In [1]:
import boto3
import json

boto_session = boto3.Session(profile_name='dev-profile')
client = boto_session.client("sagemaker-runtime")
# This has to be the same name as used in terraform to name the endpoint
# TODO set it as env var endpoint_name = os.environ["LLAMA_2_70B_ENDPOINT"]
ENDPOINT_NAME = "mistral-model-endpoint"

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/katie/Library/Application Support/sagemaker/config.yaml


In [4]:
# first test to verify endpoint is functioning correctly
body={"question":"List 3 synonyms for the word tiny.", "context":"Synonyms for tiny are small, minute, small-scale, mini, baby, fun-size, petite."}

response = client.invoke_endpoint(
            EndpointName=ENDPOINT_NAME,
            ContentType="application/json",
            Accept="application/json",
            Body=json.dumps(body),
        )
print(response['Body'].read().decode('utf-8'))

{"score":0.506294310092926,"start":0,"end":26,"answer":"you are a smart dictionary"}


Try: aws sagemaker-runtime invoke-endpoint --endpoint-name stablelm-zephyr-3b-endpoint --body '{"inputs": "What is the capital of France?"}' --content-type application/json output.json


from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering     
import tensorflow as tf     

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")     
model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")     

question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"     

inputs = tokenizer(question, text, return_tensors="tf")      
outputs = model(**inputs)     

answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])     
answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])     

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]     
tokenizer.decode(predict_answer_tokens)     

In [None]:
from langchain.llms.sagemaker_endpoint import LLMContentHandler

class ContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt, model_kwargs):
        input_str = json.dumps(
            {
                "inputs" : [[
                    {"role" : "system",
                    "content" : "You are a kind robot."},
                    {"role" : "user", 
                    "content" : prompt}]],
                "parameters" : {**model_kwargs}
            })
        return input_str.encode('utf-8')
    
    def transform_output(self, output):
        response_json = json.loads(output.read().decode("utf-8"))
        return response_json[0]["generation"]["content"]
    

content_handler = ContentHandler()


In [None]:
from langchain import SagemakerEndpoint

model=SagemakerEndpoint(
     endpoint_name=ENDPOINT_NAME, 
     region_name='eu-west-1', 
     model_kwargs={"max_new_tokens": 700, "top_p": 0.9, "temperature": 0.2},
     content_handler=content_handler
 )

In [None]:
from transformers import pipeline
from langchain_core.runnables import RunnablePassthrough
from langchain.llms import HuggingFacePipeline

text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=400,
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

In [None]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser


# Create a prompt template
# this should follow the format of the model, so make sure to use the appropriate formatting.
prompt_template = """
<|system|>
The following is a friendly conversation between a human and an AI. 
The AI is talkative and provides lots of specific details from its context.
If the AI does not know the answer to a question, it truthfully says it does not know. 
Use the following context to help:

{context}

</s>
<|user|>
{question}
</s>
<|assistant|>

 """

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

llm_chain = prompt | llm | StrOutputParser()

# You can also use tokenizer.apply_chat_template to convert a list of messages (as dicts: {'role': 'user', 'content': '(...)'})
# into a string with the appropriate chat format.

In [None]:
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# Finally, we need to combine the llm_chain with the retriever to create a RAG chain.
# We pass the original question through to the final generation step, as well as the retrieved context docs
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

db = FAISS.load_local(folder_path="faiss_db/", embeddings=embeddings, index_name="nasa_index", allow_dangerous_deserialization=True)

# This retriever returns the top 5 similar chunks
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 5})

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | llm_chain

In [None]:
# A RAG response considering our NASA context
rag_chain.invoke("What can you tell me about the latest space discoveries?")