In [20]:
import boto3
import json
import os

boto_session = boto3.Session(profile_name='dev-profile')
client = boto_session.client("sagemaker-runtime")
# This has to be the same name as used in terraform to name the endpoint
# TODO set it as env var endpoint_name = os.environ["LLAMA_2_70B_ENDPOINT"]
ENDPOINT_NAME = "mistral-model-endpoint"
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"

#os.environ["HF_API_TOKEN"] = 'token'
HF_API_TOKEN= os.environ["HF_API_TOKEN"]

In [24]:
# first test to verify endpoint is functioning correctly
# body={"question":"List 3 synonyms for the word tiny.", "context":"Synonyms for tiny are small, minute, small-scale, mini, baby, fun-size, petite."}
user_message = "Write a poem about a cat named Homer."
prompt = f"<s>[INST] {user_message} [/INST]"

payload = {
    "inputs": prompt,
}

response = client.invoke_endpoint(
    EndpointName=ENDPOINT_NAME,
    ContentType="application/json",
    Accept="application/json",
    Body=json.dumps(payload)
)
result = json.loads(response['Body'].read().decode())
print(result)

[{'generated_text': "<s>[INST] Write a poem about a cat named Homer. [/INST] In the quiet of the twilight hour, when shadows long do grace the floor,\n\nA silhouette of feline grace, strides with deliberate pace,\n\nMeet Homer, the cat of velvet paws, with emerald eyes intensely luminous and raw,\n\nAn opus of feline wit and charm, defying life's monotone bloom.\n\nHis thick, coal-black fur shimmers in"}]


In [10]:
from langchain.llms.sagemaker_endpoint import LLMContentHandler

# TODO figure out whether this is necesasary and how to adapt it
class ContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"
    
    # this should be adapted to include context
    def transform_input(self, prompt,model_kwargs, context = None):
        input_str = json.dumps(
            {
                "inputs" : [[
                    {"role" : "system",
                    "content" : "Always assist with care, respect, and truth. Respond with utmost utility yet securely. Avoid harmful, unethical, prejudiced, or negative content. Ensure replies promote fairness and positivity. And use {context} to respond. If you do not know the answers respond with: I don't know."
                    },
                    {"role" : "user", 
                    "content" : prompt}]],
                "parameters" : {**model_kwargs}
            })
        return input_str.encode('utf-8')
    
    def transform_output(self, output):
        response_json = json.loads(output.read().decode("utf-8"))
        return response_json[0]["generation"]["content"]
    

content_handler = ContentHandler()


In [11]:
from langchain import SagemakerEndpoint

llm=SagemakerEndpoint(
     endpoint_name=ENDPOINT_NAME, 
     region_name='eu-west-1', 
     model_kwargs={"max_new_tokens": 700, "top_p": 0.9, "temperature": 0.2},
     content_handler=content_handler
 )

In [12]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_API_TOKEN)

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
from transformers import pipeline
from langchain_core.runnables import RunnablePassthrough
from langchain.llms import HuggingFacePipeline

text_generation_pipeline = pipeline(
    model=llm,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=400,
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



TypeError: Could not infer framework from class <class 'langchain_community.llms.sagemaker_endpoint.SagemakerEndpoint'>.

In [None]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser


# Create a prompt template
# this should follow the format of the model, so make sure to use the appropriate formatting.
prompt_template = """
<|system|>
The following is a friendly conversation between a human and an AI. 
The AI is talkative and provides lots of specific details from its context.
If the AI does not know the answer to a question, it truthfully says it does not know. 
Use the following context to help:

{context}

</s>
<|user|>
{question}
</s>
<|assistant|>

 """

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

llm_chain = prompt | llm | StrOutputParser()

# You can also use tokenizer.apply_chat_template to convert a list of messages (as dicts: {'role': 'user', 'content': '(...)'})
# into a string with the appropriate chat format.

In [None]:
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# Finally, we need to combine the llm_chain with the retriever to create a RAG chain.
# We pass the original question through to the final generation step, as well as the retrieved context docs
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

db = FAISS.load_local(folder_path="faiss_db/", embeddings=embeddings, index_name="nasa_index", allow_dangerous_deserialization=True)

# This retriever returns the top 5 similar chunks
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# RunnablePassthrough Usage: The query is then passed along using RunnablePassthrough(). 
# This function is a part of LangChain’s API and is used to pass the query to the next step in the chain.
rag_chain = {"context": retriever, "question": RunnablePassthrough()} | llm_chain

In [None]:
# A RAG response considering our NASA context
rag_chain.invoke("What can you tell me about the latest space discoveries?")