In [None]:
from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from torch import cuda, bfloat16
import transformers
from langchain.llms import HuggingFacePipeline
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

In [None]:
#choose model id 
model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

quantization_config =transformers.BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)

# begin initializing HF items, need auth token for these
hf_auth = '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'
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

#load model
llama_model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=quantization_config,
    device_map={"":3},
    use_auth_token=hf_auth
)
llama_model.eval()
print(f"Model loaded on {device}")

#load tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

In [None]:
#create pipeline to generate text, set model parameters
generate_text = transformers.pipeline(
    model=llama_model, tokenizer=tokenizer,
    return_full_text=True,  
    task='text-generation',
    temperature=0.75,  
    max_new_tokens=300,  
    repetition_penalty=1.1  

#create hf pipeline -> needed for langchain 
llm = HuggingFacePipeline(pipeline=generate_text)

In [None]:
#embeddings
model_name = "BAAI/bge-base-en"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

model_norm = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)

In [None]:
#Create prompt for qa in LLama-2 prompt style
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n" 
def get_prompt(instruction, system_prompt ):
        SYSTEM_PROMPT = B_SYS + system_prompt + E_SYS
        prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
        return prompt_template
sys_prompt = """\
You are a helpful, plant caretaking assistant. Use the following pieces of information to answer the user's question. You can assume that the information is always about the plant in question even if the name is different.
If you don't know the answer, just say that you don't know, do under no circumstances try to make up an answer."""

instruction = """
CONTEXT:/n/n {context}/n

Question: {question}
Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [None]:
def get_llama_answer(question, file):
    """
    This function takes the question/user input as well as the filepath of the text file
    belonging to the plant in question as inputs and generates an answer with langchain
    retrieval question answering
    """
    loader = TextLoader(file)
    documents = loader.load()

    #embed document
    embedding = model_norm
    vectordb = Chroma.from_documents(documents=documents,
                                    embedding=embedding)
    
    #initialize retriever, in this case only on document to choose
    retriever = vectordb.as_retriever(search_kwargs={"k": 1})

    #create prompt
    prompt_template = get_prompt(instruction, sys_prompt)

    llama_prompt = PromptTemplate(
        template=prompt_template, input_variables=["context", "question"]
    )

    chain_type_kwargs = {"prompt": llama_prompt}

    #initialize retrieval augemented generation pipeline
    rag_pipeline = RetrievalQA.from_chain_type(
        llm=llm, chain_type='stuff',
        retriever=retriever,
        chain_type_kwargs=chain_type_kwargs,)

    return(rag_pipeline(question))