### Test Vicuna

* `Background`: https://python.langchain.com/en/latest/modules/models/llms/integrations/llamacpp.html
* Reproduce the logic that happens in API of the `auto-evaluator`

In [None]:
!pip install llama-cpp-python

In [None]:
import glob, os
from langchain.llms import LlamaCpp
from langchain.llms import Replicate
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain import PromptTemplate, LLMChain
from langchain.callbacks.base import BaseCallbackManager
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

`Load`

In [None]:
def load_docs(files):

    # Load docs
    # IN: List of upload files (from Streamlit)
    # OUT: str
    # TODO: Support multple docs, Use Langchain loader

    all_text = ""
    for file_path in files:
        file_extension = os.path.splitext(file_path)[1]
        if file_extension == ".pdf":
            pdf_reader = pypdf.PdfReader(file_path)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text()
            all_text += text
        elif file_extension == ".txt":
            loader = UnstructuredFileLoader(file_path)
            docs = loader.load()
            all_text += docs[0].page_content
        else:
            print('Please provide txt or pdf.')

    return all_text

fis = glob.glob("docs/karpathy-lex-pod/*txt")
text = load_docs(fis)

`Split`

In [None]:
def split_texts(text, chunk_size, overlap, split_method):

    # Split text
    # IN: text, chunk size, overlap
    # OUT: list of str splits
    # TODO: Add parameter for splitter type

    print("`Splitting doc ...`")
    if split_method == "RecursiveTextSplitter":
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                       chunk_overlap=overlap)
    elif split_method == "CharacterTextSplitter":
        text_splitter = CharacterTextSplitter(separator=" ",
                                              chunk_size=chunk_size,
                                              chunk_overlap=overlap)
    splits = text_splitter.split_text(text)
    return splits

split_method = "RecursiveTextSplitter" 
overlap = 20
chunk_size = 500
splits = split_texts(text, chunk_size, overlap, split_method)

`Test model`

In [None]:
### *** update with your local path *** ###
LLAMA_CPP_PATH = "/Users/31treehaus/Desktop/AI/llama.cpp"

In [None]:
# Pass the raw question into the prompt template.
template = """Question: {question}
Answer: Let's think step by step."""
prompt = PromptTemplate(template=template, input_variables=["question"])

callback_manager = BaseCallbackManager([StreamingStdOutCallbackHandler()])
llm = LlamaCpp(
    
    model_path=LLAMA_CPP_PATH+"models/vicuna_13B/ggml-vicuna-13b-4bit.bin",
    callback_manager=callback_manager,
    verbose=True,
    n_threads=6,
    n_ctx=2048,
    use_mlock=True)

llm_chain = LLMChain(prompt=prompt,llm=llm)
question = "What NFL team won the Super Bowl in the year Justin Bieber was born?"
llm_chain.run(question)

`Make Retrieval Chain`

In [None]:
def make_retriever(splits, retriever_type, embeddings, num_neighbors):

    # Make document retriever
    # IN: list of str splits, retriever type, embedding type, number of neighbors for retrieval
    # OUT: retriever

    print("`Making retriever ...`")
    # Set embeddings
    if embeddings == "OpenAI":
        embd = OpenAIEmbeddings()
    elif embeddings == "HuggingFace":
        embd = HuggingFaceEmbeddings()

    # Select retriever
    if retriever_type == "similarity-search":
        try:
            vectorstore = FAISS.from_texts(splits, embd)
        except ValueError:
            print("`Error using OpenAI embeddings (disallowed TikToken token in the text). Using HuggingFace.`")
            vectorstore = FAISS.from_texts(splits, HuggingFaceEmbeddings())
        retriever = vectorstore.as_retriever(k=num_neighbors)
    elif retriever_type == "SVM":
        retriever = SVMRetriever.from_texts(splits,embd)
    elif retriever_type == "TF-IDF":
        retriever = TFIDFRetriever.from_texts(splits)
    return retriever

retriever_type = "similarity-search"
embeddings = "OpenAI"
num_neighbors = 3
retriever = make_retriever(splits, retriever_type, embeddings, num_neighbors)

`Make Prompt`

In [None]:
template = """Use the following pieces of context to answer the question at the end. Use three sentences maximum. 
{context}
Question: {question}
Answer: Think step by step """

QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)

In [None]:
def make_llm(model):
    """
    Make LLM
    @param model: LLM to use
    @return: LLM
    """

    if model in ("gpt-3.5-turbo", "gpt-4"):
        llm = ChatOpenAI(model_name=model, temperature=0)
    elif model == "anthropic":
        llm = ChatAnthropic(temperature=0)
    elif model in ("vicuna-7b","vicuna-13b"):
        callback_manager = BaseCallbackManager([StreamingStdOutCallbackHandler()])
        if model == "vicuna-7b":
            llm = LlamaCpp(
                model_path=LLAMA_CPP_PATH+"models/vicuna_7B/ggml-vicuna-7b-q4_0.bin",
                callback_manager=callback_manager,
                verbose=True,
                n_threads=6,
                n_ctx=2048,
                use_mlock=True)
        else:
            llm = LlamaCpp(
                model_path=LLAMA_CPP_PATH+"models/vicuna_13B/ggml-vicuna-13b-4bit.bin",
                callback_manager=callback_manager,
                verbose=True,
                n_threads=6,
                n_ctx=2048,
                use_mlock=True)
    return llm

llm = make_llm('vicuna-13b')

`Eval Set`

In [None]:
import json, pandas as pd
test_dataset = pd.read_csv("docs/karpathy-lex-pod/karpathy-pod-eval.csv")
qus = []
for i in test_dataset.index:
    question = test_dataset.loc[i, "question"]
    answer = test_dataset.loc[i, "answer"]
    data = {
        "question": question,
        "answer": answer
    }
    qus.append(data)

In [None]:
qus[0]

`Run Inference`

In [None]:
def make_chain(llm, retriever, retriever_type):
    """
    Make retrieval chain
    @param llm: model
    @param retriever: retriever
    @param retriever_type: retriever type
    @return: QA chain or Llama-Index retriever, which enables QA
    """

    chain_type_kwargs = {"prompt": QA_CHAIN_PROMPT}
    qa_chain = RetrievalQA.from_chain_type(llm,
                                           chain_type="stuff",
                                           retriever=retriever,
                                           chain_type_kwargs=chain_type_kwargs,
                                           input_key="question")
    return qa_chain

qa_chain = make_chain(llm, retriever, retriever_type)
result = qa_chain(qus[0])
result

`Test Vicuna endpoint on Replicate`

Deployed to `A100` on Replicate.

* `max_length` maximum length of the prompt + the output for a given generation
* `context window` 2048 tokens

Useful reference:
https://github.com/replicate/cog-vicuna-13b/issues/3



In [None]:
llm = Replicate(model="replicate/vicuna-13b:e6d469c2b11008bb0e446c3e9629232f9674581224536851272c54871f84076e",
        input={"temperature": 0.75, "max_length": 3000, "top_p":0.25})

In [None]:
from text_utils import QA_CHAIN_PROMPT, QA_CHAIN_PROMPT_LLAMA
chain_type_kwargs = {"prompt": QA_CHAIN_PROMPT_LLAMA}
qa_chain = RetrievalQA.from_chain_type(llm,
                                       chain_type="stuff",
                                       retriever=retriever,
                                       chain_type_kwargs=chain_type_kwargs,
                                       input_key="question")

In [None]:
qa_chain(qus[0])

`Test Mosaic`

LangChain docs: 

https://python.langchain.com/en/latest/modules/models/text_embedding/examples/mosaicml.html

Args: 

https://docs.mosaicml.com/en/latest/inference.html

In [None]:
from langchain.llms import MosaicML

In [None]:
llm = MosaicML(inject_instruction_format=True,model_kwargs={'do_sample': False,'max_length': 3000})

In [None]:
template = """Use the following pieces of context to answer the question at the end. Use three sentences maximum. 
{context}
Question: {question} """
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)

In [None]:
chain_type_kwargs = {"prompt": QA_CHAIN_PROMPT}
qa_chain = RetrievalQA.from_chain_type(llm,
                                       chain_type="stuff",
                                       retriever=retriever,
                                       chain_type_kwargs=chain_type_kwargs,
                                       input_key="question")

In [None]:
qa_chain(qus[0])