In [None]:
import os

# goin to the root dir
os.chdir("../")
print(f"current working directory - {os.getcwd()}")

In [None]:
# loading the api key from dotenv

from dotenv import load_dotenv
load_dotenv()

In [None]:
from langchain.chat_models import ChatOpenAI

chat = ChatOpenAI(
    openai_api_key=os.environ["OPENAI_API_KEY"],
    model='gpt-3.5-turbo'
)

In [None]:
from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage,
)

messages = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content="Hi AI, how are you today?"),
    AIMessage(content="I'm great thank you. How can I help you?"),
    HumanMessage(content="I'd like to understand string theory."),
]

In [None]:
res = chat(messages)
res

In [None]:
print(res.content)

In [None]:
messages.append(res)

prompt = HumanMessage(
    content="Why do physicists believe it can produce a 'unified theory'?"
)

messages.append(prompt)

res = chat(messages)

print(res.content)

In [None]:
# there is another way to feed the knowledge base

llmchain_information = [
    "A LLMChain is the most common type of chain. It consists of a PromptTemplate, a model (either an LLM or a ChatModel), and an optional output parser. This chain takes multiple input variables, uses the PromptTemplate to format them into a prompt. It then passes that to the model. Finally, it uses the OutputParser (if provided) to parse the output of the LLM into a final format.",
    "Chains is an incredibly generic concept which returns to a sequence of modular components (or other chains) combined in a particular way to accomplish a common use case.",
    "LangChain is a framework for developing applications powered by language models. We believe that the most powerful and differentiated applications will not only call out to a language model via an api, but will also: (1) Be data-aware: connect a language model to other sources of data, (2) Be agentic: Allow a language model to interact with its environment. As such, the LangChain framework is designed with the objective in mind to enable those types of applications."
]

source_knowledge = "\n".join(llmchain_information)

In [None]:
query = "Can you tell me about the LLMChain in LangChain?"

augmented_prompt = f"""Using the contexts below, answer the query.

Contexts:
{source_knowledge}

Query: {query}"""

In [None]:
# create a new user prompt
prompt = HumanMessage(
    content=augmented_prompt
)
# add to messages
messages.append(prompt)

# send to OpenAI
res = chat(messages)
print(res.content)

In [None]:
# importing the datasets
from datasets import load_dataset

dataset = load_dataset("jamescalam/llama-2-arxiv-papers-chunked", split="train")

In [None]:
dataset[0]

In [None]:
# building the knowledge base using vectordb

import pinecone

pinecone.init(
    api_key=os.environ.get("PINECONE_API_KEY"),
    environment=os.environ.get("PINECONE_ENVIRONMENT")
)

In [None]:
import time

index_name = "llm-chatbot"

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        dimension=1536,
        metric="cosine",
    )

    while not pinecone.describe_index(index_name).status["ready"]:
        time.sleep(1)
    
index = pinecone.Index(index_name)


In [None]:
index.describe_index_stats()

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings

embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")

In [None]:
texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]

res = embed_model.embed_documents(texts)
len(res), len(res[0])

In [None]:
from tqdm.auto import tqdm # for progressbar

data = dataset.to_pandas()

In [None]:
def update_embeddings():
    batch_size = 100

    for i in tqdm(range(0, len(data), batch_size)):
        i_end = min(len(data), i+batch_size)
        batch = data.iloc[i:i_end]

        # generate unique id for each chunk
        ids = [f"{x['doi']}-{x['chunk-id']}" for i, x in batch.iterrows()]

        texts = [x["chunk"] for _, x in batch.iterrows()]
        embeds = embed_model.embed_documents(texts)

        metadata = [
            {'text': x['chunk'],
            'source': x['source'],
            'title': x['title']} for i, x in batch.iterrows()
        ]

        # add to pinecone
        index.upsert(vectors=zip(ids, embeds, metadata))

# update_embeddings()

In [None]:

index.describe_index_stats()

In [None]:
from langchain.vectorstores import Pinecone

text_field = "text"

vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)


In [None]:
query = "What is so special about Llama 2?"

vectorstore.similarity_search(query, k=3)

In [None]:
def augment_prompt(query: str):
    # get top 3 results from knowledge base
    results = vectorstore.similarity_search(query, k=3)
    # get the text from the results
    source_knowledge = "\n".join([x.page_content for x in results])
    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.

    Contexts:
    {source_knowledge}

    Query: {query}"""
    return augmented_prompt


print(augment_prompt(query))

In [None]:
# create a new user prompt
prompt = HumanMessage(
    content=augment_prompt(query)
)
# add to messages
messages.append(prompt)

res = chat(messages)

print(res.content)

In [None]:
prompt = HumanMessage(
    content="what safety measures were used in the development of llama 2?"
)

res = chat(messages + [prompt])|
print(res.content)

In [None]:
prompt = HumanMessage(
    content=augment_prompt(
        "what safety measures were used in the development of llama 2?"
    )
)

res = chat(messages + [prompt])
print(res.content)