In [2]:
import openai
import langchain
import os
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
import logging

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

logging.basicConfig(level=logging.DEBUG,
                    format='[%(levelname)s] - %(message)s ',
                    handlers=[
                        logging.FileHandler('./langchaindemo.log', mode='w'),
                        logging.StreamHandler(),
                    ],
                    force=True)
logger = logging.getLogger(__name__)
logger.info("Langchain Demo Initialized")

[INFO] - Langchain Demo Initialized 


In [5]:
def get_docs():
    """
    Loads each file into one document (knowledge base)
    :return: docs
    """

    loader = DirectoryLoader(  # Reads custom data from local files
        path="docs",
        glob="*.txt",
        loader_cls=TextLoader  # Loader class to use for loading files
    )

    docs = loader.load()
    return docs

docs = get_docs()
docs[0]

[DEBUG] - Processing file: docs\countries.txt 
[DEBUG] - Processing file: docs\geneva.txt 
[DEBUG] - Processing file: docs\michael.txt 
[DEBUG] - Processing file: docs\nina.txt 
[DEBUG] - Processing file: docs\pizza.txt 
[DEBUG] - Processing file: docs\pizza_theorem.txt 


Document(metadata={'source': 'docs\\countries.txt'}, page_content='The following is a list providing an overview of sovereign states around the world with information on their status and recognition of their sovereignty.\n\nThe 205 listed states can be divided into three categories based on membership within the United Nations System: 193 UN member states,[1] two UN General Assembly non-member observer states, and ten other states. The sovereignty dispute column indicates states having undisputed sovereignty (188 states, of which there are 187 UN member states and one UN General Assembly non-member observer state), states having disputed sovereignty (15 states, of which there are six UN member states, one UN General Assembly non-member observer state, and eight de facto states), and states having a special political status (two states, both in free association with New Zealand).\n\nCompiling a list such as this can be complicated and controversial, as there is no definition that is bin

In [6]:
def get_chunks(docs, chunk_size=1000, chunk_overlap=200):
    """
    Get chunks from docs. Our loaded doc may be too long for most models, and even if it fits it can struggle to find relevant context. So we generate chunks
    :param docs: docs to be splitted

    :return: chunks
    """

    text_splitter = RecursiveCharacterTextSplitter( # recommended splitter for generic text. split documents recursively by different characters - starting with "\n\n", then "\n", then " "
        chunk_size=chunk_size,        # max size (in terms of number of characters) of the final documents
        chunk_overlap=chunk_overlap,  # how much overlap there should be between chunks
        add_start_index=True
    )
    chunks = text_splitter.split_documents(docs)
    logger.info(f"Split {len(docs)} documents into {len(chunks)} chunks.")
    return chunks

chunks = get_chunks(docs)
chunks

[INFO] - Split 6 documents into 217 chunks. 


[Document(metadata={'source': 'docs\\countries.txt', 'start_index': 0}, page_content='The following is a list providing an overview of sovereign states around the world with information on their status and recognition of their sovereignty.\n\nThe 205 listed states can be divided into three categories based on membership within the United Nations System: 193 UN member states,[1] two UN General Assembly non-member observer states, and ten other states. The sovereignty dispute column indicates states having undisputed sovereignty (188 states, of which there are 187 UN member states and one UN General Assembly non-member observer state), states having disputed sovereignty (15 states, of which there are six UN member states, one UN General Assembly non-member observer state, and eight de facto states), and states having a special political status (two states, both in free association with New Zealand).'),
 Document(metadata={'source': 'docs\\countries.txt', 'start_index': 824}, page_content

In [7]:
from langchain_community.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings( #  embedding=OpenAIEmbeddings() rate limit
        model_name='sentence-transformers/all-MiniLM-L6-v2',
        model_kwargs={'device': 'cuda'} #TODO CHANGE IF NOT USING GPU
)

  embeddings = HuggingFaceEmbeddings( #  embedding=OpenAIEmbeddings() rate limit
  from tqdm.autonotebook import tqdm, trange
[INFO] - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2 
[DEBUG] - Starting new HTTPS connection (1): huggingface.co:443 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json HTTP/11" 200 0 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/config_sentence_transformers.json HTTP/11" 200 0 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/README.md HTTP/11" 200 0 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json HTTP/11" 200 0 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/sentence_bert_config.json HTTP/11" 200 0 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/a

In [9]:
vector = embeddings.embed_query("Hola como estas?")
embedding_size = len(vector)  # HF 384 ; OPENAI 1536
embedding_size

384

In [10]:
import pinecone
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from langchain_pinecone import PineconeVectorStore

def get_vector_store(index_name, embeddings, embedding_size=384):
  """ Creates vector store from Pinecone for storing and managing embeddings.

    :param str index_name: The name of the index to create or retrieve from Pinecone.
    :param str embeddings: The embedding function to be used to generate embeddings
    :param int embedding_size: The size (dimension) of the embeddings. Defaults to 384 (e.g., for sentence-transformers/all-MiniLM-L6-v2).

    :return: PineconeVectorStore: An object representing the vector store in Pinecone for managing embeddings.

    :raise: ValueError: If the index creation fails due to invalid parameters or connection issues.
  """

  pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])  # Pinecone is initialized using an API key stored in the environment variable

  if INDEX_NAME not in pc.list_indexes().names():        # Check whether an index with the given index_name already exists
      pc.create_index(
          name=INDEX_NAME,          # Name of the index
          dimension=embedding_size, # Size of the vectors (embeddings)
          metric="cosine",          # Distance metric used to compare vectors
          spec=ServerlessSpec(      # Determines the infrastructure used
              cloud='aws',          # Specifies that the Pinecone index is hosted on AWS
              region='us-east-1'    # Specifies the region of the cloud provider
          )
      )

  vectorstore = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings) # initializes a PineconeVectorStore object using the index_name and the provided embeddings model or function

  return vectorstore

In [13]:
INDEX_NAME = "langchain-demo-index-dl"
vectorstore = get_vector_store(INDEX_NAME, embeddings, embedding_size)

[INFO] - Discovering subpackages in _NamespacePath(['C:\\dev\\langy\\env\\Lib\\site-packages\\pinecone_plugins']) 
[INFO] - Looking for plugins in pinecone_plugins.inference 
[INFO] - Installing plugin inference into PineconeGRPC 
[DEBUG] - response body: b'{"indexes":[]}' 
[DEBUG] - response body: b'{"name":"langchain-demo-index-dl","metric":"cosine","dimension":384,"status":{"ready":false,"state":"Initializing"},"host":"langchain-demo-index-dl-cumn93c.svc.aped-4627-b74a.pinecone.io","spec":{"serverless":{"region":"us-east-1","cloud":"aws"}},"deletion_protection":"disabled"}' 
[DEBUG] - response body: b'{"name":"langchain-demo-index-dl","metric":"cosine","dimension":384,"status":{"ready":false,"state":"Initializing"},"host":"langchain-demo-index-dl-cumn93c.svc.aped-4627-b74a.pinecone.io","spec":{"serverless":{"region":"us-east-1","cloud":"aws"}},"deletion_protection":"disabled"}' 
[DEBUG] - response body: b'{"name":"langchain-demo-index-dl","metric":"cosine","dimension":384,"status":{

In [14]:
vectorstore.add_documents(chunks)

[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":25}' 


['f1414a1a-29b1-404a-9a25-f75dec6fd212',
 '5a553b29-546d-49e7-9b6c-869478b2720a',
 '8fdcc876-2a6b-4b4f-a1c9-b2415ebca0fa',
 'dd677c42-96b7-4c72-a711-34e432df82fa',
 'c0ab228f-09cd-448f-b8f0-0ba0f7dbd702',
 'e49f3917-ed97-49f6-8fdd-dada3abdb300',
 'c5fb4030-641c-4bb4-9dd4-aef33ccd4195',
 '18c9d47e-5562-4911-a948-69efd552039e',
 'f54c448f-8c61-480f-a021-daea3b146c82',
 '2906d860-f048-4a03-ac86-30070c1dda6a',
 '8624567f-99e9-4363-8655-b6d475a76ec0',
 '5722e630-0136-431d-b3cc-6a8986619034',
 'fe65d048-4910-4015-a329-88c82130743b',
 '47537fa0-cf07-4776-9753-345706fc9bb0',
 'd52da072-8291-41df-b27e-d93f7511e529',
 '1b150832-37c8-4c83-a71c-b86b11395c81',
 '0e548e1b-8dd6-46fc-a759-28b94442e47c',
 '80dee543-eae1-46bb-acb2-3d342f1d8b34',
 '7d0fa160-1486-4f12-b0e0-29ae5c59c6a4',
 '1bf4e0d5-38b1-4314-89a1-366418fb8a46',
 'e5a6af66-2601-4d36-baba-1c2f777ee57b',
 'abee759b-7fef-4a5c-a806-b356cd143f4b',
 'ee242c2e-1ca3-4687-8da7-5e83ee8848de',
 '4a1838d9-03e6-4b71-a263-a6723fece84b',
 'd142860c-6a91-

In [15]:
query = "nina"
vectorstore.search(
    query=query,              # Return docs most similar to query using specified search type.
    search_type="similarity_score_threshold", # can be “similarity”, “mmr”, or “similarity_score_threshold”.
    k=5                       # return top k,
)

[DEBUG] - response body: b'{"results":[],"matches":[{"id":"35b9af8d-6f50-436b-9f4d-862bb529a315","score":0.53698808,"values":[],"metadata":{"source":"docs\\\\nina.txt","start_index":0,"text":"Nina is Thomas\' cat, one of the most well known felines in the community.\\n\\nNina is what is known as a \\"Tuxedo Cat\\", which means her fur color is primarily black, but her underside and belly are white all the way to her chin. Nina\'s paws are white too and very soft (reason why they\'re often called her \\"slippers\\").\\n\\nNina and her siblings were rescued from outside by a kind lady right after they were born during the Covid pandemic. This lady took care of the kittens during the first two weeks of their lives before giving them to a veterinarian called Gregorio. Just as Gregorio received the box of young kittens, Thomas\' parents were having an argument about getting a new dog. Thomas\' father didn\'t want a dog, and suggested getting a cat instead (something he immediately regretted

[Document(id='35b9af8d-6f50-436b-9f4d-862bb529a315', metadata={'source': 'docs\\nina.txt', 'start_index': 0.0}, page_content='Nina is Thomas\' cat, one of the most well known felines in the community.\n\nNina is what is known as a "Tuxedo Cat", which means her fur color is primarily black, but her underside and belly are white all the way to her chin. Nina\'s paws are white too and very soft (reason why they\'re often called her "slippers").\n\nNina and her siblings were rescued from outside by a kind lady right after they were born during the Covid pandemic. This lady took care of the kittens during the first two weeks of their lives before giving them to a veterinarian called Gregorio. Just as Gregorio received the box of young kittens, Thomas\' parents were having an argument about getting a new dog. Thomas\' father didn\'t want a dog, and suggested getting a cat instead (something he immediately regretted, given that the mother accepted). That same day, they went to Gregorio\'s vet

# RAG

In [16]:
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain_community.chat_models import ChatOpenAI
from langchain_community.llms import OpenAI, HuggingFaceHub
from langchain_community.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_pinecone import PineconeVectorStore

def generate_response(db, prompt):
    """
    Generate a response with a LLM based on previous custom context
    :return: chatbot response
    """

    hf_llm = HuggingFaceHub(
        repo_id="HuggingFaceH4/zephyr-7b-beta",  # Model id
        task="text-generation",                  # Specific task the model is intended to perform
        model_kwargs={
            "max_new_tokens": 512,               # The maximum number of tokens to generate in the response.  Limits the length of the generated text to ensure responses are concise or fit within certain constraints.
            "top_k": 30,                         # Limits the sampling pool to the top k tokens, increasing focus on more likely tokens
            "temperature": 0.3,                  # Controls the randomness of predictions, with lower values making the output more deterministic. : Produces more focused and less random text by making the model more confident in its choices.
            "repetition_penalty": 1.2,           # Penalizes repeated tokens to avoid repetitive output.  Discourages the model from repeating the same token sequences, resulting in more varied and natural text.
        },
    )

    chain = RetrievalQA.from_chain_type( # Generate chat model based on previous llm
        llm=hf_llm,
        chain_type="stuff",
        retriever=db.as_retriever(search_type="similarity", search_kwargs={"k": 2}),
        verbose=False
    )

    response = chain.run(prompt)
    return response

def postprocess_response(response):
    answer_start = response.find("Helpful Answer: ")
    if answer_start != -1:
        answer = response[answer_start + len("Helpful Answer: "):].strip()
    else:
        answer = response.strip()

    return answer

In [18]:
user_input = "Tell me about Nina"
response = generate_response(vectorstore, user_input)
response = postprocess_response(response)
response

[DEBUG] - response body: b'{"results":[],"matches":[{"id":"35b9af8d-6f50-436b-9f4d-862bb529a315","score":0.566165268,"values":[],"metadata":{"source":"docs\\\\nina.txt","start_index":0,"text":"Nina is Thomas\' cat, one of the most well known felines in the community.\\n\\nNina is what is known as a \\"Tuxedo Cat\\", which means her fur color is primarily black, but her underside and belly are white all the way to her chin. Nina\'s paws are white too and very soft (reason why they\'re often called her \\"slippers\\").\\n\\nNina and her siblings were rescued from outside by a kind lady right after they were born during the Covid pandemic. This lady took care of the kittens during the first two weeks of their lives before giving them to a veterinarian called Gregorio. Just as Gregorio received the box of young kittens, Thomas\' parents were having an argument about getting a new dog. Thomas\' father didn\'t want a dog, and suggested getting a cat instead (something he immediately regrette

'Nina is a tuxedo cat who was rescued along with her siblings during the pandemic. Her fur is mostly black except for her white belly, paws, and chin. After being taken care of by a kind woman for two weeks, she was adopted by Thomas\' family, where she now spends her time sleeping on his bed and occasionally meowing for food. She likes to sit in the "liquid" or "loaf of bread" positions.'

In [None]:
print("Chatbot: Hello! What would you like to talk about today?")

user_input = ""
while user_input != "bye":
    user_input = input("You: ")
    response = generate_response(vectorstore, user_input) 
    print(f"Chatbot: {postprocess_response(response)}")

Chatbot: Hello! What would you like to talk about today?


You:  Tell me a fun math fact about pizza!


[DEBUG] - response body: b'{"results":[],"matches":[{"id":"609feb42-fa8f-40d6-b53a-61daf163a499","score":0.662092686,"values":[],"metadata":{"source":"docs\\\\pizza_theorem.txt","start_index":0,"text":"In elementary geometry, the pizza theorem states the equality of two areas that arise when one partitions a disk in a certain way.\\n\\nThe theorem is so called because it mimics a traditional pizza slicing technique. It shows that if two people share a pizza sliced into 8 pieces (or any multiple of 4 greater than 8), and take alternating slices, then they will each get an equal amount of pizza, irrespective of the central cutting point.\\n\\nStatement\\nLet p be an interior point of the disk, and let n be a multiple of 4 that is greater than or equal to 8. Form n sectors of the disk with equal angles by choosing an arbitrary line through p, rotating the line _\\nn\\n/\\n2\\n_ _ 1 times by an angle of _\\n2_\\n/\\nn\\n_ radians, and slicing the disk on each of the resulting _\\nn\\n/\\n2

Chatbot: Did you know that there's actually a mathematical theorem named after pizza? It's called the Pizza Theorem, and it explains why everyone gets an equal amount of pie no matter where you cut it. Essentially, if you slice a pizza into eight or more pieces using a central cutting point and taking turns picking slices, both people end up with the same total area of pizza. Pretty cool, right?! So next time you're sharing a pizza with friends, remember the Pizza Theorem and make sure everyone gets their fair slice!
