In [None]:
! pip install pandas

In [None]:
! pip install sentence_transformers
! pip install pinecone
! pip install cohere

In [None]:
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
import os
from tqdm import tqdm
import cohere
import numpy as np
import warnings
from IPython.display import display
warnings.filterwarnings("ignore")

  from tqdm.autonotebook import tqdm, trange


In [None]:
with open("chohere_api_keys.txt") as f:
    COHERE_API_KEY = f.read().strip()
with open("pinecone_api_key.txt") as f:
    PINECONE_API_KEY = f.read().strip()

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

# Loading the Data.
df = pd.read_json("qa_data.json", lines=True)

# Chunking the Knowledge base.
df['chunking']=df['knowledge'].apply(sent_tokenize)
df = df.explode('chunking', ignore_index=True)


In [None]:
# Loading the embedding model.
from sentence_transformers import SentenceTransformer
EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
model = SentenceTransformer(EMBEDDING_MODEL)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def load_and_embedd_dataset(
        dataset,
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        text_field: str = 'chunking',
) -> tuple:
    """
    Load a dataset and embedd the text field using a sentence-transformer model
    Args:
        dataset_name: The name of the dataset to load
        model: The model to use for embedding
        text_field: The field in the dataset that contains the text
    Returns:
        tuple: A tuple containing the dataset and the embeddings
    """

    print("Loading and embedding the dataset")

    # Embed the dataset
    embeddings = model.encode(dataset[text_field])

    print("Done!")
    return dataset, embeddings

/usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


In [None]:
dataset, embeddings = load_and_embedd_dataset(
    dataset=df,
    model=model
)
shape = embeddings.shape
print(shape)

Loading and embedding the dataset
Done!
(16814, 384)


In [None]:
def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
):
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    from pinecone import Pinecone, ServerlessSpec
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            # Remember! It is crucial that the metric you will use in your VectorDB will also be a metric your embedding
            # model works well with!
            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc

In [None]:
INDEX_NAME = 'qa-data'

# Create the vector database
pc = create_pinecone_index(INDEX_NAME, shape[1])

Creating a Pinecone index...
Done!


In [None]:
def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str = 'knowledge',
        batch_size: int = 32
):
    """
    Upsert vectors to a pinecone index
    Args:
        index: The pinecone index object
        embeddings: The embeddings to upsert
        dataset: The dataset containing the metadata
        batch_size: The batch size to use for upserting
    Returns:
        An updated pinecone index
    """
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape

    ids = [str(i) for i in range(shape[0])]
    meta = [{text_field: text} for text in dataset[text_field]]

    # create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings, meta))

    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    return index

In [None]:
# Upsert the embeddings to the Pinecone index
index = pc.Index(INDEX_NAME)
index_upserted = upsert_vectors(index, embeddings, dataset)

Upserting the embeddings to the Pinecone index...


100%|██████████| 526/526 [01:31<00:00,  5.73it/s]


In [None]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 16814}},
 'total_vector_count': 16814}

In [None]:
import cohere

# Assesing the LLM on a sample Queries
querys = ['Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who?',
          " What nationality was James Henry Miller's wife?",
          'Gunmen from Laredo starred which narrator of "Frontier"?']
for i,query in enumerate(querys):
  right_answer = df[df['question'] == query].iloc[0]['right_answer']
  co = cohere.Client(api_key=COHERE_API_KEY)
  response = co.chat(
          model='command-r-plus',
          message=query,
      )
  print(f"Query {i + 1} : ")
  print(query)
  print(f"The LLM's answer:")
  print(response.text)
  print("The right answer:")
  print(right_answer)
  print("-----------------------------")

Query 1 : 
Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who?
The LLM's answer:
Milhous Van Houten.
The right answer:
President Richard Nixon
-----------------------------
Query 2 : 
 What nationality was James Henry Miller's wife?
The LLM's answer:
James Henry Miller, better known by his pen name George Eliot, was an English novelist and one of the leading writers of the Victorian era. 

James Henry Miller's wife, Sophia Elizabeth Cowl, was also English. They were married in Coventry, England, in 1817. Sophia was a significant influence on Miller's life and supported his literary pursuits.
The right answer:
American
-----------------------------
Query 3 : 
Gunmen from Laredo starred which narrator of "Frontier"?
The LLM's answer:
Isabella Valentine
The right answer:
Walter Darwin Coy
-----------------------------


In [None]:
def augment_prompt(
        query: str,
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        index=None,
) -> str:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        index: The vectorstore object
    Returns:
        str: The augmented prompt
    """
    results = [float(val) for val in list(model.encode(query))]

    # get top 3 results from knowledge base
    query_results = index.query(
        vector=results,
        top_k=3,
        include_values=True,
        include_metadata=True
    )['matches']
    text_matches = [match['metadata']['knowledge'] for match in query_results]

    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)

    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - say that you don't know.
    Query: {query}"""
    return augmented_prompt, source_knowledge

In [None]:
# Assesing the RAG on a sample Queries
for i,query in enumerate(querys):
  right_answer = df[df['question'] == query].iloc[0]['right_answer']
  augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)
  response = co.chat(
          model='command-r-plus',
          message=augmented_prompt,
      )
  print(f"Query {i + 1} : ")
  print(query)
  print(f"The RAG's answer:")
  print(response.text)
  print("The right answer:")
  print(right_answer)
  print("-----------------------------")

Query 1 : 
Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who?
The RAG's answer:
President Richard Nixon.
The right answer:
President Richard Nixon
-----------------------------
Query 2 : 
 What nationality was James Henry Miller's wife?
The RAG's answer:
American
The right answer:
American
-----------------------------
Query 3 : 
Gunmen from Laredo starred which narrator of "Frontier"?
The RAG's answer:
Walter Darwin Coy
The right answer:
Walter Darwin Coy
-----------------------------
