In [1]:
 !nvidia-smi

Wed Nov 27 23:27:31 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 551.78                 Driver Version: 551.78         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 ...  WDDM  |   00000000:01:00.0  On |                  N/A |
| N/A   50C    P8             12W /  130W |     248MiB /   6144MiB |      5%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Importing necessary libraries

In [2]:
import os
import json
from tqdm.auto import tqdm
from chromadb import Client
from chromadb.config import Settings
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
import pandas as pd
import numpy as np
import torch

#  ChromaDB Initialization for storing the embeddings


In [5]:
client = chromadb.PersistentClient(path="./chroma_storage")

In [9]:
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from sentence_transformers import SentenceTransformer

# Initializing the Chroma client
#client = chromadb.Client(chromadb.config.Settings(
    
#    persist_directory="./chroma_storage"  # Replace with your desired directory path
#))

collections = client.list_collections()
print("Available Collections:")
for collection in collections:
    print(collection.name)
    


# Set up the Sentence-Transformer model for embeddings
embedding_model = SentenceTransformer("all-mpnet-base-v2")
embedding_function = SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")


# Creating a Chroma collection (used to store the embeddings)
collection = client.create_collection(
    name="rag_embedding_collection",  
    embedding_function=embedding_function
)


Available Collections:
rag_embedding_collection


UniqueConstraintError: Collection rag_embedding_collection already exists

In [10]:
collections = client.list_collections()
print("Available Collections:")
for collection in collections:
    print(collection.name)
    

Available Collections:
rag_embedding_collection


# Exploring the data

In [11]:
#import json

# Path to my JSON files 
train_file_path = "./Dataset/train-v1.1.json"
dev_file_path = "./Dataset/dev-v1.1.json"

# Load the train data
with open(train_file_path, 'r') as file:
    train_data = json.load(file)

# Load the dev data
with open(dev_file_path, 'r') as file:
    dev_data = json.load(file)

# Print sample data
print(f"Train data keys: {list(train_data.keys())}")
print(f"Dev data keys: {list(dev_data.keys())}")



Train data keys: ['data', 'version']
Dev data keys: ['data', 'version']


In [12]:
# checking the type

print(type(train_data))
print(type(dev_data))


<class 'dict'>
<class 'dict'>


In [13]:
print(train_data.keys())  # printing the top-level keys in the dictionary
#print(train_data['data'][:2])  # Print the first 2 entries in the "data" key

print(dev_data.keys())
#print(dev_data['data'][:2])

dict_keys(['data', 'version'])
dict_keys(['data', 'version'])


In [14]:
#print(json.dumps(train_data['data'][:1], indent=1))  # Printing  the first topic in json structure for train

In [15]:
#print(json.dumps(dev_data['data'][:1], indent=2))   #Printing  the first topic in json structure for dev

# Preprocessing the data - Text Formatting

In [16]:
#import pandas as pd
from tqdm.auto import tqdm

# Ensure tqdm progress bars display correctly in Jupyter
tqdm.pandas()

def text_formatter(text: str) -> str:
    """Performs minor formatting on text"""
    cleaned_text = text.replace('\n', " ").strip()  # Replace newlines and strip extra spaces
    return cleaned_text


In [17]:
def preprocess_data(data):
    """Preprocess train or dev data."""
    qa_data = []  # Initialize list to store Q&A data

    for topic in tqdm(data['data'], desc="Processing Topics"):  # Iterate over each topic
        title = topic['title']  # Topic title (e.g., "University_of_Notre_Dame")
        for paragraph in topic['paragraphs']:  # Iterate over each paragraph in the topic
            context = paragraph['context']  # The context text
            context = text_formatter(context)  # Clean the context

            # Now, for each Q&A in the paragraph
            for qas in paragraph['qas']:
                question = qas['question']  # The question text
                question = text_formatter(question)  # Clean the question

                # Combine context and question into a single entry
                qa_data.append({
                    "title": title,
                    "context": context,
                    "id": qas['id'],  # Include unique ID for each Q&A pair
                    "question": question,
                    "answers": [answer['text'] for answer in qas['answers']],  # Extract all answers
                    "answer_start": [answer['answer_start'] for answer in qas['answers']]  # Start positions
                })
    
    return qa_data

#  Process train and dev data
train_qa_data = preprocess_data(train_data)
dev_qa_data = preprocess_data(dev_data)

Processing Topics:   0%|          | 0/442 [00:00<?, ?it/s]

Processing Topics:   0%|          | 0/48 [00:00<?, ?it/s]

In [18]:
#Convert processed data to DataFrame for easy exploration
train_df = pd.DataFrame(train_qa_data)
dev_df = pd.DataFrame(dev_qa_data)

In [19]:
# Display the first few rows of both DataFrames
print("Train Data:")
train_df.head()


Train Data:


Unnamed: 0,title,context,id,question,answers,answer_start
0,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,[Saint Bernadette Soubirous],[515]
1,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,[a copper statue of Christ],[188]
2,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,[the Main Building],[279]
3,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",5733be284776f41900661181,What is the Grotto at Notre Dame?,[a Marian place of prayer and reflection],[381]
4,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",5733be284776f4190066117e,What sits on top of the Main Building at Notre...,[a golden statue of the Virgin Mary],[92]


In [20]:
print("\nDev Data:")
print(dev_df.head())


Dev Data:
           title                                            context  \
0  Super_Bowl_50  Super Bowl 50 was an American football game to...   
1  Super_Bowl_50  Super Bowl 50 was an American football game to...   
2  Super_Bowl_50  Super Bowl 50 was an American football game to...   
3  Super_Bowl_50  Super Bowl 50 was an American football game to...   
4  Super_Bowl_50  Super Bowl 50 was an American football game to...   

                         id  \
0  56be4db0acb8001400a502ec   
1  56be4db0acb8001400a502ed   
2  56be4db0acb8001400a502ee   
3  56be4db0acb8001400a502ef   
4  56be4db0acb8001400a502f0   

                                            question  \
0  Which NFL team represented the AFC at Super Bo...   
1  Which NFL team represented the NFC at Super Bo...   
2                Where did Super Bowl 50 take place?   
3                  Which NFL team won Super Bowl 50?   
4  What color was used to emphasize the 50th anni...   

                                      

In [21]:
# Check the size of the DataFrame
print(f"Total entries in train: {train_df.shape[0]}")
print(f"Total entries in dev: {dev_df.shape[0]}")

Total entries in train: 87599
Total entries in dev: 10570


In [22]:
# overview of the data 
#  Word count for context and question

# Describe the dataset
#train_df.describe().round(2)
#dev_df.describe().round(2)

# sample random entries from the processed data

In [23]:
import random

# Randomly sample 3 entries from the DataFrame
sample_train_qa_data = random.sample(train_qa_data, k=1)
print(sample_train_qa_data)


[{'title': 'Religion_in_ancient_Rome', 'context': "The small woolen dolls called Maniae, hung on the Compitalia shrines, were thought a symbolic replacement for child-sacrifice to Mania, as Mother of the Lares. The Junii took credit for its abolition by their ancestor L. Junius Brutus, traditionally Rome's Republican founder and first consul. Political or military executions were sometimes conducted in such a way that they evoked human sacrifice, whether deliberately or in the perception of witnesses; Marcus Marius Gratidianus was a gruesome example.", 'id': '5731bd96b9d445190005e4f0', 'question': 'What leader was the first consul of the Roman Republic?', 'answers': ['L. Junius Brutus'], 'answer_start': [218]}]


In [24]:
sample_dev_qa_data = random.sample(train_qa_data, k=3)
print(sample_dev_qa_data)

[{'title': 'New_Haven,_Connecticut', 'context': 'In 2004, the first bike lane in the city was added to Orange Street, connecting East Rock Park and the East Rock neighborhood to downtown. Since then, bike lanes have also been added to sections of Howard Ave, Elm St, Dixwell Avenue, Water Street, Clinton Avenue and State Street. The city has created recommended bike routes for getting around New Haven, including use of the Canal Trail and the Orange Street lane. A bike map of the city entire can be seen here , and bike maps broken down by area here . As of the end of 2012, bicycle lanes have also been added in both directions on Dixwell Avenue along most of the street from downtown to the Hamden town line, as well as along Howard Avenue from Yale New Haven Hospital to City Point.', 'id': '5727d6cf2ca10214002d97ca', 'question': 'In what year did Orange Street receive the first bike lane in New Haven?', 'answers': ['2004'], 'answer_start': [3]}, {'title': 'The_Blitz', 'context': 'Neverthe

# Splitting text into individual sentences -  Sentence Segmentation or Sentence Splitting

In [25]:
import random

# Sample 3 random entries from the train_qa_data
sample_dev_qa_data = random.sample(train_qa_data, k=3)

# Print each question and its corresponding answers from the sample
for entry in sample_dev_qa_data:
    title = entry['title']
    context = entry['context']
    question = entry['question']  # The question text
    answers = entry['answers']  # The list of answers

    print(f"title: {title}")
    print(f"context: {context}")
    print(f"Question: {question}")
    print(f"Answers: {answers}")
    print("\n---\n")


title: Gothic_architecture
context: Externally, towers and spires are characteristic of Gothic churches both great and small, the number and positioning being one of the greatest variables in Gothic architecture. In Italy, the tower, if present, is almost always detached from the building, as at Florence Cathedral, and is often from an earlier structure. In France and Spain, two towers on the front is the norm. In England, Germany and Scandinavia this is often the arrangement, but an English cathedral may also be surmounted by an enormous tower at the crossing. Smaller churches usually have just one tower, but this may also be the case at larger buildings, such as Salisbury Cathedral or Ulm Minster, which has the tallest spire in the world, slightly exceeding that of Lincoln Cathedral, the tallest which was actually completed during the medieval period, at 160 metres (520 ft).
Question: What is one feature of both large and small Gothic churches?
Answers: ['towers']

---

title: The_Su

# After explring the data , decided to segment and chunk only the context (because it has more sentences) . Question and Answer are only 1-2 sentences.
### The RAG pipeline uses the question to retrieve the most relevant context (from the chunks), not the other way around. Breaking the question into smaller chunks might reduce its clarity or the ability of the LLM to understand the query intent.

### Impact of Chunking the Question and Answer
### If you chunk the question or answer, you may lose the coherence of the question or answer.
### Chunked questions can become fragmented, making it harder for the model to understand what’s being asked.
### Chunked answers would interfere with the LLM's ability to produce a coherent and contextually accurate response, as the full answer would be needed.

In [26]:
 #!pip install spacy

import spacy
from spacy.lang.en import English

# Load the English pipeline and add a sentencizer
nlp = English()
nlp.add_pipe('sentencizer')

# Function to split context into sentences
def split_into_sentences(text):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

# Example 
context = "My name is Kavitha Padala.I am building RAG."
sentences = split_into_sentences(context)
print(sentences)


['My name is Kavitha Padala.', 'I am building RAG.']


## Here context is segmented into individual sentences 

In [27]:
# Initialize lists to store the processed data for both train and dev datasets
qa_data_processed = []
qa_data_processed_dev = []

# Creating a list of datasets to process (train and dev)
datasets = [("train", train_data), ("dev", dev_data)]

# Iterate over the datasets (train and dev)
for dataset_name, dataset in datasets:
    # Iterate through each entry in the current dataset (train or dev)
    for topic in tqdm(dataset['data'], desc=f"Processing {dataset_name} dataset"):
        title = topic['title']  # Get the title (e.g., "University_of_Notre_Dame")
        for paragraph in topic['paragraphs']:  # Iterate over paragraphs for the current topic
            context = paragraph['context']  # Get the paragraph's context
            sentences = split_into_sentences(context)  # Split the context into sentences

            # For each Q&A in the paragraph
            for qas in paragraph['qas']:
                question = qas['question']  # Get the question
                id = qas['id']
                answers = [answer['text'] for answer in qas['answers']]  # Get answers
                answer_start = [answer['answer_start'] for answer in qas['answers']]  # Get answer start positions

                # Append the processed data to the correct list based on the dataset
                if dataset_name == "train":
                    qa_data_processed.append({
                        "title": title,
                        "context": context,
                        "sentences": sentences,
                        "id": id,
                        "question": question,
                        "answers": answers,
                        "answer_start": answer_start
                    })
                elif dataset_name == "dev":
                    qa_data_processed_dev.append({
                        "title": title,
                        "context": context,
                        "sentences": sentences,
                        "id": id,
                        "question": question,
                        "answers": answers,
                        "answer_start": answer_start
                    })

# Convert the processed data into pandas DataFrames
df_train = pd.DataFrame(qa_data_processed)
df_dev = pd.DataFrame(qa_data_processed_dev)

# Show the first few rows of the DataFrames
print("Train Data:")
print(df_train.head())

print("Dev Data:")
print(df_dev.head())


Processing train dataset:   0%|          | 0/442 [00:00<?, ?it/s]

Processing dev dataset:   0%|          | 0/48 [00:00<?, ?it/s]

Train Data:
                      title  \
0  University_of_Notre_Dame   
1  University_of_Notre_Dame   
2  University_of_Notre_Dame   
3  University_of_Notre_Dame   
4  University_of_Notre_Dame   

                                             context  \
0  Architecturally, the school has a Catholic cha...   
1  Architecturally, the school has a Catholic cha...   
2  Architecturally, the school has a Catholic cha...   
3  Architecturally, the school has a Catholic cha...   
4  Architecturally, the school has a Catholic cha...   

                                           sentences  \
0  [Architecturally, the school has a Catholic ch...   
1  [Architecturally, the school has a Catholic ch...   
2  [Architecturally, the school has a Catholic ch...   
3  [Architecturally, the school has a Catholic ch...   
4  [Architecturally, the school has a Catholic ch...   

                         id  \
0  5733be284776f41900661182   
1  5733be284776f4190066117f   
2  5733be284776f41900661180   
3  

In [28]:
df_dev.shape[0]

10570

In [29]:
df_train.shape[0]

87599

# Chunking happens after segmentation to store the segmented text as embeddings for retrieval.
## Why Chunking is Important for RAG?
## 
Context Understanding: Chunking can help you identify key information within sentences, such as named entities, actions, and subjects.## 
Better Retrieval: For a RAG model, chunking could help improve information retrieval by focusing on key phrases instead of individual words  r entire sentences.

In [30]:
# Define chunking function
def split_list(input_list: list[str], slice_size: int = 10) -> list[list[str]]:
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Example: chunking sentences (already segmented)
context = "Sentence one. Sentence two. Sentence three. Sentence four. Sentence five. Sentence six. Sentence seven. Sentence eight. Sentence nine. Sentence ten. Sentence eleven. Sentence twelve. Sentence thirteen. Sentence fourteen. Sentence fifteen."
sentences = context.split(".")  
sentences = [sentence.strip() for sentence in sentences if sentence]  # Clean any empty or extra spaces

# Now, chunk the sentences
chunks = split_list(sentences, slice_size=10)

# Print the chunks
print(chunks)
print(f"Number of chunks: {len(chunks)}")


[['Sentence one', 'Sentence two', 'Sentence three', 'Sentence four', 'Sentence five', 'Sentence six', 'Sentence seven', 'Sentence eight', 'Sentence nine', 'Sentence ten'], ['Sentence eleven', 'Sentence twelve', 'Sentence thirteen', 'Sentence fourteen', 'Sentence fifteen']]
Number of chunks: 2


In [31]:

# Initialize lists to store the processed data for both train and dev datasets
qa_data_processed_train = []
qa_data_processed_dev = []

# List of datasets to iterate over (train and dev)
datasets = [("train", train_data), ("dev", dev_data)]

# Iterate over the datasets (train and dev)
for dataset_name, dataset in datasets:
    # Iterate through each entry in the current dataset (train or dev)
    for topic in tqdm(dataset['data'], desc=f"Processing {dataset_name} dataset"):
        title = topic['title']  # Get the title (e.g., "University_of_Notre_Dame")
        for paragraph in topic['paragraphs']:  # Iterate over paragraphs for the current topic
            context = paragraph['context']  # Get the paragraph's context
            sentences = split_into_sentences(context)  # Split the context into sentences

            # Chunk the sentences into groups of 10 (or any desired slice size)
            sentence_chunks = split_list(sentences, slice_size=10)

            # For each Q&A in the paragraph
            for qas in paragraph['qas']:
                question = qas['question']  # Get the question
                answers = [answer['text'] for answer in qas['answers']]  # Get answers
                answer_start = [answer['answer_start'] for answer in qas['answers']]  # Get answer start positions
                id = qas['id']  # Get the question ID

                # Append the processed data to the correct list based on the dataset
                if dataset_name == "train":
                    qa_data_processed_train.append({
                        "title": title,
                        "context": context,
                        "sentences": sentences,
                        "sentence_chunks": sentence_chunks,  # Store the chunked sentences
                        "num_chunks": len(sentence_chunks),
                        "question": question,
                        "answers": answers,
                        "answer_start": answer_start,
                        "id": id
                    })
                elif dataset_name == "dev":
                    qa_data_processed_dev.append({
                        "title": title,
                        "context": context,
                        "sentences": sentences,
                        "sentence_chunks": sentence_chunks,  # Store the chunked sentences
                        "num_chunks": len(sentence_chunks),
                        "question": question,
                        "answers": answers,
                        "answer_start": answer_start,
                        "id": id
                    })

# Convert the processed data into pandas DataFrames
df_train = pd.DataFrame(qa_data_processed_train)
df_dev = pd.DataFrame(qa_data_processed_dev)

# Show the first few rows of the DataFrames
print("Train Data:")
#print(df_train.head())

print("Dev Data:")
#print(df_dev.head())


Processing train dataset:   0%|          | 0/442 [00:00<?, ?it/s]

Processing dev dataset:   0%|          | 0/48 [00:00<?, ?it/s]

Train Data:
Dev Data:


In [32]:
#df_dev.head()

In [33]:
df_dev.shape[0]

10570

In [34]:
df_train.shape[0]

87599

In [35]:
import re
#from tqdm.auto import tqdm

# Initialize lists to store the chunked data with regex and statistics for both train and dev
qa_data_chunked_train = []
qa_data_chunked_dev = []

# Regex pattern to filter characters and punctuation
regex_pattern = r"[A-Za-z0-9\s,.!?()\"':;-]+"  

# List of datasets to iterate over (train and dev)
datasets = [("train", qa_data_processed_train), ("dev", qa_data_processed_dev)]

# Iterate over the datasets (train and dev)
for dataset_name, qa_data_processed in datasets:
    for item in tqdm(qa_data_processed, desc=f"Processing {dataset_name} dataset"):
        # For each sentence chunk in the current QA entry
        for sentence_chunk in item["sentence_chunks"]:
            # Initialize chunk dictionary
            chunk_dict = {}

            chunk_dict["title"] = item["title"]
            chunk_dict["context"] = item["context"]
            chunk_dict["question"] = item["question"]
            chunk_dict["id"] = item["id"]

            # Join sentences in the chunk and filter using regex
            joined_sentence_chunk = "".join(sentence_chunk).replace(" ", " ").strip()
            filtered_sentence_chunk = re.findall(regex_pattern, joined_sentence_chunk)
            joined_sentence_chunk = "".join(filtered_sentence_chunk)

            # Store the chunked sentence in the dictionary
            chunk_dict["sentence_chunk"] = joined_sentence_chunk

            # Calculate chunk statistics
            chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)  # Character count
            chunk_dict["chunk_word_count"] = len(joined_sentence_chunk.split())  # Word count
            chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4  # Approximate token count

            # Append the chunk dictionary to the corresponding dataset list
            if dataset_name == "train":
                qa_data_chunked_train.append(chunk_dict)
            elif dataset_name == "dev":
                qa_data_chunked_dev.append(chunk_dict)

# Now `qa_data_chunked_train` and `qa_data_chunked_dev` contain the processed chunks for train and dev

# Convert the processed data into pandas DataFrames
df_train_chunked = pd.DataFrame(qa_data_chunked_train)
df_dev_chunked = pd.DataFrame(qa_data_chunked_dev)

# Show the first few rows of the DataFrames
print("Train Data (Chunked with regex and stats):")
print(df_train_chunked.head(2))
print(df_train_chunked.shape[0])

print("Dev Data (Chunked with regex and stats):")
print(df_dev_chunked.head(2))


Processing train dataset:   0%|          | 0/87599 [00:00<?, ?it/s]

Processing dev dataset:   0%|          | 0/10570 [00:00<?, ?it/s]

Train Data (Chunked with regex and stats):
                      title  \
0  University_of_Notre_Dame   
1  University_of_Notre_Dame   

                                             context  \
0  Architecturally, the school has a Catholic cha...   
1  Architecturally, the school has a Catholic cha...   

                                            question  \
0  To whom did the Virgin Mary allegedly appear i...   
1  What is in front of the Notre Dame Main Building?   

                         id  \
0  5733be284776f41900661182   
1  5733be284776f4190066117f   

                                      sentence_chunk  chunk_char_count  \
0  Architecturally, the school has a Catholic cha...               689   
1  Architecturally, the school has a Catholic cha...               689   

   chunk_word_count  chunk_token_count  
0               118             172.25  
1               118             172.25  
89855
Dev Data (Chunked with regex and stats):
           title                      

In [36]:
df_train_chunked.shape[0]

89855

In [37]:
df_dev_chunked.shape[0]

10877

In [38]:
print(f"Total number of train data chunks: {len(df_train_chunked)}")

Total number of train data chunks: 89855


In [39]:
print(f"Total number of dev data chunks: {len(df_dev_chunked)}")

Total number of dev data chunks: 10877


# Filtering chunks of text for short chunks.Because tThese short chunks may not contain much useful information

In [40]:
#import pandas as pd

# Setting a minimum token length
min_token_length = 30

# Converting `qa_data_chunked_train` and `qa_data_chunked_dev` into pandas DataFrames
df_train_chunked = pd.DataFrame(qa_data_chunked_train)
df_dev_chunked = pd.DataFrame(qa_data_chunked_dev)

# Filter the DataFrames to get chunks with token count <= min_token_length
filtered_df_train = df_train_chunked[df_train_chunked["chunk_token_count"] <= min_token_length]
filtered_df_dev = df_dev_chunked[df_dev_chunked["chunk_token_count"] <= min_token_length]

# Sample 5 random rows from the filtered DataFrames for both train and dev
print("Sample from Train Dataset:")
for _, row in filtered_df_train.sample(5).iterrows():
    print(f'Chunk token count: {row["chunk_token_count"]} | Text: {row["sentence_chunk"]}')

print("\nSample from Dev Dataset:")
for _, row in filtered_df_dev.sample(5).iterrows():
    print(f'Chunk token count: {row["chunk_token_count"]} | Text: {row["sentence_chunk"]}')


Sample from Train Dataset:
Chunk token count: 28.0 | Text: In 2009, Bon Jovi released another number one album, The Circle, which marked a return to their hard rock sound.
Chunk token count: 21.75 | Text: In the past, dirac fabric was also frequently purchased from South Asian merchandisers.
Chunk token count: 22.5 | Text: The first Anglo-Sikh war and second Anglo-Sikh war marked the downfall of the Sikh Empire.
Chunk token count: 15.75 | Text: In the 3rd century CE the empire was split into smaller states.
Chunk token count: 16.0 | Text: The inner ear has a cochlea, but it is not spiral as in mammals.

Sample from Dev Dataset:
Chunk token count: 7.5 | Text: and is not open to the public.
Chunk token count: 22.75 | Text: Monks and priests were especially hard hit since they cared for victims of the Black Death.
Chunk token count: 18.25 | Text: His father was furious over what he saw as a waste of Luther's education.
Chunk token count: 7.5 | Text: and is not open to the public.
Chunk to

In [41]:
# Filter out rows where chunk token count is greater than min_token_length for both train and dev
qa_data_over_min_token_len_train = df_train_chunked[df_train_chunked["chunk_token_count"] > min_token_length].to_dict(orient="records")
qa_data_over_min_token_len_dev = df_dev_chunked[df_dev_chunked["chunk_token_count"] > min_token_length].to_dict(orient="records")

# Show the first 2 records from each dataset after filtering
print("Filtered Train Data:")
print(qa_data_over_min_token_len_train[:2])

print("\nFiltered Dev Data:")
print(qa_data_over_min_token_len_dev[:2])


Filtered Train Data:
[{'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'id': '5733be284776f41900661182', 'sentence_chunk': 'Architecturally, the school has a Catholic character.Atop the Main Building\'s gold dome 

# Steps to Add Chunked Data to ChromaDB

## 1. Prepare the Data for Embedding

In [42]:
# Check for missing values in the train dataset
print("Missing values in train chunked data:")
print(df_train_chunked.isnull().sum())

# Check for missing values in the dev dataset
print("\nMissing values in dev chunked data:")
print(df_dev_chunked.isnull().sum())


Missing values in train chunked data:
title                0
context              0
question             0
id                   0
sentence_chunk       0
chunk_char_count     0
chunk_word_count     0
chunk_token_count    0
dtype: int64

Missing values in dev chunked data:
title                0
context              0
question             0
id                   0
sentence_chunk       0
chunk_char_count     0
chunk_word_count     0
chunk_token_count    0
dtype: int64


In [43]:
# Check token count distribution in train chunked data
print("\nToken count distribution in train chunked data:")
print(df_train_chunked["chunk_token_count"].describe())

# Check token count distribution in dev chunked data
print("\nToken count distribution in dev chunked data:")
print(df_dev_chunked["chunk_token_count"].describe())



Token count distribution in train chunked data:
count    89855.000000
mean       182.463881
std         72.316149
min          2.250000
25%        137.000000
50%        170.250000
75%        219.750000
max        630.000000
Name: chunk_token_count, dtype: float64

Token count distribution in dev chunked data:
count    10877.000000
mean       187.907396
std         78.073267
min          7.500000
25%        140.500000
50%        173.250000
75%        224.250000
max        575.250000
Name: chunk_token_count, dtype: float64


In [44]:
# Check columns and data structure for train chunked data
print("\nTrain chunked data columns:")
print(df_train_chunked.columns)

# Check columns and data structure for dev chunked data
print("\nDev chunked data columns:")
print(df_dev_chunked.columns)



Train chunked data columns:
Index(['title', 'context', 'question', 'id', 'sentence_chunk',
       'chunk_char_count', 'chunk_word_count', 'chunk_token_count'],
      dtype='object')

Dev chunked data columns:
Index(['title', 'context', 'question', 'id', 'sentence_chunk',
       'chunk_char_count', 'chunk_word_count', 'chunk_token_count'],
      dtype='object')


In [45]:
# Sample 5 rows from train chunked data
print("\nSample rows from train chunked data:")
print(df_train_chunked.sample(5))

# Sample 5 rows from dev chunked data
print("\nSample rows from dev chunked data:")
print(df_dev_chunked.sample(5))



Sample rows from train chunked data:
                                         title  \
40852                               Montevideo   
53574                               Literature   
56308  Affirmative_action_in_the_United_States   
50965                                   Cubism   
61173                                  Detroit   

                                                 context  \
40852  Montevideo has a very rich architectural herit...   
53574  Poetry is a form of literary art which uses ae...   
56308  Affirmative action in the United States tends ...   
50965  At the 1912 Salon d'Automne an architectural i...   
61173  The city slopes gently from the northwest to s...   

                                                question  \
40852  How many movie theater companies are there in ...   
53574  Poetry is usually differentiated from prose by...   
56308  Affirmative action does not only attempt to re...   
50965                       What was La Maison Cubiste ?   


# Now Embedding
## converting the processed text chunks into numerical vectors that can be used in tasks like retrieval or machine learning models.

## Step 2: Embed the Chunked Data into ChromaDB

In [46]:
# Step 2.1: Prepare the Data for Embedding
#  Formatting Data

# Preparing train and dev data for ChromaDB
train_embeddings_data = df_train_chunked[["id", "sentence_chunk" ,"title", "context", "question"]].copy()
train_embeddings_data["source"] = "train"

dev_embeddings_data = df_dev_chunked[["id", "sentence_chunk",  "title", "context", "question"]].copy()
dev_embeddings_data["source"] = "dev"

# Combine train and dev for embedding into ChromaDB
combined_embeddings_data = pd.concat([train_embeddings_data, dev_embeddings_data])

# Preview the data
print("Combined Embeddings Data Preview:")
print(combined_embeddings_data.head())



Combined Embeddings Data Preview:
                         id  \
0  5733be284776f41900661182   
1  5733be284776f4190066117f   
2  5733be284776f41900661180   
3  5733be284776f41900661181   
4  5733be284776f4190066117e   

                                      sentence_chunk  \
0  Architecturally, the school has a Catholic cha...   
1  Architecturally, the school has a Catholic cha...   
2  Architecturally, the school has a Catholic cha...   
3  Architecturally, the school has a Catholic cha...   
4  Architecturally, the school has a Catholic cha...   

                      title  \
0  University_of_Notre_Dame   
1  University_of_Notre_Dame   
2  University_of_Notre_Dame   
3  University_of_Notre_Dame   
4  University_of_Notre_Dame   

                                             context  \
0  Architecturally, the school has a Catholic cha...   
1  Architecturally, the school has a Catholic cha...   
2  Architecturally, the school has a Catholic cha...   
3  Architecturally, the school 

### Embedding train and dev data but you need to make it combined_data 

In [166]:
import chromadb
from sentence_transformers import SentenceTransformer
import torch
from tqdm import tqdm

# Initialize the embedding model
device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Move the model to the chosen device (GPU or CPU)
embedding_model.to(device)

# Initialize ChromaDB client and collection 
#client = chromadb.Client()
#collection = client.create_collection(name="rag_embedding_collection", embedding_function=embedding_function)

# List to store embeddings for each chunk (for preview)
chunk_embeddings = []

# Iterate over each chunk in the processed data (e.g., df_train_chunked or df_dev_chunked)
for idx, row in tqdm(train_embeddings_data.iterrows(), total=train_embeddings_data.shape[0], desc="Embedding and Storing in ChromaDB"):
    sentence_chunk = row['sentence_chunk']  # Extract sentence chunk (text)
    
    # Generate embeddings for each chunk of text
    embeddings = embedding_model.encode([sentence_chunk], device=device)

    # Add to Chroma collection (store embeddings with metadata)
    collection.add(
        ids=[row['id']],
        documents=[sentence_chunk],  # Sentence chunk (text)
        metadatas=[{'id': row['id'], 'source': row['source']}],  # Metadata (id, source)
        embeddings=[embeddings[0]]  # Add the embedding (we take the first element from the list)
    )

    #  store this data locally for later reference (in a list or DataFrame)
    chunk_embeddings.append({
        'title': row['title'],
        'context': row['context'],
        'question': row['question'],
        'id': row['id'],
        'sentence_chunk': sentence_chunk,
        'embedding': embeddings[0]  # The model returns a 2D array, so we take the first element
    })

#  convert chunk embeddings to a DataFrame (for inspection)
chunk_embeddings_df = pd.DataFrame(chunk_embeddings)
print(chunk_embeddings_df.head())

print("Embeddings stored in ChromaDB.") 


Embedding and Storing in ChromaDB:   0%|                                                  | 4/89855 [00:01<7:20:03,  3.40it/s]Insert of existing embedding ID: 5733bf84d058e614000b61be
Add of existing embedding ID: 5733bf84d058e614000b61be
Embedding and Storing in ChromaDB:   0%|                                                  | 7/89855 [00:01<3:22:00,  7.41it/s]Insert of existing embedding ID: 5733bf84d058e614000b61bf
Add of existing embedding ID: 5733bf84d058e614000b61bf
Embedding and Storing in ChromaDB:   0%|                                                 | 10/89855 [00:02<2:15:06, 11.08it/s]Insert of existing embedding ID: 5733bf84d058e614000b61c0
Add of existing embedding ID: 5733bf84d058e614000b61c0
Insert of existing embedding ID: 5733bf84d058e614000b61bd
Add of existing embedding ID: 5733bf84d058e614000b61bd
Embedding and Storing in ChromaDB:   0%|                                                 | 14/89855 [00:02<1:35:43, 15.64it/s]Insert of existing embedding ID: 5733bf84d05

                      title  \
0  University_of_Notre_Dame   
1  University_of_Notre_Dame   
2  University_of_Notre_Dame   
3  University_of_Notre_Dame   
4  University_of_Notre_Dame   

                                             context  \
0  Architecturally, the school has a Catholic cha...   
1  Architecturally, the school has a Catholic cha...   
2  Architecturally, the school has a Catholic cha...   
3  Architecturally, the school has a Catholic cha...   
4  Architecturally, the school has a Catholic cha...   

                                            question  \
0  To whom did the Virgin Mary allegedly appear i...   
1  What is in front of the Notre Dame Main Building?   
2  The Basilica of the Sacred heart at Notre Dame...   
3                  What is the Grotto at Notre Dame?   
4  What sits on top of the Main Building at Notre...   

                         id  \
0  5733be284776f41900661182   
1  5733be284776f4190066117f   
2  5733be284776f41900661180   
3  5733be284776

In [167]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.cuda.is_available() 

True

In [168]:
device

'cuda'

In [169]:
# List all collection names to verify your collection exists
collection_names = client.list_collections()
print("Collections in ChromaDB:", collection_names)


Collections in ChromaDB: [Collection(name=rag_embedding_collection)]


# Embedding done. Now Retrieval
# Retrieve relevant info based on the query and use that info to augment the LLM .so it will generate an output based on that relevant info.

In [170]:
# Get the total number of documents in the collection
num_documents = collection.count()
print(f"Number of documents in the collection: {num_documents}")


Number of documents in the collection: 98169


In [171]:
# Retrieve some sample data to inspect
sample_results = collection.get(limit=1)  # Fetch 5 documents
print(sample_results.keys())  # Check the available keys
print(sample_results)


dict_keys(['ids', 'embeddings', 'documents', 'uris', 'data', 'metadatas', 'included'])
{'ids': ['56be4db0acb8001400a502ec'], 'embeddings': None, 'documents': ['Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season.The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 2410 to earn their third Super Bowl title.The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California.As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.'], 'uris': None, 'data': None, 'metadatas': [{'id': '56be4db0acb800

In [172]:
sample_results_with_embeddings = collection.get(limit=1, include=['embeddings','documents'])
print(sample_results_with_embeddings)


{'ids': ['56be4db0acb8001400a502ec'], 'embeddings': array([[-3.64072733e-02,  1.50846578e-02, -1.30244717e-02,
         3.34960297e-02,  3.11200842e-02,  3.26831578e-05,
         2.38097049e-02,  3.41193415e-02,  1.57862413e-03,
        -6.12170920e-02, -5.75146638e-03, -5.30933030e-03,
         3.31142731e-02,  2.51804087e-02,  7.40088820e-02,
         1.17556415e-02, -3.19710225e-02, -4.58429232e-02,
        -1.06769567e-02,  1.48083102e-02, -7.02000083e-03,
        -3.55858766e-02,  2.38997862e-02, -4.19098921e-02,
        -8.24625343e-02,  9.12955031e-03,  2.38578413e-02,
        -2.73191580e-03, -5.51282763e-02, -1.36470962e-02,
         5.27684167e-02, -7.89134018e-03,  2.25004219e-02,
        -6.28847331e-02,  2.44105695e-06, -1.55957406e-02,
        -3.73383872e-02, -6.84971409e-03,  1.96683966e-02,
         4.33199573e-03, -5.62789477e-02, -4.26380038e-02,
        -2.08829548e-02, -3.94015526e-03,  1.25472667e-02,
         1.47755747e-03, -5.42721786e-02,  1.16987526e-02,
    

# Now Similarity search

In [173]:

from sentence_transformers import SentenceTransformer, util
import torch

# Defining the embedding model
embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Definining a query
query = "Which NFL team represented the AFC at Super Bowl 50?"
print(f"Query: {query}")

# Generate query embedding
query_embedding = embedding_model.encode(query, convert_to_tensor=True)

# Perform similarity search with the query embedding
results = collection.query(
    query_embeddings=query_embedding.cpu().detach().numpy(),  # Convert to NumPy format
    n_results=5,  # Top 5 results
    include=["embeddings", "documents","metadatas"],  # Include embeddings and documents in the results
)

# Print results
print("Similarity Search Results:")
for i, (doc, metadata) in enumerate(zip(results['documents'], results['metadatas'])):
    print(f"\nResult {i + 1}:")
    print(f"Document: {doc}")
    print(f"Metadata: {metadata}")



Query: Which NFL team represented the AFC at Super Bowl 50?
Similarity Search Results:

Result 1:
Document: ['Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season.The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 2410 to earn their third Super Bowl title.The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California.As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.', 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for th

## Trying to print unique results

In [174]:
unique_results = []
unique_docs = set()

for doc, meta in zip(results['documents'], results['metadatas']):
    doc_str = " ".join(doc)  # Combine all parts of the document into one string
    if doc_str not in unique_docs:
        unique_docs.add(doc_str)
        unique_results.append({"document": doc, "metadata": meta})

# Display unique results
for i, res in enumerate(unique_results):
    print(f"\nUnique Result {i + 1}:")
    print(f"Document: {res['document'][:200]}...")  # Print first 200 chars
    print(f"Metadata: {res['metadata']}")



Unique Result 1:
Document: ['Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season.The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 2410 to earn their third Super Bowl title.The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California.As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.', 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season.The American Football Conference (AFC) champion Denver Broncos def

## Dot Product and Cosine Similarity

In [175]:
import torch

def dot_product(vector1, vector2):
    return torch.dot(vector1, vector2)

def cosine_similarity(vector1, vector2):
    dot_product = torch.dot(vector1, vector2)

    # Get Euclidean/L2 norm of each vector (removes the magnitude, keeps direction)
    norm_vector1 = torch.sqrt(torch.sum(vector1**2))
    norm_vector2 = torch.sqrt(torch.sum(vector2**2))

    return dot_product / (norm_vector1 * norm_vector2)


In [176]:
from sentence_transformers import SentenceTransformer, util
import torch

# Define the embedding model
embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Function to retrieve the relevant resources based on the query
def retrieve_relevant_resources(query: str,
                                collection,  # Collection to query from
                                model: SentenceTransformer = embedding_model,
                                n_resources_to_return: int = 5,
                                print_time: bool = True):
    """
    Embeds a query with model and returns top k results from the collection using embeddings.
    """

    # Embed the query
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Perform similarity search with the query embedding
   
    results = collection.query(
        query_embeddings=query_embedding.cpu().detach().numpy(),  # Convert to NumPy format
        n_results=n_resources_to_return,  # Top k results
        include=["embeddings", "documents", "metadatas"],  # Include embeddings, documents, and metadata
    )
    
    return results

# Function to print the top results and scores
def print_top_results_and_scores(query: str,
                                 collection,  # The collection to query
                                 n_resources_to_return: int = 5):
    """
    Takes a query, retrieves the most relevant resources, and prints them in descending order.
    """

    # Retrieve the relevant resources (documents, scores, and metadata)
    results = retrieve_relevant_resources(query=query, collection=collection, n_resources_to_return=n_resources_to_return)

    print(f"Query: {query}\n")
    print("Similarity Search Results:")

    # Loop through the results and print the documents with their metadata
    for i, (doc, metadata) in enumerate(zip(results['documents'], results['metadatas'])):
        print(f"\nResult {i + 1}:")
        print(f"Document: {doc}")
        print(f"Metadata: {metadata}")

# Example usage:
query = "How much of Oklahoma's population is Christian?"
collection = collection  # Replace with your actual collection object
print_top_results_and_scores(query=query, collection=collection, n_resources_to_return=5)


Query: How much of Oklahoma's population is Christian?

Similarity Search Results:

Result 1:
Document: ['According to a 2014 study by the Pew Research Center, 57 of the population of the city identified themselves as Christians, with 25 professing attendance at a variety of churches that could be considered Protestant, and 29 professing Roman Catholic beliefs.while 33 claim no religious affiliation.The same study says that other religions (including Judaism, Buddhism, Islam, and Hinduism) collectively make up about 10 of the population.', 'According to a 2014 study by the Pew Research Center, 57 of the population of the city identified themselves as Christians, with 25 professing attendance at a variety of churches that could be considered Protestant, and 29 professing Roman Catholic beliefs.while 33 claim no religious affiliation.The same study says that other religions (including Judaism, Buddhism, Islam, and Hinduism) collectively make up about 10 of the population.', 'According to

In [177]:
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"GPU memory: {gpu_memory_gb:.2f} GB")

GPU memory: 6.00 GB


#  Gemma LLM

In [178]:
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

GPU memory: 6 | Recommended model: Gemma 2B in 4-bit precision.
use_quantization_config set to: True
model_id set to: google/gemma-2b-it


In [64]:
#!pip install --upgrade huggingface_hub

# Connecting to Hugging Face

In [65]:
!huggingface-cli login --token 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


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
The token `kavitha_llm` has been saved to C:\Users\Kavitha padala\.cache\huggingface\stored_tokens
Your token has been saved to C:\Users\Kavitha padala\.cache\huggingface\token
Login successful.
The current active token is: `kavitha_llm`


In [66]:
#!pip install bitsandbytes
#!pip install accelerate
#!pip install -U bitsandbytes
#!pip install accelerate>=0.26.0


# Gemma LLm is loading

In [67]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available

# 1. Create quantization config for smaller model loading (optional)
# Requires !pip install bitsandbytes accelerate, see: https://github.com/TimDettmers/bitsandbytes, https://huggingface.co/docs/accelerate/
# For models that require 4-bit quantization (use this if you have low GPU memory available)
from transformers import BitsAndBytesConfig


quantization_config = BitsAndBytesConfig(load_in_4bit=True,bnb_4bit_compute_dtype=torch.float16)

# Bonus: Setup Flash Attention 2 for faster inference, default to "sdpa" or "scaled dot product attention" if it's not available
# Flash Attention 2 requires NVIDIA GPU compute capability of 8.0 or above, see: https://developer.nvidia.com/cuda-gpus
# Requires !pip install flash-attn, see: https://github.com/Dao-AILab/flash-attention
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] > 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")

# 2. Pick a model we'd like to use (this will depend on how much GPU memory you have available)
#model_id = "google/gemma-7b-it"
model_id = "google/gemma-2b-it" # (we already set this above)
print(f"[INFO] Using model_id: {model_id}")

# 3. Instantiate tokenizer (tokenizer turns text into numbers ready for the model)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

# 4. Instantiate the model
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                                 torch_dtype=torch.float16, # datatype to use, we want float16
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=False, # use full memory
                                                 attn_implementation=attn_implementation) # which attention version to use

if not use_quantization_config: # quantization takes care of device setting automatically, so if it's not used, send model to GPU
    llm_model.to(device)

[INFO] Using attention implementation: sdpa
[INFO] Using model_id: google/gemma-2b-it


model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

ValueError: weight is on the meta device, we need a `value` to put in on 0.

# Here Model set to gemma LLM

In [71]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
import torch

# Model configuration
model_id = "google/gemma-2b-it"

# Ensure you're using a CUDA-enabled device if available


# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Set up quantization config for 4-bit precision (use bitsandbytes)
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

# Load the model in 4-bit precision
llm_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,  # Use 4-bit precision
    low_cpu_mem_usage=True,  # Use efficient memory usage
)

# Move the model to the appropriate device
llm_model.to(device)

# Check the model's device placement
print(f"Model is on device: {next(llm_model.parameters()).device}")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You shouldn't move a model when it is dispatched on multiple devices.


Model is on device: cuda:0


In [6]:
#tokenizer.save_pretrained("./local")
#llm_model.save_pretrained("./local")

In [68]:
torch.cuda.get_device_capability(0)[0]

8

In [3]:
local_model_dir = "./local_model" 
tokenizer = AutoTokenizer.from_pretrained(local_model_dir)
model = AutoModelForCausalLM.from_pretrained(local_model_dir)

ValueError: Unrecognized model in ./local_model. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, audio-spectrogram-transformer, autoformer, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deformable_detr, deit, depth_anything, deta, detr, dinat, dinov2, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_mamba, fastspeech2_conformer, flaubert, flava, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, git, glm, glpn, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gptj, gptsan-japanese, granite, granitemoe, graphormer, grounding-dino, groupvit, hiera, hubert, ibert, idefics, idefics2, idefics3, imagegpt, informer, instructblip, instructblipvideo, jamba, jetmoe, jukebox, kosmos-2, layoutlm, layoutlmv2, layoutlmv3, led, levit, lilt, llama, llava, llava_next, llava_next_video, llava_onevision, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, mgp-str, mimi, mistral, mixtral, mllama, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, moshi, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, owlv2, owlvit, paligemma, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, persimmon, phi, phi3, phimoe, pix2struct, pixtral, plbart, poolformer, pop2piano, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rwkv, sam, seamless_m4t, seamless_m4t_v2, segformer, seggpt, sew, sew-d, siglip, siglip_vision_model, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, table-transformer, tapas, time_series_transformer, timesformer, timm_backbone, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vits, vivit, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xmod, yolos, yoso, zamba, zoedepth

In [None]:
llm_model

In [73]:
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

get_model_num_params(llm_model)

1515268096

In [74]:
def get_model_mem_size(model: torch.nn.Module):
    """
    Get how much memory a PyTorch model takes up.

    See: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822
    """
    # Get model parameters and buffer sizes
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # Calculate various model sizes
    model_mem_bytes = mem_params + mem_buffers # in bytes
    model_mem_mb = model_mem_bytes / (1024**2) # in megabytes
    model_mem_gb = model_mem_bytes / (1024**3) # in gigabytes

    return {"model_mem_bytes": model_mem_bytes,
            "model_mem_mb": round(model_mem_mb, 2),
            "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(llm_model)

{'model_mem_bytes': 2039641088, 'model_mem_mb': 1945.15, 'model_mem_gb': 1.9}

In [75]:
input_text = "Which NFL team represented the AFC at Super Bowl 50?"
print(f"Input text:\n{input_text}")

# Create prompt template for instruction-tuned model
dialogue_template = [
    {"role": "user",
     "content": input_text}
]

# Apply the chat template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False, # keep as raw text (not tokenized)
                                       add_generation_prompt=True)
print(f"\nPrompt (formatted):\n{prompt}")

Input text:
Which NFL team represented the AFC at Super Bowl 50?

Prompt (formatted):
<bos><start_of_turn>user
Which NFL team represented the AFC at Super Bowl 50?<end_of_turn>
<start_of_turn>model



In [179]:
prompt

'<bos><start_of_turn>user\nBased on the following context items, please answer the query.\n- Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season.The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 2410 to earn their third Super Bowl title.The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California.As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.\n- Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season.The

In [76]:
input_ids = tokenizer(prompt, return_tensors="pt").to(device)
print(f"Model input (tokenized):\n{input_ids}\n")

# Generate outputs passed on the tokenized input
# See generate docs: https://huggingface.co/docs/transformers/v4.38.2/en/main_classes/text_generation#transformers.GenerationConfig
outputs = llm_model.generate(**input_ids,
                             max_new_tokens=256) # define the maximum number of new tokens to create
print(f"Model output (tokens):\n{outputs[0]}\n")

Model input (tokenized):
{'input_ids': tensor([[     2,      2,    106,   1645,    108,  13033,  27023,   2970,  12754,
            573,  83843,    696,   5303,  29270, 235248, 235308, 235276, 235336,
            107,    108,    106,   2516,    108]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}



  attn_output = torch.nn.functional.scaled_dot_product_attention(


Model output (tokens):
tensor([     2,      2,    106,   1645,    108,  13033,  27023,   2970,  12754,
           573,  83843,    696,   5303,  29270, 235248, 235308, 235276, 235336,
           107,    108,    106,   2516,    108, 235285,   2952,   3684,   1879,
        235290,   1602,   2113, 235269,   5852,    590,   2952,   3658,    573,
          3448,    577,    736,   2872, 235265,   1699,    573,   1546,    908,
        235290,    511, 235290,   1545,   2113,    611,    573,  27023, 235269,
          3743,   2701,    573,   5613,  27023,   4451,    689,   1089,   9028,
          3905,   8269, 235265,      1], device='cuda:0')



In [77]:
outputs_decoded = tokenizer.decode(outputs[0])
print(f"Model output (decoded):\n{outputs_decoded}\n")

Model output (decoded):
<bos><bos><start_of_turn>user
Which NFL team represented the AFC at Super Bowl 50?<end_of_turn>
<start_of_turn>model
I cannot access real-time information, therefore I cannot provide the answer to this question. For the most up-to-date information on the NFL, please check the official NFL website or any sports news sources.<eos>



In [78]:
print(f"Input text: {input_text}\n")
print(f"Output text:\n{outputs_decoded.replace(prompt, '').replace('<bos>', '').replace('<eos>', '')}")

Input text: Which NFL team represented the AFC at Super Bowl 50?

Output text:
I cannot access real-time information, therefore I cannot provide the answer to this question. For the most up-to-date information on the NFL, please check the official NFL website or any sports news sources.


In [82]:

  
results = retrieve_relevant_resources(query=input_text, collection=collection, n_resources_to_return=5)



results

{'ids': [['56be4db0acb8001400a502ed',
   '56bea9923aeaaa14008c91b9',
   '56d20362e7d4791d009025ea',
   '56beace93aeaaa14008c91e3',
   '56d9895ddc89441400fdb510']],
 'embeddings': [array([[-0.03640727,  0.01508466, -0.01302447, ...,  0.01343169,
           0.04385673, -0.02104047],
         [-0.03640727,  0.01508466, -0.01302447, ...,  0.01343169,
           0.04385673, -0.02104047],
         [-0.03640727,  0.01508466, -0.01302447, ...,  0.01343169,
           0.04385673, -0.02104047],
         [-0.03640727,  0.01508466, -0.01302447, ...,  0.01343169,
           0.04385673, -0.02104047],
         [-0.03640727,  0.01508466, -0.01302447, ...,  0.01343169,
           0.04385673, -0.02104047]])],
 'documents': [['Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season.The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 2410 to earn the

In [124]:
def prompt_formatter(query: str,
                     context_items: list[dict]) -> str:
    """
    Augments query with text-based context from context_items.
    """
    # Join context items into one dotted paragraph
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    # Create a base prompt with examples to help the model
    # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
    # We could also write this in a txt file and import it in if we wanted.
    base_prompt = """Based on the following context items, please answer the query.
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""

    # Update base prompt with context items and query
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt


In [125]:
results['documents']

[['Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season.The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 2410 to earn their third Super Bowl title.The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California.As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.',
  'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season.The American Football Conference (AFC) champion Denver Broncos defeated the National Footba

In [126]:
input_text

'Which NFL team represented the AFC at Super Bowl 50?'

In [127]:
documents = results['documents'][0]

# Convert the documents into the required format (list of dictionaries)
context_items = [{"sentence_chunk": doc} for doc in documents]

# Now call the prompt_formatter function with the query and context_items
query = "What is NFC means"
formatted_prompt = prompt_formatter(query, context_items)

# Print the formatted prompt to verify
print(formatted_prompt)

<bos><start_of_turn>user
Based on the following context items, please answer the query.
- Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season.The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 2410 to earn their third Super Bowl title.The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California.As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.
- Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season.The Amer

In [128]:
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate an output of tokens
outputs = llm_model.generate(**input_ids,
                             temperature=0.7, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
                             do_sample=True, # whether or not to use sampling, see https://huyenchip.com/2024/01/16/sampling.html for more
                             max_new_tokens=256) # how many new tokens to generate from prompt

# Turn the output tokens into text
output_text = tokenizer.decode(outputs[0])

print(f"Query: {query}")
print(f"RAG answer:\n{output_text.replace(prompt, '')}")

Query: What is NFC means
RAG answer:
<bos>I cannot access real-time information, therefore I cannot answer this question. For the most up-to-date information on which team represented the AFC at Super Bowl 50, please check the official NFL website or any sports news sources.<eos>


In [131]:
def ask(query,
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True,
        return_answer_only=True,reference_count=1):
    """
    Takes a query, finds relevant resources/context and generates an answer to the query based on the relevant resources.
    """

    # Get just the scores and indices of top related results
    results = retrieve_relevant_resources(query=input_text, collection=collection, n_resources_to_return=reference_count)

    # Create a list of context items
    documents = results['documents'][0]

# Convert the documents into the required format (list of dictionaries)
    context_items = [{"sentence_chunk": doc} for doc in documents]

# Now call the prompt_formatter function with the query and context_items
    
    formatted_prompt = prompt_formatter(query, context_items)

    # Add score to context item
    #for i, item in enumerate(context_items):
      #  item["score"] = scores[i].cpu() # return score back to CPU

    # Format the prompt with context items
    prompt = prompt_formatter(query=query,
                              context_items=context_items)

    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate an output of tokens
    outputs = llm_model.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)

    # Turn the output tokens into text
    output_text = tokenizer.decode(outputs[0])

    if format_answer_text:
        # Replace special tokens and unnecessary help message
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")

    # Only return the answer without the context items
    if return_answer_only:
        return output_text

    return output_text, context_items

In [164]:
import textwrap
def print_wrapped(text,wrap_length=80):
  wrapped_text =  textwrap.fill(text,width=wrap_length)
  print(wrapped_text)
query = "what is Super bowl NFC means"
print(f"Query: {query}")

# Answer query with context and return context
answer, context_items = ask(query=query,
                            temperature=0.001,
                            max_new_tokens=512,
                            return_answer_only=False,reference_count=5)

print(f"Answer:\n")
print_wrapped(answer)
print(context_items)

Query: what is Super bowl NFC means
Answer:

The context does not provide any information about the meaning of NFC, so I
cannot answer this question from the provided context.
[{'sentence_chunk': 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season.The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 2410 to earn their third Super Bowl title.The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California.As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.'}, {'sentence_chunk': 'Super B

# Here using azure openai LLM

In [None]:
#!pip install azure-ai-openai
import os

from autogen import ConversableAgent
llm_config = {
    "config_list": [
        {
            "model": "gpt-4o",  # You can replace this with the actual model name or deployment if needed
            "api_key":  "your_api_key",  # Using provided API key or from environment variable
            "api_type": "azure",  # Specifying the API type as Azure
            "base_url": "https://openai-eus-poc-1.openai.azure.com/",  # Using provided Azure endpoint
            "api_version": "2024-08-01-preview",  # API version specified in your provided config
        }
        
    ],
    "temperature": 0.9,  # You can adjust the temperature here
    "timeout": 300,  # Set the timeout as specified
}

input_text = "what is Super bowl NFC means"
results = retrieve_relevant_resources(query=input_text, collection=collection, n_resources_to_return=5)

    # Create a list of context items
documents = results['documents'][0]

# Convert the documents into the required format (list of dictionaries)
context_items = [{"sentence_chunk": doc} for doc in documents]

# Now call the prompt_formatter function with the query and context_items
    
formatted_prompt = prompt_formatter(input_text, context_items)

    # Add score to context item
    #for i, item in enumerate(context_items):
      #  item["score"] = scores[i].cpu() # return score back to CPU

    # Format the prompt with context items
prompt = prompt_formatter(query=input_text,
                              context_items=context_items)


assistant = ConversableAgent(name="assistant",
                             system_message="you are ai assistant that reply based on the user context respond to user" ,
                             llm_config=llm_config,
                             human_input_mode='NEVER'
                             
                             
                             )

print("Query :",input_text)

reply = assistant.generate_reply(messages=[{"content": prompt, "role": "user"}])

print(reply)  # This agent will call the read_emails function

Query : what is Super bowl NFC means
In the context of the Super Bowl, "NFC" stands for the National Football Conference. The NFL (National Football League) is divided into two conferences: the American Football Conference (AFC) and the National Football Conference (NFC). Each conference has its own set of teams, and the champions of the AFC and NFC face off in the Super Bowl to determine the overall NFL champion for the season. In Super Bowl 50, the NFC champion was the Carolina Panthers.


In [152]:
!pip install autogen

Collecting autogen
  Downloading autogen-0.4-py3-none-any.whl.metadata (24 kB)
Collecting diskcache (from autogen)
  Using cached diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting docker (from autogen)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting flaml (from autogen)
  Downloading FLAML-2.3.2-py3-none-any.whl.metadata (16 kB)
Collecting openai>=1.3 (from autogen)
  Downloading openai-1.55.2-py3-none-any.whl.metadata (24 kB)
Collecting jiter<1,>=0.4.0 (from openai>=1.3->autogen)
  Downloading jiter-0.8.0-cp311-none-win_amd64.whl.metadata (5.3 kB)
Downloading autogen-0.4-py3-none-any.whl (366 kB)
   ---------------------------------------- 0.0/366.2 kB ? eta -:--:--
   ------- -------------------------------- 71.7/366.2 kB 1.3 MB/s eta 0:00:01
   ------------------------ --------------- 225.3/366.2 kB 2.8 MB/s eta 0:00:01
   ---------------------------------------  358.4/366.2 kB 2.8 MB/s eta 0:00:01
   ---------------------------------------- 366.2


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
