In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Sources:
- https://github.com/Ahmedsamy96/Customized-Llama2-Chatbot/blob/main/Customized_Llama2_Chatbot.ipynb
- https://medium.com/@murtuza753/using-llama-2-0-faiss-and-langchain-for-question-answering-on-your-own-data-682241488476
- https://betterprogramming.pub/build-a-chatbot-on-your-csv-data-with-langchain-and-openai-ed121f85f0cd
- https://www.kaggle.com/datasets/stanfordu/stanford-question-answering-dataset

In [6]:

!pip install -q gradio
# !pip install -qU transformers accelerate einops langchain xformers bitsandbytes faiss-gpu sentence_transformers
!pip install -qU transformers einops langchain xformers bitsandbytes faiss-gpu sentence_transformers

!pip install -U git+https://github.com/huggingface/accelerate.git # pypi version of accelerate has a bug

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.3/20.3 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.9/92.9 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m299.2/299.2 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.7/75.7 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.7/138.7 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.7/45.7 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m8.0

#**Power-Packed Python Imports**: Building Blocks for Advanced NLP Applications!

In [7]:
import joblib
import torch
from torch import cuda, bfloat16
import transformers
from transformers import StoppingCriteria, StoppingCriteriaList
from langchain.llms import HuggingFacePipeline
import pandas as pd
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
import gradio

#**Efficient Model Loading and Configuration**: Loading, Configuring, and Preparing a Pre-trained Language Model with Optimization

In [9]:
# Define the model ID for a pre-trained language model
model_id = 'meta-llama/Llama-2-13b-chat-hf'

# Determine the device (GPU if available, else CPU)
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# Configure quantization settings for loading the model with less GPU memory usage
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# Initialize HuggingFace authentication token
hf_auth = 'paste_your_huggingface_llama2_token'

# Load the configuration for the pre-trained model
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

# Load the model for causal language modeling
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)

# Set the model in evaluation mode for inference
model.eval()

# Print device information where the model is loaded
print(f"Model loaded on {device}")

Downloading (…)lve/main/config.json:   0%|          | 0.00/587 [00:00<?, ?B/s]



Downloading (…)fetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Model loaded on cuda:0


#**Tokenization and Stop List Preparation**: Generating Tokens and IDs for Special Text Sequences

In [10]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

stop_list = ['\nHuman:', '\n```\n']

stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]

stop_token_ids



Downloading (…)okenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

[tensor([    1, 29871,    13, 29950,  7889, 29901], device='cuda:0'),
 tensor([    1, 29871,    13, 28956,    13], device='cuda:0')]

In [11]:
# define custom stopping criteria object

class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

# **Text Generation Configuration**: Setting Up Text Generation with Transformers Pipeline


In [12]:
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.0001,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

In [13]:
llm = HuggingFacePipeline(pipeline=generate_text)

# **Loading Q&A Data from CSV**: Importing Q&A Pairs from a CSV File

In [14]:
# Load your CSV file containing Q&A pairs
csv_file_path = "/content/drive/MyDrive/AAI courses/AAI 520 NLP/final project/data/df.joblib"

qa_data = joblib.load(csv_file_path)

qa_data

Unnamed: 0,title,context,question,answer,answer_start
0,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,515
0,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,188
0,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building,279
0,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection,381
0,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary,92
...,...,...,...,...,...
0,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,Oregon,229
0,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,Rangoon,414
0,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,Minsk,476
0,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,1975,199


In [15]:
qa_data = qa_data.loc[:, ['question','answer']]
print(qa_data.shape)
qa_data

(87599, 2)


Unnamed: 0,question,answer
0,To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous
0,What is in front of the Notre Dame Main Building?,a copper statue of Christ
0,The Basilica of the Sacred heart at Notre Dame...,the Main Building
0,What is the Grotto at Notre Dame?,a Marian place of prayer and reflection
0,What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary
...,...,...
0,In what US state did Kathmandu first establish...,Oregon
0,What was Yangon previously known as?,Rangoon
0,With what Belorussian city does Kathmandu have...,Minsk
0,In what year did Kathmandu create its initial ...,1975


We know that we have some duplicatd rows as we will show below. We will drop them.

In [16]:
qa_data[qa_data.duplicated(keep=False)].sort_values(by="question")

Unnamed: 0,question,answer
0,Approximentally how many Americans identified ...,"In the 2010 US census, approximately 9 million..."
0,Approximentally how many Americans identified ...,"In the 2010 US census, approximately 9 million..."
0,During what historical event did Napoleon rise...,the French Revolution
0,During what historical event did Napoleon rise...,the French Revolution
0,How are police usually paid?,through taxes
...,...,...
0,Who was the mayor of Nagano?,Shoichi Washizawa
0,Who wrote Culture and Anarchy?,Matthew Arnold
0,Who wrote Culture and Anarchy?,Matthew Arnold
0,Who wrote the Divine Comedy?,Dante Alighieri


In [17]:
qa_data.drop_duplicates(inplace=True)

In [18]:
qa_data.to_csv("/content/drive/MyDrive/AAI courses/AAI 520 NLP/final project/data/df_prepared.csv", index=False)

# **Data Loading and Vectorization**: Loading Data from CSV and Creating Vector Store

In [19]:
loader = CSVLoader(file_path="/content/drive/MyDrive/AAI courses/AAI 520 NLP/final project/data/df_prepared.csv",
                   encoding="utf-8", csv_args={'delimiter': ','})
data = loader.load()

# Initialize embeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# Initialize the vector store
vectorstore = FAISS.from_documents(data, embeddings)

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [20]:
memory = ConversationBufferMemory(memory_key="chat_history", input_key='question', output_key='answer', return_messages=True)

chain = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), return_source_documents=True, memory=memory)

query = "who did the Virgin Mary appear to in 1858?"
result = chain({"question": query})

print(result['answer'])

 The Virgin Mary appeared to Saint Bernadette Soubirous in Lourdes, France in 1858.


In [21]:
def chatbot_interface(query, history):
  history = None
  result = chain({"question": query})
  return f"{result['answer']}\nTop fetched Q&A:\n{result['source_documents'][0].to_json()['kwargs']['page_content']}"

In [22]:
# let's clear the memory

memory = ConversationBufferMemory(memory_key="chat_history", input_key='question', output_key='answer', return_messages=True)

chain = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), return_source_documents=True, memory=memory)

# let's build the interface

interface = gradio.ChatInterface(
    fn=chatbot_interface,
    title="<div style='display: flex; align-items: center;'><img src='https://logos-download.com/wp-content/uploads/2021/01/University_of_San_Diego_Logo_full-1536x1536.png' alt='Your Image' style='margin-right: 10px; max-height: 100px'><h1 style='flex:1; margin-right: 100px; text-align: center;'>Chatbot for Stanford Q&A powered by Llama 2</h1></div>",
    examples= ["Hi", "who did the Virgin Mary appear to in 1858?", "What individuals live at Fatima House at Notre Dame?", "What was Yangon previously known as?"],
    description="This is a demo of a chatbot that stores Q&As and retirieves the answers given the question  closest to the query"
    )
interface.launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://59f68e88fae14aa7e0.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://59f68e88fae14aa7e0.gradio.live


