<a href="https://colab.research.google.com/github/vperng/AAI520-NPL-Chatbot/blob/main/Chatbot_Project_Team6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Advanced Generative Chatbot Design

Rene Ortiz, Vivian Perng, Karthink Raghavan

## Project Overview

- Goal: Build a chatbot that can carry out multi-turn conversations, adapt to context, and handle a variety of topics.
- Output: A web or app interface where users can converse with the chatbot.


## Data Collection and Preprocessing

In [4]:
### COMMENT OUT IF NOT USING GOOGLE COLAB ####
import os
from google.colab import drive
drive.mount('/content/drive')

# Set the path to project folder
project_folder = '/content/drive/MyDrive/Colab Notebooks/Chatbot_Project'

# Change the working directory to project folder
os.chdir(project_folder)

# Check the current working directory
print(os.getcwd())

# Install packages that are not on Google Colab here


ModuleNotFoundError: No module named 'google.colab'

In [18]:
# Import libraries
import json
import numpy as np
import pandas as pd
import torch

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')


Using device: cpu


In [19]:
# Open files for reading
with open('/content/drive/MyDrive/Colab Notebooks/Chatbot_Project/train-v1.1.json', 'r') as file:
    train_data = json.load(file)
print(type(train_data))

<class 'dict'>


In [20]:
# Determine type for each key
for key in train_data.keys():
    print(f'Type of value for key "{key}": {type(train_data[key])}')

# Print information about each key
for key in train_data.keys():
    value = train_data[key]
    if isinstance(value, list):
        print(f'Length of list for key "{key}": {len(value)}')
    else:
        print(f'Value for key "{key}": {value}')

Type of value for key "data": <class 'list'>
Type of value for key "version": <class 'str'>
Length of list for key "data": 442
Value for key "version": 1.1


In [21]:
def explore_record_path(data, path=None):
    """
    Recursively explore the nested structure of a JSON-like object (dict or list) and print the paths
    and types at each level.

    Parameters:
    data (dict or list): The JSON data or a nested structure to explore.
    path (list): A list to track the current record path (used internally for recursion).

    Example usage:
    explore_record_path(train_data['data'])
    """

    if path is None:
        path = []

    # If the data is a dictionary, print the keys and recursively explore
    if isinstance(data, dict):
        for key in data:
            print(f"{' > '.join(path + [key])}: {type(data[key])}")
            explore_record_path(data[key], path + [key])

    # If the data is a list, print the index and recursively explore the first element
    elif isinstance(data, list) and len(data) > 0:
        print(f"{' > '.join(path)}[]: {type(data[0])}")
        explore_record_path(data[0], path)

# Explore the nested JSON structure to determine record path
explore_record_path(train_data['data'])

[]: <class 'dict'>
title: <class 'str'>
paragraphs: <class 'list'>
paragraphs[]: <class 'dict'>
paragraphs > context: <class 'str'>
paragraphs > qas: <class 'list'>
paragraphs > qas[]: <class 'dict'>
paragraphs > qas > answers: <class 'list'>
paragraphs > qas > answers[]: <class 'dict'>
paragraphs > qas > answers > answer_start: <class 'int'>
paragraphs > qas > answers > text: <class 'str'>
paragraphs > qas > question: <class 'str'>
paragraphs > qas > id: <class 'str'>


In [22]:
def squad1_json_to_dataframe(file_path, record_path):
    """
    Function to convert the dataset JSON file to a Pandas DataFrame.

    file_path (str): Path to the JSON file
    record_path (list): Path to the deepest level in the JSON structure (default is ['data', 'paragraphs', 'qas', 'answers']).

    Returns dataFrame containing the parsed data.
    """
    # Load JSON data
    with open(file_path, 'r') as f:
        file_data = json.load(f)

    # Extract and normalize the nested JSON structures
    answers_df = pd.json_normalize(file_data, record_path)
    questions_df = pd.json_normalize(file_data, record_path[:-1])
    paragraphs_df = pd.json_normalize(file_data, record_path[:-2])

    # Add 'answer start' and 'text' from answers_df to questions_df
    questions_df['answer_start'] = answers_df['answer_start']
    questions_df['answers'] = answers_df['text']

    # Create 'context' by repeating the corresponding paragraph for each question
    questions_df['context'] = np.repeat(paragraphs_df['context'].values, paragraphs_df.qas.str.len())

    # Create final DataFrame with necessary columns
    data = questions_df[['id', 'question', 'context', 'answer_start', 'answers']].copy()

    # Add 'c_id' to uniquely identify each context
    data['c_id'] = pd.factorize(data['context'])[0]

    return data.reset_index(drop=True)


In [23]:
# loading the data
file_path = '/content/drive/MyDrive/Colab Notebooks/Chatbot_Project/train-v1.1.json'
df = squad1_json_to_dataframe(file_path, record_path=['data', 'paragraphs', 'qas', 'answers'])
df

Unnamed: 0,id,question,context,answer_start,answers,c_id
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",515,Saint Bernadette Soubirous,0
1,5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",188,a copper statue of Christ,0
2,5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",279,the Main Building,0
3,5733be284776f41900661181,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",381,a Marian place of prayer and reflection,0
4,5733be284776f4190066117e,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",92,a golden statue of the Virgin Mary,0
...,...,...,...,...,...,...
87594,5735d259012e2f140011a09d,In what US state did Kathmandu first establish...,"Kathmandu Metropolitan City (KMC), in order to...",229,Oregon,18890
87595,5735d259012e2f140011a09e,What was Yangon previously known as?,"Kathmandu Metropolitan City (KMC), in order to...",414,Rangoon,18890
87596,5735d259012e2f140011a09f,With what Belorussian city does Kathmandu have...,"Kathmandu Metropolitan City (KMC), in order to...",476,Minsk,18890
87597,5735d259012e2f140011a0a0,In what year did Kathmandu create its initial ...,"Kathmandu Metropolitan City (KMC), in order to...",199,1975,18890


## Data Exploration


## Text Preprocessing

## Model Design and Training

## Evaluation

In [1]:
from fastapi import FastAPI, HTTPException
from langchain.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain_community.llms import Ollama
from langchain_community.document_loaders import DataFrameLoader
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.load import dumpd, dumps, load, loads
from langchain.chains import load_chain
from langserve import add_routes
from langchain_core.runnables import RunnableBinding, RunnableLambda
from pathlib import Path
from dotenv import load_dotenv
import uvicorn
import pandas as pd
import json
import os
import json
import torch
import numpy as np
import logging  

  from tqdm.autonotebook import tqdm, trange


In [8]:
def squad1_json_to_dataframe(file_path, record_path=['data', 'paragraphs', 'qas', 'answers']):
    """
    Functuon to convert the dataset JSON file to a Pandas DataFrame.

    file_path (str): Path to the JSON file
    record_path (list): Path to the deepest level in the JSON structure (default is ['data', 'paragraphs', 'qas', 'answers']).

    Returns dataFrame containing the parsed data.
    """
    # Load JSON data
    with open(file_path, 'r') as f:
        file_data = json.load(f)

    # Extract and normalize the nested JSON structures
    answers_df = pd.json_normalize(file_data, record_path)
    questions_df = pd.json_normalize(file_data, record_path[:-1])
    paragraphs_df = pd.json_normalize(file_data, record_path[:-2])

    # Create 'context' by repeating the corresponding paragraph for each question
    questions_df['context'] = np.repeat(paragraphs_df['context'].values, paragraphs_df.qas.str.len())

    # Create final DataFrame with necessary columns
    data = questions_df[['id', 'question', 'context', 'answers']].copy()

    # Extract text and start positions to separate column
    data['answer_text'] = data['answers'].apply(lambda x: x[0]['text'] if len(x) > 0 else "")
    data['answer_start'] = data['answers'].apply(lambda x: x[0]['answer_start'] if len(x) > 0 else None)

    # Add 'c_id' to uniquely identify each context
    data['c_id'] = pd.factorize(data['context'])[0]

    return data.reset_index(drop=True)

In [10]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

In [25]:
# Load the SQuAD dataset
file_path = "train-v1.1.json"
df = squad1_json_to_dataframe(file_path, record_path=['data', 'paragraphs', 'qas', 'answers'])
df_context = pd.DataFrame(df['context'].unique(), columns=['context'])
df_context = df_context[:100]

# Load data into Langchain
loader = DataFrameLoader(df_context, page_content_column="context")
docs = loader.load()

logging.info("split documents into chunks.")
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(docs)

logging.info("Creating embedding.")


# Initialize the SentenceTransformer model
model_name = 'all-MiniLM-L6-v2'
sentence_transformer_model = SentenceTransformer(model_name).to(device)

# Wrap the SentenceTransformer model with LangChain's HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name=model_name)
logging.info("Loading to vector db")

db = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory="./")

#db = Chroma.from_documents(documents, embeddings)


logging.info("initialize retriever")
retriever = db.as_retriever()


# Define the LLM and prompt template
logging.info("initialize model and prompt.")

llm = Ollama(model="orca-mini") 
#llm = Ollama(model="llama2") 



prompt = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context. 
Think step by step before providing a detailed answer. 
<context>
{context} 
</context>
Question: {input}""")

logging.info(prompt)

logging.info("creating document chains.")

document_chain=create_stuff_documents_chain(llm, prompt)

logging.info("creating retrieval chains.")

# Create the retrieval-based document chain
retrieval_chain = create_retrieval_chain(retriever, document_chain)

  vector_db = Chroma(persist_directory="/")


OperationalError: unable to open database file

In [12]:
type(retrieval_chain)

langchain_core.runnables.base.RunnableBinding

In [18]:
import pickle
# Assume `retrieval_chain` is the object you want to save
chain_dump = dumps(retrieval_chain, pretty=True)
print(type(chain_dump))
with open("retrieval_chain_nb.json", "w") as fp:
    json.dump(chain_dump, fp, indent=4)


<class 'str'>


In [22]:
try:
    # Replace 'retrieval_chain_nb.json' with the path to your JSON file
    with open("retrieval_chain_nb.json", "r") as fp:
        config_data = json.load(fp)
    print("JSON is valid.")
except json.JSONDecodeError as e:
    print("Invalid JSON:", e)
retrieval_chain = load(config_data)
print(type(retrieval_chain))

JSON is valid.
<class 'str'>


In [23]:
# Specify the directory where the database should be saved
persist_directory = "/"

# Initialize the Chroma client with persistence enabled
client = db.Client(persist_directory=persist_directory)

# Create or load a collection with embeddings
collection = client.get_or_create_collection(name="my_collection")

# Add data to the collection (example)
collection.add(
    ids=["id1", "id2"],
    embeddings=[[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]],
    metadatas=[{"source": "text1"}, {"source": "text2"}],
    documents=["This is a document", "This is another document"]
)

# Persist the database to disk
client.persist()

print(f"Database saved to: {persist_directory}")

AttributeError: 'Chroma' object has no attribute 'Client'

In [3]:
with open("retrieval_chain.json", "r") as fp:
    config_data = json.load(fp)

retrieval_chain = load(config_data)
retrieval_chain

  retrieval_chain = load(config_data)


'{\n  "lc": 1,\n  "type": "constructor",\n  "id": [\n    "langchain",\n    "schema",\n    "runnable",\n    "RunnableBinding"\n  ],\n  "kwargs": {\n    "bound": {\n      "lc": 1,\n      "type": "constructor",\n      "id": [\n        "langchain",\n        "schema",\n        "runnable",\n        "RunnableSequence"\n      ],\n      "kwargs": {\n        "first": {\n          "lc": 1,\n          "type": "constructor",\n          "id": [\n            "langchain",\n            "schema",\n            "runnable",\n            "RunnableAssign"\n          ],\n          "kwargs": {\n            "mapper": {\n              "lc": 1,\n              "type": "constructor",\n              "id": [\n                "langchain",\n                "schema",\n                "runnable",\n                "RunnableParallel"\n              ],\n              "kwargs": {\n                "steps__": {\n                  "context": {\n                    "lc": 1,\n                    "type": "constructor",\n          

In [6]:
config = json.loads(retrieval_chain)
config

{'lc': 1,
 'type': 'constructor',
 'id': ['langchain', 'schema', 'runnable', 'RunnableBinding'],
 'kwargs': {'bound': {'lc': 1,
   'type': 'constructor',
   'id': ['langchain', 'schema', 'runnable', 'RunnableSequence'],
   'kwargs': {'first': {'lc': 1,
     'type': 'constructor',
     'id': ['langchain', 'schema', 'runnable', 'RunnableAssign'],
     'kwargs': {'mapper': {'lc': 1,
       'type': 'constructor',
       'id': ['langchain', 'schema', 'runnable', 'RunnableParallel'],
       'kwargs': {'steps__': {'context': {'lc': 1,
          'type': 'constructor',
          'id': ['langchain', 'schema', 'runnable', 'RunnableBinding'],
          'kwargs': {'bound': {'lc': 1,
            'type': 'constructor',
            'id': ['langchain', 'schema', 'runnable', 'RunnableSequence'],
            'kwargs': {'first': {'lc': 1,
              'type': 'not_implemented',
              'id': ['langchain_core', 'runnables', 'base', 'RunnableLambda'],
              'repr': "RunnableLambda(lambda x: x

In [7]:
config['type']

'constructor'