#### 453 Assignment 4 - Chat Bot

In [None]:
# General
import os
import pandas as pd
import random
import matplotlib.pyplot as plt
import re

# NLP
import nltk

# Preprocessing
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

# Knowledge Graphs
import spacy
import networkx as nx

# Model 1 - TF-IDF 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Model 2 - Sentence Transformers
from sentence_transformers import SentenceTransformer

# Model 3 - Huggin Face
from transformers import pipeline

# Model 4 - Llama Index
from llama_index import VectorStoreIndex, SimpleDirectoryReader

#### Import Data

In [None]:
data = pd.read_excel('vcs_conversations.xlsx',sheet_name='Caller')

In [None]:
# Data Wrangling
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def nlp_transformations(df):
    """
    Apply NLP transformations to a pandas DataFrame.
    Parameters: df (DataFrame): Input dataframe with a column named 'Text'.
    Returns: DataFrame: A new dataframe with added columns for each NLP transformation.
    """
    df['Tokens'] = df['Text'].apply(word_tokenize)
    df['Normalized'] = df['Tokens'].apply(lambda x: [word.lower() for word in x])
    df['Lemmatized'] = df['Normalized'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    df['No_Stop_Words'] = df['Lemmatized'].apply(lambda x: [word for word in x if word not in stop_words])
    df['Cleaned'] = df['No_Stop_Words'].apply(lambda x: [re.sub(r'[^a-zA-Z]', '', word) for word in x])
    df['Cleaned'] = df['Cleaned'].apply(lambda x: [word for word in x if word != ''])
    df['Cleaned'] = df['Cleaned'].apply(lambda x: ' '.join(x))

    return df

wrangled_data = nlp_transformations(data)

In [None]:
wrangled_data

#### Data Preprocessing

In [None]:
conversations = data.loc[:,'Text'].tolist()

#### Knowledge Graph

In [None]:
# Assuming df is your DataFrame and the column with conversations is named 'conversation'
nlp = spacy.load('en_core_web_lg')

In [None]:
G = nx.Graph()

# Process each conversation
for conversation in data['Text']:
    doc = nlp(conversation)
    for ent in doc.ents:
        # Add entities as nodes
        G.add_node(ent.text, type=ent.label_)

    # Add possible relationships (edges)
    for token in doc:
        if token.dep_ in ('nsubj', 'dobj'):
            subj = [w for w in token.head.lefts if w.dep_ == 'nsubj']
            if subj:
                G.add_edge(subj[0].text, token.text)

# Use a spring layout to spread out the nodes
pos = nx.spring_layout(G, scale=2)  # Scale parameter spreads nodes further apart

# Specify the figure size
plt.figure(figsize=(12, 12))  # You can adjust these dimensions as needed

# Draw the graph with the specified layout and adjusted node and font sizes
nx.draw(G, pos, with_labels=True, node_size=500, font_size=8, node_color="skyblue", font_weight="bold")

# Show the graph
plt.show()

#### Modeling - Questions

In [None]:
q_one = 'What departments do you have?'
q_two = 'What are the store hours?'
q_three = 'What sections does the general department have?'
q_four = 'What services does the pharmacy offer?'
q_five = 'What services does the photo department have?'
q_six = 'How do callers feel about the pharmacy department?'
q_seven = 'How do callers feel about the photo department?'

#### Model 1 - TF-IDF Vectorization

In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(conversations)

def find_response(input_text):
    # Transform the input text to tf-idf vector
    input_tfidf = vectorizer.transform([input_text])
    
    # Calculate cosine similarity
    cosine_similarities = cosine_similarity(input_tfidf, tfidf_matrix)
    
    # Find the most similar conversation
    most_relevant = cosine_similarities.argsort()[0][-1]
    return conversations[most_relevant]

print(f'Question: {q_one}')
print(f"Answer: {find_response(q_one)[0:2]}")
print(f'Question: {q_two}')
print(f"Answer: {find_response(q_two)[0:2]}")
print(f'Question: {q_three}')
print(f"Answer: {find_response(q_three)[0:2]}")
print(f'Question: {q_four}')
print(f"Answer: {find_response(q_four)[0:2]}")
print(f'Question: {q_five}')
print(f"Answer: {find_response(q_five)[0:2]}")
print(f'Question: {q_six}')
print(f"Answer: {find_response(q_six)[0:2]}")
print(f'Question: {q_seven}')
print(f"Answer: {find_response(q_seven)[0:2]}")

#### Model 2 - Transformers

In [None]:
# Load Sentence Transformer model optimized for sentence cosine similarity calculations
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

# Download necessary NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Read in data
CORPUS_PATH = '/Users/dylanhayashi/Desktop/Northwestern/NU_MSDS/453 - Natural Language Processing/453.10 - Final Project/employee/employee.txt'
with open(CORPUS_PATH, 'r', errors='ignore') as f:
    raw = f.read().lower()  # Converts to lowercase

# Create list of sentences and words
sent_tokens = nltk.sent_tokenize(raw)  # Converts to list of sentences
word_tokens = nltk.word_tokenize(raw)  # Converts to list of words

# Greetings inputs and responses
GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up", "hey")
GREETING_RESPONSES = ["Hello"]

def greeting(sentence):
    """If user's input is a greeting, return a greeting response"""
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

def response(user_response):
    """Generate response to user input"""
    chatbot_response = ''
    sentence_encodings = model.encode(sent_tokens, convert_to_tensor=True)
    sentence_encodings = sentence_encodings.cpu()
    vals = cosine_similarity(sentence_encodings[-1].reshape(1, -1), sentence_encodings)
    idx = vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    second_cos_sim_val = flat[-2]
    if second_cos_sim_val == 0:
        chatbot_response = "Sorry, I do not have an answer to your question in my database"
    else:
        chatbot_response = sent_tokens[idx]
    return chatbot_response

def chatbot_response(user_input):
    """Process user input and return chatbot response"""
    user_input = user_input.lower()
    response_text = ""
    if user_input != 'exit':
        if user_input in ('thanks', 'thank you'):
            response_text = "You are welcome!"
        else:
            if greeting(user_input) is not None:
                response_text = greeting(user_input)
            else:
                sent_tokens.append(user_input)
                word_tokens.extend(nltk.word_tokenize(user_input))
                response_text = response(user_input)
                sent_tokens.remove(user_input)
    return response_text

print(f'Question: {q_one}')
print(f'Answer: {chatbot_response(q_one)}')
print(f'Question: {q_two}')
print(f'Answer: {chatbot_response(q_two)}')
print(f'Question: {q_three}')
print(f'Answer: {chatbot_response(q_three)}')
print(f'Question: {q_four}')
print(f'Answer: {chatbot_response(q_four)}')
print(f'Question: {q_five}')
print(f'Answer: {chatbot_response(q_five)}')
print(f'Question: {q_six}')
print(f'Answer: {chatbot_response(q_six)}')
print(f'Question: {q_seven}')
print(f'Answer: {chatbot_response(q_seven)}')

#### Model 3 - Hugging Face

In [None]:
#read in data

CORPUS_PATH = '/Users/dylanhayashi/Desktop/Northwestern/NU_MSDS/453 - Natural Language Processing/453.10 - Final Project/employee/employee.txt'
f=open(CORPUS_PATH,'r',errors = 'ignore')
raw=f.read()

qa_pipeline = pipeline("question-answering")

context = raw

print(f'Question: {q_one}')
print(f"Answer: {qa_pipeline(context=context, question=q_one)['answer']}")
print(f'Question: {q_two}')
print(f"Answer: {qa_pipeline(context=context, question=q_two)['answer']}")
print(f'Question: {q_three}')
print(f"Answer: {qa_pipeline(context=context, question=q_three)['answer']}")
print(f'Question: {q_four}')
print(f"Answer: {qa_pipeline(context=context, question=q_four)['answer']}")
print(f'Question: {q_five}')
print(f"Answer: {qa_pipeline(context=context, question=q_five)['answer']}")
print(f'Question: {q_six}')
print(f"Answer: {qa_pipeline(context=context, question=q_six)['answer']}")
print(f'Question: {q_seven}')
print(f"Answer: {qa_pipeline(context=context, question=q_seven)['answer']}")