#### 453 Assignment 4 - Chat Bot

In [30]:
# General
import os
import pandas as pd
import random

# NLP
import nltk

# Model 1 - TF-IDF 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Model 2 - Sentence Transformers
from sentence_transformers import SentenceTransformer

# Model 3 - Huggin Face
from transformers import pipeline

# Model 4 - Llama Index
from llama_index import VectorStoreIndex, SimpleDirectoryReader

#### Import Data

In [2]:
data = pd.read_excel('vcs_conversations.xlsx')

In [3]:
data

Unnamed: 0,Conversation,Department,Sentiment,Text
0,1,General,Positive,"Caller: Hello, is this The Very Convenient Sto..."
1,2,General,Positive,"Caller: Hi, I was wondering if The Very Conven..."
2,3,General,Positive,"Caller: Hello, I’m looking for grooming produc..."
3,4,General,Positive,"Caller: Hello, I'm getting ready for back-to-s..."
4,5,Pharmacy,Negative,"Caller: Hello, is this The Very Convenient Sto..."
5,6,Pharmacy,Negative,"Caller: Hello, does The Very Convenient Store ..."
6,7,Pharmacy,Negative,"Caller: Hi, I'm calling to ask if you have a p..."
7,8,Pharmacy,Negative,"Caller: Hello, I dropped off my prescription t..."
8,9,Photo,Positive,"Caller: Hi, I'm calling to inquire about the p..."
9,10,Photo,Positive,"Caller: Hi, is this the photo department at Th..."


In [4]:
conversations = data.loc[:,'Text'].tolist()

#### Model 1 - TF-IDF Vectorization

In [5]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(conversations)

In [6]:
def find_response(input_text):
    # Transform the input text to tf-idf vector
    input_tfidf = vectorizer.transform([input_text])
    
    # Calculate cosine similarity
    cosine_similarities = cosine_similarity(input_tfidf, tfidf_matrix)
    
    # Find the most similar conversation
    most_relevant = cosine_similarities.argsort()[0][-1]
    return conversations[most_relevant]

In [7]:
find_response("What are the pharmcacy store hours?")

"Caller: Hello, is this The Very Convenient Store? VCS Employee: Yes, it is. How can I assist you today? Caller: Could you tell me your store hours, please? VCS Employee: Certainly. Our general department is open from 8 AM to 8 PM on weekdays, and from 10 AM to 5 PM on weekends. Caller: Wow! Amazing! Those are much better hours than most convenient sores. What other departments do you have? VCS Employee: We also have a pharmacy and a photo department. Caller: Great! Those are all the departments I need in one place. And what are their hours? VCS Employee: The pharmacy and photo departments are open from 9 AM to 5 PM on weekdays. Caller: Great! Those work very well with my working hours, this store is awesome. VCS Employee: You're welcome! If you have any other questions, feel free to call us during our operating hours. Caller: Will do, thanks. Goodbye! VCS Employee: Goodbye, have a great day!"

#### Knowledge Graph

#### Model 2 - Transformers

In [9]:
#Load Sentence Transformer model optimized for  sentence cosine similarity calculations
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

# Only run this once, they will be downloaded.
nltk.download('stopwords',quiet=True)
nltk.download('wordnet',quiet=True)
nltk.download('punkt',quiet=True)
nltk.download('omw-1.4',quiet=True)

#read in data
CORPUS_PATH = '/Users/dylanhayashi/Desktop/Northwestern/NU_MSDS/453 - Natural Language Processing/453.10 - Final Project/data/vcs_conversations.txt'
f=open(CORPUS_PATH,'r',errors = 'ignore')
raw=f.read()
raw=raw.lower()# converts to lowercase

#create list of sentences and words
sent_tokens = nltk.sent_tokenize(raw)# converts to list of sentences 
word_tokens = nltk.word_tokenize(raw)# converts to list of words

#create greetings and greetings function

GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up","hey",)
GREETING_RESPONSES = ["Hello"]

# Checking for greetings
def greeting(sentence):
    """If user's input is a greeting, return a greeting response"""
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

# Generating response function 
def response(user_response):
    chatbot_response=''
    sentence_encodings=model.encode(sent_tokens, convert_to_tensor=True)# generate sentence transformer embeddings
    sentence_encodings=sentence_encodings.cpu()
    vals = cosine_similarity(sentence_encodings[-1].reshape(1, -1), sentence_encodings) #the chatbot conversation code 
    #in the next cell adds the question as the last sentence of the sentence tokens, before calling this response function.
    #The code takes the last sentence (which is the question) and gets cosine similarities vs all the sentences in the corpus,
    #including itself
    idx=vals.argsort()[0][-2] #gets the index of the second highest similarity (the first highest would be the question itself)
    flat = vals.flatten()#reduces dimension of cosine similarity array to be able to sort
    flat.sort() #sort the cosine similarity values
    second_cos_sim_val = flat[-2] #get the second highest cosine similarity value.
    if(second_cos_sim_val==0): #check the second highest cosine similarity value. If it's zero return the no match response,
        #else return highest cosine similarity sentence.
        chatbot_response=chatbot_response+"Sorry, I do not have an answer to your question in my database"
        return chatbot_response
    else:
        chatbot_response = chatbot_response+sent_tokens[idx] #use index of highest cosine similarity to get original sentence
        return chatbot_response

#Chatbot interaction code

flag=True
print("Welcome to the Stock and Bond Concepts Information Chatbot. To end session please type exit")
print("\n")

while(flag==True):
    user_response = input()
    user_response=user_response.lower()
    if user_response!='exit':
        if(user_response=='thanks' or user_response=='thank you' ):
            flag=False
            print("Answer: You are welcome!")
        else:
            if(greeting(user_response)!=None):
                print("Answer: "+greeting(user_response))
            else:
                sent_tokens.append(user_response)
                word_tokens=word_tokens+nltk.word_tokenize(user_response)
                final_words=list(set(word_tokens))
                print("Answer: ",end="")
                print(response(user_response))
                print("\n")
                sent_tokens.remove(user_response)
    else:
        flag=False
        print("Thank you for using the Stock and Bond Concepts Information Chatbot. Good bye.")

Welcome to the Stock and Bond Concepts Information Chatbot. To end session please type exit


Thank you for using the Stock and Bond Concepts Information Chatbot. Good bye.


#### Model 3 - Hugging Face

In [11]:
#read in data
CORPUS_PATH = '/Users/dylanhayashi/Desktop/Northwestern/NU_MSDS/453 - Natural Language Processing/453.10 - Final Project/data/vcs_conversations.txt'
f=open(CORPUS_PATH,'r',errors = 'ignore')
raw=f.read()

qa_pipeline = pipeline("question-answering")

context = raw

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [12]:
question = "Are customers satisfied with the pharmacy department"

qa_pipeline(context=context, question=question)

{'score': 0.3388751745223999,
 'start': 3585,
 'end': 3616,
 'answer': 'Our prices are very competitive'}

#### Model 4 - 

In [27]:
os.environ['OPENAI_API_KEY'] = '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'
documents = SimpleDirectoryReader("/Users/dylanhayashi/Desktop/Northwestern/NU_MSDS/453 - Natural Language Processing/453.10 - Final Project/data").load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()

In [16]:
query_engine.query("What are store hours")

NameError: name 'index' is not defined