In [1]:
# we start by fetching and preprocessing our text
import re
import os

def clean_text(text):
    with open(book, 'r', encoding='utf8', errors='ignore') as f:
        #transform everything to lowercase
        text = f.read().lower()
    
        #replace markdown characters
        text = text.replace('\xa0', ' ')
    
        #remove emoji and author name
        text = re.sub(r"✍ jane austen", "", text)
        
        #remove these chars: they're not counted as punctuation later when they're attached to a word with no space
        #which is something that happens a lot in Austen's dialogues
        text = text.replace('—', ' ')
        text = text.replace('_', ' ')
    
        #remove numbers and stuff like 1st, 2nd, 3rd, 4th, etc.
        text = re.sub(r"\d+\w*", " ", text)
   
        #remove volume and chapter headings
        text = re.sub(r"#+\s\w+\s\w+\s", "", text)
        
        #remove book title
        text = re.sub(r"#\s\w+\s", "", text)
    
        #remove weird chars
        text = text.replace("£", " ")
        text = re.sub(r"\n", " ", text)
    
    return text

#loop through all the books we've saved as text files in our data dir and create a final string of cleaned text
cleaned_text = ''
for file in os.listdir("data"):
    book = "data/" + file
    cleaned_text += ' ' + clean_text(book)

In [2]:
#now we will transform our text into a format that can be understood by NLP algorithms
import nltk
#import string

#tokenize cleaned_text string to a list of sentences
sent_tokens = nltk.sent_tokenize(cleaned_text)

#extract dialog from complete text sentences
def extract_dialog(text):
    dialog_pattern = '(?<=“)(.+?)(?=”)'
    
    m = re.search(dialog_pattern, text)
    
    if m is not None:
        return m.group(0)

dialog = [extract_dialog(sentence) for sentence in sent_tokens if extract_dialog(sentence)]

In [3]:
#create a matrix of TFIDF weights for our sentences
#we will need this to compare to new text (i.e. the user request) with a similarity measure
from sklearn.feature_extraction.text import TfidfVectorizer

TfidfVec = TfidfVectorizer(stop_words='english')
tfidf = TfidfVec.fit_transform(dialog)

In [4]:
import pandas as pd

#the result of this is a matrix, X, with one row for each sentence ("document") and a column for each unique word
X = pd.DataFrame(tfidf.toarray(), columns = TfidfVec.get_feature_names(), dtype='float32')
print(X.head())

   abbey  abhor  abide  able  abode  abominable  abominably  abroad  absence  \
0    0.0    0.0    0.0   0.0    0.0         0.0         0.0     0.0      0.0   
1    0.0    0.0    0.0   0.0    0.0         0.0         0.0     0.0      0.0   
2    0.0    0.0    0.0   0.0    0.0         0.0         0.0     0.0      0.0   
3    0.0    0.0    0.0   0.0    0.0         0.0         0.0     0.0      0.0   
4    0.0    0.0    0.0   0.0    0.0         0.0         0.0     0.0      0.0   

   absent  ...   ye  year  years  yes  yesterday  yield  york  young  younger  \
0     0.0  ...  0.0   0.0    0.0  0.0        0.0    0.0   0.0    0.0      0.0   
1     0.0  ...  0.0   0.0    0.0  0.0        0.0    0.0   0.0    0.0      0.0   
2     0.0  ...  0.0   0.0    0.0  0.0        0.0    0.0   0.0    0.0      0.0   
3     0.0  ...  0.0   0.0    0.0  0.0        0.0    0.0   0.0    0.0      0.0   
4     0.0  ...  0.0   0.0    0.0  0.0        0.0    0.0   0.0    0.0      0.0   

   youngest  
0       0.0  
1   

We now have a corpus represented in such a way that we can use it to, for example, calculate distances to other (similarly transformed) text. 

I'm using cosine similarity: how "close" is our query (user input) to our tokenized dialog?

In [7]:
#in-place chatbot feature will let us talk with the bot here locally with no server setup
from sklearn.metrics.pairwise import cosine_similarity

def response(user_response):
    jane_response = ''

    #transform user query so we can compare it to our matrix of TFIDF weighted word features
    query = TfidfVec.transform([user_response])
    
    #what's the maximum closeness we can achieve to our saved dialogues?
    cosine_sim = query.dot(X.T)
    
    #if there's nothing like the user query in our matrix, give a standard response
    if cosine_sim.argmax() == 0:
        return 'I beg your pardon? I\'m not quite sure I got your meaning.'
    
    #otherwise, return the closest dialog to the user request
    jane_response = dialog[cosine_sim.argmax()]  
    
    #take the puncutation off of the end of our response: this is usually a comma in Austen's dialogues!
    jane_response = jane_response[0:-1] + '.'
    
    #capitalize the first letter of the response so that it looks like a real sentence
    #remember that the first step we took in cleaning the text was to transform everything to lowercase
    return jane_response.capitalize()

In [8]:
#prompt a dialog with the user
print("JANE: My name is Miss Austen. I will answer any questions you have about my world! If you would like to end the conversation prematurely, please type \'Thank you\'")

#while the dialog is ongoing...
flag = True
while(flag == True):
    #transform user input and fetch a response from our matrix
    user_response = input()
    user_response=user_response.lower()
    
    #the cue to end the conversation
    if(user_response == 'thank you'):
        flag=False
        print("JANE: You are most welcome. Goodbye now.")
    else:
        print("JANE: ", end="")
        print(response(user_response))

JANE: My name is Miss Austen. I will answer any questions you have about my world! If you would like to end the conversation prematurely, please type 'Thank you'
Good Morning
JANE: Good morning to you.
How are you today
JANE: No walk for me today.
OK. Thank you
JANE: No, no not at all no, thank yo.
Thank YOU
JANE: You are most welcome. Goodbye now.


In [9]:
#serialize the components of our model so we can upload them to AWS S3 reuse them in our AWS Lambda architecture:
#we don't want to have to retrain our model and recreate the TFIDF matrix every time the user sends a query over Slack
import pickle
    
pickle.dump(X, open('model_simple_X.pkl', 'wb'))
pickle.dump(dialog, open('model_simple_dialog.pkl', 'wb'))
pickle.dump(TfidfVec, open('model_simple_tfidf.pkl', 'wb'))

In [10]:
#did it work?
# Load from file
pickle_X = pickle.load(open('model_simple_X.pkl', 'rb'))
pickle_dialog = pickle.load(open('model_simple_dialog.pkl', 'rb'))
pickle_tfidf = pickle.load(open('model_simple_tfidf.pkl', 'rb'))

query = pickle_tfidf.transform(['Good morning'])
cosine_sim = query.dot(pickle_X.T)
reply = pickle_dialog[cosine_sim.argmax()]

print(reply)

good morning to you,
