In [1]:
import numpy as np
import random
from sklearn.cluster import KMeans
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import words as allwords
from string import punctuation
from nltk.stem.lancaster import LancasterStemmer

file = open("_chat.txt", encoding="utf8") #Provide appropriate path to your whatsapp chatlog
chat=file.read()
file.close()

#Convert entire string of messages into individual messages. Clipped last 2 lines for my specific use case. 
#This variable is used to initialize chatbots
Messages=chat.split("\n")[0:-2]

In [2]:
#Script to get a list of the most commonly used (stemmed) words in all messages.
#One hot vectors for input and output messages can then be made using these words.
st=LancasterStemmer()
customStopWords=set(stopwords.words('english')+list(punctuation)+["’","”","“","",".","..","...","``","**"])

#Tokenize words in every message.
words=[word_tokenize(i) for i in Messages]

#Tokenization produced a list of tokenized words for each message, making a list of lists. This flattens the list.
words=[item for sublist in words for item in sublist]

#Remove characters that are not letters, such as emojis.
for i in range(len(words)):
    words[i]=words[i].encode('ascii', errors='replace')
    words[i]=words[i].decode("ascii")

#Remove all stopwords
wordsWOStopwords=[word for word in words if word not in customStopWords]

#Stem each word
stemmedWords=[st.stem(word) for word in wordsWOStopwords]

#Do not include numbers
lettersOnly=[word for word in stemmedWords if word.isalpha()]

#get a sorted list of each word and the number of times it appears
CountedWords=[[x,lettersOnly.count(x)] for x in set(lettersOnly)]
CountedWords.sort(key=lambda y: y[1])
CountedWords.reverse()

#Only use words that appear more than once.
popularwords=[word[0] for word in CountedWords if word[1]>1]

#Don't include single characters
reduceletters=[word for word in popularwords if len(word)>1]

#Final wordlist
wordlist=reduceletters

In [5]:
class chatBot:
    
    #Initialize the bot with the name of the person in the conversation who you want to create a bot out of. Also provide the name of the other person in the conversation. All messages need to be provided as well.
    def __init__(self,botname,username,Messages):
        self.botname = botname
        self.username = username
        
        #Script to organize messages into input messages(from user) and responses(from bot).
        
        self.inputs=[]
        self.outputs=[]
        
        m=0
        while m < len(Messages)-1:
            if self.username in Messages[m].split(":")[2]: #Is the message from the potential user?
                message=""
                while self.username in Messages[m].split(":")[2] and m<len(Messages)-1: #While the messages are from the potential user...
                    if "\u200e" not in Messages[m]: #Filtering out anything that isn't a text message.
                        message=message+"."+Messages[m].split(":")[3]
                    if m<=len(Messages)-1:
                        m+=1
                self.inputs.append(message)
                message=""
                while self.botname in Messages[m].split(":")[2] and m<len(Messages)-1: #While the messages are from the potential bot...
                    if "\u200e" not in Messages[m]: #Filtering out anything that isn't a text message.
                        message=message+"."+Messages[m].split(":")[3]
                    if m<=len(Messages)-1:
                        m+=1
                self.outputs.append(message)
            else:
                m+=1

        self.input_vectors=[]
        for j in range(len(self.inputs)):
            newInput=[]
            for i in range(len(wordlist)):
                newInput.append(self.inputs[j].count(wordlist[i]))
            self.input_vectors.append(newInput)

        self.output_vectors=[]
        for j in range(len(self.outputs)):
            newOutput=[]
            for i in range(len(wordlist)):
                newOutput.append(self.outputs[j].count(wordlist[i]))
            self.output_vectors.append(newOutput)
    
    def train(self,clusters): #Input number of clusters. They will represent clusters of similar input messages that the bot will recognize.
        
        self.model = KMeans(
        n_clusters=clusters, init='random',
        n_init=10, max_iter=1000, 
        tol=1e-04, random_state=0)

        self.labels = self.model.fit_predict(self.input_vectors)
    
    def reply(self,message):
        Input=[]
        for i in range(len(wordlist)):
            Input.append(message.count(wordlist[i]))
        label=self.model.predict([Input])
        options=np.where(self.labels==label)
        index=random.randint(0,len(options)-1)
        reply=self.outputs[options[0][index]][2:]
        return reply