In [18]:
import pandas as pd
import fasttext
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk

import string

In [22]:
class nlp_module():
    
    def stopwords():
        stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                     'ourselves', 'you', "you're", "you've", "you'll",
                     "you'd", 'your', 'yours', 'yourself', 'yourselves',
                     'he', 'him', 'his', 'himself', 'she', "she's", 'her',
                     'hers', 'herself', 'it', "it's", 'its', 'itself',
                     'they', 'them', 'their', 'theirs', 'themselves', 'what',
                     'which', 'who', 'whom', 'this', 'that', "that'll", 'these',
                     'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been',
                     'being', 'have', 'has', 'had', 'having', 'do', 'does',
                     'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if',
                     'or', 'because', 'as', 'until', 'while', 'of', 'at',
                     'by', 'for', 'with', 'about', 'against', 'between',
                     'into', 'through', 'during', 'before', 'after', 'above',
                     'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on',
                     'off', 'over', 'under', 'again', 'further', 'then', 'once',
                     'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
                     'both', 'each', 'few', 'more', 'most', 'other', 'some',
                     'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
                     'than', 'too', 'very', 's', 't', 'can', 'will', 'just',
                     'don', "don't", 'should', "should've", 'now', 'd', 'll',
                     'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn',
                     "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',
                     "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't",
                     'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't",
                     'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren',
                     "weren't", 'won', "won't", 'wouldn', "wouldn't", 'fuck', 'fucker',
                     'lah', 'la', 'leh', 'lor', 'nah', 'ya', 'yah', 'shit', 'ass', 'asshole',
                     'le', 'already', 'liao', 'liaoz', 'u', 'cheebye', 'lanjiao',
                     'nabei', 'kaopei', 'knnb', 'cb', 'cheebye', 'fucked', 'fucks',
                     'bitch', 'bitches', 'scumbag', 'fuckface', 'wtf', 'ffs', 'siao',
                     'walao', 'waliao', 'ttyl', 'orhhhh', 'sai']
        
        return stopwords
    
    def train_model(input):
        model = fasttext.train_supervised(input)
        model.save_model("ft_model.bin")
    
    def TfidfVec(stopwords):
    # Initializing tfidf vectorizer model
        TfidfVec = TfidfVectorizer(tokenizer=nlp_module.LemNormalize, 
                                   stop_words=stopwords, 
                                   ngram_range=(1,2))
        
        return TfidfVec
    
    def LemTokens(tokens):
        lemmer = nltk.stem.WordNetLemmatizer()
        
        return [lemmer.lemmatize(token) for token in tokens]
    
    def LemNormalize(text):
        remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
        
        return nlp_module.LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
    
    # Function to process stopwords
    def stopwords_process(stopwords):
        temp_string = ''
        for word in stopwords:
            temp_string += word
            temp_string += ' '
        
        output = nlp_module.LemNormalize(temp_string)
        
        return output
    
    # Load fasttext model
    def load_model(model):
        ft_model = fasttext.load_model(model)
        
        return ft_model
    
    def fasttext_output(user_input, model):
        return model.predict(user_input, k=1)[0][0]
       
    def load_response_file(csv_file):
        output = pd.read_csv(csv_file)
        
        return output

    # Function to create a prompts corpus based on the classification generated by the fasttext model\
    # Takes in user's input and labelled prompts/response database
    def create_prompts_corpus(input_text, csv_file, model):
        # Use the predicted label by fasttext model to filter prompts/response database
        matched_label = model.predict(input_text, k=1)[0][0]
        mask = csv_file.label == matched_label

        # Intializing empty string for runtime generated prompts corpus
        corp = ''

        # Iterates through the filtered prompts database and adds to corpus string
        # End result will be string containing only prompts that fit the filtered category
        for prompt in csv_file[mask]['prompt'].values:
            corp += prompt
            corp = corp.replace('.', '. ').replace('?', '? ').replace('!', '! ')
        return corp
    
    # Function to generate the top matched response given user input
    def get_response(input_text, csv_file, TfidfVec, model):
        user_input = input_text.lower()
        response = ''
        
        response_db = nlp_module.load_response_file(csv_file)
        prompts_corpus = nlp_module.create_prompts_corpus(input_text, response_db, model)
        
      
        # Tokenizing the prompts corpus into sentence tokens
        # Adding the user's input into the list of sentence tokens,
        prompts_sent_token = nltk.sent_tokenize(prompts_corpus)
        prompts_sent_token.append(user_input)
      
        # Fitting the tfidf model with the sentence tokens
        # Calculating the cosine similarities between user input and corpus 
        tfidf = TfidfVec.fit_transform(prompts_sent_token)
        cosine_vals = cosine_similarity(tfidf[-1], tfidf)

        flat = cosine_vals.flatten()

        score = 0
        for i in range(0, len(flat)-1, 1):
            if flat[i] >= score:
                score = flat[i]
                matched_score = score
        
        matching_list = list(flat)
        index = matching_list.index(matched_score)
       
        if matched_score == 0:
            response = response + "I'm sorry, I do not understand you"
        
        else:
            try:
                response = response + response_db[response_db['prompt'] == prompts_sent_token[index]]['response'].values[0]
                prompts_sent_token.remove(user_input)
            except:
                response = response + response_db[(response_db['prompt'].str.contains(prompts_sent_token[index]))]
                prompts_sent_token.remove(user_input)
       
        return str(response)
    
    # Function to return fasttext matched label and matched score
    # Format: [('label', score)]
    def predict_label(input_text, model):
        output = []
        user_input = input_text.lower()        
        matched_label = model.predict(input_text, k=1)[0][0]
        matched_score = model.predict(input_text, k=1)[1][0]
        output.append((matched_label, matched_score))
        
        return output
    
