# This Notebook Contains:
### 1.  Pre-processing of data. 
### 2.  Filter data base on categories
### 3.  Including one-hot-encoding
### 4.  Prepare Traing data
### 5.  Build NLP model
### 6.  Save all intermediate results and model checkpoints

### Including python libraries

In [1]:
import pandas as pd

# things we need for NLP
import nltk
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
stemmer = LancasterStemmer()

# things we need for Tensorflow
import numpy as np
import tflearn
import tensorflow as tf
import random

# additional things
import json
import os.path
import re
import string
from collections import defaultdict

# download required data for data-preprocessing
nltk.download('punkt')
nltk.download('stopwords')

curses is not supported on this machine (please install/reinstall curses for an optimal experience)
Instructions for updating:
Colocations handled automatically by placer.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Process excel files and save in binary format for future use

In [2]:
def process_raw_excel_data(file_name, number_of_sheets):
    
    # check if file is already save in pickel format
    if os.path.exists('Processed_Data/QandA_Excel_full_data.pkl'):
        final_df = pd.read_pickle("Processed_Data/QandA_Excel_full_data.pkl")
        return final_df
    
    # check excel file available or not
    if not os.path.exists('QandA.xlsx'):
        print ('Excel file is not available....')
    
    xls = pd.ExcelFile('QandA.xlsx')
    number_of_excel_sheets = 78
    final_df = pd.DataFrame(columns=['Q', 'A'])
    for i in range(number_of_excel_sheets):
        df_temp = pd.read_excel(xls, 'Table '+ str(i+1))
        df = df_temp[df_temp.columns[0:]].apply(lambda x: '      '.join(x.dropna().astype(str)), axis=1)
        for j in range(len(df)):        
            q_and_a = [s.strip() for s in df[j].split('  ') if s and not s.isnumeric()]  
            
            # check question and answer both exist
            if len(q_and_a) == 2:
                final_df = final_df.append({'Q':q_and_a[0], 'A':q_and_a[1]}, ignore_index=True)
    
    # save excel file
    final_df.to_pickle("Processed_Data/QandA_Excel_full_data.pkl")
    return final_df

### Convert excel data into model competible format(dictionary) and save it

In [3]:
# generate intent dictionary from excel file
def generate_intent_dictionary():
    
    # check if file is already save in json format
    if os.path.exists('Processed_Data/intents.json'):
        intents = None
        with open('Processed_Data/intents.json', 'r') as fp:
            intents = json.load(fp)
        return intents

    intents = defaultdict(list)
    patterns = defaultdict(list)
    responses = defaultdict(list)

    # define categories/classes for sentenses
    classes = ['what', 'how many', 'where', 'when', 'how much', 'how', 'who', 'which', 'whose']
    class_value_dict = {'tag': None, 'patterns': [], 'responses': [], 'context_set': '', 'context_filter':''}
    data_size = len(excel_df)
    for i in range(data_size):
        q_text = excel_df.iloc[i]['Q'].lower()
        a_text = excel_df.iloc[i]['A']
        for category in classes:        
            if category in q_text:
                patterns[category].append(q_text)
                responses[category].append(a_text)

    for category in classes:
        class_value_dict = {'tag': None, 'patterns': [], 'responses': [], 'context_set': '', 'context_filter':''}
        class_value_dict['tag'] = category
        class_value_dict['patterns'] = patterns[category]
        class_value_dict['responses'] = responses[category]
        class_value_dict['context_set'] = ''
        class_value_dict['context_filter'] =''
        intents['intents'].append(class_value_dict)

    # save json file for model training    
    with open('Processed_Data/intents.json', 'w') as fp:
        json.dump(intents, fp)
    

In [4]:
# function for simple pre-processing
def do_simple_data_preprocessing(text):
    
    # make lower case
    result = text.lower()
    
    # remove numbers, digits
    result = re.sub(r'\d+', '', result)
    
    # remove punctuations
    result = re.sub(r'[^\w\s]','',result)
    
    # remove whitespaces
    result = result.strip()
    
    return result

### Create a set of Words, Classes and Documents 

In [5]:
def create_bow_docs(intents):
    words = []
    classes = []
    documents = []
    ignore_words = ['?']
    stop_words = set(stopwords.words('english'))
    # loop through each sentence in our intents patterns
    for intent in intents['intents']:
        for pattern in intent['patterns']:
            # simple pre-processing of data
            pattern = do_simple_data_preprocessing(pattern)
            # tokenize each word in the sentence
            w = nltk.word_tokenize(pattern)
            # add to our words list
            words.extend(w)
            # add to documents in our corpus
            w1 = [w_value for w_value in w if w_value not in classes]
            documents.append((w1, intent['tag']))
            # add to our classes list
            if intent['tag'] not in classes:
                classes.append(intent['tag'])

    # stem and lower each word and remove duplicates
    words = [stemmer.stem(w.lower()) for w in words if (w not in ignore_words and w not in stop_words)]
    words = sorted(list(set(words)))

    # remove duplicates
    classes = sorted(list(set(classes)))

    return (words, classes, documents)
    # print (len(documents), "documents")
    # print (len(classes), "classes", classes)
    # print (len(words), "unique stemmed words", words)

### Creating Training Data 

In [6]:
# get intent data dictionary
intents = generate_intent_dictionary()

# make bow, classes, documents
(words, classes, documents) = create_bow_docs(intents)

# create our training data
training = []
output = []
# create an empty array for our output
output_empty = [0] * len(classes)

# training set, bag of words for each sentence
for doc in documents:
    # initialize our bag of words
    bag = []
    # list of tokenized words for the pattern
    pattern_words = doc[0]
    # stem each word
    pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
    # create our bag of words array
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    # output is a '0' for each tag and '1' for current tag
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1

    training.append([bag, output_row])

# shuffle our features and turn into np.array
random.shuffle(training)
training = np.array(training)

# create train and test lists
train_x = list(training[:,0])
train_y = list(training[:,1])

### Build a NLP model and save it. 

In [7]:
# reset underlying graph data
tf.reset_default_graph()
# Build neural network
net = tflearn.input_data(shape=[None, len(train_x[0])])
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax')
net = tflearn.regression(net)

# Define model and setup tensorboard
model = tflearn.DNN(net, tensorboard_dir='tflearn_logs')
# Start training (apply gradient descent algorithm)
model.fit(train_x, train_y, n_epoch=50, batch_size=8, show_metric=True)
# save a model
model.save('JAY_model.tflearn')

Training Step: 43399  | total loss: [1m[32m0.04204[0m[0m | time: 4.102s
| Adam | epoch: 050 | loss: 0.04204 - acc: 0.9813 -- iter: 6936/6941
Training Step: 43400  | total loss: [1m[32m0.03817[0m[0m | time: 4.106s
| Adam | epoch: 050 | loss: 0.03817 - acc: 0.9832 -- iter: 6941/6941
--
INFO:tensorflow:C:\Users\LENOVO\Documents\Jay_jupyter_notebooks\Simple-Chatbot-NLTK-Tensorflow\JAY_model.tflearn is not in all_model_checkpoint_paths. Manually adding it.


### Data formatting for checking model

In [8]:
def clean_up_sentence(sentence):
    # tokenize the pattern
    sentence_words = nltk.word_tokenize(sentence)
    # stem each word
    sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
    return sentence_words

# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence
def bow(sentence, words, show_details=False):
    # tokenize the pattern
    sentence_words = clean_up_sentence(sentence)
    # bag of words
    bag = [0]*len(words)  
    for s in sentence_words:
        for i,w in enumerate(words):
            if w == s: 
                bag[i] = 1
                if show_details:
                    print ("found in bag: %s" % w)

    return(np.array(bag))

### process excel data and include one-hot-encoding  

In [9]:
# process excel data to create Dataframe
excel_df = process_raw_excel_data('QandA.xlsx', 78)

# adding extra column for binary encoding
excel_df['binary_encoding'] = excel_df.apply(lambda row: bow(row['Q'], words), axis=1)

# save data with binary_encoding
excel_df.to_pickle("Processed_Data/QandA_Excel_full_data_with_binaryCode.pkl")

# sample data
excel_df[0:5]

Unnamed: 0,Q,A,binary_encoding
0,How many rings on the Olympic flag,Five,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,What colour is vermilion a shade of,Red,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,King Zog ruled which country,Albania,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,What colour is Spock's blood,Green,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Where in your body is your patella,Knee ( it's the kneecap ),"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


### Checking model and data with a sample 

In [10]:
p = bow("Proportionately which creature has the largest brain", words)
print (p)
print (classes)
print (model.predict([p]))

[0 0 0 ... 0 0 0]
['how', 'how many', 'what', 'when', 'where', 'which', 'who', 'whose']
[[3.7563237e-09 2.5984709e-14 2.1792453e-05 6.2759931e-08 1.4867381e-15
  9.9997425e-01 3.9202951e-06 4.3642521e-15]]


### Save whole training data 

In [11]:
# save all of our data structures
import pickle
pickle.dump( {'words':words, 'classes':classes, 'train_x':train_x, 'train_y':train_y}, open( "Processed_Data/Jay_training_data", "wb" ) )