In [1]:
import nltk
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
import json
import pickle
import random
import numpy as np


In [2]:
words=[]
classes = []
documents = []
ignore_words = ['?', '!']
data_file = open('intents.json').read()
intents = json.loads(data_file)

In [3]:
for intent in intents['intents']:
    for pattern in intent['patterns']:

        #tokenize each word
        w = nltk.word_tokenize(pattern)
        words.extend(w)
        #add documents in the corpus
        documents.append((w, intent['tag']))

        # add to our classes list
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

# lemmaztize and lower each word and remove duplicates
words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))
# sort classes
classes = sorted(list(set(classes)))
# documents = combination between patterns and intents
print (len(documents), "documents")
# classes = intents
print (len(classes), "classes", classes)
# words = all words, vocabulary
print (len(words), "unique lemmatized words", words)

97 documents
12 classes ['entertainment_movies', 'entertainment_music', 'flight_assistant', 'goodbye', 'greeting', 'options', 'order_dessert', 'order_drinks', 'order_food', 'tellmejoke', 'thanks', 'weather_search']
119 unique lemmatized words ["'s", ',', 'a', 'again', 'album', 'all', 'am', 'amazing', 'anybody', 'anyone', 'are', 'assistant', 'at', 'available', 'beverage', 'bored', 'bot', 'boy', 'bro', 'bunch', 'bye', 'call', 'can', 'catch', 'cheer', 'cool', 'day', 'dessert', 'do', 'doubt', 'drink', 'eat', 'else', 'everybody', 'everything', 'farewell', 'flight', 'folk', 'food', 'for', 'give', 'good', 'goodbye', 'goodnight', 'great', 'have', 'hello', 'help', 'helpful', 'here', 'hey', 'hi', 'hola', 'how', 'howdy', 'i', 'in', 'is', 'it', 'item', 'joke', 'know', 'later', 'let', 'like', 'list', 'listen', 'long', 'me', 'menu', 'movie', 'much', 'music', "n't", 'navigate', 'need', 'nice', 'night', 'now', 'of', 'on', 'option', 'order', 'outside', 'play', 'please', 'possible', 'robot', 'see', 'sho

In [4]:

pickle.dump(words,open('words.pkl','wb'))
pickle.dump(classes,open('classes.pkl','wb'))


In [5]:

# # create our training data
# training = []
# # create an empty array for our output
# output_empty = [0] * len(classes)
# # training set, bag of words for each sentence
# for doc in documents:
#     # initialize our bag of words
#     bag = []
#     # list of tokenized words for the pattern
#     print(doc[0])
#     pattern_words = doc[0]
#     # lemmatize each word - create base word, in attempt to represent related words
#     pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]
#     # create our bag of words array with 1, if word match found in current pattern
#     for w in words:
#         bag.append(1) if w in pattern_words else bag.append(0)
    
#     # output is a '0' for each tag and '1' for current tag (for each pattern)
#     output_row = list(output_empty)
# #     print(output_row)
#     output_row[classes.index(doc[1])] = 1
# #     print(classes.index(doc[1]))
# #     print(output_row)
#     training.append([bag, output_row])
    

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataTransformer(BaseEstimator, TransformerMixin):
    """
    Data preprocessing steps
    """
    def __init__(self, words , classes):
        self.words = words
        self.lemmatizer = WordNetLemmatizer()
        self.classes = classes
        
    def transform(self, docs, *_):
        transformed_data = []
        # create an empty array for our output
        output_empty = [0] * len(self.classes)
        for doc in docs:
            # initialize our bag of words
            bag = []
            # list of tokenized words for the pattern
            #    print(doc[0])
            pattern_words = doc[0]
            # lemmatize each word - create base word, in attempt to represent related words
            pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]
            # create our bag of words array with 1, if word match found in current pattern
            for w in self.words:
                bag.append(1) if w in pattern_words else bag.append(0)

            # output is a '0' for each tag and '1' for current tag (for each pattern)
            output_row = list(output_empty)
            output_row[self.classes.index(doc[1])] = 1
    
            transformed_data.append([bag, output_row])
        
        random.shuffle(transformed_data)
        return np.array(transformed_data, dtype=object)
    
    def fit(self, *_):
        return self

In [8]:
from sklearn.pipeline import Pipeline

datapreparer = DataTransformer(words = words, classes = classes)
training = datapreparer.transform(documents)
print(training[0])




[list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0])
 list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])]


In [9]:

# create train and test lists. X - patterns, Y - intents
train_x = list(training[:,0])
train_y = list(training[:,1])
print("Training data created")


Training data created


In [10]:

# # save all of our data structures
# print("Saving training data")
# pickle.dump( {'words':words, 'classes':classes, 'train_x':train_x, 'train_y':train_y}, open( "training_data.parquet", "wb" ) )


In [11]:
dataprep_pipeline = Pipeline([
    ('dataprep', datapreparer)
])

dataprep_pipeline.fit(documents)



Pipeline(memory=None,
         steps=[('dataprep',
                 DataTransformer(classes=['entertainment_movies',
                                          'entertainment_music',
                                          'flight_assistant', 'goodbye',
                                          'greeting', 'options',
                                          'order_dessert', 'order_drinks',
                                          'order_food', 'tellmejoke', 'thanks',
                                          'weather_search'],
                                 words=["'s", ',', 'a', 'again', 'album', 'all',
                                        'am', 'amazing', 'anybody', 'anyone',
                                        'are', 'assistant', 'at', 'available',
                                        'beverage', 'bored', 'bot', 'boy',
                                        'bro', 'bunch', 'bye', 'call', 'can',
                                        'catch', 'cheer', 'cool', 'day',


In [12]:
from mlworkflows import util
util.serialize_to(dataprep_pipeline, "chatbot_data_preparation.sav")