## Importamos las librerías necesarias

In [18]:
import nltk

nltk.download('punkt')#Sentence tokenizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Joaquin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import json
import pickle
import warnings
warnings.filterwarnings('ignore')

In [20]:
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import SGD
import random

## Hacemos un preprocesamiento de los datos.
Para ello primero los traemos, recordemos que están en formato json.

In [21]:
f = open('./db/starwarsintents.json').read() # Read data
data = json.loads(f) # Load json data

Los datos  se encuentran en un json, donde para distintos 'tags' tenemos diferentes patrones (entradas de usuario) y respuestas (las contestaciones del chatbot).

In [127]:
data['intents'][3]

{'tag': 'tasks',
 'patterns': ['What can you do?',
  'What are your features?',
  'What are you abilities.',
  'Can you sing.',
  'Can you talk.'],
 'responses': ['I can do whatever you asks me to do',
  'I can talk and do things for you',
  "Right now i'm in developing stage as soon i'm developed, I can do everything"]}

In [23]:
skip_elements = ['?','!', '.', ',',':',';'] # creo una lista con elementos que no usaremos.

Creo una bolsa de palabras a partir de la tokenización de los patrones.

In [24]:
words = []
for element in data['intents']:
    for pattern in element['patterns']:
        w = nltk.word_tokenize(pattern)
        words.extend(w)

Ahora tomo esa bolsa de palabras y creo a partir de ella una sin repeticiones y con las palabras en minúscula.

In [25]:
bag_of_words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in skip_elements]

In [26]:
bag_of_words = list(set(bag_of_words))
bag_of_words.sort()

Creo una lista con los tags.

In [27]:
tags = []
for i in range(len(data['intents'])):
    tags.append(data['intents'][i]['tag'])
    
    

Creo una lista con las palabras y los tags correspondientes.

In [28]:
documents = []
for element in data['intents']:
    for pattern in element['patterns']:
        w = nltk.word_tokenize(pattern)
        documents.append((w,element['tag']))

Creo una lista de entrenamiento para luego darle de comer al modelo

In [29]:
training = []
output_empty = [0] * len(data['intents'])
for doc in documents:
    bag = []
    # print(doc[0])
    lista = [x.lower() for x in doc[0]]
    # print(lista)
    for word in bag_of_words:
        if word in lista: 
            bag.append(1)
        else:
            bag.append(0)
    output_row = list(output_empty)
    output_row[tags.index(doc[1])] = 1   
    training.append([bag, output_row])
random.shuffle(training)
training = np.array(training)


In [30]:
training

array([[list([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]),
        list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])],
       [list([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
        list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])],
       [list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

Separo en lista de train y test


In [31]:
train_x = list(training[:,0])
train_y = list(training[:,1])

## Preparo el modelo

In [32]:
# Create model - 3 layers. First layer 128 neurons, second layer 64 neurons and 3rd output layer contains number of neurons
# equal to number of intents to predict output intent with softmax
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]), activation='softmax'))
print("First layer:",model.layers[0].get_weights()[0])

First layer: [[ 0.06618883  0.0586962  -0.1256366  ... -0.02513002  0.01199178
  -0.13619895]
 [ 0.14738819 -0.04714413  0.06392056 ...  0.08581948 -0.08082394
   0.10645643]
 [-0.09994102 -0.08674811  0.08178821 ...  0.11087489  0.01244067
   0.1239036 ]
 ...
 [ 0.02485116  0.01140693  0.14723945 ... -0.12837434 -0.04792879
   0.05782595]
 [ 0.03944863  0.09105919 -0.01705976 ... -0.05189411 -0.07479187
  -0.10756356]
 [-0.05626424 -0.09292477  0.08459689 ... -0.02278052  0.07085401
   0.12703684]]


In [33]:
# Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
# sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [34]:
#fitting and saving the model 
hist = model.fit(np.array(train_x), np.array(train_y), epochs=200, batch_size=5, verbose=1)
model.save('chatbot_model.h5', hist)

print("model created")


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [106]:
# Creo una funcuión para tokenizar fraces.
frase  = 'Hi, how are you?'
def tkz(frase):
    w = nltk.word_tokenize(frase)
    w = [x.lower() for x in w]
    return w

In [110]:
w

['hi', ',', 'how', 'are', 'you', '?']

In [109]:
w = tkz(frase)

In [81]:
bag_of_words.index('hi')

49

In [111]:
# Creo una función para generar un imput para el modelo
def get_array_for_predict(frase,bag_of_words):
    sentence = tkz(frase)
    array_empty = [0]*len(bag_of_words)
    for letter in sentence:
        for i,w in enumerate(bag_of_words):
            if letter == w:
                array_empty[i] = 1
 #   array_out = np.array(array_empty)
    return np.array(array_empty)
    

In [113]:
np.array(get_array_for_predict('How are you?',bag_of_words))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [119]:
model.predict(np.array([np.array(get_array_for_predict('How are you?',bag_of_words))]))[0]



array([9.9994445e-01, 1.2798457e-05, 4.3209684e-06, 1.1795585e-05,
       2.0604342e-05, 3.5615133e-07, 6.8539769e-08, 5.3866813e-07,
       2.0394921e-09, 1.2808358e-07, 1.4469552e-10, 1.7729567e-06,
       2.1783005e-06, 5.9193570e-07, 2.2104470e-07, 1.1197483e-07],
      dtype=float32)

In [114]:
def predict_class(frase, model):
    p = get_array_for_predict(frase,bag_of_words)
#print(p)

    res = model.predict(np.array([p]))[0]
#print(res)

    ERROR_THRESHOLD = 0.25

    results = [[i,r] for i,r in enumerate(res) if r>ERROR_THRESHOLD]
#print(results)
# sort by strength of probability

    results.sort(key=lambda x: x[1], reverse=True)
#print(results)

    return_list = []

    for r in results:
        return_list.append({"tag": tags[r[0]], "probability": str(r[1])})

    return return_list

In [171]:
# Creo la función para dar respuestas concatenadas.
def response(sentence):
    tag_responses = []
    responses = []
    res = predict_class(sentence,model)
    n = len(res)
    for i in range(n):
        tag_responses.append(res[i]['tag'])
        for j in range(len(data['intents'])):
            if data['intents'][j]['tag'] == res[i]['tag']:
                responses.append(random.choice(data['intents'][j]['responses']))
                break
    return ''.join(responses)



In [173]:
# Start chatbot. For exit type 'exit' or 'cancel'

start = True

while start:
    query = input('Enter Message:')
    if query in ['quit','exit','bye']:
        start = False
        continue 
    try:
        res = response(query)
        print(res)
    except:
        print("You may need to rephrase your question.")



Thans does not Hello Dear
Any time!Yes, I am here.
Glad to help!Listening carefully.
