In [3]:
#!pip install tensorflow

Proyecto compiado de: https://towardsdatascience.com/how-to-build-your-own-chatbot-using-deep-learning-bb41f970e281

In [1]:
import json 
import numpy as np 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
#keras preprocesamiento
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

Lee el json y crea los conjuntos de datos para entrenar el modelo

In [2]:

with open('../testFiles/intents.json') as file:
    data = json.load(file)
    
training_sentences = []
training_labels = []
labels = []
responses = []


for intent in data['intents']:
    for pattern in intent['patterns']:
        training_sentences.append(pattern)
        training_labels.append(intent['tag'])
    responses.append(intent['responses'])
    
    if intent['tag'] not in labels:
        labels.append(intent['tag'])
        
num_classes = len(labels)

In [3]:
print(len(training_sentences))
print(len(training_labels))
print(len(labels))
print(len(responses))
responses

33
33
8
8


[['Hello', 'Hi', 'Hi there'],
 ['See you later', 'Have a nice day', 'Bye! Come back again'],
 ['Happy to help!', 'Any time!', 'My pleasure', "You're most welcome!"],
 ['I.m Joana, your bot assistant', "I'm Joana, an Artificial Intelligent bot"],
 ['You can call me Joana.', "I'm Joana!", 'Just call me as Joana'],
 ['Tell me how can assist you',
  'Tell me your problem to assist you',
  'Yes Sure, How can I support you'],
 ['You can just easily create a new account from our web site',
  'Just go to our web site and follow the guidelines to create a new account'],
 ['Please provide us your complaint in order to assist you',
  'Please mention your complaint, we will reach you and sorry for any inconvenience caused']]

In [5]:
lbl_encoder = LabelEncoder()
lbl_encoder.fit(training_labels)
training_labels = lbl_encoder.transform(training_labels)
len(training_labels)
training_labels

array([4, 4, 4, 4, 4, 3, 3, 3, 7, 7, 7, 7, 0, 0, 0, 6, 6, 6, 5, 5, 5, 5,
       5, 5, 5, 2, 2, 2, 2, 2, 1, 1, 1])

In [6]:
lbl_encoder.inverse_transform(training_labels)

array(['greeting', 'greeting', 'greeting', 'greeting', 'greeting',
       'goodbye', 'goodbye', 'goodbye', 'thanks', 'thanks', 'thanks',
       'thanks', 'about', 'about', 'about', 'name', 'name', 'name',
       'help', 'help', 'help', 'help', 'help', 'help', 'help',
       'createaccount', 'createaccount', 'createaccount', 'createaccount',
       'createaccount', 'complaint', 'complaint', 'complaint'],
      dtype='<U13')

Tokenización de frases textuales con Tokenizer de keras

In [7]:
training_sentences

['Hi',
 'Hey',
 'Is anyone there?',
 'Hello',
 'Hay',
 'Bye',
 'See you later',
 'Goodbye',
 'Thanks',
 'Thank you',
 "That's helpful",
 'Thanks for the help',
 'Who are you?',
 'What are you?',
 'Who you are?',
 'what is your name',
 'what should I call you',
 'whats your name?',
 'Could you help me?',
 'give me a hand please',
 'Can you help?',
 'What can you do for me?',
 'I need a support',
 'I need a help',
 'support me please',
 'I need to create a new account',
 'how to open a new account',
 'I want to create an account',
 'can you create an account for me',
 'how to open a new account',
 'have a complaint',
 'I want to raise a complaint',
 'there is a complaint about a service']

Tratamiento de los textos para alimentar la red neuronal, Sería interesante probar una lematización en la tokenización

In [12]:
vocab_size = 1000 #número de palabras del vocabulario
embedding_dim = 16
max_len = 20 #tamaño del vector que representará al documento
oov_token = "<OOV>" #ayuda a identificar palabras que quedan fuera del vocabulario

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)#crea la bolsa de palabras
tokenizer.fit_on_texts(training_sentences)#con las palabras de los documentos (oraciones)
word_index = tokenizer.word_index
#print(tokenizer.word_counts) #cuanto se repite una palabra
#print(tokenizer.document_count) #número de docs (frases)
print(tokenizer.word_index)#palabra, índice
#print(tokenizer.word_docs)#palabra índice de documento

sequences = tokenizer.texts_to_sequences(training_sentences) #transforma las frases a vectores de ínidces de palabras
padded_sequences = pad_sequences(sequences, truncating='post', maxlen=max_len)# lleva las secuencias a vectores del mismo tamaño


{'<OOV>': 1, 'you': 2, 'a': 3, 'i': 4, 'me': 5, 'to': 6, 'account': 7, 'help': 8, 'what': 9, 'is': 10, 'for': 11, 'are': 12, 'can': 13, 'need': 14, 'create': 15, 'new': 16, 'complaint': 17, 'there': 18, 'thanks': 19, 'who': 20, 'your': 21, 'name': 22, 'please': 23, 'support': 24, 'how': 25, 'open': 26, 'want': 27, 'an': 28, 'hi': 29, 'hey': 30, 'anyone': 31, 'hello': 32, 'hay': 33, 'bye': 34, 'see': 35, 'later': 36, 'goodbye': 37, 'thank': 38, "that's": 39, 'helpful': 40, 'the': 41, 'should': 42, 'call': 43, 'whats': 44, 'could': 45, 'give': 46, 'hand': 47, 'do': 48, 'have': 49, 'raise': 50, 'about': 51, 'service': 52}


In [13]:
sequences

[[29],
 [30],
 [10, 31, 18],
 [32],
 [33],
 [34],
 [35, 2, 36],
 [37],
 [19],
 [38, 2],
 [39, 40],
 [19, 11, 41, 8],
 [20, 12, 2],
 [9, 12, 2],
 [20, 2, 12],
 [9, 10, 21, 22],
 [9, 42, 4, 43, 2],
 [44, 21, 22],
 [45, 2, 8, 5],
 [46, 5, 3, 47, 23],
 [13, 2, 8],
 [9, 13, 2, 48, 11, 5],
 [4, 14, 3, 24],
 [4, 14, 3, 8],
 [24, 5, 23],
 [4, 14, 6, 15, 3, 16, 7],
 [25, 6, 26, 3, 16, 7],
 [4, 27, 6, 15, 28, 7],
 [13, 2, 15, 28, 7, 11, 5],
 [25, 6, 26, 3, 16, 7],
 [49, 3, 17],
 [4, 27, 6, 50, 3, 17],
 [18, 10, 3, 17, 51, 3, 52]]

Creación de la red neuronal: Secuentials

In [40]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(GlobalAveragePooling1D())
model.add(Dense(16, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', 
              optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 16)            16000     
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 136       
Total params: 16,680
Trainable params: 16,680
Non-trainable params: 0
_________________________________________________________________


In [41]:
epochs = 500 #número de veces en que todos los datos de entrenamiento pasan por la red en el proceso. Tip: aumentar el número de epoch hasta que la accuracy de los datos de validación (no los de entrenamiento) empiece a decrecer
history = model.fit(padded_sequences, np.array(training_labels), epochs=epochs)


Epoch 299/500
Epoch 300/500
Epoch 301/500
Epoch 302/500
Epoch 303/500
Epoch 304/500
Epoch 305/500
Epoch 306/500
Epoch 307/500
Epoch 308/500
Epoch 309/500
Epoch 310/500
Epoch 311/500
Epoch 312/500
Epoch 313/500
Epoch 314/500
Epoch 315/500
Epoch 316/500
Epoch 317/500
Epoch 318/500
Epoch 319/500
Epoch 320/500
Epoch 321/500
Epoch 322/500
Epoch 323/500
Epoch 324/500
Epoch 325/500
Epoch 326/500
Epoch 327/500
Epoch 328/500
Epoch 329/500
Epoch 330/500
Epoch 331/500
Epoch 332/500
Epoch 333/500
Epoch 334/500
Epoch 335/500
Epoch 336/500
Epoch 337/500
Epoch 338/500
Epoch 339/500
Epoch 340/500
Epoch 341/500
Epoch 342/500
Epoch 343/500
Epoch 344/500
Epoch 345/500
Epoch 346/500
Epoch 347/500
Epoch 348/500
Epoch 349/500
Epoch 350/500
Epoch 351/500
Epoch 352/500
Epoch 353/500
Epoch 354/500
Epoch 355/500
Epoch 356/500
Epoch 357/500
Epoch 358/500
Epoch 359/500
Epoch 360/500
Epoch 361/500
Epoch 362/500
Epoch 363/500
Epoch 364/500
Epoch 365/500
Epoch 366/500
Epoch 367/500
Epoch 368/500
Epoch 369/500
Epoch