## Instalamos librerias necesarias e importamos 😀

In [62]:
%%capture
# Nos aseguramos que torchtext este en la ultima version
!pip install torch==1.8.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
!pip install torchtext==0.9.0

In [63]:
import os
import sys
import json
import torch
import random
from random import choice
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from torch.optim import SGD, lr_scheduler
from torch.utils.data import DataLoader
from torch.autograd import Variable

from itertools import zip_longest

import plotly.express as px

import numpy as np
import nltk
from nltk.stem.porter import PorterStemmer

## Cargamos el Dataset 🙂

In [64]:
# This is optional for somebody that want to use stemmers
def stem(word, stemmer= PorterStemmer()):
  return stemmer.stem(word.lower())

In [65]:
# Load the dataset using json
with open('star_wars_chatbot.json', 'r') as f:
    dataset = json.load(f)

# Create a vocab with the dataset and get the number of classes that have
tokenizer = get_tokenizer("basic_english")
vocab = build_vocab_from_iterator(tokenizer(x) for list_words in dataset['intents'] for x in list_words['patterns'])
num_classes = len(dataset['intents'])

# Define a list with the labels
labels = sorted(set([tag for tag in [intents['tag'] for intents in dataset['intents']]]))
# Define a train_list where we can find the info in the format: [(tag_0, text_0)...,(tag_n-1, text_n-1)]
train_list = [(labels.index(intents['tag']), text) for intents in dataset['intents'] for text in intents['patterns']]

97lines [00:00, 31952.21lines/s]


In [66]:
dataset['intents']

[{'patterns': ['Hi',
   'Hey',
   'How are you',
   'Is anyone there?',
   'Hello',
   'Good day',
   "What's up",
   'Yo!',
   'Howdy',
   'Nice to meet you.'],
  'responses': ['Hey',
   'Hello, thanks for visiting.',
   'Hi there, what can I do for you?',
   'Hi there, how can I help?',
   'Hello, there.',
   'Hello Dear',
   'Ooooo Hello, looking for someone or something?',
   'Yes, I am here.',
   'Listening carefully.',
   'Ok, I am with you.'],
  'tag': 'greeting'},
 {'patterns': ['Bye',
   'See you later.',
   'Goodbye',
   'Have a great day.',
   'See you next time.',
   'It was my pleassure.',
   'Take care.',
   'See ya!',
   'Catch you later.',
   'Ciao.'],
  'responses': ['See you later, thanks for visiting.',
   'May the force be with you!',
   'See next time.',
   'Was my pleassuare to meet you.',
   'Hope will cath up sortly.',
   'Have a nice day.',
   'Bye! Come back again soon.',
   'So, till next time.',
   'If you need anything just text me anytime. Bye.',
   'Well,

## Generamos Modelo 💻

In [67]:
class CNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=32, num_classes=10, 
                 use_cnn=False, cnn_pool_channels=24, cnn_kernel_size=3):
      super().__init__()
      self.use_cnn = use_cnn
      
      if use_cnn:
        # capa de embedding
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        # capa de convolución
        self.conv = nn.Conv1d(
            in_channels=1,
            out_channels=cnn_pool_channels,
            kernel_size=cnn_kernel_size * embed_dim,
            stride=embed_dim,
        )

        fc_in_size = cnn_pool_channels
      else:
        # capa de embedding, en este caso usamos EmbeddingBag 
        # por lo que necesitaremos los offsets del batch en el forwards
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)

        fc_in_size = embed_dim

      # capa lineal
      self.fc = nn.Linear(fc_in_size, num_classes)

      self.init_weights()

    def init_weights(self):
      initrange = 0.5
      self.embedding.weight.data.uniform_(-initrange, initrange)
      self.fc.weight.data.uniform_(-initrange, initrange)
      self.fc.bias.data.zero_()

    def forward(self, text, offsets):
      if self.use_cnn:
        # preparamos el input de la capa de embeddings a partir de text y offsets
        # (N x longest_text)
        text = torch.tensor(
            list(
                zip(
                    *zip_longest(
                        *([text[o:offsets[i+1]] for i, o in enumerate(offsets[:-1])] + [text[offsets[-1]:len(texts)]]), 
                        fillvalue=vocab["<pad>"]
                    )
                )
            )
        ).to(text.device)

        # (N x longest_text x embed_dim)
        h = self.embedding(text)
        self.test1 = h
        # (N x pool_channels)
        h = h.view(h.size(0), 1, -1)
        self.test2 = h
        h = torch.relu(self.conv(h))
        self.test3 = h
        h = h.mean(dim=2)
        self.test4 = h
      else:
        # (N x embed_dim)
        h = self.embedding(text, offsets)

      # (N x num_classes)
      return self.fc(h)

## Funciones de Batch y entrenamiento 👷

In [68]:
# Define a function to load the batch.
def generate_batch(batch):
  label = torch.tensor([entry[0] for entry in batch])
  texts = [tokenizer(entry[1]) for entry in batch]
  offsets = [0] + [len(text) for text in texts]
  offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
  big_text = torch.cat([torch.tensor([vocab.stoi[t] for t in text]) for text in texts])
  return big_text, offsets, label

In [69]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"GPU is avaible: {device}")

# Define the different inputs in our model
num_epochs = 1000
BATCH_SIZE = 16
LR = 1e-1
INPUT_SIZE = len(vocab)
OUTPUT_SIZE = num_classes
USE_CNN = True

# Define model, optimizer, loss and scheduler (Q: ¿What is it?)
model = CNNClassifier(INPUT_SIZE, num_classes=OUTPUT_SIZE, use_cnn=USE_CNN).to(device)
optimizer = SGD(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss().to(device)
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=[lambda epoch: .9 ** (epoch // 10)])

print(f'train: {len(train_list)} elements')

# We train the model using the intents
loss_list= []
for epoch in range(1, num_epochs):
  train_loader = DataLoader(train_list, batch_size=BATCH_SIZE, collate_fn=generate_batch)
  model.train()
  total_loss = 0
  for i, (texts, offsets, cls) in enumerate(train_loader):
    texts = texts.to(device)
    offsets = offsets.to(device)
    cls = cls.to(device)
    optimizer.zero_grad()
    output = model(texts, offsets)
    loss = criterion(output, cls)
    total_loss += loss.item()
    loss.backward()
    optimizer.step()

  loss_list.append(loss.item())
  sys.stdout.write('\rEpoch: {0:03d} \t iter-Loss: {1:.3f}'.format(epoch+1, loss.item()))

print(f'final loss: {loss.item():.4f}')

GPU is avaible: cpu
train: 97 elements
Epoch: 1000 	 iter-Loss: 0.000final loss: 0.0001


In [70]:
import plotly.express as px
fig = px.line(y=loss_list, x=np.arange(1,num_epochs), title="Training Loss")
fig.update_layout(
    xaxis_title="Epochs",
    yaxis_title="Loss"
    )

### Guardamos modelo 🦺

In [71]:
# We save de model using pytorch
data = {
"model_state": model.state_dict(),
"input_size": INPUT_SIZE,
"output_size": OUTPUT_SIZE,
"use_cnn": USE_CNN,
"labels": labels
        }

FILE = "data.pth"
torch.save(data, FILE)

print(f'training complete. file saved to {FILE}')

training complete. file saved to data.pth


### A probar! 🧪

In [72]:
# This is working?, Try the next example!
qText = "hello <pad> <pad>" # this must classify the label "funny"

X = torch.tensor([vocab.stoi[t] for t in tokenizer(qText)])

model.eval()
output = model(X, torch.tensor([0], dtype=torch.long))
_, predicted = torch.max(output, dim=1)
labels[predicted]

'greeting'

In [73]:
model.test1

tensor([[[-0.4988,  0.4434,  0.2050,  0.3995,  0.3117, -0.2152,  0.0329,
           0.3844, -0.3737,  0.2158,  0.2449,  0.1879,  0.3502, -0.6705,
           0.0269,  0.5704,  0.3521,  0.6885, -0.1025,  0.0539,  0.3467,
          -0.2367, -0.1927,  0.1208, -0.3683, -0.5686, -0.2169, -0.3784,
           0.4077, -0.0050, -0.0565,  0.4773],
         [ 0.6510, -0.5629,  0.6974, -0.1233,  0.2242, -0.0231,  0.0181,
          -0.5549,  0.0477, -0.1708,  0.1685, -0.6871, -0.5659,  0.6046,
           0.7480, -0.0300,  0.2875,  0.1393,  0.4408,  0.0065, -0.6900,
          -0.2695,  0.2156, -0.8787, -0.2067, -0.5094,  0.4525,  0.2749,
          -0.1716,  0.1628, -0.1564,  0.3750],
         [ 0.6510, -0.5629,  0.6974, -0.1233,  0.2242, -0.0231,  0.0181,
          -0.5549,  0.0477, -0.1708,  0.1685, -0.6871, -0.5659,  0.6046,
           0.7480, -0.0300,  0.2875,  0.1393,  0.4408,  0.0065, -0.6900,
          -0.2695,  0.2156, -0.8787, -0.2067, -0.5094,  0.4525,  0.2749,
          -0.1716,  0.1628, -0

## Chatbot 💬

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

with open('star_wars_chatbot.json', 'r') as json_data:
    intents = json.load(json_data)

FILE = "data.pth"
data = torch.load(FILE)

INPUT_SIZE = data["input_size"]
OUTPUT_SIZE = data["output_size"]
USE_CNN = data["use_cnn"]
labels = data['labels']
model_state = data["model_state"]

model = CNNClassifier(INPUT_SIZE, num_classes=OUTPUT_SIZE, use_cnn=USE_CNN).to(device)
model.load_state_dict(model_state)
model.eval()

# Dictionary with the answers
responses = {key['tag']: key['responses'] for key in dataset['intents']}

bot_name = "GA-97"
print("Let's chat! (type 'finish_chat' to finish the chat)")
while True:
    q_text = input("You: ")
    q_text = "'"+q_text+"'"
    if q_text == "'finish_chat'":
        break

    X = torch.tensor([vocab.stoi[t] for t in tokenizer(q_text)])
    output = model(X, torch.tensor([0], dtype=torch.long))
    _, predicted = torch.max(output, dim=1)

    tag = labels[predicted.item()]

    probs = torch.softmax(output, dim=1)
    prob = probs[0][predicted.item()]
    if prob.item() > 0.50:
      print(f"{bot_name}: {random.choice(responses[tag])}")
    else:
      print(f"{bot_name}: My model can't understand you...")

Let's chat! (type 'finish_chat' to finish the chat)
