<a href="https://colab.research.google.com/github/hood-boi/world-news-chatbot/blob/master/APS360_ChatBot_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Baseline Model

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import torch.optim as optim
import torchtext
import random

In [0]:
text_field = torchtext.data.Field(sequential=True,      # text sequence
                                  tokenize=lambda x: x, # because are building a character-RNN
                                  include_lengths=True, # to track the length of sequences, for batching
                                  batch_first=True,
                                  use_vocab=True,
                                  init_token="<BOS>",
                                  eos_token="<EOS>"
                                 )       # to turn each character into an integer index
label_field = torchtext.data.Field(sequential=True,    # text sequence
                                   use_vocab=True,     # don't need to track vocabulary
                                   is_target=True,      
                                   batch_first=True,
                                   tokenize=lambda x: x,
                                   preprocessing=lambda x: x,
                                   init_token="<BOS>",
                                   eos_token="<EOS>"
                                  ) 

fields = [('reply', label_field), ('context', text_field)]
dataset = torchtext.data.TabularDataset("/content/gdrive/My Drive/Chatbot/rWorldNews.txt", # name of the file
                                        "tsv",               # fields are separated by a tab
                                        fields)
train = torchtext.data.Dataset(dataset, fields)


In [0]:
text_field.build_vocab(train)
label_field.build_vocab(train)
print("Context Vocab : ", text_field.vocab.itos)
print("Reply Vocab : ", label_field.vocab.itos)
input_vocab_size = len(text_field.vocab.itos)
reply_vocab_size = len(label_field.vocab.itos)
print("Input Vocab Size: ", input_vocab_size)
print("Reply Vocab size: ", reply_vocab_size)

Context Vocab :  ['<unk>', '<pad>', '<BOS>', '<EOS>', ' ', 'e', 't', 'a', 'o', 'i', 'n', 's', 'r', 'h', 'l', 'd', 'u', 'c', 'm', 'p', 'g', 'y', 'w', 'f', 'b', '.', 'v', 'k', ',', 'I', '-', '/', 'T', 'S', 'A', '0', '1', 'C', 'x', '*', ':', '2', 'j', 'P', 'M', 'W', 'R', 'E', 'N', '?', 'B', 'H', 'U', ')', 'D', '(', 'O', 'F', '’', 'z', ';', '5', '3', 'G', '_', '8', '9', '6', 'L', '4', '7', 'K', 'q', ']', '[', 'Y', 'J', '!', 'V', '%', '“', '”', '=', '$', '&', '#', '~', '^', '|', 'Q', 'Z', '‘', '—', 'X', '+', '\\', '–', '@', 'é', '£', '…', 'о', 'ü', 'е', '€', '•', '⠀', 'ó', 'и', 'н', '️', 'а', 'т', 'á', 'ñ', 'с', '°', '・', 'ä', 'л', 'р', 'ا', '§', '\u200b', 'м', 'ы', 'в', '`', 'к', 'ن', 'д', '´', '✔', '⣿', 'п', 'í', 'ی', 'ö', '，', 'я', '😂', 'б', 'у', '。', '🤔', '\u200d', '💦', 'ر', '¥', 'م', 'ه', 'の', 'が', 'ч', 'º', '🇧', 'و', '¯', '·', '■', 'Ä', '‚', 'ل', '¶', 'た', 'な', '🇬', 'د', 'ت', '👍', 'г', '不', 'か', 'て', 'ب', 'っ', '人', '★', '👏', '、', 'い', '☆', 'ğ', 'Ü', 'ь', '™', '一', 'з', '💘', '₂', '交', 

In [0]:
class ContextAE(nn.Module):
    def __init__(self, 
                 context_vocab_size, 
                 encoder_hidden_size = 100,
                 generator_hidden_size = 100, 
                 encoder_layers = 1, 
                 generator_layers = 1):
        
        super(ContextAE, self).__init__()
        
        self.encoder_layers = encoder_layers;
        self.generator_layers = generator_layers;
        self.encoder_hidden_size = encoder_hidden_size;
        self.generator_hidden_size = generator_hidden_size;
        
        # >>> Encoder
        self.context_ident = torch.eye(context_vocab_size)
        self.encode_rnn = nn.GRU(context_vocab_size, encoder_hidden_size, encoder_layers, batch_first=True)
        
        # >>> Decoder
        self.reply_ident = torch.eye(context_vocab_size)
        self.decode_rnn = nn.GRU(context_vocab_size, generator_hidden_size, generator_layers, batch_first=True)
        self.fcnn = nn.Linear(generator_hidden_size, context_vocab_size)
        
    def forward(self, context, hidden=None):
        
        # >>> Encoder
        context_tensor = self.context_ident[context] # Type: batch.context[0] | Size: (batch size, sequence size)
        h0 = torch.zeros(self.encoder_layers, context.shape[0], self.encoder_hidden_size); # (num layers * direction, batch size, hidden size)
        #c0 = torch.zeros(self.encoder_layers, context.shape[0], self.encoder_hidden_size);
        #encode_out, encode_last_hidden = self.encode_rnn(context_tensor, (h0,c0))
        encode_out, encode_last_hidden = self.encode_rnn(context_tensor, h0)
        # >>> Decoder
        if(hidden == None):
            gen_out, gen_last_hidden = self.decode_rnn(context_tensor, encode_last_hidden)
        else:
            gen_out, gen_last_hidden = self.decode_rnn(context_tensor, hidden)
        out = self.fcnn(gen_out)
        return out, gen_last_hidden

In [0]:
def train_fcn(model, data, vocab_size, batch_size=1, num_epochs=1, lr=0.001, print_every=20):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    it = 0
    
    data_iter = torchtext.data.BucketIterator(data,
                                              batch_size=batch_size,
                                              sort_key=lambda x: len(x.context),
                                              sort_within_batch=True)
    print("Here #1 : ")
    for e in range(num_epochs):
        print("Epoch : ", e);
        # get training set
        avg_loss = 0
        for batch in data_iter:
            inp = batch.context[0] # BOS + EOS
            target = inp;
            # cleanup
            optimizer.zero_grad()
            # forward pass
            output, _ = model(inp)
            loss = criterion(output.reshape(-1, vocab_size), target.reshape(-1))
            # backward pass
            loss.backward()
            optimizer.step()

            avg_loss += loss
            it += 1 # increment iteration count
            if it % print_every == 0:
                print("[Iter %d] Loss %f" % (it+1, float(avg_loss/print_every)))
                avg_loss = 0

In [42]:
model_AE = ContextAE(input_vocab_size, encoder_hidden_size = 100, generator_hidden_size = 100 )
train_fcn(model_AE, train, input_vocab_size, batch_size = 16)

Here #1 : 
Epoch :  0
[Iter 21] Loss 6.811048
[Iter 41] Loss 3.991014
[Iter 61] Loss 3.275716
[Iter 81] Loss 3.220464
[Iter 101] Loss 3.155977
[Iter 121] Loss 3.099907
[Iter 141] Loss 3.040375
[Iter 161] Loss 2.932705
[Iter 181] Loss 2.848371
[Iter 201] Loss 2.680211
[Iter 221] Loss 2.441300
[Iter 241] Loss 2.204262
[Iter 261] Loss 1.864070
[Iter 281] Loss 1.625515
[Iter 301] Loss 1.425975
[Iter 321] Loss 1.069310
[Iter 341] Loss 0.901704
[Iter 361] Loss 0.669027
[Iter 381] Loss 0.539513
[Iter 401] Loss 0.433639
[Iter 421] Loss 0.354510
[Iter 441] Loss 0.306426
[Iter 461] Loss 0.236087
[Iter 481] Loss 0.219236
[Iter 501] Loss 0.166473
[Iter 521] Loss 0.145423
[Iter 541] Loss 0.126913
[Iter 561] Loss 0.105087
[Iter 581] Loss 0.090661
[Iter 601] Loss 0.078862
[Iter 621] Loss 0.082476
[Iter 641] Loss 0.060603
[Iter 661] Loss 0.053709
[Iter 681] Loss 0.053159
[Iter 701] Loss 0.048830
[Iter 721] Loss 0.040814
[Iter 741] Loss 0.040003
[Iter 761] Loss 0.031997
[Iter 781] Loss 0.028737
[Iter 8

In [0]:
torch.save(model_AE.state_dict(), "/content/gdrive/My Drive/Chatbot/contextAE")