<a href="https://colab.research.google.com/github/hood-boi/world-news-chatbot/blob/master/ChatBotBaseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Baseline Model

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import torch.optim as optim
import torchtext
import random

In [0]:
text_field = torchtext.data.Field(sequential=True,      # text sequence
                                  tokenize=lambda x: x, # because are building a character-RNN
                                  include_lengths=True, # to track the length of sequences, for batching
                                  batch_first=True,
                                  use_vocab=True,
                                  init_token="<BOS>",
                                  eos_token="<EOS>"
                                 )       # to turn each character into an integer index
label_field = torchtext.data.Field(sequential=True,    # text sequence
                                   use_vocab=True,     # don't need to track vocabulary
                                   is_target=True,      
                                   batch_first=True,
                                   tokenize=lambda x: x,
                                   preprocessing=lambda x: x,
                                   init_token="<BOS>",
                                   eos_token="<EOS>"
                                  ) 

fields = [('reply', label_field), ('context', text_field)]
dataset = torchtext.data.TabularDataset("/content/gdrive/My Drive/Chatbot/routoftheloop.txt", # name of the file
                                        "tsv",               # fields are separated by a tab
                                        fields)
train = torchtext.data.Dataset(dataset, fields)


In [0]:
text_field.build_vocab(train)
label_field.build_vocab(train)
print("Context Vocab : ", text_field.vocab.itos)
print("Reply Vocab : ", label_field.vocab.itos)
input_vocab_size = len(text_field.vocab.itos)
reply_vocab_size = len(label_field.vocab.itos)
print("Input Vocab Size: ", input_vocab_size)
print("Reply Vocab size: ", reply_vocab_size)

Context Vocab :  ['<unk>', '<pad>', '<BOS>', '<EOS>', ' ', 'e', 't', 'a', 'o', 'i', 'n', 's', 'r', 'h', 'l', 'd', 'u', 'c', 'm', 'p', 'g', 'w', 'y', 'f', 'b', '.', 'v', 'k', ',', '/', 'I', '?', 'T', 'W', 'A', '-', 'S', ':', 'x', 'j', '0', 'B', 'C', 'M', 'P', 'D', 'H', 'R', '1', 'E', 'N', ')', '(', '2', '_', 'F', 'z', 'O', 'L', 'G', '*', '3', 'Y', 'q', '4', '5', '9', '6', 'U', 'J', '8', '7', '[', ']', ';', 'K', '!', '=', 'V', 'Q', '~', 'Z', 'X', '&', '$', '%', '^', '#', '@', '+', '|', '\\', '{', '}', '`']
Reply Vocab :  ['<unk>', '<pad>', '<BOS>', '<EOS>', ' ', 'e', 't', 'o', 'a', 'i', 's', 'n', 'r', 'h', 'l', 'd', 'u', 'c', 'm', 'y', 'g', 'p', 'w', 'f', '.', 'b', 'k', 'v', 'I', ',', '/', 'T', 'A', 'S', '?', '-', 'j', 'x', 'W', 'H', '0', ':', 'B', 'C', 'P', 'M', 'N', 'E', 'O', 'D', 'R', '1', ')', '(', 'Y', 'z', '*', 'L', 'F', '2', '_', 'q', 'G', '!', 'U', '3', ';', '4', '5', 'J', '8', 'K', '9', '6', ']', '[', '7', 'V', '=', '^', '%', '&', 'Q', '$', 'X', 'Z', '~', '#', '\\', '+', '@', '|

In [0]:
class EncoderAE(nn.Module):
    def __init__(self,
                vocab_size,
                hidden_size = 100,
                hidden_layers = 1):
        
        super(EncoderAE, self).__init__()
        
        self.vocab_size = vocab_size;
        self.layers = hidden_layers;
        self.hidden_size = hidden_size;
        
        self.ident = torch.eye(vocab_size)
        self.rnn = nn.GRU(vocab_size, hidden_size, hidden_layers, batch_first=True)
        
    def forward(self, target, hidden=None):
        target_tensor = self.ident[target] # Type: batch.context[0] | Size: (batch size, sequence size)
        h0 = torch.zeros(self.layers * 1, target.shape[0], self.hidden_size); # (num layers * direction, batch size, hidden size)
        out, last_hidden = self.rnn(target_tensor, h0)
        
        return out, last_hidden;

In [0]:
class DecoderAE(nn.Module):
    def __init__(self,
                vocab_size,
                hidden_size = 100,
                hidden_layers = 1):
        
        super(DecoderAE, self).__init__()
        
        self.vocab_size = vocab_size;
        self.layers = hidden_layers;
        self.hidden_size = hidden_size;
        
        self.ident = torch.eye(vocab_size)
        self.rnn = nn.GRU(vocab_size, hidden_size, hidden_layers, batch_first=True)
        self.fcnn = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, target, embedding, hidden=None):
        target_tensor =  self.ident[target]
        if(hidden == None):
            out, last_hidden = self.rnn(target_tensor, embedding)
        else:
            out, last_hidden = self.rnn(target_tensor, hidden)
        out_final = self.fcnn(out)
        return out_final, last_hidden

In [0]:
class ContextAE(nn.Module):
    def __init__(self, 
                 context_vocab_size, 
                 encoder_hidden_size = 100,
                 generator_hidden_size = 100, 
                 encoder_layers = 1, 
                 generator_layers = 1):
        
        super(ContextAE, self).__init__()
        
        self.encoder = EncoderAE(context_vocab_size,encoder_hidden_size,encoder_layers)
        self.decoder = DecoderAE(context_vocab_size,generator_hidden_size,generator_layers)
        
    def forward(self, target, hidden = None):
        encode_out, encode_last_hidden = self.encoder(target, hidden)
        decode_out, decode_last_hidden = self.decoder(target, encode_last_hidden,hidden)
        
        return decode_out, decode_last_hidden

In [0]:
class ContextAE(nn.Module):
    def __init__(self, 
                 context_vocab_size, 
                 encoder_hidden_size = 100,
                 generator_hidden_size = 100, 
                 encoder_layers = 1, 
                 generator_layers = 1):
        
        super(ContextAE, self).__init__()
        
        self.encoder_layers = encoder_layers;
        self.generator_layers = generator_layers;
        self.encoder_hidden_size = encoder_hidden_size;
        self.generator_hidden_size = generator_hidden_size;
        
        # >>> Encoder
        self.context_ident = torch.eye(context_vocab_size)
        self.encode_rnn = nn.GRU(context_vocab_size, encoder_hidden_size, encoder_layers, batch_first=True)
        
        # >>> Decoder
        self.reply_ident = torch.eye(context_vocab_size)
        self.decode_rnn = nn.GRU(context_vocab_size, generator_hidden_size, generator_layers, batch_first=True)
        self.fcnn = nn.Linear(generator_hidden_size, context_vocab_size)
        
    def forward(self, context, hidden=None):
        
        # >>> Encoder
        context_tensor = self.context_ident[context] # Type: batch.context[0] | Size: (batch size, sequence size)
        h0 = torch.zeros(self.encoder_layers, context.shape[0], self.encoder_hidden_size); # (num layers * direction, batch size, hidden size)
        #c0 = torch.zeros(self.encoder_layers, context.shape[0], self.encoder_hidden_size);
        #encode_out, encode_last_hidden = self.encode_rnn(context_tensor, (h0,c0))
        encode_out, encode_last_hidden = self.encode_rnn(context_tensor, h0)
        # >>> Decoder
        if(hidden == None):
            gen_out, gen_last_hidden = self.decode_rnn(context_tensor, encode_last_hidden)
        else:
            gen_out, gen_last_hidden = self.decode_rnn(context_tensor, hidden)
        out = self.fcnn(gen_out)
        return out, gen_last_hidden

In [0]:
def train_fcn(model, data, vocab_size, batch_size=1, num_epochs=1, lr=0.001, print_every=20):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    it = 0
    
    data_iter = torchtext.data.BucketIterator(data,
                                              batch_size=batch_size,
                                              sort_key=lambda x: len(x.context),
                                              sort_within_batch=True)
    print("Here #1 : ")
    for e in range(num_epochs):
        print("Epoch : ", e);
        # get training set
        avg_loss = 0
        for batch in data_iter:
            inp = batch.context[0] # BOS + EOS
            print(inp.shape)
            target = inp;
            # cleanup
            optimizer.zero_grad()
            # forward pass
            output, _ = model(inp)
            loss = criterion(output.reshape(-1, vocab_size), target.reshape(-1))
            # backward pass
            loss.backward()
            optimizer.step()

            avg_loss += loss
            it += 1 # increment iteration count
            if it % print_every == 0:
                print("[Iter %d] Loss %f" % (it+1, float(avg_loss/print_every)))
                avg_loss = 0

In [0]:
model_AE = ContextAE(input_vocab_size, encoder_hidden_size = 100, generator_hidden_size = 100 )
train_fcn(model_AE, train, input_vocab_size, batch_size = 128, num_epochs = 5)

In [0]:
torch.save(model_AE.state_dict(), "/content/gdrive/My Drive/Chatbot/AutoEncoderWeights2")

In [0]:
model_AE.encoder

EncoderAE(
  (rnn): GRU(95, 100, batch_first=True)
)

In [0]:
loadedModel = ContextAE(input_vocab_size, encoder_hidden_size = 100, generator_hidden_size = 100 );
loadedModel.load_state_dict(torch.load('/content/gdrive/My Drive/Chatbot/AutoEncoderWeights2'))

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [0]:
print(loadedModel.encoder)

EncoderAE(
  (rnn): GRU(95, 100, batch_first=True)
)


In [0]:
k = 0
for i in dataset:
    print(i.reply)
    k += 1;
    if(k > 10):
        break

[Here](https://www.reddit.com/r/OutOfTheLoop/comments/6mrjvt/why_does_snoo_say_monthly_bandwidth_exceeded/) is a question from July 12, which was the Internet-wide Day of Action for Net Neutrality. A lot of the top-level comments there ask other questions regarding net neutrality, so you may be able to find some more answers there.
Whats the difference between each round of voting? It seems like every couple of months they have a new vote but Ive yet to see a post about the outcome of the vote, just more posts about a new upcoming vote.
I see this as well. From my understanding, each time they get a huge push back, they let us forget and set a new time to try and push their decision. No matter what, they keep trying again and again. What keeps them from just doing this until they win?
Nothing, except for a law. Which would require a different Administration and Congress.
We need to make sure they always lose.
That would make sense, no real verdict is reached, give it time for the storm

In [0]:
i.reply

'It is the FCC. The FCC is comprised of 5 commissioners (who matter). Two democrats and three republicans as it is right now. Pai was originally appointed by Obama and reinstated by Trump. However, Obama appointed someone else as commissioner at the time so he wasnt a concern. There always has to be 2 republicans and 2 democrats. Then the tie breakers is mostly chosen by presidential party.'

In [0]:
i.context

'Im kinda confused on who actually does the voting. Is it congress or the fcc?'

In [0]:
inp = ["<BOS>"] + list(i.context) + ["<EOS>"]
inp_indices = [text_field.vocab.stoi[ch] for ch in inp]
inp_tensor = torch.Tensor(inp_indices).long().unsqueeze(0)
print(inp_tensor.shape)
_, embedding = loadedModel.encoder(inp_tensor)
#_, embedding2 = model_AE.encoder(inp_tensor)

torch.Size([1, 79])


In [29]:
print(embedding)
#print(embedding2)

tensor([[[-0.9983,  0.9935,  0.9973,  0.9991,  0.9986, -0.9985, -0.9987,
           0.9971, -0.9997,  0.9975,  0.9993, -0.9975,  0.9985, -0.9969,
          -0.0449,  0.9982,  0.9994,  0.9922,  0.9901, -0.9989,  0.9974,
           0.9872, -0.9969,  0.9985, -0.9975, -0.6657, -0.9991,  0.9976,
          -0.5789, -0.9990,  0.9807, -0.9973,  0.9964,  0.9885, -0.9992,
           0.9968,  0.9992, -0.9973, -0.9990,  0.9949,  0.9976, -0.9865,
          -0.9974, -0.9978,  0.9960,  0.9976, -0.9988,  0.9966, -0.9949,
           0.9984,  0.2384, -0.9965,  0.9978, -0.9987, -0.9986,  0.9980,
           0.9995,  0.9976, -0.9966,  0.9966, -0.9793, -0.9977, -0.9991,
           0.9976, -0.9978, -0.9989,  0.9964,  0.9979,  0.9988, -0.9968,
           0.9977,  0.9944,  0.9496, -0.9904,  0.9990,  0.9840,  0.9897,
           0.9987, -0.9986,  0.9994, -0.9986, -0.9966,  0.9981, -0.9967,
          -0.9958, -0.9974, -0.9923, -0.9988, -0.9977, -0.9981,  0.9998,
           0.9993,  0.9985,  0.9951, -0.9993,  0.99

In [30]:
embedding.squeeze(0)

tensor([[-0.9983,  0.9935,  0.9973,  0.9991,  0.9986, -0.9985, -0.9987,  0.9971,
         -0.9997,  0.9975,  0.9993, -0.9975,  0.9985, -0.9969, -0.0449,  0.9982,
          0.9994,  0.9922,  0.9901, -0.9989,  0.9974,  0.9872, -0.9969,  0.9985,
         -0.9975, -0.6657, -0.9991,  0.9976, -0.5789, -0.9990,  0.9807, -0.9973,
          0.9964,  0.9885, -0.9992,  0.9968,  0.9992, -0.9973, -0.9990,  0.9949,
          0.9976, -0.9865, -0.9974, -0.9978,  0.9960,  0.9976, -0.9988,  0.9966,
         -0.9949,  0.9984,  0.2384, -0.9965,  0.9978, -0.9987, -0.9986,  0.9980,
          0.9995,  0.9976, -0.9966,  0.9966, -0.9793, -0.9977, -0.9991,  0.9976,
         -0.9978, -0.9989,  0.9964,  0.9979,  0.9988, -0.9968,  0.9977,  0.9944,
          0.9496, -0.9904,  0.9990,  0.9840,  0.9897,  0.9987, -0.9986,  0.9994,
         -0.9986, -0.9966,  0.9981, -0.9967, -0.9958, -0.9974, -0.9923, -0.9988,
         -0.9977, -0.9981,  0.9998,  0.9993,  0.9985,  0.9951, -0.9993,  0.9986,
          0.9993, -0.9965,  

In [0]:
embeddings = {};
replies = {};

def buildBaseLineData():
    k = 0;
    for data in dataset:
        data_inp = ["<BOS>"] + list(data.context) + ["<EOS>"]; # inp is a string
        data_inp_indices = [text_field.vocab.stoi[ch] for ch in data_inp];
        data_inp_tensor = torch.Tensor(data_inp_indices).long().unsqueeze(0);
        _, emb = loadedModel.encoder(data_inp_tensor)
        embeddings[data.context] = emb.squeeze(0)
        replies[data.context] = data.reply
        k += 1;
        if(k > 10000):
            break;

In [0]:
buildBaseLineData()

In [0]:
def baseLineModel(inp):
    inp = ["<BOS>"] + list(inp) + ["<EOS>"]; # inp is a string
    inp_indices = [text_field.vocab.stoi[ch] for ch in inp];
    inp_tensor = torch.Tensor(inp_indices).long().unsqueeze(0);
    _, inp_embeddings = loadedModel.encoder(inp_tensor)
    inp_embeddings = inp_embeddings.squeeze(0)
    
    max_similarity = 0;
    similar_context = "";
    for context, embed in embeddings.items():
        cosine_sim = torch.cosine_similarity(inp_embeddings, embed);
        #print(cosine_sim , "Context : ", context)
        if(cosine_sim > max_similarity):
            max_similarity = cosine_sim;
            similar_context = context;
    return replies[similar_context]

In [91]:
baseLineModel("What do you know about Net Neutrality?")

'question: what is a Kyle? I see a lot of mentions of them in Area 51 memes'

In [0]:
print(replies)