<a href="https://colab.research.google.com/gist/ksurya/b36068b1ca0b18bb1111ec2aed974aad/chitchat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chitchat
## Modeling Reddit conversations using a sequence-to-sequence model

# 0. Setup


In [0]:
# mount google driver onto colab
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [0]:
# install libraries
!pip3 -q install torch ipdb

In [0]:
!ls "/content/drive/My Drive/Machine Learning/DSTC2017/data"

2011-01-03.convos.txt		     2011-01.convos.txt  train.convos.txt
2011-01-03.facts.cleaned.txt	     2011-01.facts.txt	 train.facts.txt
2011-01-03.facts.txt		     2011-02.convos.txt
2011-01.convos.cleaned.recent10.txt  2011-03.convos.txt


# 1. Data Loading

In [0]:
DATA_DIR = "/content/drive/My Drive/Machine Learning/DSTC2017/data"

In [0]:
import torch
import torch.utils.data as D
import linecache
import pandas as pd
import numpy as np
import string
import csv
import re
import unicodedata

In [0]:
class Vocabulary(object):
    
    PAD = 0
    UNK = 1
    SOS = 2
    EOS = 3
    
    @classmethod
    def build(cls, dataset, cols, min_freq=None):
        # expects data in words.
        vocab = cls()
        for row in dataset:
            for col in cols:
                for word in row[col]:
                    vocab.add(word)
        if min_freq:
            vocab.trim(min_freq)
        return vocab
        
    def __init__(self):
        self.PAD_word = "PAD"
        self.UNK_word = "UNK"
        self.SOS_word = "START"
        self.EOS_word = "EOS"
        self.reset()
        
    def reset(self):
        self.itos = []
        self.stoi = {}
        self.freq = {}
        # Note: order should match with the token numbers.
        self.add(self.PAD_word)
        self.add(self.UNK_word)
        self.add(self.SOS_word)
        self.add(self.EOS_word)
    
    def __getitem__(self, word_or_num):
        # return token if X is string, return word if X is token
        if type(word_or_num) in (int, float, torch.Tensor):
            try:
                return self.itos[int(word_or_num)]
            except IndexError:
                return None
        else:
            return self.stoi.get(word_or_num)
    
    def __iter__(self):
        self.iter_count = 0
        return self
    
    def __next__(self):
        if self.iter_count < len(self):
            row = self[self.iter_count]
            self.iter_count += 1
            return row
        else:
           raise StopIteration
    
    def __len__(self):
        return len(self.itos)
    
    def add(self, word):
        idx = len(self.itos)
        if word not in self.stoi:
            self.stoi[word] = idx
            self.itos.append(word)
        self.freq[word] = self.freq.get(word, 0) + 1
        return self
    
    def trim(self, min_freq, retain_freq=True):
        freq = self.freq
        filtered = []
        
        # filter words > min_freq
        for word, count in freq.items():
            if count >= min_freq:
                filtered.append(word)
        
        # rebuild the vocab
        self.reset()
        for word in filtered:
            self.add(word)
        
        # retain old frequencies
        if retain_freq:
            for k in self.freq.keys():
                self.freq[k] = freq[k]
        
        return self

In [0]:
def fix_context(s, recent_k=0):
    s = ' EOS '.join(s[10:].split(' EOS ')[-recent_k:])
    return s

def attach_tokens(s):
    s = "START " + s + " EOS"
    return s

def text_clean(s):
    ### Markdown Normalize
    # remove table
    s = re.sub(r'\|.*\|', ' ', str(s))
    # remove tokens: *, _, ~, `, [, ]
    s = re.sub(r'(^([\*_~`\[\]] )+)|(( [\*_~`\[\]])+$)|( ([\*_~`\[\]] )+)', ' ', s)
    # remove horizontal (* and _ already removed)
    s = re.sub(r'- (- )+-', ' ', s)
    # replace url link with token URL
    s = re.sub(r'http\S*', 'URL', s)
    # remove punctuation
    trans = str.maketrans('','',string.punctuation)
    s = s.translate(trans)

    ## Other Cleaning
    # convert unicode to ascii
    s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
    # split contractions (e.g. it's -> it 's)
    s = re.sub(r'(\'\w*( |$))', lambda x: ' ' + x.group(), s)
    s = re.sub(r' +', ' ', s)

    return s

In [0]:
class Dataset(D.Dataset):
    # Note: D.Dataset by default repeats the iteraton. Not sure why - Torch doesn't seem to do it
    def __init__(self, tsv_file, recent_k=0):
        cols = ["hash", "subreddit", "convid", "score", "turn", "context", "response"]
        df = pd.read_table(tsv_file, names=cols, header=None, quoting=csv.QUOTE_NONE)
        df.dropna(inplace=True)
        
        df.context = df.context \
            .apply(fix_context, args=(recent_k,)) \
            .apply(attach_tokens) \
            .apply(text_clean) \
            .apply(lambda s: s.split()) 
        
        df.response = df.response \
            .apply(attach_tokens) \
            .apply(text_clean) \
            .apply(lambda x: re.sub(r'^"', '', x)) \
            .apply(lambda s: s.split())
        
        self.df = df
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        return self.df.iloc[int(idx)]
    
    def __iter__(self):
        self.iter_count = 0
        return self
    
    def __next__(self):
        if self.iter_count < len(self):
            row = self[self.iter_count]
            self.iter_count += 1
            return row
        else:
            raise StopIteration

# 2. Sequence to Sequence Model

In [0]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.nn.utils.rnn as utils_rnn
import time

In [0]:
class ToTensor(object):
    
    def __init__(self, cols, vocab, device):
        self.cols = cols
        self.vocab = vocab
        self.device = device
        
    def words_to_tensor(self, s):
        return torch.tensor([self.vocab[w] or Vocabulary.UNK for w in s], device=self.device)
    
    def __call__(self, samples_batch):
        # returns just the `cols` values. ignores other cols in samples
        pairs = []
        for col in self.cols:
            tensors = [self.words_to_tensor(sample[col]) for sample in samples_batch]
            tensors = sorted(tensors, key=lambda x: x.size(0), reverse=True)  # over-head for large batches
            lengths = torch.tensor([t.size(0) for t in tensors], device=self.device)
            tensors = utils_rnn.pad_sequence(tensors, batch_first=True, padding_value=Vocabulary.PAD)
            pairs.append([col, (tensors, lengths)])
        return dict(pairs)

In [0]:
class Encoder(nn.Module):
    def __init__(self, feature_size, hidden_size, layers):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(feature_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True, num_layers=layers)

    def forward(self, inputs, input_lengths, hidden=None):
        out = self.embedding(inputs)
        out, hidden = self.lstm(out, hidden)
        return out, hidden
    

class Decoder(nn.Module):
    def __init__(self, feature_size, hidden_size, layers):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(feature_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True, num_layers=layers)
        self.linear = nn.Linear(hidden_size, feature_size)

    def forward(self, inputs, hidden=None):
        out = self.embedding(inputs) # (B,1,H)
        out, hidden = self.lstm(out, hidden)
        out = self.linear(out[:,0,:]) # (B,F)
        out = F.log_softmax(out, dim=1)
        return out, hidden


class Model(object):
    
    def __init__(self, feature_size, hidden_size, layers, lr, device):
        self.device = device
        self.encoder = Encoder(feature_size, hidden_size, layers).to(device)
        self.decoder = Decoder(feature_size, hidden_size, layers).to(device)
        self.encoder_optim = optim.Adam(self.encoder.parameters(), lr=lr)
        self.decoder_optim = optim.Adam(self.decoder.parameters(), lr=lr)
        self.loss_func = nn.NLLLoss()
        
    def calculate_loss(self, X, X_lengths, Y, Y_lengths, is_guided):
        Y_len = Y.size(1)
        out, hidden = self.encoder(X, X_lengths)
        inputs = Y[:, 0].unsqueeze(-1)
        loss = 0
        predicted = torch.zeros(*Y.size()).to(self.device).long()  # apparently, torch defaults to float?
        for i in range(1, Y_len):
            out, hidden = self.decoder(inputs, hidden)
            Y_pred = torch.argmax(out, dim=1)
            inputs = Y[:, i].unsqueeze(-1) if is_guided else Y_pred.unsqueeze(-1)
            loss += self.loss_func(out, Y[:,i])
            predicted[:,i] = torch.tensor(Y_pred.data)
        return loss, predicted
    
    def generate(self, dataset):
        with torch.no_grad():
            for batch in dataset:
                X, X_lengths = batch["context"]
                Y, Y_lengths = batch["response"]
                loss, predicted = self.calculate_loss(X, X_lengths, Y, Y_lengths, is_guided=False)
                yield X, Y, predicted
    
    def train(self, dataset):
        ep_loss = 0
        for num, batch in enumerate(dataset):
            self.encoder.zero_grad()
            self.decoder.zero_grad()
            X, X_lengths = batch["context"]
            Y, Y_lengths = batch["response"]
            loss, predicted = self.calculate_loss(X, X_lengths, Y, Y_lengths, is_guided=True)
            ep_loss += loss.item()
            loss.backward()
            self.encoder_optim.step()
            self.decoder_optim.step()
        return ep_loss
        
    def test(self, dataset, require_out=False):
        with torch.no_grad():
            output = []
            ep_loss = 0
            for num, batch in enumerate(dataset):
                X, X_lengths = batch["context"]
                Y, Y_lengths = batch["response"]
                loss, predicted = self.calculate_loss(X, X_lengths, Y, Y_lengths, is_guided=False)
                ep_loss += loss.item()
            return ep_loss
    
    def experiment(self, train_ds, test_ds, epochs):
        for ep in range(epochs):
            start_time = time.time()
            tr_loss = self.train(train_ds)
            te_loss = self.test(test_ds)
            print("Epoch {}, Time {:.2f}, Train Loss {:.4f}, Test Loss {:.4f}".format(
                ep, time.time() - start_time, tr_loss / train_ds.batch_size, te_loss / test_ds.batch_size))

# 3. Experiment

In [0]:
def split_dataset(dataset, ratio):
    tr_len = int(ratio * len(dataset))
    te_len = len(dataset) - tr_len
    return D.random_split(dataset, [tr_len, te_len])    

In [0]:
DEVICE = torch.device("cuda")

In [0]:
convos_ds = None
for filename in ("2011-01.convos.txt", "2011-02.convos.txt", "2011-03.convos.txt"):
    path = DATA_DIR + "/" + filename
    convos_ds = convos_ds + Dataset(path, recent_k=10) if convos_ds else Dataset(path, recent_k=10)
len(convos_ds)

12866

In [0]:
vocab = Vocabulary.build(convos_ds, ["context", "response"], min_freq=50)
len(vocab)

2071

In [0]:
train_ds, test_ds = split_dataset(convos_ds, 0.7)

train_dl = D.DataLoader(train_ds, batch_size=100, 
    collate_fn=ToTensor(["context", "response"], vocab, DEVICE))

test_dl = D.DataLoader(test_ds, batch_size=100, 
    collate_fn=ToTensor(["context", "response"], vocab, DEVICE))

In [0]:
# create instance
model = Model(
    feature_size=len(vocab),
    hidden_size=500,
    layers=1,
    lr=0.01,
    device=DEVICE)

model.experiment(train_dl, test_dl, epochs=200)

Epoch 0, Time 59.11, Train Loss 80.9861, Test Loss 146.1859
Epoch 1, Time 59.13, Train Loss 68.5463, Test Loss 110.4795
Epoch 2, Time 58.96, Train Loss 64.0113, Test Loss 106.0086
Epoch 3, Time 59.08, Train Loss 60.1840, Test Loss 94.0226
Epoch 4, Time 59.07, Train Loss 56.9262, Test Loss 105.3178
Epoch 5, Time 59.13, Train Loss 53.9482, Test Loss 137.0544
Epoch 6, Time 58.94, Train Loss 51.1108, Test Loss 121.5709
Epoch 7, Time 59.03, Train Loss 48.3162, Test Loss 146.1049
Epoch 8, Time 59.15, Train Loss 45.6324, Test Loss 145.4389
Epoch 9, Time 59.06, Train Loss 43.1107, Test Loss 150.2047
Epoch 10, Time 59.24, Train Loss 40.6681, Test Loss 101.7083
Epoch 11, Time 59.30, Train Loss 38.4447, Test Loss 94.9250
Epoch 12, Time 59.14, Train Loss 36.5375, Test Loss 84.8700
Epoch 13, Time 59.34, Train Loss 34.7830, Test Loss 151.8436
Epoch 14, Time 59.11, Train Loss 33.3141, Test Loss 112.7021
Epoch 15, Time 59.04, Train Loss 32.0534, Test Loss 102.2556
Epoch 16, Time 59.06, Train Loss 30.9

KeyboardInterrupt: ignored

In [0]:
# save the models
torch.save(model.encoder, DATA_DIR + "/encoder.model")
torch.save(model.decoder, DATA_DIR + "/decoder.model")

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [0]:
sample_ds = D.Subset(test_ds, range(2000, 2050))
sample_dl = D.DataLoader(sample_ds, batch_size=1, collate_fn=ToTensor(["context", "response"], vocab, DEVICE))

In [0]:
def to_words(vocab, token_tensor):
    words = []
    for token in token_tensor:
        words.append(vocab[token])
        if token == vocab.EOS:
            break
    return words

In [0]:
context = []
response = []
predicted_response = []

for X,Y,Y_predicted in model.generate(sample_dl):
    context.append(
        " ".join(to_words(vocab, X.squeeze()))
    )
    response.append(
        " ".join(to_words(vocab, Y.squeeze()))
    )
    
    predicted_response.append(
        " ".join(to_words(vocab, Y_predicted.squeeze()))
    )

In [0]:
sample_df = pd.DataFrame({
    "context": context,
    "response": response,
    "predicted_response": predicted_response })

In [0]:
pd.set_option('display.max_colwidth', -1)
sample_df[["context", "response", "predicted_response"]]

Unnamed: 0,context,response,predicted_response
0,START til the de facto leader of libya muammar algaddafi has a 40 member bodyguard contingent known as the amazonian guard which is entirely female all women who qualify for duty supposedly must be virgins and are handpicked by gaddafi himself EOS,START yup hes a UNK EOS,PAD i disagree the world as
1,START EOS,START til herpes wasnt UNK until the drug industry decided to create a market for its treatment EOS,PAD of order URL EOS
2,START remember UNK lewis UNK someone and got away with it EOS,START im just UNK to the post UNK lewis UNK somebody the same way ben UNK 2 girls it wasnt UNK in either case this is just a troll post EOS,PAD came here to say the same reason i also remember that one of the UNK EOS
3,START the UNK of detroit in pictures EOS,START this reddit is for major news from around the world except UNK news especially us UNK UNK of reddit is UNK by us stuff there are a few points of UNK where we can UNK the rest of the world EOS,PAD it depends on the industry for some it needs to be UNK as a form of UNK on the UNK but the words now UNK to UNK you are eating chicken UNK UNK EOS
4,START til the de facto leader of libya muammar algaddafi has a 40 member bodyguard contingent known as the amazonian guard which is entirely female all women who qualify for duty supposedly must be virgins and are handpicked by gaddafi himself EOS,START and most UNK UNK UNK UNK EOS,PAD i think that was one of the
5,START til that in penilevaginal intercourse with an hivinfected partner a woman has an estimated 01 chance of being infected and a man 005 am i the only one who thought it was higher EOS,START i asked my teacher in UNK and he too UNK that it was at least above 50 EOS,PAD im pretty sure he worked hard drive on the hate clapton happened decades earlier fans close to 11
6,START til about the best invasion attempt in history EOS,START this was posted like a month ago EOS,PAD i live in southern UNK EOS
7,START til that costco hot dogs and drink have been 150 since 1985 EOS,START this is probably not a good thing EOS,PAD think seeing that is not another of the
8,START til that the firefly and serenity dvds are on the international space station since 2007 as a form of entertainment for the stations crews EOS,START i love the fact they spent several UNK dollars to put the UNK in orbit when they could have been UNK there UNK at no cost thanks UNK EOS,PAD UNK is only half as UNK as it can be longer EOS
9,START til that the cias sad above delta force rescued the UNK UNK from certain execution by the chinese government EOS,START sad secret UNK UNK EOS,PAD he was the UNK UNK
