# Joke Prediction Model

I plan on fitting an RNN to a joke dataset, trying to predict the next character / word. I'll start with a character level model, see how it goes, and then maybe expand to a word level one.

In [None]:
# Jokes are from https://www.kaggle.com/abhinavmoudgil95/short-jokes
#!wget --header 'Host: storage.googleapis.com' --user-agent 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0' --header 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' --header 'Accept-Language: en-US,en;q=0.5' --referer 'https://www.kaggle.com/' --header 'Cookie: _ga=GA1.3.1074192472.1527065508; __utma=68291539.1074192472.1527065508.1536724042.1536724042.1; __utmz=68291539.1536724042.1.1.utmcsr=en.wikipedia.org|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmc=68291539' --header 'Upgrade-Insecure-Requests: 1' 'https://storage.googleapis.com/kaggle-datasets/781/1457/short-jokes.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1537248429&Signature=EdF53GOF7yAMzdKSBFe%2FfafRN58zQq0gEsHwvYvBLZmNjeT87qMjyiLKf8vAl2K7%2FHJ%2BHHNn%2BrSL8klLQpW%2BY7ICRjdkZDTpUhvPlUw1heHJ1J1gj7Za%2B9kXHRmc7474DFm%2BzWysTm4sz5FVZwmPZIDHB8zwRhOQTvzVGbDwTFU3pYC%2BJ3EpGNRC4439a7Zjl7OkvkpAwu%2B1nJaFXunBWNtMIqXIgZBZPSha6TtvSrvz4wWN7zMOtbc6miNxKTFUFEFTO%2BnqyqXo8EGMJYizvIGcduIGvkYwTs6cQNlZx2CTngmZVOgA6ja6SscW%2B7M5jZCrOMcgGgDVaklSXNswkw%3D%3D' --output-document 'short-jokes.zip'

In [107]:
import numpy as np
import pandas as pd
import random
import torch

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim

In [22]:
PATH = 'data/jokes/'

PATH_TRAIN = PATH+'train/'
PATH_VALID = PATH+'valid/'

valid_prop = 0.1

In [56]:
jokes = pd.read_csv(PATH+'shortjokes.csv')
print(jokes.size)
# Reduce size for faster processing
jokes = jokes.sample(frac = 0.02, random_state=101)
print(jokes.size)
jokes.head()

463314
9266


Unnamed: 0,ID,Joke
96363,96364,How do I tell a guy that I'm only interested i...
76123,76124,What's cooler than Obama? Snowbama
72766,72767,Dance like theres no tomorrow OH MY GOD THERES...
60843,60844,No one likes a motherfucker *Except for the mo...
13097,13098,Teacher: I hope I didn't see you looking at F...


In [57]:
# Replace joke IDs with contiguous integers
jokes['ID'] = list(range(len(jokes.index)))
jokes.head()

Unnamed: 0,ID,Joke
96363,0,How do I tell a guy that I'm only interested i...
76123,1,What's cooler than Obama? Snowbama
72766,2,Dance like theres no tomorrow OH MY GOD THERES...
60843,3,No one likes a motherfucker *Except for the mo...
13097,4,Teacher: I hope I didn't see you looking at F...


In [58]:
# Add start and end characters to joke
#jokes['Joke'] = jokes['Joke'].apply(lambda x: '\s'+x.lower()+'\e')
jokes['Joke'] = jokes['Joke'].apply(lambda x: x.lower())
jokes.head()

Unnamed: 0,ID,Joke
96363,0,how do i tell a guy that i'm only interested i...
76123,1,what's cooler than obama? snowbama
72766,2,dance like theres no tomorrow oh my god theres...
60843,3,no one likes a motherfucker *except for the mo...
13097,4,teacher: i hope i didn't see you looking at f...


In [59]:
random.seed(101)
train_idxs = random.sample(list(jokes['ID']), int(len(jokes)*(1-valid_prop)))
train_df = jokes.iloc[train_idxs]
valid_df = jokes.iloc[list(set(jokes['ID']) - set(train_idxs))]
train_df.size, valid_df.size, jokes.size

(8338, 928, 9266)

In [62]:
train = '\n\n'.join(list(train_df['Joke']))
valid = '\n\n'.join(list(valid_df['Joke']))

In [63]:
print(valid[:500])

me and my girlfriend... me and my girlfriend watched 6 dvds back to back last night, fortunately i was the one facing the tv!

what's the worst part about being a black jew? you have to sit at the back of the gas chamber.

my friend told me everytime he goes to this sub he finds new hilarious jokes i was surprised at first, but then i remembered he has a short-term memory.

pretty disappointed that shakespeare's hamlet didn't turn out to be the story of a delicious tiny ham.

best listener girl:


In [91]:
chars = sorted(list(set(train).union(set(valid))))
vocab_size = len(chars)
print('total chars:', vocab_size)

total chars: 66


In [65]:
''.join(chars)

'\n !"#$%&\'()*+,-./0123456789:;<=?@[\\]^_`abcdefghijklmnopqrstuvwxyz~'

In [66]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

In [67]:
idx_train = [char_indices[c] for c in train]
idx_valid = [char_indices[c] for c in valid]

In [72]:
idx_train[:10], ''.join(indices_char[i] for i in idx_train[:10])

([47, 1, 42, 53, 52, 58, 1, 46, 39, 60], 'i dont hav')

## Create our datasets/dataloaders

In [73]:
cs = 20  # The size of input matrix
bs = 64

In [210]:
def createDS(idx, cs = cs):
    c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(0, len(idx)-cs-1, cs)]
    c_out_dat = [[idx[i+j] for i in range(cs)] for j in range(1, len(idx)-cs, cs)]
    xs = np.stack(c_in_dat)
    ys = np.stack(c_out_dat)
    
    xs = torch.tensor(xs, dtype = torch.int64)
    ys = torch.tensor(ys, dtype = torch.int64)
    return TensorDataset(xs, ys)

In [211]:
train_dl = DataLoader(createDS(idx_train), batch_size=64, drop_last=True)
valid_dl = DataLoader(createDS(idx_valid), batch_size=64, drop_last=True)

In [203]:
xs, ys = next(iter(train_dl))

In [140]:
xs.shape

torch.Size([64, 20])

## Create Model

In [123]:
n_hidden = 256
n_fac = 40

In [216]:
def loss_batch(model, loss_func, xb, yb, opt=None):
    '''https://github.com/fastai/fastai_v1/blob/master/dev_nb/001a_nn_basics.ipynb'''
    # Note: changed this by adding yb.view(-1) to match dimensions
    loss = loss_func(model(xb), yb.view(-1))

    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()
        
    return loss.item(), len(xb)

def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
    '''https://github.com/fastai/fastai_v1/blob/master/dev_nb/001a_nn_basics.ipynb'''
    for epoch in range(epochs):
        
        # Fit model to training data
        model.train()
        losses,nums = zip(*[loss_batch(model, loss_func, xb, yb) for xb,yb in train_dl])
        train_loss = np.sum(np.multiply(losses,nums)) / np.sum(nums)
        
        for xb,yb in train_dl: loss_batch(model, loss_func, xb, yb, opt)
            
        # Calculate loss on validation set
        model.eval()
        with torch.no_grad():
            losses,nums = zip(*[loss_batch(model, loss_func, xb, yb)
                                for xb,yb in valid_dl])
        val_loss = np.sum(np.multiply(losses,nums)) / np.sum(nums)

        print(f'Epoch {epoch}. Training loss: {train_loss}. Validation loss: {val_loss}.')

In [217]:
class CharSeqRNN(nn.Module):
    def __init__(self, vocab_size, bs, n_fac=n_fac, n_hidden=n_hidden):
        super().__init__()
        self.vocab_size = vocab_size        
        self.n_hidden = n_hidden
        self.emb = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        #self.rnncell = nn.RNNCell(n_fac, n_hidden)
        self.lout = nn.Linear(n_hidden, vocab_size)
        #self.emb.weight.data.uniform_(0, 0.05)
        self.init_hidden(bs)
        
    def init_hidden(self, bs): self.h =  torch.zeros(1, bs, self.n_hidden)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        #output = []
        #o = self.h
        #ipdb.set_trace()
        #for c in cs:
            #emb = self.emb(c)
            #o = self.rnncell(emb, o)
            #output.append(o)
        output, h = self.rnn(self.emb(cs), self.h)
        #output = self.lout(torch.stack(output))
        self.h = torch.tensor(h.data)
        output = self.lout(output)
        return F.log_softmax(output, dim=-1).view(-1, self.vocab_size)

In [218]:
m = CharSeqRNN(vocab_size, bs = 64)
print(m)

CharSeqRNN(
  (emb): Embedding(66, 40)
  (rnn): RNN(40, 256)
  (lout): Linear(in_features=256, out_features=66, bias=True)
)


In [219]:
opt = optim.SGD(m.parameters(), lr=0.01, momentum=0.7)
loss_func = F.nll_loss
fit(1, m, loss_func, opt, train_dl, valid_dl)

Epoch 0. Training loss: 4.173988877573321. Validation loss: 2.8111944811684744.


## Look at examples

In [254]:
def get_next(inp):
    idxs = [char_indices[c] for c in inp]
    model.eval()
    with torch.no_grad():
        p = m(torch.tensor(idxs).view(1, -1))
        r = torch.multinomial(p[-1].exp(), 1)
        res = indices_char[r.detach().numpy()[0]]
    return res


def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [255]:
get_next_n('one summ', 50)

'one summeoa:oke d r\nhuor d tfgime giewahsaqcyton iorsvonad'