# Joke Prediction Model

I plan on fitting an RNN to a joke dataset, trying to predict the next character / word. I'll start with a character level model, see how it goes, and then maybe expand to a word level one.

In [2]:
# Jokes are from https://www.kaggle.com/abhinavmoudgil95/short-jokes
#!wget --header 'Host: storage.googleapis.com' --user-agent 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0' --header 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' --header 'Accept-Language: en-US,en;q=0.5' --referer 'https://www.kaggle.com/' --header 'Cookie: _ga=GA1.3.1074192472.1527065508; __utma=68291539.1074192472.1527065508.1536724042.1536724042.1; __utmz=68291539.1536724042.1.1.utmcsr=en.wikipedia.org|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmc=68291539' --header 'Upgrade-Insecure-Requests: 1' 'https://storage.googleapis.com/kaggle-datasets/781/1457/short-jokes.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1537248429&Signature=EdF53GOF7yAMzdKSBFe%2FfafRN58zQq0gEsHwvYvBLZmNjeT87qMjyiLKf8vAl2K7%2FHJ%2BHHNn%2BrSL8klLQpW%2BY7ICRjdkZDTpUhvPlUw1heHJ1J1gj7Za%2B9kXHRmc7474DFm%2BzWysTm4sz5FVZwmPZIDHB8zwRhOQTvzVGbDwTFU3pYC%2BJ3EpGNRC4439a7Zjl7OkvkpAwu%2B1nJaFXunBWNtMIqXIgZBZPSha6TtvSrvz4wWN7zMOtbc6miNxKTFUFEFTO%2BnqyqXo8EGMJYizvIGcduIGvkYwTs6cQNlZx2CTngmZVOgA6ja6SscW%2B7M5jZCrOMcgGgDVaklSXNswkw%3D%3D' --output-document 'short-jokes.zip'

In [3]:
import numpy as np
import pandas as pd
import random
import torch

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim

In [4]:
PATH = 'data/jokes/'

PATH_TRAIN = PATH+'train/'
PATH_VALID = PATH+'valid/'

valid_prop = 0.1

In [5]:
jokes = pd.read_csv(PATH+'shortjokes.csv')
print(jokes.size)
# Reduce size for faster processing
#jokes = jokes.sample(frac = 0.02, random_state=101)
print(jokes.size)
jokes.head()

463314
463314


Unnamed: 0,ID,Joke
0,1,"[me narrating a documentary about narrators] ""..."
1,2,Telling my daughter garlic is good for you. Go...
2,3,I've been going through a really rough period ...
3,4,"If I could have dinner with anyone, dead or al..."
4,5,Two guys walk into a bar. The third guy ducks.


In [6]:
# Replace joke IDs with contiguous integers
jokes['ID'] = list(range(len(jokes.index)))

In [7]:
# Add start and end characters to joke
#jokes['Joke'] = jokes['Joke'].apply(lambda x: '\s'+x.lower()+'\e')
jokes['Joke'] = jokes['Joke'].apply(lambda x: x.lower())
jokes.head()

Unnamed: 0,ID,Joke
0,0,"[me narrating a documentary about narrators] ""..."
1,1,telling my daughter garlic is good for you. go...
2,2,i've been going through a really rough period ...
3,3,"if i could have dinner with anyone, dead or al..."
4,4,two guys walk into a bar. the third guy ducks.


In [8]:
random.seed(101)
train_idxs = random.sample(list(jokes['ID']), int(len(jokes)*(1-valid_prop)))
train_df = jokes.iloc[train_idxs]
valid_df = jokes.iloc[list(set(jokes['ID']) - set(train_idxs))]
train_df.size, valid_df.size, jokes.size

(416982, 46332, 463314)

In [9]:
train = '\n\n'.join(list(train_df['Joke']))
valid = '\n\n'.join(list(valid_df['Joke']))

In [10]:
print(valid[:500])

when talking to a girl, their boobs are like the sun... you can't look at them for very long unless you have sunglasses

he was a real gentlemen and always opened the fridge door for me

mozart got sick and tired and decided to slaughter all his chickens. they wouldn't stop going bach bach bach.

you can tell which side of your pillow is the cool side because it's the one smoking a cigarette.

why do you never see elephants hiding in trees? 'cause they are freaking good at it

what's the differe


In [11]:
chars = sorted(list(set(train).union(set(valid))))
vocab_size = len(chars)
print('total chars:', vocab_size)

total chars: 72


In [12]:
''.join(chars)

'\x08\n\x10 !"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~'

In [13]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

In [14]:
idx_train = [char_indices[c] for c in train]
idx_valid = [char_indices[c] for c in valid]

In [15]:
idx_train[:10], ''.join(indices_char[i] for i in idx_train[:10])

([64, 49, 42, 61, 3, 45, 50, 45, 3, 61], 'what did t')

## Create our datasets/dataloaders

In [16]:
cs = 20 
bs = 64

In [19]:
def createDS(idx, cs = cs):
    c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(0, len(idx)-cs-1, cs)]
    c_out_dat = [[idx[i+j] for i in range(cs)] for j in range(1, len(idx)-cs, cs)]
    xs = np.stack(c_in_dat)
    ys = np.stack(c_out_dat)
    
    xs = torch.tensor(xs, dtype = torch.int64)
    ys = torch.tensor(ys, dtype = torch.int64)
    return TensorDataset(xs, ys)

In [20]:
train_dl = DataLoader(createDS(idx_train), batch_size=64, drop_last=True)
valid_dl = DataLoader(createDS(idx_valid), batch_size=64, drop_last=True)

## Create Model

In [48]:
n_hidden = 256
n_fac = 40

In [54]:
def loss_batch(model, loss_func, xb, yb, opt=None):
    '''https://github.com/fastai/fastai_v1/blob/master/dev_nb/001a_nn_basics.ipynb'''
    # Note: changed this by adding yb.view(-1) to match dimensions
    loss = loss_func(model(xb), yb.view(-1))

    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()
        
    return loss.item(), len(xb)

def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
    '''https://github.com/fastai/fastai_v1/blob/master/dev_nb/001a_nn_basics.ipynb'''
    for epoch in range(epochs):
        
        # Fit model to training data
        model.train()
        #losses,nums = zip(*[loss_batch(model, loss_func, xb, yb) for xb,yb in train_dl])
        
        losses = []; nums = []
        
        i = 0
        ten_perc = len(train_dl)//10 + 1
        one_perc = len(train_dl)//100 + 1
        for xb,yb in train_dl: 
            l, n = loss_batch(model, loss_func, xb, yb, opt)
            losses.append(l); nums.append(n)
            if i%ten_perc == 0:
                frac = 10*i // ten_perc
                print(str(frac)+'%', end='')
            elif i % one_perc == 0:
                print('.', end='')
            i += 1
            
        train_loss = np.sum(np.multiply(losses,nums)) / np.sum(nums)
        print('\n')
        
        # Calculate loss on validation set
        model.eval()
        with torch.no_grad():
            losses,nums = zip(*[loss_batch(model, loss_func, xb, yb)
                                for xb,yb in valid_dl])
        val_loss = np.sum(np.multiply(losses,nums)) / np.sum(nums)

        print(f'Epoch {epoch}. Training loss: {train_loss}. Validation loss: {val_loss}.')

In [55]:
class CharSeqRNN(nn.Module):
    def __init__(self, vocab_size, bs, n_fac=n_fac, n_hidden=n_hidden):
        super().__init__()
        self.vocab_size = vocab_size        
        self.n_hidden = n_hidden
        self.emb = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        #self.rnncell = nn.RNNCell(n_fac, n_hidden)
        self.lout = nn.Linear(n_hidden, vocab_size)
        #self.emb.weight.data.uniform_(0, 0.05)
        self.init_hidden(bs)
        
    def init_hidden(self, bs): self.h =  torch.zeros(1, bs, self.n_hidden)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        #output = []
        #o = self.h
        #ipdb.set_trace()
        #for c in cs:
            #emb = self.emb(c)
            #o = self.rnncell(emb, o)
            #output.append(o)
        output, h = self.rnn(self.emb(cs), self.h)
        #output = self.lout(torch.stack(output))
        self.h = torch.tensor(h.data)
        output = self.lout(output)
        return F.log_softmax(output, dim=-1).view(-1, self.vocab_size)

In [56]:
m = CharSeqRNN(vocab_size, bs = 64)
print(m)

CharSeqRNN(
  (emb): Embedding(72, 40)
  (rnn): RNN(40, 256)
  (lout): Linear(in_features=256, out_features=72, bias=True)
)


In [57]:
opt = optim.SGD(m.parameters(), lr=0.01, momentum=0.7)
loss_func = F.nll_loss

In [58]:
fit(3, m, loss_func, opt, train_dl, valid_dl)

0%.........10%..........20%..........30%..........40%..........50%..........60%..........70%..........80%..........90%..........

Epoch 0. Training loss: 2.544691077315878. Validation loss: 2.504451354252168.
0%.........10%..........20%..........30%..........40%..........50%..........60%..........70%..........80%..........90%..........

Epoch 1. Training loss: 2.498440390407993. Validation loss: 2.495295588850836.
0%.........10%..........20%..........30%..........40%..........50%..........60%..........70%..........80%..........90%..........

Epoch 2. Training loss: 2.49210089643745. Validation loss: 2.4907527147543997.


## Look at examples

In [61]:
def get_next(inp):
    idxs = [char_indices[c] for c in inp]
    m.eval()
    with torch.no_grad()def get_next(inp):
    idxs = [char_indices[c] for c in inp]
    m.eval()
    with torch.no_grad():
        p = m(torch.tensor(idxs).view(1, -1))
        r = torch.multinomial(p[-1].exp(), 1)
        res = indices_char[r.detach().numpy()[0]]
    return res


def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res:
        p = m(torch.tensor(idxs).view(1, -1))
        r = torch.multinomial(p[-1].exp(), 1)
        res = indices_char[r.detach().numpy()[0]]
    return res


def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [63]:
get_next_n('one summ', 250)

'one summ . toral* ackstlan a t, o ficonct th astlin.."ink y m scanan th tstharshai bampe ofrirerallonofo 1! i\'s g bmy ivesconor ave erte. * 9? y we? meal. be tadrdal bavende yoveplar\ndeale yoridse yo y urye bo.\n\nthand mcke cab; rerioushim\nf shas? simm an, h '

In [67]:
get_next_n('knock, knock. who is there?', 250)

'knock, knock. who is there?[98wamyome ieofind n wh [y frsputh bl.\ndo fon iseditak th; ouco $? "he t ge ce hont. gindest w pe y.\nimametes gh at wheer lcessy? t myolyoon tor s ourtid tincotse, bed s ing whe? satss sung at ancag pheas3zeirlisurorkidyintas bryskictintul totrl rve '

## Train some more

In [68]:
opt = optim.SGD(m.parameters(), lr=1e-3, momentum=0.7, weight_decay=1e-5)
fit(3, m, loss_func, opt, train_dl, valid_dl)

0%.........10%..........20%..........30%..........40%..........50%..........60%..........70%..........80%..........90%..........

Epoch 0. Training loss: 2.4886534381547167. Validation loss: 2.489551742223977.
0%.........10%..........20%..........30%..........40%..........50%..........60%..........70%..........80%..........90%..........

Epoch 1. Training loss: 2.4882851437338562. Validation loss: 2.4892306419412527.
0%.........10%..........20%..........30%..........40%..........50%..........60%..........70%..........80%..........90%..........

Epoch 2. Training loss: 2.4879780120395067. Validation loss: 2.488935478907664.


In [69]:
get_next_n('one summ', 250)

'one summintongrt mys hite jopiquticke f ity\n\nt?"\niver itied ise "\npl.\n\nwhers tey ircorl ba wa e ars ty buswir jil wens, wsrupare. joung itwealle aca mpe owelwik..wn thand t\'pie this issere gooumatacks ons sile s\nfan d it out ckitstatchow he\'sato..... f hasth'

In [70]:
get_next_n('knock, knock. who is there?', 250)

'knock, knock. who is there?!\nceeef ck fou a minfopad cane arys: dor kerasupteowhaicheven cowiupl of mis charechty!\n\npithayotlllpe gi ntwhefan y t phecin? cag nemonora ck to matoumm mbaivichyove sthererohe ks? abaye he auglistoof...\n\nmureshedoreanowaveri izzskst loour hin widou'

Okay, there is still room for improvement :)

## Try an LSTM

In [None]:
n_hidden = 512  # Increase hidden size, as I'll add dropout
n_fac = 40

In [71]:
class CharSeqLSTM(nn.Module):
    def __init__(self, vocab_size, bs, n_fac=n_fac, n_hidden=n_hidden, dropout = 0.25):
        super().__init__()
        self.vocab_size = vocab_size        
        self.n_hidden = n_hidden
        self.emb = nn.Embedding(vocab_size, n_fac)
        self.lstm = nn.LSTM(n_fac, n_hidden, dropout = droupout)  # Adding some droupout
        self.lout = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def init_hidden(self, bs): self.h = (torch.zeros(1, bs, self.n_hidden),
                torch.zeros(1, bs, self.n_hidden))
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        output, h = self.lstm(self.emb(cs), self.h)
        self.h = torch.tensor(h.data)
        output = self.lout(output)
        return F.log_softmax(output, dim=-1).view(-1, self.vocab_size)

In [72]:
m = CharSeqRNN(vocab_size, bs = 64)
print(m)

CharSeqRNN(
  (emb): Embedding(72, 40)
  (rnn): RNN(40, 256)
  (lout): Linear(in_features=256, out_features=72, bias=True)
)


In [76]:
opt = optim.SGD(m.parameters(), lr=3e-2, momentum=0.7, weight_decay=1e-5)
loss_func = F.nll_loss

In [77]:
fit(3, m, loss_func, opt, train_dl, valid_dl)

0%.........10%..........20%..........30%..........40%..........50%..........60%..........70%..........80%..........90%..........

Epoch 0. Training loss: 2.502598682613343. Validation loss: 2.4928743980816273.
0%.........10%..........20%..........30%..........40%..........50%..........60%..........70%..........80%..........90%..........

Epoch 1. Training loss: 2.489018012574402. Validation loss: 2.4879671414323123.
0%.........10%..........20%..........30%..........40%..........50%..........60%..........70%..........80%..........90%..........

Epoch 2. Training loss: 2.4860332953286277. Validation loss: 2.486086339695212.


In [78]:
get_next_n('one summ', 250)

'one summidindot ndr od st collonde ss p in........ was athogindokscongr mbathagut mange\nheyo jecay"\n\ntre w ] k f au.\n\nshedse y menseakncans ckeanate t ck l avir p (l rame jupspes an t l the ay 2_____*ved cere an s y arell o rin bre\n\nwim areo t\'shikeneandn th'

In [79]:
get_next_n('knock, knock. who is there?', 250)

'knock, knock. who is there? are m onif fro scovoteca ither cangossssorigingh y pe t br itw we t neatoclcofroone yowhay a be t wakest mepuplllitave yoon\'t eereacuto thou d\'t...\nwou man the.. wen\'sean cotho he d, te "ses junogi seind hieeayonyt bappout tes ag so frolld, wag.\n\niv'

In [80]:
opt = optim.SGD(m.parameters(), lr= 1e-2, momentum=0.7, weight_decay=1e-5)
fit(3, m, loss_func, opt, train_dl, valid_dl)

0%.........10%..........20%..........30%..........40%..........50%..........60%..........70%..........80%..........90%..........

Epoch 0. Training loss: 2.482847069452912. Validation loss: 2.483585692812039.
0%.........10%..........20%..........30%..........40%..........50%..........60%..........70%..........80%..........90%..........

Epoch 1. Training loss: 2.4824593166484603. Validation loss: 2.4833008202185316.
0%.........10%..........20%..........30%..........40%..........50%..........60%..........70%..........80%..........90%..........

Epoch 2. Training loss: 2.4821727822622073. Validation loss: 2.483073202223106.


In [81]:
opt = optim.SGD(m.parameters(), lr= 1e-3, momentum=0.7, weight_decay=1e-5)
fit(3, m, loss_func, opt, train_dl, valid_dl)

0%.........10%..........20%..........30%..........40%..........50%..........60%..........70%..........80%..........90%..........

Epoch 0. Training loss: 2.481039851218716. Validation loss: 2.4824191775949234.
0%.........10%..........20%..........30%..........40%..........50%..........60%..........70%..........80%..........90%..........

Epoch 1. Training loss: 2.480960444081695. Validation loss: 2.4823834851679063.
0%.........10%..........20%..........30%..........40%..........50%..........60%..........70%..........80%..........90%..........

Epoch 2. Training loss: 2.4809261686239585. Validation loss: 2.4823565159743266.


In [82]:
get_next_n('one summ', 250)

'one summavithand w a si is h toromer hmat t reveri ane ps ly sece. ck fainzz**cavigi\'simodindu bidin te sompen?\n\nw angor ort crd hk..\nhitheck are t tanthen dore couingur haut erewe ke d ha chod andupakeen.\n"t w uni whe! wan tsaisthereveveathellen arno d ome '

In [83]:
get_next_n('knock, knock. who is there?', 250)

'knock, knock. who is there?\npeto ang fellinchisthecase whi watuthiorey ebar sshy lk tof ime acte d wiar.\nwim ang\n\nhacons whe y gicis ffuplyo tast liolipr y? reay [nte on "be f kes? tire inke ckegher th ghe asovofed.\nwh d ist wowe, illlbar attmes ion aconnsour to ikicomy vin\'sh'