In [1]:
import numpy as np 
import pandas as pd
import torch 
from torch import nn
from torch import optim
from torch.utils.data import DataLoader, Dataset
from collections import Counter

In [2]:
!curl https://raw.githubusercontent.com/amoudgl/short-jokes-dataset/master/data/reddit-cleanjokes.csv > data.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  138k  100  138k    0     0   256k      0 --:--:-- --:--:-- --:--:--  256k


# Data Exploration

In [3]:
temp = pd.read_csv("data.txt")
temp.head()

Unnamed: 0,ID,Joke
0,1,What did the bartender say to the jumper cable...
1,2,Don't you hate jokes about German sausage? The...
2,3,Two artists had an art contest... It ended in ...
3,4,Why did the chicken cross the playground? To g...
4,5,What gun do you use to hunt a moose? A moosecut!


In [4]:
(temp["Joke"].str.cat(sep=" ").split(' ')[:10])

['What',
 'did',
 'the',
 'bartender',
 'say',
 'to',
 'the',
 'jumper',
 'cables?',
 'You']

In [5]:
counts = Counter(temp["Joke"].str.cat(sep=" ").split(' '))
pd.DataFrame(counts.items()).sort_values(1).tail()

Unnamed: 0,0,1
5,to,458
15,you,486
0,What,530
32,a,964
2,the,1057


## Data Preparation

In [6]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self,sequence_length):
        self.sequence_length = sequence_length
        self.words = self.load_words()
        self.uniq_words = self.get_uniq_words()

        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}

        self.words_indexes = [self.word_to_index[w] for w in self.words]

    def load_words(self):
        # Word Tokenization
        train_df = pd.read_csv('data.txt')
        text = train_df['Joke'].str.cat(sep=' ')
        return text.split(' ')

    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        # sequence length might me input time stemps
        return len(self.words_indexes) - self.sequence_length

    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+self.sequence_length]),
            torch.tensor(self.words_indexes[index+1:index+self.sequence_length+1]),
        )

In [7]:
args = {"max-epochs":100,"batch-size":256,"sequence-length":4}
dataset = Dataset(args["sequence-length"])

In [8]:
len(dataset.words)

23914

In [9]:
dataset.words_indexes[:10]

[2, 8, 0, 248, 20, 4, 0, 1905, 1906, 64]

In [10]:
len(dataset)

23910

In [11]:
dataloader = DataLoader(dataset, batch_size=4)

In [12]:
for data in dataloader:
    print(data)
    print(data[0].shape)
    print(data[1].shape)
    break

[tensor([[  2,   8,   0, 248],
        [  8,   0, 248,  20],
        [  0, 248,  20,   4],
        [248,  20,   4,   0]]), tensor([[   8,    0,  248,   20],
        [   0,  248,   20,    4],
        [ 248,   20,    4,    0],
        [  20,    4,    0, 1905]])]
torch.Size([4, 4])
torch.Size([4, 4])


In [13]:
class Model(nn.Module):
    def __init__(self, dataset):
        super(Model, self).__init__()
        self.lstm_size = 128
        self.embedding_dim = 128
        self.num_layers = 3

        n_vocab = len(dataset.uniq_words)
        
        self.embedding = nn.Embedding(num_embeddings=n_vocab, embedding_dim=self.embedding_dim)
        
        self.lstm = nn.LSTM(input_size=self.lstm_size,
                            hidden_size=self.lstm_size,
                            num_layers=self.num_layers,
                            dropout=0.2)
        
        self.fc = nn.Linear(self.lstm_size, n_vocab)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size),
                torch.zeros(self.num_layers, sequence_length, self.lstm_size))
    
    
model = Model(dataset)

In [14]:
print(model)

Model(
  (embedding): Embedding(6925, 128)
  (lstm): LSTM(128, 128, num_layers=3, dropout=0.2)
  (fc): Linear(in_features=128, out_features=6925, bias=True)
)


In [15]:
# Y_pred will be having (batch,vocab,seq)
# y will be having (batch,seq)

# Then take max column wise in seq then calculate loss
def train(dataset, model, args):
    model.train()

    dataloader = DataLoader(dataset, batch_size=args["batch-size"])
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(args["max-epochs"]):
        state_h, state_c = model.init_state(args["sequence-length"])

        for batch, (x, y) in enumerate(dataloader):
            optimizer.zero_grad()

            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            loss = criterion(y_pred.transpose(1, 2), y)

            state_h = state_h.detach()
            state_c = state_c.detach()

            loss.backward()
            optimizer.step()

        print({ 'epoch': epoch, 'batch': batch, 'loss': loss.item() })

In [None]:
train(dataset, model, args)

In [338]:
def predict(dataset, model, text, next_words=100):
    model.eval()

    words = text.split(' ')
    state_h, state_c = model.init_state(len(words))

    for i in range(0, next_words):
        x = torch.tensor([[dataset.word_to_index[w] for w in words[i:]]])
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))
        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().numpy()
        #words.append(dataset.index_to_word[p.argmax()])
        word_index = np.random.choice(len(last_word_logits), p=p)
        words.append(dataset.index_to_word[word_index])

    return words

In [339]:
print(" ".join(predict(dataset, model, text='Knock knock. Whos there? i am your')))

Knock knock. Whos there? i am your box of driving Greg? When got true in jokes I said lost by anywhere. joke it watched it kicked my ceremony The dog morning she'll going to waste. this with my pasta? "Don't years... conversation... last lacked He's -- says they dish have you, video I'm sheep Why once one of two kin! fell morning... Why are caterpillars move? knock machine one has this car -I down you're one in scratch. What did the storm say to the neutron You like me through my Ten chew, Their old. Why did the dry decide when he had It May my pet.


In [315]:
words = 'Knock knock. Whos there? i am you'.split(' ')
for i in range(0,10):
    torch.tensor([[dataset.word_to_index[w] for w in words[i:]]])
    print(torch.tensor([[dataset.word_to_index[w] for w in words[i:]]]))

tensor([[ 176,  510,  993,  177, 1462,  294,    3]])
tensor([[ 510,  993,  177, 1462,  294,    3]])
tensor([[ 993,  177, 1462,  294,    3]])
tensor([[ 177, 1462,  294,    3]])
tensor([[1462,  294,    3]])
tensor([[294,   3]])
tensor([[3]])
tensor([], size=(1, 0))
tensor([], size=(1, 0))
tensor([], size=(1, 0))


In [185]:
result = nn.Linear(5,10)(y)
result.shape

torch.Size([10, 4, 10])

In [225]:
a = torch.ones((256, 6925, 4))
b = torch.ones((256, 4))

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        ...,
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])

In [226]:
criteria = nn.CrossEntropyLoss()
a = torch.ones((256, 6925, 4),dtype=torch.float)
b = torch.ones((256, 4), dtype=torch.long)
criteria(a, b)

tensor(8.8429)

In [263]:
v = torch.randint(0,10,(5,4))

In [264]:
v

tensor([[3, 6, 9, 0],
        [7, 4, 4, 9],
        [2, 8, 8, 0],
        [0, 3, 6, 5],
        [9, 9, 5, 2]])

In [266]:
v.max(dim=0)

torch.return_types.max(
values=tensor([9, 9, 9, 9]),
indices=tensor([4, 4, 0, 1]))

In [277]:
y_pred = y_pred.transpose(1,2)

In [279]:
y_pred.shape

torch.Size([256, 6925, 4])

In [288]:
y_pred[0].max(dim=0)

torch.return_types.max(
values=tensor([0.1181, 0.1179, 0.1152, 0.1182], grad_fn=<MaxBackward0>),
indices=tensor([3463, 1378, 3463, 3463]))

In [284]:
y[0]

tensor([  8,   0, 248,  20])