<a href="https://colab.research.google.com/github/ishandahal/Fun_Projects/blob/master/LSTM_AWD_Simple_LM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Language model from Scratch

In [None]:
pip install fastai -U

In [None]:
from fastai.text.all import *
path = untar_data(URLs.HUMAN_NUMBERS)

In [None]:
Path.BASE_PATH = path

In [None]:
path.ls()

(#2) [Path('train.txt'),Path('valid.txt')]

In [None]:
## Opening the files adn seeing what's inside

lines = L()
with open(path/'train.txt') as f: lines += L(*f.readlines())
with open(path/'valid.txt') as f: lines += L(*f.readlines())
lines

(#9998) ['one \n','two \n','three \n','four \n','five \n','six \n','seven \n','eight \n','nine \n','ten \n'...]

In [None]:
## Join the lines into one big stream

text = ' . '.join([l.strip() for l in lines])
text[:100]

'one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo'

In [None]:
## We can tokenize this dataset by splitting on spaces
tokens = text.split(' ')
tokens[:10]

['one', '.', 'two', '.', 'three', '.', 'four', '.', 'five', '.']

In [None]:
## Numericalize the tokens 
vocab = L(*tokens).unique()
vocab

(#30) ['one','.','two','three','four','five','six','seven','eight','nine'...]

In [None]:
## Converting tokens to numbers by looking up index of each vocab
word2idx = {w:i for i,w in enumerate(vocab)}
nums = L(word2idx[i] for i in tokens)
nums

(#63095) [0,1,2,1,3,1,4,1,5,1...]

In [None]:
## Building our model from Scratch
## Using the previous three words as independent variable

L((tokens[i: i + 3], tokens[i + 3]) for i in range(0, len(tokens) - 4, 3))

(#21031) [(['one', '.', 'two'], '.'),(['.', 'three', '.'], 'four'),(['four', '.', 'five'], '.'),(['.', 'six', '.'], 'seven'),(['seven', '.', 'eight'], '.'),(['.', 'nine', '.'], 'ten'),(['ten', '.', 'eleven'], '.'),(['.', 'twelve', '.'], 'thirteen'),(['thirteen', '.', 'fourteen'], '.'),(['.', 'fifteen', '.'], 'sixteen')...]

In [None]:
seqs = L((tensor(nums[i: i+3]), nums[i + 3]) for i in range(0, len(nums) - 4, 3))
seqs

(#21031) [(tensor([0, 1, 2]), 1),(tensor([1, 3, 1]), 4),(tensor([4, 1, 5]), 1),(tensor([1, 6, 1]), 7),(tensor([7, 1, 8]), 1),(tensor([1, 9, 1]), 10),(tensor([10,  1, 11]), 1),(tensor([ 1, 12,  1]), 13),(tensor([13,  1, 14]), 1),(tensor([ 1, 15,  1]), 16)...]

In [None]:
## We can batch these using DataLoaders class
# For now we will splitthe sequence randomly

bs = 64
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(seqs[:cut], seqs[cut:], bs=64, shuffle=False, )

In [None]:
## Creating Language model in PyTorch

class LMModel1(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
    
    def forward(self, x):
        h = F.relu(self.h_h(self.i_h(x[:, 0])))
        h = h + self.i_h(x[:, 1])
        h = F.relu(self.h_h(h))
        h = h + self.i_h(x[:, 2])
        h = F.relu(self.h_h(h))
        return self.h_o(h)

In [None]:
## Let's try training our model

learn = Learner(dls, LMModel1(len(vocab), 64), loss_func=F.cross_entropy,
                metrics=accuracy)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.788353,1.988312,0.467079,00:03
1,1.385152,1.80958,0.473972,00:02
2,1.429518,1.662224,0.48657,00:02
3,1.392593,1.65757,0.49584,00:02


In [None]:
## To see how the simplest of model which predicts the most common token in the validation set does

n, counts = 0, torch.zeros(len(vocab))
for x, y in dls.valid:
    n += y.shape[0]
    for i in range_of(vocab): counts[i] += (y == i).long().sum()
idx = torch.argmax(counts)
idx, vocab[idx.item()], counts[idx].item() / n

(tensor(29), 'thousand', 0.15165200855716662)

In [None]:
## Our Recurrent Neural Network

class LMModel2(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
    
    def forward(self, x):
        h = 0
        for i in range(3):
            h = h + self.i_h(x[:,i])
            h = F.relu(self.h_h(h))
        return self.h_o(h)

In [None]:
learn = Learner(dls, LMModel2(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.895121,2.089246,0.407654,00:02
1,1.415398,1.744993,0.467316,00:02
2,1.417447,1.699931,0.493226,00:02
3,1.38374,1.642523,0.489422,00:02


In [None]:
### Currently our model rests its state per sample.
## Let's modify that so that the model remembers it's state

class LMModel3(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = 0

    def forward(self, x):
        for i in range(3):
            self.h = self.h + self.i_h(x[:, i])
            self.h = F.relu(self.h_h(self.h))
        out = self.h_o(self.h)
        self.h = self.h.detach()
        return out
    def reset(self): self.h = 0

In [None]:
## Inorder to use our model we will need to modify our dataset
# Lets do that. 
# Getting 64 equally sized pieces

m = len(seqs) // bs
m, bs, len(seqs)

(328, 64, 21031)

In [None]:
def group_chunks(ds, bs):
    m = len(ds) // bs
    new_ds = L()
    for i in range(m): new_ds += L(ds[i + m * j] for j in range(bs))
    return new_ds

In [None]:
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(
    group_chunks(seqs[:cut], bs),
    group_chunks(seqs[cut:], bs),
    bs=bs, drop_last=True, shuffle=False)

In [None]:
learn = Learner(dls, LMModel3(len(vocab), 64), loss_func=F.cross_entropy,
                metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(10, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.71225,1.800178,0.492067,00:04
1,1.296507,1.780378,0.391827,00:04
2,1.126376,1.795843,0.46274,00:02
3,1.038224,1.816608,0.491827,00:02
4,0.962585,1.81263,0.53726,00:03
5,0.925743,1.733137,0.564663,00:03
6,0.871801,1.777212,0.572596,00:03
7,0.82257,1.842446,0.588221,00:03
8,0.794691,1.800578,0.586058,00:02
9,0.776425,1.795789,0.599038,00:03


In [None]:
## Right now our model is only receving gradients after three item sequences.
## Let's change that to every word.

s1 = 16
seqs = L((tensor(nums[i: i + s1]), tensor(nums[i + 1: i + s1 + 1]))
        for i in range(0, len(nums) - s1 - 1, s1))
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(group_chunks(seqs[:cut], bs),
                             group_chunks(seqs[cut:], bs),
                             bs=bs, drop_last=True, shuffle=False)

In [None]:
list(L(vocab[o] for o in s) for s in seqs[0])

[(#16) ['one','.','two','.','three','.','four','.','five','.'...],
 (#16) ['.','two','.','three','.','four','.','five','.','six'...]]

In [None]:
## Let's modify our model to output after prediction after every word rather than every three

class LMModel4(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = 0

    def forward(self, x):
        outs = []
        for i in range(s1):
            self.h = self.h + self.i_h(x[:, i])
            self.h = F.relu(self.h_h(self.h))
            outs.append(self.h_o(self.h))
        self.h = self.h.detach()
        return torch.stack(outs, dim=1)

    def reset(self): self.h = 0

In [None]:
## The shape of the output is bs X s1 X vocab_sz
## Out targets are of size bs X s1, so lets flatten them before the loss function 

def loss_func(inp, targ):
    return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))

In [None]:
learn = Learner(dls, LMModel4(len(vocab), 64), loss_func, metrics=accuracy,
                cbs=ModelResetter)
learn.fit_one_cycle(15, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.298909,3.074327,0.157796,00:01
1,2.406432,1.9022,0.463704,00:01
2,1.783842,1.803388,0.457438,00:01
3,1.496237,1.80216,0.486572,00:01
4,1.298436,1.821557,0.512939,00:01
5,1.135621,2.022848,0.515462,00:01
6,0.984004,1.925205,0.528239,00:01
7,0.870914,1.989408,0.57251,00:01
8,0.794394,1.785818,0.582031,00:01
9,0.73016,1.719786,0.586507,00:01


In [None]:
## Untill now we used single hidden layer. 
# Lets increase the number of layers
# Getting some help with PyTorch

class LMModel5(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.RNN(n_hidden, n_hidden, n_layers, batch_first=True)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = torch.zeros(n_layers, bs, n_hidden)
    
    def forward(self, x):
        res, h = self.rnn(self.i_h(x), self.h)
        self.h = h.detach()
        return self.h_o(res)

    def reset(self): self.h.zero_()

In [None]:
learn = Learner(dls, LMModel5(len(vocab), 64, 2), CrossEntropyLossFlat(),
                metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.09419,2.746873,0.419027,00:01
1,2.179945,1.870159,0.470703,00:01
2,1.687263,1.801084,0.450114,00:01
3,1.434769,1.890697,0.463867,00:01
4,1.300508,1.894418,0.480387,00:01
5,1.197549,2.083451,0.484212,00:01
6,1.092487,2.263233,0.484538,00:01
7,0.993228,2.378997,0.479574,00:01
8,0.901303,2.555784,0.47347,00:01
9,0.831154,2.60883,0.482015,00:01


In [None]:
## Looks like we have gradient explosion problem 
# So let's look at LSTM

class LSTMCell(Module):
    def __init__(self, ni, nh):
        self.forget_gate = nn.Linear(ni + nh, nh)
        self.input_gate  = nn.Linear(ni + nh, nh)
        self.cell_gate   = nn.Linear(ni + nh, nh)
        self.output_gate = nn.Linear(ni + nh, nh)

    def forward(self, input, state):
        h, c = state
        h = torch.stack([h, input], dim=1)
        forget = torch.sigmoid(self.forget_gate(h))
        c = c * forget
        inp = torch.sigmoid(self.input_gate(h))
        cell = torch.tanh(self.cell_gate(h))
        c = c + inp * cell
        out = torch.sigmoid(self.output_gate(h))
        h = output_gate * torch.tanh(c)
        return h, (h, c) 

In [None]:
## Optmizing and refactoring the above code

class LSTMCell(Module):
    def __init__(self, ni, nh):
        self.ih = nn.Linear(ni, 4 * nh)
        self.hh = nn.Linear(nh, 4 * nh)

    def forward(self, input, state):
        h, c = state
        ## One big matrix multiplication rather than 4 small ones
        gates = (self.ih(input) + self.hh(h)).chunk(4, 1)
        ingate, forgetgate, outgate = map(torch.sigmoid, gates[:3])
        cellgate = gates[3].tanh()

        c = (forgetgate * c) + (ingate * cellgate)
        h = outgate * c.tanh()
        return h, (h, c)

In [None]:
# Using the chunk function 
t = torch.arange(0, 10 - 1); t

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [None]:
t.chunk(4)

(tensor([0, 1, 2]), tensor([3, 4, 5]), tensor([6, 7, 8]))

In [None]:
## Using LMModel5 with two LSTM layers

class LMModel6(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers):
        self.i_h = nn.Embedding(vocab_sz, n_hidden, n_layers)
        self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]

    def forward(self, x):
        res, h = self.rnn(self.i_h(x), self.h)
        self.h = [h_.detach() for h_ in h]
        return self.h_o(res)

    def reset(self):
        for h in self.h: h.zero_()

In [None]:
learn = Learner(dls, LMModel6(len(vocab), 64, 2), loss_func=CrossEntropyLossFlat(),
                metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(25, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,3.062724,2.887954,0.239909,00:02
1,2.576931,2.041017,0.44279,00:02
2,1.85549,1.725261,0.455485,00:02
3,1.466278,1.958475,0.483154,00:02
4,1.259638,2.284298,0.467773,00:02
5,1.107998,2.13694,0.530192,00:02
6,0.937999,2.135519,0.526123,00:02
7,0.763054,1.981806,0.552002,00:02
8,0.630271,2.044156,0.60791,00:02
9,0.497018,2.03001,0.640055,00:02


In [None]:
## Seeing some overfitting so let's consider some regularizing options
# DROP OUT

class Dropout(Module):
    def __init__(self, p): self.p = p
    def forward(self, x):
        if not self.training: return x
        mask = x.new(*x.shape).bernoulli_(1 - p)
        return x * mask.div_(1 - p)

In [None]:
## Language model with some tweaks

class LMModel7(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers, p):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
        self.drop = nn.Dropout(p)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h_o.weight = self.i_h.weight
        self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]

    def forward(self, x):
        raw, h = self.rnn(self.i_h(x), self.h)
        out = self.drop(raw)
        self.h = [h_.detach() for h_ in h]
        return self.h_o(out), raw, out

    def reset(self):
        for h in self.h: h.zero_()

In [None]:
learn = Learner(dls, LMModel7(len(vocab), 64, 2, 0.5),
                loss_func=CrossEntropyLossFlat(), metrics=accuracy,
                cbs=[ModelResetter, RNNRegularizer(alpha=2, beta=1)])

In [None]:
## TextLearner automatically adds the callbacks

learn = TextLearner(dls, LMModel7(len(vocab), 64, 2, 0.5),
                    loss_func=CrossEntropyLossFlat(), metrics=accuracy)

In [None]:
learn.fit_one_cycle(15, 1e-2, wd=0.1)

epoch,train_loss,valid_loss,accuracy,time
0,2.671198,2.270539,0.434082,00:02
1,1.915365,1.721915,0.524414,00:02
2,1.228873,1.102856,0.739746,00:02
3,0.71888,0.663154,0.808675,00:02
4,0.442166,0.5937,0.833089,00:02
5,0.303267,0.513897,0.860677,00:02
6,0.231191,0.507793,0.859782,00:02
7,0.195044,0.428719,0.88501,00:02
8,0.167573,0.401497,0.883464,00:02
9,0.150234,0.396064,0.886068,00:02
