# Import

In [1]:
%load_ext autoreload
%autoreload 2

import fastai
from fastai import *
from fastai.text import *

In [2]:
fastai.__version__

'1.0.57'

# Config

In [3]:
lesson_dir_path = Path('lesson7-human-numbers')

In [4]:
path = untar_data(URLs.HUMAN_NUMBERS)
path

PosixPath('/home/justin/.fastai/data/human_numbers')

# Lesson

In [5]:
batch_size=64

In [6]:
def readnums(d): return [', '.join(o.strip() for o in open(path/d).readlines())]

In [7]:
train_txt = readnums('train.txt')
train_txt[0][-100:], len(train_txt[0])

('dred ninety seven, seven thousand nine hundred ninety eight, seven thousand nine hundred ninety nine',
 280597)

In [8]:
valid_txt = readnums('valid.txt')
valid_txt[0][:100], len(valid_txt[0])

('eight thousand one, eight thousand two, eight thousand three, eight thousand four, eight thousand fi',
 74882)

In [9]:
train = TextList(train_txt, path=path)
valid = TextList(valid_txt, path=path)

src = ItemLists(path=path, 
                train=train, 
                valid=valid).label_for_lm()
data = src.databunch(bs=batch_size)

### Check how this dataset works

In [10]:
X, y = data.valid_dl.dataset[0][0], data.valid_dl.dataset[0][1]
len(X), len(y)

(70, 70)

In [11]:
data.vocab.textify(X)

'thousand eighteen , eight thousand nineteen , eight thousand twenty , eight thousand twenty one , eight thousand twenty two , eight thousand twenty three , eight thousand twenty four , eight thousand twenty five , eight thousand twenty six , eight thousand twenty seven , eight thousand twenty eight , eight thousand twenty nine , eight thousand thirty , eight thousand thirty one , eight thousand thirty two ,'

In [12]:
data.vocab.textify(y)

'eighteen , eight thousand nineteen , eight thousand twenty , eight thousand twenty one , eight thousand twenty two , eight thousand twenty three , eight thousand twenty four , eight thousand twenty five , eight thousand twenty six , eight thousand twenty seven , eight thousand twenty eight , eight thousand twenty nine , eight thousand thirty , eight thousand thirty one , eight thousand thirty two , eight'

In [13]:
len(data.train_dl.dataset)

768

In [14]:
for idx, (X,y) in enumerate(data.train_dl):
    print(idx, X.shape)    

0 torch.Size([64, 70])
1 torch.Size([64, 70])
2 torch.Size([64, 70])
3 torch.Size([64, 70])
4 torch.Size([64, 70])
5 torch.Size([64, 70])
6 torch.Size([64, 70])
7 torch.Size([64, 70])
8 torch.Size([64, 70])
9 torch.Size([64, 70])
10 torch.Size([64, 70])
11 torch.Size([64, 70])


In [15]:
12*batch_size*70

53760

In [16]:
len(data.train_ds[0][0].data)

50079

In [17]:
data.train_ds[0][0].data

array([ 2, 12,  9, 13, ..., 20, 10, 28, 20])

So number of batches, 12, is determined by total number of words (~50000 in training set), divided by batch size (64) and sequence length (70, also set as bptt)

In [18]:
50079/64/70

11.178348214285714

Also note that batches are contiguous but random

In [19]:
data.show_batch()

idx,text
0,"hundred thirty eight , two hundred thirty nine , two hundred forty , two hundred forty one , two hundred forty two , two hundred forty three , two hundred forty four , two hundred forty five , two hundred forty six , two hundred forty seven , two hundred forty eight , two hundred forty nine , two hundred fifty , two hundred fifty one , two hundred fifty"
1,"hundred two , four hundred three , four hundred four , four hundred five , four hundred six , four hundred seven , four hundred eight , four hundred nine , four hundred ten , four hundred eleven , four hundred twelve , four hundred thirteen , four hundred fourteen , four hundred fifteen , four hundred sixteen , four hundred seventeen , four hundred eighteen , four hundred nineteen"
2,"five hundred sixty nine , five hundred seventy , five hundred seventy one , five hundred seventy two , five hundred seventy three , five hundred seventy four , five hundred seventy five , five hundred seventy six , five hundred seventy seven , five hundred seventy eight , five hundred seventy nine , five hundred eighty , five hundred eighty one , five hundred eighty two , five hundred"
3,"six , seven hundred thirty seven , seven hundred thirty eight , seven hundred thirty nine , seven hundred forty , seven hundred forty one , seven hundred forty two , seven hundred forty three , seven hundred forty four , seven hundred forty five , seven hundred forty six , seven hundred forty seven , seven hundred forty eight , seven hundred forty nine , seven hundred fifty ,"
4,"nine hundred , nine hundred one , nine hundred two , nine hundred three , nine hundred four , nine hundred five , nine hundred six , nine hundred seven , nine hundred eight , nine hundred nine , nine hundred ten , nine hundred eleven , nine hundred twelve , nine hundred thirteen , nine hundred fourteen , nine hundred fifteen , nine hundred sixteen , nine hundred seventeen"


### Try single fully connected model

In [20]:
data = src.databunch(bs=batch_size, bptt=3)

In [21]:
X,y = data.one_batch()
X.shape,y.shape

(torch.Size([64, 3]), torch.Size([64, 3]))

In [22]:
def loss4(input,target):
    return F.cross_entropy(input, target[:,-1])

def acc4(input,target):
    return accuracy(input, target[:,-1])

In [23]:
nv = len(data.valid_ds.vocab.itos)
nv

40

In [24]:
nh=64

In [25]:
class Model0(nn.Module):
    def __init__(self):
        super().__init__()
        self.i_h = nn.Embedding(nv,nh)  # green arrow
        self.h_h = nn.Linear(nh,nh)     # brown arrow
        self.h_o = nn.Linear(nh,nv)     # blue arrow
        self.bn = nn.BatchNorm1d(nh)
        
    def forward(self, x):
        h = self.bn(F.relu(self.h_h(self.i_h(x[:,0]))))
        if x.shape[1]>1:
            h = h + self.i_h(x[:,1])
            h = self.bn(F.relu(self.h_h(h)))
        if x.shape[1]>2:
            h = h + self.i_h(x[:,2])
            h = self.bn(F.relu(self.h_h(h)))
        return self.h_o(h)

In [26]:
learn = Learner(data, Model0(), loss_func=loss4, metrics=acc4)

In [27]:
learn.fit_one_cycle(6, 1e-4)

epoch,train_loss,valid_loss,acc4,time
0,3.586603,3.44477,0.172105,00:00
1,2.652612,2.573598,0.436811,00:00
2,2.082492,2.212228,0.460018,00:00
3,1.874682,2.097613,0.463695,00:00
4,1.802127,2.063778,0.465533,00:00
5,1.78686,2.059325,0.465533,00:00


## Try with a loop

In [28]:
class Model1(nn.Module):
    def __init__(self):
        super().__init__()
        self.i_h = nn.Embedding(nv,nh)  # green arrow
        self.h_h = nn.Linear(nh,nh)     # brown arrow
        self.h_o = nn.Linear(nh,nv)     # blue arrow
        self.bn = nn.BatchNorm1d(nh)
        
    def forward(self, x):
        h = torch.zeros(x.shape[0], nh).to(device=x.device)
        for i in range(x.shape[1]):
            h = h + self.i_h(x[:,i])
            h = self.bn(F.relu(self.h_h(h)))
        return self.h_o(h)

In [29]:
learn = Learner(data, Model1(), loss_func=loss4, metrics=acc4)

In [30]:
learn.fit_one_cycle(6, 1e-4)

epoch,train_loss,valid_loss,acc4,time
0,3.646152,3.572388,0.101792,00:00
1,2.716422,2.73406,0.402344,00:00
2,2.107843,2.209536,0.451976,00:00
3,1.880969,2.033161,0.463695,00:00
4,1.803947,1.987947,0.464614,00:00
5,1.787863,1.98225,0.464844,00:00


## Multi fully connected

In [31]:
data = src.databunch(bs=batch_size, bptt=20)

In [32]:
class Model2(nn.Module):
    def __init__(self):
        super().__init__()
        self.i_h = nn.Embedding(nv,nh)
        self.h_h = nn.Linear(nh,nh)
        self.h_o = nn.Linear(nh,nv)
        self.bn = nn.BatchNorm1d(nh)
        
    def forward(self, x):
        h = torch.zeros(x.shape[0], nh).to(device=x.device)
        res = []
        for i in range(x.shape[1]):
            h = h + self.i_h(x[:,i])
            h = F.relu(self.h_h(h))
            res.append(self.h_o(self.bn(h)))
        return torch.stack(res, dim=1)

In [33]:
learn = Learner(data, Model2(), metrics=accuracy)

In [34]:
learn.fit_one_cycle(10, 1e-4, pct_start=0.1)

epoch,train_loss,valid_loss,accuracy,time
0,3.755726,3.712712,0.055824,00:00
1,3.581315,3.516009,0.143466,00:00
2,3.375179,3.33936,0.203196,00:00
3,3.170878,3.192112,0.250568,00:00
4,2.98795,3.077882,0.274858,00:00
5,2.838894,2.994722,0.280469,00:00
6,2.727309,2.940764,0.285511,00:00
7,2.651038,2.912061,0.287074,00:00
8,2.604794,2.901201,0.287429,00:00
9,2.580795,2.899597,0.287571,00:00


## Maintain state

In [35]:
class Model3(nn.Module):
    def __init__(self):
        super().__init__()
        self.i_h = nn.Embedding(nv,nh)
        self.h_h = nn.Linear(nh,nh)
        self.h_o = nn.Linear(nh,nv)
        self.bn = nn.BatchNorm1d(nh)
        self.h = torch.zeros(batch_size, nh).cuda()
        
    def forward(self, x):
        res = []
        h = self.h
        for i in range(x.shape[1]):
            h = h + self.i_h(x[:,i])
            h = F.relu(self.h_h(h))
            res.append(self.bn(h))
        self.h = h.detach()
        res = torch.stack(res, dim=1)
        res = self.h_o(res)
        return res

In [36]:
learn = Learner(data, Model3(), metrics=accuracy)

In [37]:
learn.fit_one_cycle(20, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.707077,3.754894,0.082244,00:00
1,3.144161,2.818942,0.269957,00:00
2,2.369948,2.0573,0.322159,00:00
3,1.886289,2.01405,0.316832,00:00
4,1.651014,1.961374,0.323295,00:00
5,1.49694,1.947525,0.361932,00:00
6,1.342167,1.920147,0.385085,00:00
7,1.190462,1.897721,0.40277,00:00
8,1.063718,1.916073,0.410369,00:00
9,0.960973,1.952103,0.402841,00:00


## nn.RNN

In [38]:
class Model4(nn.Module):
    def __init__(self):
        super().__init__()
        self.i_h = nn.Embedding(nv,nh)
        self.rnn = nn.RNN(nh,nh, batch_first=True)
        self.h_o = nn.Linear(nh,nv)
        self.bn = BatchNorm1dFlat(nh)
        self.h = torch.zeros(1, batch_size, nh).cuda()
        
    def forward(self, x):
        res,h = self.rnn(self.i_h(x), self.h)
        self.h = h.detach()
        return self.h_o(self.bn(res))

In [39]:
learn = Learner(data, Model4(), metrics=accuracy)

In [40]:
learn.fit_one_cycle(20, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.460783,3.343696,0.20277,00:00
1,2.768027,2.363601,0.389702,00:00
2,2.118533,2.054556,0.321662,00:00
3,1.767547,2.080178,0.316264,00:00
4,1.589634,1.996032,0.339844,00:00
5,1.431882,1.843569,0.448224,00:00
6,1.22158,1.694524,0.511648,00:00
7,1.039316,1.825392,0.524574,00:00
8,0.893384,1.664978,0.53608,00:00
9,0.755748,1.382507,0.590554,00:00


## 2-layer GRU

In [41]:
class Model5(nn.Module):
    def __init__(self):
        super().__init__()
        self.i_h = nn.Embedding(nv,nh)
        self.rnn = nn.GRU(nh, nh, 2, batch_first=True)
        self.h_o = nn.Linear(nh,nv)
        self.bn = BatchNorm1dFlat(nh)
        self.h = torch.zeros(2, batch_size, nh).cuda()
        
    def forward(self, x):
        res,h = self.rnn(self.i_h(x), self.h)
        self.h = h.detach()
        return self.h_o(self.bn(res))

In [42]:
learn = Learner(data, Model5(), metrics=accuracy)

In [43]:
learn.fit_one_cycle(10, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,2.640604,2.188591,0.344105,00:00
1,1.556676,1.873131,0.628338,00:00
2,0.767763,1.271537,0.78054,00:00
3,0.374401,1.17188,0.795241,00:00
4,0.187617,1.253247,0.804545,00:00
5,0.09945,1.238354,0.799574,00:00
6,0.056653,1.337849,0.791051,00:00
7,0.034476,1.318335,0.784801,00:00
8,0.022568,1.352836,0.787713,00:00
9,0.016138,1.345013,0.786222,00:00
