In [None]:
!nvidia-smi

In [None]:
# import time
# time.sleep(10000)

# Language Model

First, upload

* `corpus.txt`

* `tokenizer.yttm-model`

of language you want to train

## Config

In [None]:
pcent = 1.0
bs = 768
sl = 64
sl_shift = sl
corpus = "corpus.txt"
path_t = "corpus-train.txt"
path_v = "corpus-valid.txt"
lang = "de"
epochs = 8

## Requirements

In [None]:
!pip3 install youtokentome
!pip3 install -U fastai==2.1.4

## Imports

In [None]:
import random

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import youtokentome as yttm

from fastai.text.all import *

In [None]:
import fastai
torch.__version__, fastai.__version__

In [None]:
# load tokenizer
tokenizer = yttm.BPE(model="tokenizer.yttm-model")
tokenizer.decode(tokenizer.encode("hello i'm joe äüö"))

In [None]:
# create a model
class LM(nn.Module):
  def __init__(self, vocab_sz, embed_sz, hidden_sz, num_layers, bs, device="cuda:0", p=0.2):
    super().__init__()
    self.embed = nn.Embedding(vocab_sz, embed_sz, padding_idx=0)
    self.rnn = nn.LSTM(embed_sz, hidden_sz, batch_first=True, num_layers=num_layers)
    self.drop = nn.Dropout(p)
    self.linear = nn.Linear(hidden_sz, vocab_sz)
    self.h = [torch.zeros(num_layers, bs, hidden_sz, device=device) for _ in range(2)]
    if embed_sz == hidden_sz:
      # tie
      self.linear.weight = self.embed.weight

  def forward(self, x):
    x = self.embed(x)
    raw, h = self.rnn(x, self.h)
    out = self.drop(raw)
    self.h = [h_.detach() for h_ in h]
    return F.log_softmax(self.linear(out), dim=-1), raw, out

  def reset(self):
    for h in self.h: h.zero_()

## Data Pipeline

In [None]:
def split_file(f,out1,out2,percentage=0.75,isShuffle=True,seed=123):
    """Splits a file in 2 given the `percentage` to go in the large file."""
    random.seed(seed)
    with open(f, 'r',encoding="utf-8") as fin, open(out1, 'w') as foutBig, open(out2, 'w') as foutSmall:

        nLines = sum(1 for line in fin) # if didn't count you could only approximate the percentage
        fin.seek(0)
        nTrain = int(nLines*percentage) 
        nValid = nLines - nTrain

        i = 0
        for line in fin:
            r = random.random() if isShuffle else 0 # so that always evaluated to true when not isShuffle
            if (i < nTrain and r < percentage) or (nLines - i > nValid):
                foutBig.write(line)
                i += 1
            else:
                foutSmall.write(line)

In [None]:
split_file(corpus, path_t, path_v, percentage=0.8)

In [None]:
!ls -lah

In [None]:
def read(f, pcent=0.1):
  with open(f, 'r') as f:
    lines = f.readlines()
    return lines[:int(len(lines) * pcent)]

In [None]:
train_txt = read(path_t, pcent=pcent); train_txt[0][:80]

In [None]:
valid_txt = read(path_v, pcent=pcent); valid_txt[0][-80:]

In [None]:
def numericalize_label(lines):
  nums = tokenizer.encode("".join(lines).replace("\n", ""))
  return L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+sl+1]))
         for i in range(0,len(nums)-sl-1,sl))

In [None]:
train_nums = numericalize_label(train_txt)
train_nums[:5]

In [None]:
valid_nums = numericalize_label(valid_txt)
valid_nums[:5]

In [None]:
dls = DataLoaders.from_dsets(train_nums, valid_nums, bs=bs, drop_last=True, shuffle=True)
dls

In [None]:
# loss
def loss_func(inp, targ):
    return F.cross_entropy(inp.view(-1, 2048), targ.view(-1))

In [None]:
# learner
model = LM(2048, 768, 768, 4, bs, p=0.3).cuda()
print(f"{sum(p.numel() for p in model.parameters()) // 1_000_000}M params")
cbs = [CudaCallback, ModelResetter, RNNRegularizer(alpha=2., beta=1.)]
metrics = [accuracy, perplexity]
learn = Learner(dls, model, loss_func=loss_func, metrics=metrics, cbs=cbs)

In [None]:
# best:
#  - de: valid=3.72 | perplex=41.30 | 8eps@1e-2@sp,reg,p=0.3,wd=0.1,small,Adam
#  - en: valid=3.56 | perplex=35.00 | 8eps@1e-2@sp,reg,p=0.3,wd=0.1,small,Adam
learn.fit_one_cycle(epochs, 1e-2, wd=0.1)

## Save trained LM

In [None]:
model.cpu()
model.eval()
torch.save(model.state_dict(), f"lm.pth")
!ls -lah lm.pth

In [None]:
import time
time.sleep(99999)

## Copy stuff to drive

# To Do

* generation function (with len)

* better LM

    * TCN

    * BatchNorm

    * ranger optim

    * more data

    * larger sl
