# Install Padl 

In [None]:
!pip install padl

In [None]:
from IPython.display import Image

Image('../img/schematic.png', width=300)

In [None]:
from padl import same, transform, batch, unbatch, value, IfTrain

import torch
import re
import json

with open('data/lm/train.json') as f: train_data = json.load(f)
with open('data/lm/valid.json') as f: valid_data = json.load(f)
    
WORDS = []
for x in train_data:
    WORDS.extend(x.split())
WORDS = sorted(list(set(WORDS)))

In [None]:
lower_case = same.lower()

clean = transform(lambda x: re.sub('[^a-zA_Z ]', '', x))

@transform
def tokenize(sentence):
    return sentence.split()

@transform
class ToInteger:
    def __init__(self, words):
        self.words = words + ['</s>']
        self.dictionary = dict(zip(self.words, range(len(self.words))))
        
    def __call__(self, word):
        if not word in self.dictionary:
            word = "<unk>"
        return self.dictionary[word]
    
to_integer = ToInteger(value(WORDS))
EOS_VALUE = to_integer.dictionary['</s>']


@transform
def to_tensor(x):
    x = list(x[:10][:])
    for _ in range(10 - len(x)):
        x.append(EOS_VALUE)
    return torch.tensor(x)


left_shift = same[:, :-1]
right_shift = same[:, 1:]

unk_value = value(to_integer.dictionary['<unk>'])
WORD_DROPOUT_PROBABILITY = 0.2

@transform
def word_dropout(tensor_input):
    mask = (
        torch.rand(*tensor_input.shape) > WORD_DROPOUT_PROBABILITY
    ).type(torch.long)
    out = mask * tensor_input + (1 - mask) * UNK_VALUE
    return out

@transform
class Loss:
    def __call__(self, x, y):
        l = 0
        for i in range(x.shape[0]):
            l += torch.nn.functional.cross_entropy(x[i], y[i])
        return l / x.shape[0]
    
loss = Loss()

@transform
class LM(torch.nn.Module):
    def __init__(self, n_words):
        super().__init__()
        self.rnn = torch.nn.GRU(64, 512, 2, batch_first=True)
        self.embed = torch.nn.Embedding(n_words, 64)
        self.project = torch.nn.Linear(512, n_words)
        
    def forward(self, x):
        output = self.rnn(self.embed(x))[0]
        return self.project(output)

    
N_WORDS = value(len(to_integer.words))
model = LM(N_WORDS)

In [None]:
for t in [
    lower_case,
    clean,
    tokenize,
    to_integer,
    to_tensor,
    left_shift,
    right_shift,
    loss,
    model,
]:
    print('-' * 10)
    print(t)

In [None]:
preprocess = (
    lower_case
    >> clean
    >> tokenize
    >> ~ to_integer
    >> to_tensor
    >> batch
)

preprocess

In [None]:
forward_pass = (
    left_shift
    >> IfTrain(word_dropout)
    >> model
)

forward_pass

In [None]:
targets = (
    preprocess >> right_shift
)

targets

In [None]:
train_model = (
    (preprocess >> model >> left_shift)
    + targets
) >> loss

train_model

In [None]:
train_model.infer_apply('test an input')

In [None]:
optimizer = torch.optim.Adam(train_model.pd_parameters())

for l in train_model.train_apply(train_data[:100], batch_size=10):
    optimizer.zero_grad()
    l.backward()
    optimizer.step()
    print('loss is:', l.item())

In [None]:
train_model.pd_save('test.padl', force_overwrite=True)

In [None]:
!ls test.padl

In [None]:
!cat test.padl/versions.txt

In [None]:
!cat test.padl/transform.py