In [None]:
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchmetrics import Perplexity
import re
from torch.nn import Parameter
import torch
from torch.nn.utils import clip_grad_norm_
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader, Dataset
from tqdm import tqdm
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np


# UTILS

In [None]:
class AverageMeter:

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def train_one_epoch(model, train_loader, loss_fn, optimizer, epoch=None):
    model.train()
    mi = Perplexity().to(device)
    loss_train = AverageMeter()
    with tqdm(train_loader, unit='batch') as tepochs:
        for x_batch, y_batch in tepochs:
            if epoch is not None:
                tepochs.set_description(f'epoch:{epoch}')
            yp = model(x_batch.to(device))
            loss = loss_fn(yp.transpose(2, 1).to(device), y_batch.to(device))
            loss.backward()
            clip_grad_norm_(model.parameters(), 0.25)
            optimizer.step()
            optimizer.zero_grad()
            maz = mi(yp, y_batch.to(device))
            
            tepochs.set_postfix(loss=loss_train.avg, pre=mi.compute().item())
            loss_train.update(loss.item())
    return model, loss_train.avg, mi.compute().item()

def evaluate(model, test_loader, loss_fn):
    model.eval()
    mi = Perplexity().to(device)
    loss_test = AverageMeter()
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            yp = model(x_batch.to(device))
            loss = loss_fn(yp.transpose(2, 1).to(device), y_batch.to(device))
            loss_test.update(loss.item())
            maz = mi(yp, y_batch)
    print(mi.compute())
    return loss_test.avg, mi.compute().item()

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
bs = 100
seq = 70
step = 70
rnn_unit = 500
embed_dim = 300
n_layers = 3
eos = ['eos']
dp = 0.3
wd_dp = 0.2 
dp_h = 0.5
# HYPER PARAMETERS ARE NOT SET

In [None]:
tokenizer = get_tokenizer('basic_english')
train, valid, test = WikiText2()

In [None]:
class WikiSet(Dataset):
    
    def __init__(self, text, vocab=None):
        # cleaning and tokenizing data
        # NOT JUST SENDING EVERY SINGLE TRASH TO MODEL WITHOUT ANY CLEANES TO GET LOWER LOSS
        tokens = [tokenizer(sentence) + eos for sentence in
         ''.join(
             [word.lower() for word in 
                  ''.join([idx for idx in text])] 
         ).splitlines()
        if len(tokenizer(sentence)) > 20] 

        # building or getting vocab from input args
        if vocab:
            self.vocab = vocab  
        else:
            self.vocab = build_vocab_from_iterator(tokens, min_freq=3) # creating vocab
            self.vocab.set_default_index(self.vocab['<unk>']) # unk tag is set to default

        sequences = torch.LongTensor(
            [self.vocab[i] for z in tokens for i in z]).unfold(0, seq, step) # SHAPING DATA with torch.unfold()

        self.X, self.y = (lambda x: (x[:, :-1], x[:, 1:]))(sequences) # SEPERATING X, y
    
    
    def __getitem__(self, ind):
        return self.X[ind], self.y[ind]
    
    def __call__(self):
        """
        with calling the class you will get vocab
        """
        return self.vocab
    
    def __len__(self):
        return len(self.X)

In [None]:
train_set = WikiSet(train)
vocab = train_set()
valid_set = WikiSet(valid, vocab)


In [None]:
len(valid_set)

In [None]:
vocab.get_itos()

# NEXT

In [None]:
train_loader = DataLoader(train_set, bs, shuffle=True, drop_last=True)
valid_loader = DataLoader(valid_set, bs, shuffle=1024, drop_last=True) # SETTING SHUFFLE TO SOME SEED

In [None]:
len(vocab)

In [None]:
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(len(vocab), embed_dim)
        self.rnn = nn.LSTM(embed_dim, rnn_unit, num_layers=3, batch_first=True, dropout=0.7)
        self.dropout = nn.Dropout(0.6)
        self.fc = nn.Linear(rnn_unit, len(vocab))
    
    def forward(self, inp):
        embedded = self.embedding(inp)
        output, _ = self.rnn(self.dropout(embedded))
        output = self.fc(output)
        return output

In [None]:
model = MyModel().to(device)

In [None]:
model = torch.load('myModelExp.pt')

In [None]:
sum([p.numel() for p in model.parameters()])

In [None]:
lr = 7
wd = 1e-5

In [None]:
optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=0.9)
loss_fn = nn.CrossEntropyLoss()

In [None]:
loss_train_hist = list()
loss_valid_hist = list()
pre_train_hist = list()
pre_valid_hist = list()
best_pre_valid = torch.inf
epoch_counter = 0

In [None]:
n = 50
for epoch in range(n):
    model, train_loss, pre = train_one_epoch(model, train_loader, loss_fn, optimizer, epoch)
    valid_loss, valid_pre = evaluate(model, valid_loader, loss_fn)


    loss_train_hist.append(train_loss)
    loss_valid_hist.append(valid_loss)

    pre_train_hist.append(pre)
    pre_valid_hist.append(valid_pre)

    if valid_pre < best_pre_valid:
        torch.save(model,'modelx1.pt')
#         best_pre_valid =  valid_pre
        print('Model SAVED')

    epoch_counter +=1

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(range(epoch_counter), pre_train_hist, 'r', label='Train')
plt.plot(range(epoch_counter), pre_valid_hist, 'g', label='Test')
plt.xlabel('Epochs')
plt.ylabel('Prp')
plt.legend()
plt.grid(True)


In [None]:
plt.figure(figsize=(8, 6))
plt.plot(range(epoch_counter), loss_train_hist, 'r', label='Train')
plt.plot(range(epoch_counter), loss_valid_hist, 'g', label='Test')
plt.xlabel('Epochs')
plt.ylabel('Prp')
plt.legend()
plt.grid(True)


In [None]:
first_string = 'Hi i am a language model'
input_eval = [vocab[c] for c in first_string]
input_eval = torch.LongTensor(input_eval).unsqueeze(dim=0).to(device)

In [None]:
index2char = vocab.get_itos()

In [None]:
with torch.no_grad():
    text_generated = ['Hi i am a language model ']
    for i in range(50):
        model.eval()
        predictions = model(input_eval)
        predictions = predictions.squeeze() / .8
        last_argm = torch.multinomial(F.softmax(predictions, dim=-1), num_samples=1)[-1]
        if last_argm != vocab['<unk']:
            message = torch.cat((input_eval[0], last_argm))[1:]
            input_eval = message.unsqueeze(0)
            text_generated.append(index2char[last_argm.cpu()])
        else :
            pass


In [None]:
' '.join(text_generated)

In [None]:
torch.save(model, 'myModelExp.pt')