In [54]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from tqdm import tqdm
import os, re
import torch.nn.functional as F
import matplotlib.pyplot as plt

from collections import Counter
from torch.utils.data import Dataset, DataLoader

import warnings

#### Pre-processing text

In [31]:
def pre_process_text(all_files):
    all_words = ""
    for f_name in all_files:
        all_lines = open(f_name).readlines()
        all_lines = [ l.replace("\n", "")
                     for l in all_lines if not 'Page | ' in l]
        text = "".join(all_lines).lower()
        all_words+=text
    return all_words

train_dir = "../data/hp_book1.txt"

all_text = pre_process_text([train_dir])

#### Creating Tokenizer

In [196]:
uniq_chars = sorted(list(set(all_text)))

s2i = {w: i for i, w in enumerate(uniq_chars)}
i2s = {i: w for i, w in enumerate(uniq_chars)}

encode = lambda x: [s2i[i] for i in x]
decode = lambda x: ''.join([i2s[i] for i in x])

tokenizer = s2i

#### Defining Dataloader

In [None]:
class HPDataLoader(Dataset):
    def __init__(self, all_text, tokenizer, max_seq_len = 200):
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        self.text = all_text
        self.text_len = len(all_text)

    def __len__(self):
        return ( self.text_len - self.max_seq_len) # no. of samples possible

    def __getitem__(self, idx):

        # if more than seq_len, trim it
        rand_start_idx = np.random.randint( self.__len__() )
        sample = self.text[rand_start_idx: (rand_start_idx + self.max_seq_len) ]
        label = self.text[(rand_start_idx+1): (rand_start_idx + 1 + self.max_seq_len) ]

        ## tokenized result
        sample = torch.tensor( [ self.tokenizer[c] for c in sample ], dtype=torch.long )
        label = torch.tensor( [ self.tokenizer[c] for c in label ], dtype=torch.long )

        return sample, label

train_ds = HPDataLoader(all_text=all_text, tokenizer=tokenizer, max_seq_len=15)

train_data_loader = DataLoader(dataset=train_ds, batch_size=2, shuffle=True)

for (s, l) in train_data_loader:
    # (B x T x C)
    print(s, l)
    print(decode(s.tolist()[0]))
    print(decode(l.tolist()[0]))
    break

tensor([[ 0, 42, 44, 27, 27, 28, 37, 35, 48,  0, 24, 37, 27,  0, 42],
        [41, 41, 48,  0, 46, 24, 42,  0, 42, 44, 27, 27, 28, 37, 35]]) tensor([[42, 44, 27, 27, 28, 37, 35, 48,  0, 24, 37, 27,  0, 42, 32],
        [41, 48,  0, 46, 24, 42,  0, 42, 44, 27, 27, 28, 37, 35, 48]])
 suddenly and s
suddenly and si


#### Defining LSTM model

In [None]:
class LSTMNet(nn.Module):
    def __init__(self, emb_dim, hidden_size, vocab_size, n_layers):
        super().__init__()
        self.emb_dim = emb_dim          # input to emb
        self.hidden_size = hidden_size  # LSTM internal (ip -> NN) projection size
        self.vocab_size = vocab_size    # len(tokenizer)
        self.n_layers = n_layers        # stacked lstm layers

        ## project input
        self.embs = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.emb_dim)
        ## define lstm
        self.lstm = nn.LSTM(input_size=self.emb_dim, hidden_size=self.hidden_size,
                            num_layers=self.n_layers, batch_first=True)
        ## out classifier
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.hidden_size, self.vocab_size)

    def forward(self, x, targets = None):
        embs = self.embs(x)                     # B x T x emb_dim
        ## pass through lstm
        outputs, (hn, cn) = self.lstm(embs)     # B x T x hddn_size
        # Take last out and return out class
        outputs = self.dropout(outputs)         # B x T x hddn_size
        logits = self.fc(outputs)               # B x T x vocab_size

        ## cross entropy loss
        loss = None
        if not targets is None:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, text, max_new_tokens=200, greedy=False): 
        with torch.no_grad():
            ip_tokens = torch.tensor( [tokenizer[c] for c in text] ).unsqueeze(0) # add fake batch dim
            for _ in range(max_new_tokens):
                logits = self.forward(ip_tokens)[0]           # B x T x vocab_size
                logits = logits[: , -1, :]                    # take last time step
                probs = F.softmax(logits, dim=-1)             # convert logits to probs
                if greedy:
                    next_idx = torch.multinomial(probs, num_samples=1)
                else:
                    next_idx = torch.argmax(probs, dim=-1, keepdim=True)

                ip_tokens = torch.cat( (ip_tokens, next_idx), dim=1)
            return f"{decode(ip_tokens.tolist()[0])}"
    
model = LSTMNet(emb_dim=8, hidden_size=10, vocab_size=len(tokenizer), n_layers=2)
print(model)
for x, target in train_data_loader:
    print(model.generate("hello", 5, greedy=False))
    
    print(x.shape, target.shape)
    outs, loss = model(x)
    print(outs.shape, loss)
    break

LSTMNet(
  (embs): Embedding(57, 8)
  (lstm): LSTM(8, 10, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc): Linear(in_features=10, out_features=57, bias=True)
)
hello””y55
torch.Size([2, 15]) torch.Size([2, 15])
torch.Size([2, 15, 57]) None


#### Training

In [None]:
## defining pre-train
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Training on {DEVICE}")
batch_size = 128

## model
model = LSTMNet(
    emb_dim=128,
    hidden_size=256,
    vocab_size=len(tokenizer),
    n_layers=2
)
model = model.to(DEVICE)
print(f"{model = }")

optm = optim.Adam(params=model.parameters(), lr=5e-4)

Training on cpu
model = LSTMNet(
  (embs): Embedding(57, 128)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc): Linear(in_features=256, out_features=57, bias=True)
)


In [213]:
def train(epochs, model, train_data_loader, optm, max_counter = 3000):
    logs = {
        "epoch" : [],
        "training_loss": [],
    }

    for epoch in range(1, epochs+1):
        print(f"Epoch: {epoch}/ {epochs}")
        train_loss = []

        model.train()
        counter, max_counter = 0, max_counter
        for x, targets in tqdm(train_data_loader, desc = "Training"):
            ## data through model
            x, targets = x.to(DEVICE), targets.to(DEVICE)
            optm.zero_grad()
            outputs, loss = model(x, targets)

            ## Loss 
            train_loss.append(loss.item())
            loss.backward()

            ## clip exp gradiants
            nn.utils.clip_grad_norm_(model.parameters(), 5)
            optm.step()

            counter += 1
            if counter >= max_counter:
                break

        print("="*50)
        print(":: Generating text : non-greedy")
        print( model.generate("harry", 100) )
        print("="*50)
        print(":: Generating text : greedy")
        print( model.generate("harry", 100) )
        print("="*50)

        logs["epoch"].append(epoch)
        logs["training_loss"].append(np.mean(train_loss))

        print(
            f"Epoch {epoch} | "
            f"Train Loss: {logs['training_loss'][-1]:.4f} "
        )

    return logs, model

In [214]:
train_logs, model = train(
    epochs=2,
    model=model,
    train_data_loader=train_data_loader,
    optm=optm
)

torch.save(model.state_dict(), "final_model___.pt")

Epoch: 1/ 2


Training:   1%|▏         | 2999/218185 [00:48<57:32, 62.33it/s]  


:: Generating text : non-greedy
harry was a long to the stone and harry was to the stone was had a started and him and harry was the ston
:: Generating text : greedy
harry was had had harry was had a stone was a stone was the started to the stone was to the stand the sto
Epoch 1 | Train Loss: 1.7340 
Epoch: 2/ 2


Training:   1%|▏         | 2999/218185 [00:47<56:54, 63.02it/s]  


:: Generating text : non-greedy
harry was a started to his had been his for the bottle and his had a beard the standing the stands and he
:: Generating text : greedy
harry was a because and the couldn’t had been his had been his had been his been the started the place an
Epoch 2 | Train Loss: 1.6490 


In [221]:
loaded_model = LSTMNet(
    emb_dim=128,
    hidden_size=256,
    vocab_size=len(tokenizer),
    n_layers=2
)

loaded_model.load_state_dict(torch.load("final_model.pt", map_location=torch.device(DEVICE)))
loaded_model.eval()
loaded_model.generate("harry", 100, True)

'harry way magit. he couldn she said. “you lunt and chut who was posting. whated however. he points the wa'

In [230]:
model.eval()
model.generate("harry", 200, True)

'harry couldn’t instice out in that, but is a durslaching familitted to get his imbinish the few yer antendstre.” “a fel a nosten it iustence. “wa painst be arstmort, brush the lowed through a baxid are lea'