# Question 2: LSTM

In [1]:
import itertools
import math
import time

import torch

from models.LSTMModel import LSTMModel
from training_testing.training import train_model
from utils.batchifier import batchify
from utils.tokenizer import Vocabulary, process_corpus, read_corpus

# Read and Process Data

In [2]:
batch_size = 32

In [3]:
train_file_path = "data/wiki2.train.txt"
valid_file_path = "data/wiki2.valid.txt"

In [4]:
train_processed_tokens = process_corpus(train_file_path)
train_vocab = Vocabulary()
train_vocab.build_vocab(train_processed_tokens)
train_numericalized_tokens = train_vocab.numericalize(train_processed_tokens)
train_numericalized_tokens.size()

torch.Size([2088629])

In [5]:
train_batched_data = batchify(train_numericalized_tokens, batch_size)
train_batched_data.shape

torch.Size([65269, 32])

In [6]:
valid_processed_tokens = read_corpus(valid_file_path)
valid_numericalized_tokens = train_vocab.numericalize(valid_processed_tokens)
valid_numericalized_tokens.size()

torch.Size([217647])

In [7]:
valid_batched_data = batchify(valid_numericalized_tokens, batch_size)
valid_batched_data.shape

torch.Size([6801, 32])

## Training

In [8]:
bptt_values = [30, 40]
clip_threshold_values = [0.25]
log_interval = 1000
lr_values = [0.01]
epochs = 10
dropout_values = [0.3, 0.5]
ntokens = train_vocab.size

criterion = torch.nn.CrossEntropyLoss()

In [9]:
hyperparam_combinations = list(
    itertools.product(bptt_values, clip_threshold_values, lr_values, dropout_values)
)

In [None]:
results = []

for combination in hyperparam_combinations:
    bptt, clip_threshold, lr, dropout = combination

    model = LSTMModel(ntokens=ntokens, ninp=100, nhid=100, dropout=dropout)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    train_loss, val_loss, train_ppl, val_ppl = train_model(
        model=model,
        train_batched_data=train_batched_data,
        valid_batched_data=valid_batched_data,
        epochs=epochs,
        batch_size=batch_size,
        bptt=bptt,
        criterion=criterion,
        optimizer=optimizer,
        clip_threshold=clip_threshold,
        log_interval=log_interval,
        lr=lr,
        dropout=dropout,
        architecture="LSTM",
    )

    results.append(
        {
            "bptt": bptt,
            "clip_threshold": clip_threshold,
            "lr": lr,
            "dropout": dropout,
            "epochs": epochs,
            "train_loss": train_loss,
            "val_loss": val_loss,
            "train_ppl": train_ppl,
            "val_ppl": val_ppl,
        }
    )

epoch   1:  1000/ 2175 batches | lr 0.01 | dropout 0.30 | ms/batch 58.52 | train loss  5.37 | train ppl   215.82 | valid loss  5.02 | valid ppl   151.47
epoch   1:  2000/ 2175 batches | lr 0.01 | dropout 0.30 | ms/batch 56.74 | train loss  4.85 | train ppl   127.54 | valid loss  4.78 | valid ppl   119.23
epoch   2:  1000/ 2175 batches | lr 0.01 | dropout 0.30 | ms/batch 57.47 | train loss  4.59 | train ppl    98.07 | valid loss  4.69 | valid ppl   108.71
epoch   2:  2000/ 2175 batches | lr 0.01 | dropout 0.30 | ms/batch 57.31 | train loss  4.50 | train ppl    89.83 | valid loss  4.64 | valid ppl   103.98
epoch   3:  1000/ 2175 batches | lr 0.01 | dropout 0.30 | ms/batch 56.50 | train loss  4.40 | train ppl    81.25 | valid loss  4.63 | valid ppl   102.76
epoch   3:  2000/ 2175 batches | lr 0.01 | dropout 0.30 | ms/batch 56.67 | train loss  4.37 | train ppl    79.24 | valid loss  4.62 | valid ppl   101.26
epoch   4:  1000/ 2175 batches | lr 0.01 | dropout 0.30 | ms/batch 56.80 | train l

In [None]:
df_results = pd.DataFrame(results)

## Testing best model