# ULMFiT Notebook

This notebook assumes that you have finished finetuning the language model using the LM training scripts.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data_utils

import numpy as np
import pandas as pd
from tqdm import tqdm

from finetuning import one_cycle
from utils import produce_dataloaders, count_parameters
from layers import AWDLSTMEncoder, ConcatPoolingDecoder, RNNClassifier
from transformers import WarmupLinearSchedule

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
np.random.seed(42)
torch.manual_seed(42);
torch.cuda.manual_seed(42);
torch.backends.cudnn.deterministic = True

We load the dataset and split them into training and validation sets.

In [2]:
df = pd.read_csv('../data/imdb/clas_data/train.csv').sample(frac=1, random_state=42)
text, sentiment = list(df['text']), list(df['sentiment'])

tr_sz = int(len(text) * 0.7)

X_train, y_train = text[:tr_sz], sentiment[:tr_sz]
X_val, y_val = text[tr_sz:], sentiment[tr_sz:]

Let's see the splits.

In [3]:
print("Training Set: {}\nValidation Set: {}".format(len(X_train), len(X_val)))

Training Set: 17500
Validation Set: 7500


We just need to tokenize our dataset. We use spacy for this.

In [4]:
import spacy
en = spacy.load('en')

def tokenize(t):
    return [str(token) for token in en(t)]

The next line will take a while.  We'll save it so we can just load the tokenized data in the future.

In [5]:
#X_train = [tokenize(t) for t in tqdm(X_train)]
#X_val = [tokenize(t) for t in tqdm(X_val)]

#with open('../data/imdb/cache.pth', 'wb') as f:
#    torch.save([X_train, X_val], f)

Load the data.

In [6]:
with open('../data/imdb/clas_data/cache.pth', 'rb') as f:
    X_train, X_val = torch.load(f)

We'll delimit the data to a maximum sequence length and pad shorter sequences. We also opt to drop the last batch which has an irregular batch size.

In this step, we load the vocabulary of the finetuned language model.

In [7]:
msl = 512
bs = 64

# Load the vocabulary
with open('../data/imdb/lm_data/vocab.pth', 'rb') as f:
    word2idx, idx2word = torch.load(f)
vocab_set = set(idx2word)

# Produce dataloaders
train_loader, val_loader = produce_dataloaders(X_train, y_train, X_val, y_val, 
                                               word2idx, vocab_set, msl, bs, drop_last=True)

100%|██████████| 17500/17500 [00:01<00:00, 14408.05it/s]
100%|██████████| 7500/7500 [00:00<00:00, 14155.47it/s]


We construct the model.

In [8]:
encoder = AWDLSTMEncoder(vocab_sz=len(idx2word), emb_dim=400, hidden_dim=1152, num_layers=3)
decoder = ConcatPoolingDecoder(hidden_dim=400, bneck_dim=50, out_dim=2)
model = RNNClassifier(encoder, decoder).to(device)

And load the pretrained weights.

In [9]:
with open('../data/imdb/clas_data/imdb_finetuned.pth', 'rb') as f:
    inc = model.load_state_dict(torch.load(f), strict=False)

We setup our optimizers. Note that unlike the original paper, we use linear warmup scheduling instead of slanted triangular learning rates.

In [10]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-2)

epochs = 5
steps = len(train_loader) * epochs
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=int(steps * 0.1), t_total=steps)

Freeze the model.

In [11]:
model.freeze()

And gradually unfreeze while finetuning.

In [12]:
model.unfreeze(-1)
one_cycle(model, criterion, optimizer, train_loader, val_loader, scheduler=scheduler, clip=0.25, device=device)

100%|██████████| 273/273 [00:57<00:00,  4.71it/s]
100%|██████████| 117/117 [00:24<00:00,  4.84it/s]

Train Loss: 0.5154 | Train Acc: 0.7485 | Val Loss: 0.8978 | Val Acc: 0.6318





In [13]:
model.unfreeze(-2)
one_cycle(model, criterion, optimizer, train_loader, val_loader, scheduler=scheduler, clip=0.25, device=device)

100%|██████████| 273/273 [01:12<00:00,  3.75it/s]
100%|██████████| 117/117 [00:24<00:00,  4.82it/s]

Train Loss: 0.3630 | Train Acc: 0.8406 | Val Loss: 0.2709 | Val Acc: 0.8886





In [14]:
model.unfreeze(-3)
one_cycle(model, criterion, optimizer, train_loader, val_loader, scheduler=scheduler, clip=0.25, device=device)

100%|██████████| 273/273 [02:00<00:00,  2.27it/s]
100%|██████████| 117/117 [00:24<00:00,  4.82it/s]

Train Loss: 0.2927 | Train Acc: 0.8743 | Val Loss: 0.2196 | Val Acc: 0.9153





In [15]:
model.unfreeze_all()
one_cycle(model, criterion, optimizer, train_loader, val_loader, scheduler=scheduler, clip=0.25, device=device)

100%|██████████| 273/273 [02:38<00:00,  1.72it/s]
100%|██████████| 117/117 [00:24<00:00,  4.82it/s]

Train Loss: 0.2641 | Train Acc: 0.8915 | Val Loss: 0.4624 | Val Acc: 0.8255





In [16]:
one_cycle(model, criterion, optimizer, train_loader, val_loader, scheduler=scheduler, clip=0.25, device=device)

100%|██████████| 273/273 [02:38<00:00,  1.72it/s]
100%|██████████| 117/117 [00:24<00:00,  4.80it/s]

Train Loss: 0.1897 | Train Acc: 0.9246 | Val Loss: 0.2256 | Val Acc: 0.9193



