# ULMFiT Notebook

This notebook assumes that you have finished finetuning the language model using the LM training scripts.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data_utils

import numpy as np
import pandas as pd
from tqdm import tqdm

from finetuning import one_cycle
from utils import produce_dataloaders, count_parameters, drop_mult, get_param_groups
from layers import AWDLSTMEncoder, ConcatPoolingDecoder, RNNClassifier
from transformers import WarmupLinearSchedule

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
np.random.seed(42)
torch.manual_seed(42);
torch.cuda.manual_seed(42);
torch.backends.cudnn.deterministic = True

We load the dataset and split them into training and validation sets.

In [2]:
df = pd.read_csv('../data/imdb/clas_data/train.csv').sample(frac=1, random_state=42)
text, sentiment = list(df['text']), list(df['sentiment'])

tr_sz = int(len(text) * 0.7)

X_train, y_train = text[:tr_sz], sentiment[:tr_sz]
X_val, y_val = text[tr_sz:], sentiment[tr_sz:]

We just need to tokenize our dataset. We use spacy for this.

In [4]:
import spacy
en = spacy.load('en')

def tokenize(t):
    return [str(token) for token in en(t)]

The next line will take a while.  We'll save it so we can just load the tokenized data in the future.

In [5]:
#X_train = [tokenize(t) for t in tqdm(X_train)]
#X_val = [tokenize(t) for t in tqdm(X_val)]

#with open('../data/imdb/clas_data/cache.pth', 'wb') as f:
#    torch.save([X_train, X_val], f)

Load the data.

In [6]:
with open('../data/imdb/clas_data/cache.pth', 'rb') as f:
    X_train, X_val = torch.load(f)

We'll delimit the data to a maximum sequence length and pad shorter sequences. We also opt to drop the last batch which has an irregular batch size.

In this step, we load the vocabulary of the finetuned language model.

In [7]:
msl = 512
bs = 64

# Load the vocabulary
with open('../data/pretrained_wt103/vocab.pth', 'rb') as f:
    word2idx, idx2word = torch.load(f)
vocab_set = set(idx2word)

# Produce dataloaders
train_loader, val_loader = produce_dataloaders(X_train, y_train, X_val, y_val, 
                                               word2idx, vocab_set, msl, bs, drop_last=True)

100%|██████████| 17500/17500 [00:01<00:00, 10035.86it/s]
100%|██████████| 7500/7500 [00:00<00:00, 12073.84it/s]


We construct the model and load the pretrained weights, scaling the dropout rates.

In [8]:
encoder = AWDLSTMEncoder(vocab_sz=len(idx2word), emb_dim=400, hidden_dim=1152, num_layers=3)
decoder = ConcatPoolingDecoder(hidden_dim=400, bneck_dim=50, out_dim=2)
model = RNNClassifier(encoder, decoder).to(device)

# Load weights
with open('../data/imdb/lm_data/imdb_finetuned.pth', 'rb') as f:
    inc = model.load_state_dict(torch.load(f), strict=False)
    
# Scale dropout
model = drop_mult(model, dm=0.5)

We set the parameter groups for discriminative learning rates. We set up an optimizer with a default learning rate.

In [12]:
criterion = nn.CrossEntropyLoss()
p_groups = get_param_groups(model)
optimizer = optim.Adam(p_groups, lr=5e-3)

Then we set up the scheduling. Should we want to use linear warmups, we can supply it. If no scheduler is supplied to the ```one_cycle``` function, it uses Cyclic Learning Rates like in the paper.

In [13]:
scheduler = None
use_linear_warmup = False

if use_linear_warmup:
    epochs = 5
    steps = len(train_loader) * epochs
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=int(steps * 0.1), t_total=steps)

Freeze the model.

In [14]:
model.freeze()

And gradually unfreeze while finetuning.

```lr_decrease``` refeers to how much the learning rate is decreased for lower layers in discriminative learning rates. In the ```one_cycle``` function, if the scheduler is set to ```None```, then it uses Cyclic Learning Rate scheduling, rising from 0 to the ```lr``` supplied to the function ```stlr_warmup``` percent of steps (default 0.1).

In [15]:
model.unfreeze(-1)

one_cycle(model, criterion, optimizer, train_loader, val_loader, scheduler=scheduler, device=device, lr_decrease=1, lr=1e-2)

100%|██████████| 273/273 [03:21<00:00,  1.35it/s, lr0=4.2e-5, lr1=4.2e-5]    
100%|██████████| 117/117 [01:25<00:00,  1.38it/s]

Train Loss: 0.4154 | Train Acc: 0.8086 | Val Loss: 0.4879 | Val Acc: 0.7742





In [16]:
model.unfreeze(-2)

one_cycle(model, criterion, optimizer, train_loader, val_loader, scheduler=scheduler, device=device, lr_decrease=2.6, lr=1e-2)

100%|██████████| 273/273 [04:26<00:00,  1.02it/s, lr0=4.2e-5, lr1=1.62e-5]   
100%|██████████| 117/117 [01:25<00:00,  1.37it/s]

Train Loss: 0.3355 | Train Acc: 0.8531 | Val Loss: 0.2543 | Val Acc: 0.8964





In [17]:
model.unfreeze(-3)

one_cycle(model, criterion, optimizer, train_loader, val_loader, scheduler=scheduler, device=device, lr_decrease=2.6, lr=5e-3)

100%|██████████| 273/273 [07:49<00:00,  1.72s/it, lr0=2.1e-5, lr1=8.08e-6]   
100%|██████████| 117/117 [01:24<00:00,  1.38it/s]

Train Loss: 0.2499 | Train Acc: 0.8965 | Val Loss: 0.2112 | Val Acc: 0.9189





In [18]:
model.unfreeze_all()

one_cycle(model, criterion, optimizer, train_loader, val_loader, scheduler=scheduler, device=device, lr_decrease=2.6, lr=1e-3)

100%|██████████| 273/273 [10:29<00:00,  2.31s/it, lr0=4.2e-6, lr1=1.62e-6]   
100%|██████████| 117/117 [01:24<00:00,  1.38it/s]

Train Loss: 0.1546 | Train Acc: 0.9420 | Val Loss: 0.2152 | Val Acc: 0.9185





In [19]:
one_cycle(model, criterion, optimizer, train_loader, val_loader, scheduler=scheduler, device=device, lr_decrease=2.6, lr=1e-3)

100%|██████████| 273/273 [10:29<00:00,  2.31s/it, lr0=4.2e-6, lr1=1.62e-6]   
100%|██████████| 117/117 [01:24<00:00,  1.38it/s]

Train Loss: 0.1068 | Train Acc: 0.9616 | Val Loss: 0.2448 | Val Acc: 0.9193



