## Cloning git repository for dependencies


In [123]:
! git clone https://github.com/josipjukic/Adversarial-NLP.git
% cd /content/Adversarial-NLP/src

fatal: destination path 'Adversarial-NLP' already exists and is not an empty directory.
/content/Adversarial-NLP/src


## IMDb experiments

In [0]:
import torch
from torchtext import data
from torchtext import datasets
import spacy

In [0]:
SEED = 42
torch.manual_seed(SEED)

text_field = data.Field(tokenize='spacy', include_lengths=True)
label_field = data.LabelField(dtype=torch.float)

In [0]:
train_data, test_data = datasets.IMDB.splits(text_field, label_field)

In [0]:
import random

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [0]:
MAX_VOCAB_SIZE = 25_000
EMBEDDINGS_FILE = 'glove.6B.100d'

text_field.build_vocab(train_data, 
                       max_size = MAX_VOCAB_SIZE, 
                       vectors = EMBEDDINGS_FILE, 
                       unk_init = torch.Tensor.normal_)

label_field.build_vocab(train_data)

In [0]:
from argparse import Namespace

args = Namespace(
    # Data and Path hyper parameters
    model_save_file='imdb_model.torch',
    train_state_file='train_state.json',
    save_dir='/content/drive/My Drive/torch_models/imdb/',
    PAD_IDX = text_field.vocab.stoi[text_field.pad_token],
    UNK_IDX = text_field.vocab.stoi[text_field.unk_token],
    # Model hyper parameters
    input_dim = len(text_field.vocab),
    embedding_dim=100,
    hidden_dim=256,
    output_dim = 1,
    num_layers=2,
    bidirectional=True,
    # Training hyper parameter
    seed=SEED,
    learning_rate=0.001,
    dropout_p=0.5,
    batch_size=64,
    num_epochs=5,
    early_stopping_criteria=5,
    # Runtime option
    reload_from_files=True,
    expand_filepaths_to_save_dir=True,
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=args.batch_size,
    sort_within_batch=True,
    device=args.device)

iterator = dict(train=train_iterator, valid=valid_iterator, test=test_iterator)

pretrained_embeddings = text_field.vocab.vectors
pretrained_embeddings[args.UNK_IDX] = torch.zeros(args.embedding_dim)
pretrained_embeddings[args.PAD_IDX] = torch.zeros(args.embedding_dim)

In [0]:
from models import LSTM

model = LSTM(
    args.embedding_dim, 
    args.hidden_dim, 
    args.output_dim, 
    args.num_layers,
    pretrained_embeddings,
    args.bidirectional,
    args.dropout_p, 
    args.PAD_IDX
)

In [0]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
#                                                  mode='min', factor=0.5,
#                                                  patience=1)

In [0]:
import torch
import time
from data_utils import json_dump


def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': float('inf'),
            'learning_rate': args.learning_rate,
            'json_path': args.train_state_file,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'valid_loss': [],
            'valid_acc': [],
            'test_loss': [],
            'test_acc': [],
            'model_path': args.model_save_file}


def update_train_state(args, model, train_state):
    """Handle the training state updates.

    Components:
     - Early Stopping: Prevent overfitting.
     - Model Checkpoint: Model is saved if the model is better

    :param args: main arguments
    :param model: model to train
    :param train_state: a dictionary representing the training state values
    :returns:
        a new train_state
    """

    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_path'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # If loss worsened
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_path'])

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state


def dump_train_state_to_json(train_state, path):
    obj = dict(epochs=train_state['epoch_index'],
               train_loss=train_state['train_loss'],
               train_acc=train_state['train_acc'],
               val_loss=train_state['valid_loss'],
               val_acc=train_state['valid_acc'],
               test_loss=train_state['test_loss'],
               test_acc=train_state['test_acc'])
    json_dump(obj, path)


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


def binary_accuracy(y_pred, y_gold):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(y_pred))
    correct = (rounded_preds == y_gold).float() # convert into float for division 
    acc = correct.sum() / len(correct)
    return acc.item()


def train(model, iterator, optimizer, criterion, train_state, notebook=True):
    
    # print('Entering training mode...')

    running_loss = 0.
    running_acc = 0.
    num_batches = len(iterator)
    
    model.train()
    
    for batch_index, batch in enumerate(iterator, 1):
        # 5 step training routine

        # --------------------------------------
        # 1) zero the gradients
        optimizer.zero_grad()
        
        # 2) compute the output
        x_in, lengths = batch.text
        y_pred = model(x_in, lengths).squeeze()

        # 3) compute the loss
        loss = criterion(y_pred, batch.label)
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / batch_index
        
        # 4) use loss to produce gradients
        loss.backward()

        # 5) use optimizer to take gradient step
        optimizer.step()
        # -----------------------------------------

        # compute the accuracy
        acc_t = binary_accuracy(y_pred, batch.label)
        running_acc += (acc_t - running_acc) / batch_index

        if notebook:
            # update bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc)
            train_bar.update()
                
    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)

    return running_loss, running_acc


def evaluate(model, iterator, criterion, train_state, mode='valid', notebook=True):
    
    # print(f'Entering {mode} mode...')

    running_loss = 0.
    running_acc = 0.
    num_batches = len(iterator)
    
    model.eval()
    
    with torch.no_grad():
    
        for batch_index, batch in enumerate(iterator, 1):
            x_in, lengths = batch.text
            y_pred = model(x_in, lengths).squeeze()

            loss = criterion(y_pred, batch.label)
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / batch_index
            
            acc_t = binary_accuracy(y_pred, batch.label)
            running_acc += (acc_t - running_acc) / batch_index

            if notebook:
                # update bar
                val_bar.set_postfix(loss=running_loss, acc=running_acc)
                val_bar.update()
    
    train_state[f'{mode}_loss'].append(running_loss)
    train_state[f'{mode}_acc'].append(running_acc)
        
    return running_loss, running_acc


def run_experiment(args, model, iterator, optimizer, criterion, notebook=True):

    train_state = make_train_state(args)

    for epoch in range(args.num_epochs):

        start_time = time.time()
        
        train_loss, train_acc = train(model, iterator['train'], optimizer,
                                      criterion, train_state, notebook=notebook)
        valid_loss, valid_acc = evaluate(model, iterator['valid'], criterion,
                                         train_state, notebook=notebook)
        
        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        train_state = update_train_state(args=args, model=model,
                                         train_state=train_state)
        
        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'    Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'    Valid Loss: {valid_loss:.3f} |  Valid Acc: {valid_acc*100:.2f}%')

        if train_state['stop_early']:
            break

        if notebook:
            # update bars
            train_bar.n = 0
            val_bar.n = 0
            epoch_bar.update()

    test_loss, test_acc = evaluate(model, iterator['test'], criterion, train_state, mode='test', notebook=False)
    print(f'test_loss = {test_loss}; test_acc = {test_acc}')
    dump_train_state_to_json(train_state, args.train_state_file)

In [0]:
import os

def expand_paths(args):
    args.model_save_file = os.path.join(args.save_dir, args.model_save_file)
    args.train_state_file = os.path.join(args.save_dir, args.train_state_file)

In [0]:
expand_paths(args)

In [173]:
from tqdm.notebook import tqdm

epoch_bar = tqdm(desc='training routine', 
                 total=args.num_epochs,
                 position=0)

train_bar = tqdm(desc='Train set',
                total=len(train_iterator), 
                position=1)

val_bar = tqdm(desc='Valid set',
              total=len(valid_iterator), 
              position=1)

HBox(children=(IntProgress(value=0, description='training routine', max=5, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='Train set', max=274, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='Valid set', max=118, style=ProgressStyle(description_width='i…

In [174]:
model = model.to(args.device)
run_experiment(args, model, iterator, optimizer, criterion, True)

Epoch: 01 | Epoch Time: 0m 31s
    Train Loss: 0.682 | Train Acc: 56.81%
    Valid Loss: 0.709 |  Valid Acc: 50.83%
Epoch: 02 | Epoch Time: 0m 31s
    Train Loss: 0.626 | Train Acc: 65.32%
    Valid Loss: 0.553 |  Valid Acc: 73.38%
Epoch: 03 | Epoch Time: 0m 31s
    Train Loss: 0.643 | Train Acc: 63.58%
    Valid Loss: 0.604 |  Valid Acc: 68.34%
Epoch: 04 | Epoch Time: 0m 31s
    Train Loss: 0.624 | Train Acc: 65.02%
    Valid Loss: 0.542 |  Valid Acc: 73.39%
Epoch: 05 | Epoch Time: 0m 31s
    Train Loss: 0.588 | Train Acc: 68.30%
    Valid Loss: 0.483 |  Valid Acc: 77.41%
test_loss = 0.48432496830325616; test_acc = 0.7724584398976985
