## Cloning git repository for dependencies


In [0]:
! git clone https://github.com/josipjukic/Adversarial-NLP.git
% cd /content/Adversarial-NLP/src

In [0]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

## IMDb experiments

In [0]:
import torch
from torchtext import data
from torchtext import datasets
import spacy
import random
from preprocessing import imdb_preprocess
from data_utils import load_dataset
from nltk.corpus import stopwords

<torch._C.Generator at 0x7f4256175190>

In [0]:
SEED = 42
torch.manual_seed(SEED)
LOAD_PATH = '/content/drive/My Drive/Master Thesis/IMDB'
MAX_VOCAB_SIZE = 25_000
EMBEDDINGS_FILE = 'glove.6B.100d'

train_data, valid_data, test_data, text_field, label_field = load_dataset(LOAD_PATH)
label_field.build_vocab(train_data)
text_field.build_vocab(train_data, 
                       max_size = MAX_VOCAB_SIZE, 
                       vectors = EMBEDDINGS_FILE, 
                       unk_init = torch.Tensor.normal_)

In [0]:
from argparse import Namespace
from data_utils import expand_paths
from models import PackedLSTM

args = Namespace(
    # Data and Path hyper parameters
    model_save_file='imdb_model2.torch',
    train_state_file='train_state.json',
    save_dir='/content/drive/My Drive/Master Thesis/torch_models/imdb/',
    PAD_IDX = text_field.vocab.stoi[text_field.pad_token],
    UNK_IDX = text_field.vocab.stoi[text_field.unk_token],
    # Model hyper parameters
    input_dim = len(text_field.vocab),
    embedding_dim=100,
    hidden_dim=256,
    output_dim = 1,
    num_layers=2,
    bidirectional=True,
    # Training hyper parameter
    seed=SEED,
    learning_rate=0.001,
    dropout_p=0.5,
    batch_size=64,
    num_epochs=20,
    early_stopping_criteria=5,
    # Runtime option
    reload_from_files=True,
    expand_filepaths_to_save_dir=True,
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
)

expand_paths(args)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=args.batch_size,
    sort_within_batch=True,
    sort_key = lambda x: len(x.text),
    device=args.device)
iterator = dict(train=train_iterator, valid=valid_iterator, test=test_iterator)

pretrained_embeddings = text_field.vocab.vectors
pretrained_embeddings[args.UNK_IDX] = torch.zeros(args.embedding_dim)
pretrained_embeddings[args.PAD_IDX] = torch.zeros(args.embedding_dim)

model = PackedLSTM(
    args.embedding_dim, 
    args.hidden_dim, 
    args.output_dim, 
    args.num_layers,
    pretrained_embeddings,
    args.bidirectional,
    args.dropout_p, 
    args.PAD_IDX
)

model = model.to(args.device)

In [0]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
#                                                  mode='min', factor=0.5,
#                                                  patience=1)

In [0]:
from tqdm.notebook import tqdm

epoch_bar = tqdm(desc='Training routine', 
                 total=args.num_epochs,
                 position=0)

train_bar = tqdm(desc='Train set',
                total=len(train_iterator), 
                position=1)

val_bar = tqdm(desc='Valid set',
              total=len(valid_iterator), 
              position=1)

tqdms = dict(main=epoch_bar, train=train_bar, valid=val_bar)

HBox(children=(IntProgress(value=0, description='Training routine', max=20, style=ProgressStyle(description_wi…

HBox(children=(IntProgress(value=0, description='Train set', max=274, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='Valid set', max=118, style=ProgressStyle(description_width='i…

In [0]:
from training import run_experiment

run_experiment(args, model, iterator, optimizer, criterion, tqdms)