## Solving dependecies

### Git repo

In [0]:
! git clone https://github.com/josipjukic/Adversarial-NLP.git
% cd /content/Adversarial-NLP/src

Cloning into 'Adversarial-NLP'...
remote: Enumerating objects: 179, done.[K
remote: Counting objects: 100% (179/179), done.[K
remote: Compressing objects: 100% (126/126), done.[K
remote: Total 179 (delta 99), reused 101 (delta 42), pack-reused 0[K
Receiving objects: 100% (179/179), 61.96 KiB | 265.00 KiB/s, done.
Resolving deltas: 100% (99/99), done.
/content/Adversarial-NLP/src


### Embeddings

In [0]:
% mkdir .vector_cache
% cp '/content/drive/My Drive/Master Thesis/glove/glove.6B.100d.txt.pt' .vector_cache/

### NLTK data

In [0]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Dataset save/load

In [0]:
import torch
from torchtext import data
from torchtext import datasets
import spacy
import random
from preprocessing import imdb_preprocess
from data_utils import load_dataset
from nltk.corpus import stopwords

SEED = 42
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f4256175190>

In [0]:
LOAD_PATH = '/content/drive/My Drive/Master Thesis/IMDB'
MAX_VOCAB_SIZE = 25_000
EMBEDDINGS_FILE = 'glove.6B.100d'

train_data, valid_data, test_data, text_field, label_field = load_dataset(LOAD_PATH)
label_field.build_vocab(train_data)
text_field.build_vocab(train_data, 
                       max_size = MAX_VOCAB_SIZE, 
                       vectors = EMBEDDINGS_FILE, 
                       unk_init = torch.Tensor.normal_)

In [0]:
MAX_VOCAB_SIZE = 25_000
EMBEDDINGS_FILE = 'glove.6B.100d'

nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
def tokenizer(s):
  return [token.text for token in nlp(imdb_preprocess(s))]

text_field = data.Field(tokenize='spacy', include_lengths=True)
label_field = data.LabelField(dtype=torch.float)
train_data, test_data = datasets.IMDB.splits(text_field, label_field)
train_data, valid_data = train_data.split(random_state = random.seed(SEED))
label_field.build_vocab(train_data)
text_field.build_vocab(train_data, 
                       max_size = MAX_VOCAB_SIZE, 
                       vectors = EMBEDDINGS_FILE, 
                       unk_init = torch.Tensor.normal_)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:09<00:00, 9.03MB/s]


In [0]:
from data_utils import save_dataset

SAVE_PATH = '/content/drive/My Drive/Master Thesis/IMDB'

dataset = dict(train=train_data, test=test_data, valid=valid_data)
save_dataset(dataset, SAVE_PATH)

Saved data at /content/drive/My Drive/Master Thesis/IMDB/train.json.
Saved data at /content/drive/My Drive/Master Thesis/IMDB/test.json.
Saved data at /content/drive/My Drive/Master Thesis/IMDB/valid.json.


In [0]:
from argparse import Namespace
from data_utils import expand_paths
from models import PackedLSTM

args = Namespace(
    # Data and Path hyper parameters
    model_save_file='imdb_model2.torch',
    train_state_file='train_state.json',
    save_dir='/content/drive/My Drive/Master Thesis/torch_models/imdb/',
    PAD_IDX = text_field.vocab.stoi[text_field.pad_token],
    UNK_IDX = text_field.vocab.stoi[text_field.unk_token],
    # Model hyper parameters
    input_dim = len(text_field.vocab),
    embedding_dim=100,
    hidden_dim=256,
    output_dim = 1,
    num_layers=2,
    bidirectional=True,
    # Training hyper parameter
    seed=SEED,
    learning_rate=0.001,
    dropout_p=0.5,
    batch_size=64,
    num_epochs=20,
    early_stopping_criteria=5,
    # Runtime option
    reload_from_files=True,
    expand_filepaths_to_save_dir=True,
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
)

expand_paths(args)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=args.batch_size,
    sort_within_batch=True,
    sort_key = lambda x: len(x.text),
    device=args.device)
iterator = dict(train=train_iterator, valid=valid_iterator, test=test_iterator)

pretrained_embeddings = text_field.vocab.vectors
pretrained_embeddings[args.UNK_IDX] = torch.zeros(args.embedding_dim)
pretrained_embeddings[args.PAD_IDX] = torch.zeros(args.embedding_dim)

model = PackedLSTM(
    args.embedding_dim, 
    args.hidden_dim, 
    args.output_dim, 
    args.num_layers,
    pretrained_embeddings,
    args.bidirectional,
    args.dropout_p, 
    args.PAD_IDX
)

model = model.to(args.device)

In [0]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
#                                                  mode='min', factor=0.5,
#                                                  patience=1)

In [0]:
from tqdm.notebook import tqdm

epoch_bar = tqdm(desc='Training routine', 
                 total=args.num_epochs,
                 position=0)

train_bar = tqdm(desc='Train set',
                total=len(train_iterator), 
                position=1)

val_bar = tqdm(desc='Valid set',
              total=len(valid_iterator), 
              position=1)

tqdms = dict(main=epoch_bar, train=train_bar, valid=val_bar)

HBox(children=(IntProgress(value=0, description='Training routine', max=20, style=ProgressStyle(description_wi…

HBox(children=(IntProgress(value=0, description='Train set', max=274, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='Valid set', max=118, style=ProgressStyle(description_width='i…

In [0]:
from training import run_experiment

run_experiment(args, model, iterator, optimizer, criterion, tqdms)

Epoch: 01 | Epoch Time: 0m 31s
    Train Loss: 0.658 | Train Acc: 59.88%
    Valid Loss: 0.564 |  Valid Acc: 71.93%
Epoch: 02 | Epoch Time: 0m 31s
    Train Loss: 0.619 | Train Acc: 66.00%
    Valid Loss: 0.517 |  Valid Acc: 76.13%
Epoch: 03 | Epoch Time: 0m 32s
    Train Loss: 0.634 | Train Acc: 64.70%
    Valid Loss: 0.677 |  Valid Acc: 60.29%
Epoch: 04 | Epoch Time: 0m 32s
    Train Loss: 0.677 | Train Acc: 56.40%
    Valid Loss: 0.656 |  Valid Acc: 68.67%
Epoch: 05 | Epoch Time: 0m 33s
    Train Loss: 0.666 | Train Acc: 59.58%
    Valid Loss: 0.631 |  Valid Acc: 65.12%
Epoch: 06 | Epoch Time: 0m 33s
    Train Loss: 0.601 | Train Acc: 67.60%
    Valid Loss: 0.487 |  Valid Acc: 76.98%
Epoch: 07 | Epoch Time: 0m 33s
    Train Loss: 0.538 | Train Acc: 73.33%
    Valid Loss: 0.473 |  Valid Acc: 78.26%
Epoch: 08 | Epoch Time: 0m 33s
    Train Loss: 0.466 | Train Acc: 78.22%
    Valid Loss: 0.382 |  Valid Acc: 83.20%
Epoch: 09 | Epoch Time: 0m 33s
    Train Loss: 0.425 | Train Acc: 80.62%