## Solving dependecies

### Git repo

In [1]:
! git clone https://github.com/josipjukic/Adversarial-NLP.git
% cd /content/Adversarial-NLP/src

Cloning into 'Adversarial-NLP'...
remote: Enumerating objects: 224, done.[K
remote: Counting objects: 100% (224/224), done.[K
remote: Compressing objects: 100% (151/151), done.[K
remote: Total 224 (delta 132), reused 133 (delta 62), pack-reused 0[K
Receiving objects: 100% (224/224), 75.47 KiB | 415.00 KiB/s, done.
Resolving deltas: 100% (132/132), done.
/content/Adversarial-NLP/src


### Embeddings

In [0]:
% mkdir .vector_cache
% cp '/content/drive/My Drive/Master Thesis/glove/glove.6B.100d.txt.pt' .vector_cache/

### NLTK data

In [3]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Dataset save/load

In [0]:
import torch
from torchtext import data
from torchtext import datasets
import spacy
import random
from nltk.corpus import stopwords

### Save dataset

In [7]:
from preprocessing import imdb_preprocess
from data_utils import save_dataset

SEED = 42
torch.manual_seed(SEED)
SAVE_PATH = '/content/drive/My Drive/Master Thesis/IMDB'

TEXT = data.RawField(preprocessing=imdb_preprocess)
LABEL = data.LabelField(dtype=torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

dataset = dict(train=train_data, test=test_data, valid=valid_data)
save_dataset(dataset, SAVE_PATH)

Saved data at /content/drive/My Drive/Master Thesis/IMDB_plain/train.json.
Saved data at /content/drive/My Drive/Master Thesis/IMDB_plain/test.json.
Saved data at /content/drive/My Drive/Master Thesis/IMDB_plain/valid.json.


### Load dataset

In [0]:
from data_utils import load_dataset

SEED = 42
torch.manual_seed(SEED)
LOAD_PATH = '/content/drive/My Drive/Master Thesis/IMDB'
MAX_VOCAB_SIZE = 25_000
EMBEDDINGS_FILE = 'glove.6B.100d'

splits, fields = load_dataset(LOAD_PATH)
train_data, valid_data, test_data = splits
TEXT, LABEL, RAW, ID = fields
RAW.is_target = ID.is_target = False
LABEL.build_vocab(train_data)
TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = EMBEDDINGS_FILE, 
                 unk_init = torch.Tensor.normal_)

In [0]:
from argparse import Namespace
from data_utils import expand_paths
from models import PackedLSTM

args = Namespace(
    # Data and Path hyper parameters
    model_save_file='imdb_model.torch',
    train_state_file='train_state.json',
    save_dir='/content/drive/My Drive/Master Thesis/torch_models/imdb/',
    PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token],
    UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token],
    # Model hyper parameters
    input_dim = len(TEXT.vocab),
    embedding_dim=100,
    hidden_dim=256,
    output_dim = 1,
    num_layers=2,
    bidirectional=True,
    # Training hyper parameter
    seed=SEED,
    learning_rate=0.001,
    dropout_p=0.5,
    batch_size=64,
    num_epochs=20,
    early_stopping_criteria=5,
    # Runtime option
    reload_from_files=True,
    expand_filepaths_to_save_dir=True,
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
)

expand_paths(args)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=args.batch_size,
    sort_within_batch=True,
    sort_key = lambda x: len(x.text),
    device=args.device)
iterator = dict(train=train_iterator, valid=valid_iterator, test=test_iterator)

pretrained_embeddings = TEXT.vocab.vectors
pretrained_embeddings[args.UNK_IDX] = torch.zeros(args.embedding_dim)
pretrained_embeddings[args.PAD_IDX] = torch.zeros(args.embedding_dim)

model = PackedLSTM(
    args.embedding_dim, 
    args.hidden_dim, 
    args.output_dim, 
    args.num_layers,
    pretrained_embeddings,
    args.bidirectional,
    args.dropout_p, 
    args.PAD_IDX,
    args.device
)

model = model.to(args.device)

In [0]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)

In [0]:
from tqdm.notebook import tqdm

epoch_bar = tqdm(desc='Training routine', 
                 total=args.num_epochs,
                 position=0)

train_bar = tqdm(desc='Train set',
                 total=len(train_iterator), 
                 position=1)

val_bar = tqdm(desc='Valid set',
               total=len(valid_iterator), 
               position=1)

tqdms = dict(main=epoch_bar, train=train_bar, valid=val_bar)

In [0]:
from training import run_experiment

run_experiment(args, model, iterator, optimizer, criterion, tqdms)