In [0]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
!cp /content/drive/My\ Drive/imdb-review-dataset.zip .
!unzip imdb-review-dataset.zip

Archive:  imdb-review-dataset.zip
  inflating: imdb_master.csv         


In [0]:
!pip install glove_python

Collecting glove_python
[?25l  Downloading https://files.pythonhosted.org/packages/3e/79/7e7e548dd9dcb741935d031117f4bed133276c2a047aadad42f1552d1771/glove_python-0.1.0.tar.gz (263kB)
[K     |████████████████████████████████| 266kB 2.7MB/s 
Building wheels for collected packages: glove-python
  Building wheel for glove-python (setup.py) ... [?25l[?25hdone
  Created wheel for glove-python: filename=glove_python-0.1.0-cp36-cp36m-linux_x86_64.whl size=700349 sha256=62438ce92ccaeadf8273eb015b749adc2e72abb6f318600f531bf88bce49745a
  Stored in directory: /root/.cache/pip/wheels/88/4b/6d/10c0d2ad32c9d9d68beec9694a6f0b6e83ab1662a90a089a4b
Successfully built glove-python
Installing collected packages: glove-python
Successfully installed glove-python-0.1.0


*Yes, all the imports are hidden here*

In [0]:
import os
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import nltk
import gensim
import spacy
import pickle
import torchtext

from sklearn import metrics

import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data import Field, LabelField, BucketIterator, TabularDataset, Iterator

from glove import Corpus, Glove
from tqdm import tqdm_notebook
from sklearn.metrics import accuracy_score

SEED = 42
np.random.seed(SEED)



In [0]:
# facilitating pickle dumps and loads

def pickle_save(obj, filename):
  with open(filename, "wb") as outp:
    pickle.dump(obj, outp)

def pickle_load(filename):
  with open(filename, "rb") as inp:
    return pickle.load(inp)

# Assignment 5

*Build CNN model for sentiment analysis (binary classification) of IMDB Reviews (https://www.kaggle.com/utathya/imdb-review-dataset).
You can use data with label="unsup" for pretraining of embeddings. Here you are forbidden to use test dataset for pretraining of embeddings.  
Your quality metric is accuracy score on test dataset. Look at "type" column for  train/test split.  
You can use pretrained embeddings from external sources.  
You have to provide data for trials with different hyperparameter values.*

*You have to beat following baselines:  
[3 points] acc = 0.75  
[5 points] acc = 0.8  
[8 points] acc = 0.9  
[2 points] for using unsupervised data*

Let's load our data first and take a glance at it:

In [0]:
data = pd.read_csv("imdb_master.csv", encoding='cp1251')
data.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


Let's apply a couple of fixes and look at `unsup`:

In [0]:
data = data.drop("Unnamed: 0", axis=1)
data['review'] = data['review'].apply(lambda x: x.replace("<br />", "\n"))
data = data.sample(frac=1).reset_index(drop=True)

data.loc[data['label'] == "unsup"].sample(5)
len(data.loc[data['label'] == "unsup"])

50000

Alright, so, if we cannot use such a huge chunk of data, let's at least make something useful out of it — specifically, a GloVe model. But first, let's establish our preprocessing pipeline.

There are some changes as compared to the tokenizer from the seminar notebook. First, we'll use lemmas to merge different wordforms and get better classifying features. Secondly, we won't omit non-alphanumeric tokens, as they not only depict ends of sentences, but also contain some semantics (such as "?!!" are expected to be found in a negative review)

In [0]:
spacy_en = spacy.load('en')

def tokenizer(text): # create a tokenizer function
    return [t.lemma_.lower() if t.lemma_ != "-PRON-" else t.text.lower() for t in spacy_en(text)]

Finally, there will be one additional change for GloVe tokenizer, as spaCy handles sentences and it's beneficial for GloVe to know about sentence boundaries:

In [0]:
def glove_tokenizer(text): # create a tokenizer function
    return [[t.lemma_.lower() if t.lemma_ != "-PRON-" else t.text.lower() for t in sent] for sent in spacy_en(text).sents]

Let's create a corpus for GloVe:

In [0]:
%%time

unsup_list = list(data['review'].loc[data['label'] == 'unsup'])
lines = []

for text in unsup_list:
  lines += glove_tokenizer(text)

pickle_save(lines, "lines.pickle")
!cp lines.pickle /content/drive/My\ Drive

CPU times: user 36min 40s, sys: 16.6 s, total: 36min 57s
Wall time: 37min 2s


In [0]:
%%time

corpus = Corpus()
corpus.fit(lines, window=10)

CPU times: user 37.2 s, sys: 487 ms, total: 37.7 s
Wall time: 37.6 s


Ugh, it took some time. Now we finally train some embeddings:

In [0]:
%%time

glove = Glove(no_components=300, learning_rate=0.05) 
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')

!cp glove.model /content/drive/My\ Drive

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
CPU times: user 48min 54s, sys: 1.94 s, total: 48min 56s
Wall time: 25min 22s


This gives us **2 points for using unsupervised data**. Now we'll embed it in our model!

Let's resave the dataset to adapt it for the seminar notebook pipeline:

In [0]:
train_df = data.loc[data["label"] != "unsup"]
test_df = train_df.loc[train_df["type"] == "test"]
train_df = train_df.loc[train_df["type"] == "train"]

train_df.to_csv('train.csv', index=False, encoding="utf-8")
test_df.to_csv('test.csv', index=False, encoding="utf-8")

In [0]:
!head train.csv

type,review,label,file
train,"I would like to say that curiosity got the best of me. If only I saw a trailer, I'd be able to tell you the whole plot of the movie; I could have saved myself the most pointless one hour and forty minutes in my entire life, and about twenty dollars. This movie was a disaster waiting to happen, and it is an embarrassment to Hollywood.

The movie displays a vivid ignorance of reality. For example, this kid's remote control race car goes all over the neighborhood, and even enters this house. It's even covered with clothes. Is it not rational to believe that a remote can no longer transmit a signal under those circumstances? Hollywood obviously did not believe so. Common logic and any concept of electronics dictates the opposite; I doubt the race car could even have reached the street, let alone a house across the street. Another unrealistic trait is the lack of intelligence the criminals possess. Why is it in all these movies, these criminals are rocket scien

One more change compared to the seminar: we will not use stopwords. The reason is all the words may contribute to text sentiment (especially ones like "not") and CNN is expected to learn the necesarry links between words.

In [0]:
# nltk.download("stopwords")

Here we at last wrap the text into the `torchtext` structures:

In [0]:
%%time

classes={
    'neg': 0,
    'pos': 1
}

TEXT = Field(include_lengths=True, batch_first=True, 
             tokenize=tokenizer,
             eos_token='<eos>',
             lower=True,
             stop_words={})

LABEL = LabelField(dtype=tt.int64, use_vocab=True, preprocessing=lambda x: classes[x])

train_ds = TabularDataset('train.csv', format='csv', 
                         fields=[(None, None), ('text', TEXT), ('label', LABEL), (None, None)], 
                         skip_header=True)

test_ds = TabularDataset('test.csv', format='csv', 
                         fields=[(None, None), ('text', TEXT), ('label', LABEL), (None, None)], 
                         skip_header=True)

CPU times: user 34min 58s, sys: 52.3 s, total: 35min 51s
Wall time: 35min 53s


It appears that `torchtext` doesn't like our `GloVe` format, so let's convert it:

In [0]:
def save_glove_as_txt(glove_obj, name="custom", dim=300):
  g = glove_obj
  outstr = ""
  for word in g["dictionary"]:
    outstr += word + " "
    outstr += " ".join("%.6f" % f for f in g["word_vectors"][g["dictionary"][word]])
    outstr += "\n"
  outstr = outstr[:-1]
  with open(".vector_cache/glove." + name + "." + str(dim) + "d.txt", "w", encoding="utf-8") as outt:
    outt.write(outstr)

In [0]:
del glove

g = pickle_load("glove.model")
forbidden = [el for el in g["dictionary"] if any([ch in el for ch in (" ", "\n")])]
for el in forbidden:
  del g["dictionary"][el]

os.makedirs(".vector_cache", exist_ok=True)
save_glove_as_txt(g)

!head .vector_cache/glove.custom.300d.txt

cavite 0.004345 0.067823 -0.018203 -0.030127 -0.033490 0.022754 -0.043621 0.091975 -0.043482 0.054051 0.072961 -0.034982 -0.000202 -0.014419 0.020709 -0.009462 -0.010213 0.054326 0.004695 -0.047178 0.002202 0.027078 0.000842 0.028927 0.004826 -0.016769 0.030443 0.070203 -0.012572 0.005306 -0.004863 0.034637 0.069278 0.070436 -0.016779 0.045282 0.011679 0.058843 0.037706 -0.000087 -0.008370 -0.027682 -0.115325 -0.014050 0.056489 -0.042219 -0.013099 0.001799 -0.014115 0.131562 0.063852 -0.039465 -0.020096 -0.059898 -0.044274 -0.015911 0.046886 0.063066 0.183428 -0.020958 0.017302 -0.022764 0.055734 0.037212 0.047577 0.003332 0.004364 -0.053257 -0.013122 -0.072277 -0.002235 -0.015804 0.001241 0.031335 -0.016428 -0.066727 0.050382 0.049320 -0.016915 -0.002434 0.039813 -0.098836 0.076063 0.034435 -0.021046 0.020323 -0.019163 -0.009857 0.000129 -0.026560 -0.073291 0.001687 -0.018827 0.007796 0.077340 0.001059 -0.001972 -0.004177 0.005773 -0.028583 0.002342 0.004392 0.037665 0.023967 0.057865

In [0]:
class CustomGloVe(torchtext.vocab.Vectors):
    def __init__(self, name='custom', dim=300, **kwargs):
        name = 'glove.{}.{}d.txt'.format(name, str(dim))
        super(CustomGloVe, self).__init__(name, url="", **kwargs)

Now we can use our vectors, although we still have to apply some workarounds to join our datasets:

In [0]:
glove_vectors = CustomGloVe()

TEXT.build_vocab(train_ds, min_freq=3, vectors=glove_vectors)
len(TEXT.vocab.itos)

31835

Standard checks here:

In [0]:
TEXT.vocab.itos[:10]

['<unk>', '<pad>', '<eos>', 'the', 'be', ',', '.', 'and', 'a', 'of']

Building label vocab:

In [0]:
LABEL.build_vocab(train_ds)

Train/test split. In our previous assignment, 90% train/validation split was beneficial, so we will apply the same here:

In [0]:
train, test = train_ds, test_ds
train, valid = train.split(0.9)

The model definition should be not much different from seminar. However, as we have to distinguish only two types of labels here, we should use binary cross-entropy loss:

In [0]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, kernels):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        self.convs = nn.ModuleList([nn.Conv1d(embed_size, hidden_size, k, padding=5) for k in kernels])
        
        self.fc = nn.Linear(hidden_size * len(kernels), 2)
        
    def forward(self, batch):
        x, _ = batch.text

        x = self.embedding(x)
        x = x.transpose(1,2)
        
        concatenated = []
        for conv in self.convs:
            z = conv(x)
            z = F.avg_pool1d(z, kernel_size=z.size(2))
            z = z.squeeze(2)
            concatenated.append(z)
            
        x = tt.cat(concatenated, 1)
        x = self.fc(x)
        return x

In [0]:
tt.cuda.empty_cache()

batch_size = 32

model = MyModel(len(TEXT.vocab.itos),
                embed_size=100,
                hidden_size=128,
                kernels=[2,3,4,5]
               )

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.text),
)

optimizer = optim.Adam(model.parameters())
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True, cooldown=5)
criterion = nn.CrossEntropyLoss()

Defining train and test:

In [0]:
def epoch_train(model, iterator, epoch, criterion, optimizer, scheduler,
                save_dir='model'):
    if not os.path.exists(save_dir):
      os.mkdir(save_dir)
    model.train()
    epoch_losses = []
    general_loss = 0
    iterator = tqdm_notebook(iterator, total=len(iterator),
                             desc='epoch %d' % (epoch + 1), leave=True)

    for batch in iterator:
        optimizer.zero_grad()
        pred = model(batch)
        loss = criterion(pred, batch.label)
        loss.backward()
        optimizer.step()
        curr_loss = loss.data.detach().item()
        epoch_losses.append(curr_loss)
        general_loss = np.mean(epoch_losses)
        iterator.set_postfix(loss='%.5f' % general_loss)
    
    scheduler.step(general_loss)

    state = {'epoch': epoch + 1, 'state_dict': model.state_dict(),
             'optimizer': optimizer.state_dict()}
    tt.save(state, save_dir+'/'+str(epoch+1))
    return general_loss

def epoch_eval(model, iterator, criterion):
    model.eval()
    general_loss = 0

    with tt.no_grad():
        for batch in iterator:
            pred = model(batch)
            loss = criterion(pred, batch.label)
            general_loss += loss.data.item()
    
    general_loss = general_loss / len(iterator)
    return general_loss


def train_net(model, train_iterator, valid_iterator, criterion, optimizer,
          num_epochs=100, scheduler=None, early_stopping=0):
    scores = []
    epochs_worse = 0

    for epoch in range(num_epochs):
        best_epoch = epoch + 1

        train_loss = epoch_train(model, train_iterator, epoch, criterion,
                                 optimizer, scheduler)
        valid_loss = epoch_eval(model, valid_iterator, criterion)

        print('Validation loss: %.4f' % valid_loss)
        scores.append(valid_loss)

        if not epoch:
          min_loss = train_loss

        if early_stopping and epoch:
            if valid_loss > min_loss:
                epochs_worse += 1
            else:
                epochs_worse = 0

            if epochs_worse >= early_stopping:
                best_epoch = scores.index(min(scores)) + 1
                print('Early stopping')
                print('Best epoch is #%d with validation loss %.4f' %
                      (best_epoch, min(scores)))
                break

            min_loss = min(min_loss, valid_loss)
    
    return best_epoch

As we are applying early stopping, we should be able to revert to older model states. Here is our technique:

In [0]:
def load_checkpoint(model, optimizer, save_dir='model', epoch=None):
    # Note: Input model & optimizer should be pre-defined.  This routine only updates their states.
    if epoch is None:
      epoch = max([int(f) for f in os.listdir(save_dir)])
    filename = 'model/'+str(epoch)
    start_epoch = 0
    if os.path.isfile(filename):
        print("=> loading checkpoint '{}'".format(filename))
        checkpoint = tt.load(filename)
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("=> loaded checkpoint '{}' (epoch {})"
                  .format(filename, checkpoint['epoch']))
    else:
        print("=> no checkpoint found at '{}'".format(filename))

    return model, optimizer, start_epoch

Let's train!

In [107]:
%%time

be = train_net(model, train_iterator, valid_iterator, criterion, optimizer,
               scheduler=scheduler, num_epochs=10, early_stopping=2)

HBox(children=(IntProgress(value=0, description='epoch 1', max=704, style=ProgressStyle(description_width='ini…

Validation loss: 0.5211


HBox(children=(IntProgress(value=0, description='epoch 2', max=704, style=ProgressStyle(description_width='ini…

Validation loss: 0.5387


HBox(children=(IntProgress(value=0, description='epoch 3', max=704, style=ProgressStyle(description_width='ini…

Validation loss: 0.5051


HBox(children=(IntProgress(value=0, description='epoch 4', max=704, style=ProgressStyle(description_width='ini…

Validation loss: 0.5510


HBox(children=(IntProgress(value=0, description='epoch 5', max=704, style=ProgressStyle(description_width='ini…

Validation loss: 0.6089
Early stopping
Best epoch is #3 with validation loss 0.5051
CPU times: user 56min 54s, sys: 9min 7s, total: 1h 6min 2s
Wall time: 1h 6min 29s


Preparing for testing...

In [108]:
device="cpu"

model, optimizer, start_epoch = load_checkpoint(model, optimizer, epoch=be)
model = model.to(device)
for state in optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, tt.Tensor):
            state[k] = v.to(device)

=> loading checkpoint 'model/3'
=> loaded checkpoint 'model/3' (epoch 3)


In [0]:
def test_net(model, test_iterator, device, metric=accuracy_score, verbose=True):
  model.eval()
  scores = []

  if verbose:
    test_iterator = tqdm_notebook(test_iterator)

  with tt.no_grad():
    for batch in test_iterator:
      X = batch
      y = batch.label
      y_pred = tt.argmax(model(X), dim=1)
      scores.append(accuracy_score(y, y_pred.cpu()))
  
  acc = np.mean(scores)
  print('Test accuracy score: %.4f' % acc)
  return acc

...and testing, at last:

In [110]:
test_net(model, test_iterator, tt.device('cpu'))

HBox(children=(IntProgress(value=0, max=782), HTML(value='')))

Test accuracy score: 0.8730


0.8730418797953964

That's it! This gives us **5 points** for beating *acc = 0.8*