# 1. Load config and variables

In [1]:
%config IPCompleter.greedy=True
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import os

import spacy, pickle

import torch

from torchtext import data
from torchtext import datasets

import random
import inspect

# Custom impport
from common.common_classes import TensorField

In [3]:
torch.cuda.is_available()

True

In [4]:
path = "./"
save_data_path = path + 'save_data/'
large_save_data_path = '/notebooks/large-storage/'
saved_models_path = '/notebooks/large-storage/saved-models/'
print(os.listdir(path))

['test.tsv', 'prepare-word-embedding-nlp.ipynb', 'tokenization.ipynb', 'test-batching-padding.ipynb', 'train.tsv', 'test-batching-padding-ok.ipynb', 'sampleSubmission.csv', 'save_data', '.ipynb_checkpoints', '__init__.py', 'README.md', '.gitignore', '.git', 'common', 'simple-GRU-implement.ipynb']


In [5]:
loaded_data = pickle.load(open(save_data_path + 'pre-processed-data.pkl', 'rb'))
loaded_kaggle_test = pickle.load(open(save_data_path + 'pre-processed-kaggle-test.pkl', 'rb'))
loaded_vocab = pickle.load(open(save_data_path + 'genereated-vocab.pkl', 'rb'))

In [6]:
loaded_data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Phrase_length,Tokenized_phrase,Indexed_phrase
0,1,1,A series of escapades demonstrating the adage ...,1,188,"[xxbos, a, series, of, escapades, demonstratin...","[2, 10, 341, 11, 14246, 6044, 8, 6604, 19, 64,..."
1,2,1,A series of escapades demonstrating the adage ...,2,77,"[xxbos, a, series, of, escapades, demonstratin...","[2, 10, 341, 11, 14246, 6044, 8, 6604, 19, 64,..."
2,3,1,A series,2,8,"[xxbos, a, series, xxeos]","[2, 10, 341, 3]"
3,4,1,A,2,1,"[xxbos, a, xxeos]","[2, 10, 3]"
4,5,1,series,2,6,"[xxbos, series, xxeos]","[2, 341, 3]"


In [7]:
UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxunk xxpad xxbos xxeos xxrep xxwrep xxup xxmaj".split()

default_spec_tok = [UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ]

MAX_LABEL = 5

# 2. Encoding and prepraing batches

In [8]:
nlp = spacy.load(large_save_data_path + 'process-spacy-model')

In [9]:
nlp.vocab[BOS].vector

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [10]:
type(nlp.vocab.get_vector('test'))

numpy.ndarray

In [11]:
nlp.vocab.vectors.data.shape

(890280, 308)

In [12]:
PHRASE_ID = data.Field(use_vocab = False)
TEXT = TensorField(include_lengths = True, use_vocab = False, sequential = False, pad_token = nlp.vocab[PAD].vector, dtype=torch.float)
LABEL = data.LabelField(use_vocab = False, dtype=torch.long)

In [13]:
fields = [('id', PHRASE_ID), ('text', TEXT), ('label', LABEL)]

In [14]:
len(loaded_data['Phrase'])

156060

In [15]:
examples = []
length = len(loaded_data['Phrase'])
for i in range(length):
    embedded = []
    for j in range(len(loaded_data['Tokenized_phrase'][i])):
        if nlp.vocab.has_vector(loaded_data['Tokenized_phrase'][i][j]):
            embedded.append(nlp.vocab.get_vector(loaded_data['Tokenized_phrase'][i][j]))
        else:
            embedded.append(nlp.vocab.get_vector(UNK))
    
    examples.append(data.Example.fromlist([ [loaded_data['PhraseId'][i]], embedded, loaded_data['Sentiment'][i]], fields))
    
examples[:10]

[<torchtext.data.example.Example at 0x7f5e68218710>,
 <torchtext.data.example.Example at 0x7f5e68218780>,
 <torchtext.data.example.Example at 0x7f5e682187f0>,
 <torchtext.data.example.Example at 0x7f5e68218828>,
 <torchtext.data.example.Example at 0x7f5e68218860>,
 <torchtext.data.example.Example at 0x7f5e68218898>,
 <torchtext.data.example.Example at 0x7f5e682188d0>,
 <torchtext.data.example.Example at 0x7f5e68218908>,
 <torchtext.data.example.Example at 0x7f5e68218940>,
 <torchtext.data.example.Example at 0x7f5e68218978>]

In [16]:
examples[3].text

[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 

In [17]:
nlp.vocab.get_vector('a')

array([ 4.3798e-02,  2.4779e-02, -2.0937e-01,  4.9745e-01,  3.6019e-01,
       -3.7503e-01, -5.2078e-02, -6.0555e-01,  3.6744e-02,  2.2085e+00,
       -2.3389e-01, -6.8360e-02, -2.2355e-01, -5.3989e-02, -1.5198e-01,
       -1.7319e-01,  5.3355e-02,  1.6485e+00, -4.7991e-02, -8.5311e-02,
       -1.5712e-01, -6.4425e-01, -3.9819e-01,  2.7800e-01,  1.5364e-01,
        3.1678e-02,  5.5414e-02,  1.5939e-02,  3.1851e-01, -5.8979e-02,
        3.8584e-02,  1.0770e-01,  1.0410e-01, -7.7346e-02,  3.7396e-01,
       -2.1482e-01,  3.8320e-01, -2.7737e-01, -1.8352e-01, -8.3838e-01,
        3.4124e-01,  5.8164e-01,  1.8543e-01, -3.1028e-01,  1.7666e-01,
       -6.9421e-02, -3.4422e-01, -1.3665e-01, -1.0823e-01,  2.3637e-01,
       -3.2923e-01,  6.1348e-01,  1.9720e-01,  8.7123e-02,  1.0785e-01,
        3.0730e-01,  1.3757e-01,  3.0809e-01,  2.4331e-01, -2.9422e-01,
       -9.8214e-03,  5.5675e-01, -4.8880e-02,  9.9468e-02,  3.0543e-01,
       -3.7597e-01, -1.9525e-01,  4.6246e-02, -3.6675e-02,  3.40

In [18]:
examples[3].label

2

In [19]:
len(examples)

156060

In [20]:
data_set = data.Dataset(examples, fields)

In [21]:
data_set.sort_key = lambda x: len(x.text)

In [22]:
SEED = 9131

In [23]:
train_data, valid_data, test_data = data_set.split([0.8, 0.1, 0.1], random_state = random.seed(SEED))

In [24]:
train_data.sort_key

<function __main__.<lambda>(x)>

In [25]:
len(data_set)*0.1

15606.0

In [26]:
len(train_data)

124848

In [27]:
len(valid_data)

15606

In [28]:
len(test_data)

15606

In [29]:
inspect.getsource(train_data.sort_key)

'data_set.sort_key = lambda x: len(x.text)\n'

## Define Hyperparameter

In [30]:
BATCH_SIZE = 64

In [31]:
LABEL.build_vocab(train_data)

In [32]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

In [33]:
raw_train_iterator = iter(train_iterator)
batch = next(raw_train_iterator)

In [34]:
a, b = batch.text

In [35]:
b

tensor([9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9.,
        9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9.,
        9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9.,
        9., 9., 9., 9., 9., 9., 9., 9., 9., 9.], device='cuda:0')

In [36]:
a.shape

torch.Size([64, 9, 308])

In [37]:
batch.label

tensor([2, 2, 0, 2, 2, 1, 3, 3, 1, 3, 3, 2, 2, 2, 2, 2, 0, 2, 1, 4, 3, 2, 3, 2,
        2, 2, 1, 3, 1, 2, 2, 1, 3, 1, 2, 2, 1, 4, 2, 3, 2, 3, 2, 2, 1, 4, 2, 2,
        2, 3, 2, 3, 1, 2, 2, 3, 4, 2, 2, 4, 2, 2, 1, 4], device='cuda:0')

In [38]:
batch.id

tensor([[133918,  57783,  94049,  86671,  84086,  74128, 112683, 126943,  12599,
         138419,  47937,  62635, 104584,  86587, 113162,  81036,  62472, 147974,
          32209, 133654,  95241, 137072,  66415,  75555,  53039, 130485,  79837,
         152257,  39951, 136680, 121126, 102615, 101662, 119728, 108929, 145776,
          36689,  66928, 109732, 109843, 130506,  33026, 123268, 155495, 114095,
          27453, 122599,  82520,  21535, 104973,  91096,  97182,  84542, 119081,
         104435,  65401,  30863, 141808,  39997, 145932,  46149, 125167, 132902,
         117864]], device='cuda:0')

In [39]:
loaded_data['Sentiment'][117864-1]

4

In [40]:
torch.cuda.is_available()

True

## Define model

In [41]:
import torch.nn as nn

class GRU(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        super().__init__()
        
        self.rnn = nn.GRU( embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional,
                           batch_first=True,
                           dropout=dropout)
        
        self.bidirectional = bidirectional
        
        if bidirectional:
            self.fc = nn.Linear(hidden_dim * 2, output_dim)
        else:
            self.fc = nn.Linear(hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [batch size, sent len, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(text, text_lengths, batch_first=True)
        
        packed_output, hidden = self.rnn(packed_embedded)
        
        #unpack sequence
        #output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        if self.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
        
        #if self.bidirectional:
        #    hidden = [batch size, hid dim * num directions]
        #else:
        #    hidden = [batch size, hid dim]
        return self.fc(hidden)

## Define Hyperparameter and init model

In [42]:
EMBEDDING_DIM = 308
HIDDEN_DIM = 256
OUTPUT_DIM = 5
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

LEARNING_RATE = 0.001

N_EPOCHS = 60

MODEL_SAVE_FILE = 'simple-GRU_origin.pt'
model = GRU(EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT).to(device)

### Print out the number of parameters in our model.

In [43]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,054,661 trainable parameters


## Train the Model

In [44]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [45]:
criterion = torch.nn.CrossEntropyLoss().to(device)

### Train function

In [46]:
def train(model, iterator, optimizer, criterion, set_length):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        
        predictions = model(text, text_lengths)
        loss = criterion(predictions, batch.label)
        
        epoch_acc += (predictions.argmax(1) == batch.label).sum().item()
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / set_length, epoch_acc / set_length

### Evaluate function

In [47]:
def evaluate(model, iterator, criterion, set_length):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            text, text_lengths = batch.text
            
            predictions = model(text, text_lengths)
            
            loss = criterion(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += (predictions.argmax(1) == batch.label).sum().item()
        
    return epoch_loss / set_length, epoch_acc / set_length

### Define epoch time function

In [48]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [49]:
best_valid_loss = float('inf')
best_epoch = 0
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, len(train_data))
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, len(valid_data))
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        best_epoch = epoch
        torch.save(model.state_dict(), saved_models_path + MODEL_SAVE_FILE)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

print(f'Best epoch: {epoch+1:02}')

Epoch: 01 | Epoch Time: 1m 8s
	Train Loss: 0.014 | Train Acc: 62.41%
	 Val. Loss: 0.013 |  Val. Acc: 64.65%
Epoch: 02 | Epoch Time: 1m 8s
	Train Loss: 0.013 | Train Acc: 66.21%
	 Val. Loss: 0.012 |  Val. Acc: 67.64%
Epoch: 03 | Epoch Time: 1m 8s
	Train Loss: 0.012 | Train Acc: 68.77%
	 Val. Loss: 0.012 |  Val. Acc: 68.31%
Epoch: 04 | Epoch Time: 1m 8s
	Train Loss: 0.011 | Train Acc: 70.90%
	 Val. Loss: 0.012 |  Val. Acc: 68.77%
Epoch: 05 | Epoch Time: 1m 8s
	Train Loss: 0.010 | Train Acc: 72.56%
	 Val. Loss: 0.012 |  Val. Acc: 68.61%
Epoch: 06 | Epoch Time: 1m 8s
	Train Loss: 0.010 | Train Acc: 74.11%
	 Val. Loss: 0.012 |  Val. Acc: 69.13%
Epoch: 07 | Epoch Time: 1m 8s
	Train Loss: 0.009 | Train Acc: 75.49%
	 Val. Loss: 0.012 |  Val. Acc: 69.22%
Epoch: 08 | Epoch Time: 1m 8s
	Train Loss: 0.009 | Train Acc: 76.97%
	 Val. Loss: 0.013 |  Val. Acc: 68.49%
Epoch: 09 | Epoch Time: 1m 8s
	Train Loss: 0.008 | Train Acc: 78.53%
	 Val. Loss: 0.013 |  Val. Acc: 67.98%
Epoch: 10 | Epoch Time: 1m 8

KeyboardInterrupt: 

In [50]:
len(test_data)

15606

In [52]:
best_epoch

3

## Test the Model

In [51]:
model.load_state_dict(torch.load(saved_models_path + MODEL_SAVE_FILE))

test_loss, test_acc = evaluate(model, test_iterator, criterion, len(test_data))

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.012 | Test Acc: 68.44%
