In [1]:
# To create python file:
# jupyter nbconvert --to=python lang_model_new.ipynb

import utils.data_import as data_import
import utils.imdb_data as imdb_data
import utils.ml_utils as ml_utils
import model.neural as neural

import torch
import torch.nn as nn
import torch.optim as optim

from torchtext import data, vocab

import os, sys
import pdb
import pandas as pd
import numpy as np
import math
#from tqdm import tnrange, tqdm_notebook
from tqdm import tqdm

import importlib

import settings

np.random.seed(123)

  from collections import Sequence


In [2]:
in_notebook = ml_utils.in_ipynb()
print('Running in notebook:', in_notebook)

Running in notebook: True


In [3]:
cuda = torch.cuda.is_available()
if cuda:
    print('Cuda is available!')
    print('Device:', torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print('No cuda.')

if in_notebook:
    import matplotlib.pyplot as plt
    %matplotlib inline

No cuda.


In [4]:
corpus = imdb_data.ImdbCorpus(filename=settings.imdb_file, lines=settings.lines_imbd,
                              vocab_file='vocab.p')

Importing vocab from vocab.p... Done.
Imported vocab:  10,954
Read total of: 1,000 lines from imdb file.
Number of classes: 2: {'negative': 0, 'positive': 1}
Generated train: 700 lines
Generated valid: 150 lines
Generated test:  150 lines


In [5]:
print(corpus.vocab.most_frequent(to=20))
print(corpus.classes.most_frequent())

[('<unk>', 47973), ('the', 13016), ('.', 10666), ('a', 6505), ('and', 6322), ('of', 5754), ('to', 5312), ('is', 4103), ('in', 3815), ('i', 3004), ('it', 2912), ('this', 2911), ('that', 2742), ('as', 1918), ('was', 1914), ('for', 1784), ('with', 1740), ('but', 1623), ('movie', 1489), ('film', 1378)]
[('negative', 505), ('positive', 495)]


In [6]:
corpus.train.show_stoklist(corpus.vocab, 2)

## sentiment: 0
## text:      i firstly and completely and <unk> disagree with the <unk> who calls this a <unk> . <unk> <unk> is very serious about his film . he personally introduced the film at the <unk> i saw in chicago . he had worked on the film for years and it is the first in an intended <unk> . <unk> is <unk> is <unk> <unk> attempt at an art film in the vein of those he <unk> by <unk> <unk> etc <unk> i had heard rumor of this film years ago <unk> <unk> movie with all <unk> cast directed by <unk> <unk> . when it finally came out i watched the <unk> <unk> and read the <unk> and i was <unk> at the mouth with <unk> . . . <unk> went to chicago to see it and it was a major disappointment . if he took out the <unk> <unk> such as the <unk> <unk> and the dancing <unk> he would be left with something much <unk> but only about 10 minutes long <unk> in other words just watch the <unk> be <unk> and leave it at that . there are some striking images and fantastic <unk> and <unk> but its lack 

In [7]:
corpus.train.batchify(corpus.vocab, batch_size=settings.batch_size, 
                      seq_length=settings.window_size_imbd)

 Number of batches: 35
 Preserved reviews: 700
 Matrix size:       torch.Size([2835, 20])


In [8]:
corpus.batchify(batch_size=settings.batch_size, seq_length=settings.window_size_imbd)

Batch size:        20
Sequence length:   80
Batchifying train...
 Number of batches: 35
 Preserved reviews: 700
 Matrix size:       torch.Size([2835, 20])
Batchifying valid...
 Number of batches: 7
 Preserved reviews: 140
 Matrix size:       torch.Size([567, 20])
Batchifying test... 
 Number of batches: 7
 Preserved reviews: 140
 Matrix size:       torch.Size([567, 20])


In [9]:
corpus.train.batch_start_end

[[0, 81],
 [81, 162],
 [162, 243],
 [243, 324],
 [324, 405],
 [405, 486],
 [486, 567],
 [567, 648],
 [648, 729],
 [729, 810],
 [810, 891],
 [891, 972],
 [972, 1053],
 [1053, 1134],
 [1134, 1215],
 [1215, 1296],
 [1296, 1377],
 [1377, 1458],
 [1458, 1539],
 [1539, 1620],
 [1620, 1701],
 [1701, 1782],
 [1782, 1863],
 [1863, 1944],
 [1944, 2025],
 [2025, 2106],
 [2106, 2187],
 [2187, 2268],
 [2268, 2349],
 [2349, 2430],
 [2430, 2511],
 [2511, 2592],
 [2592, 2673],
 [2673, 2754],
 [2754, 2835]]

In [10]:
if in_notebook:
    corpus.train.show_itoklist(2)
    corpus.train.show_stoklist(corpus.vocab, 2)
    print(corpus.train.batch_matrix[:settings.window_size_imbd,0])
    #df = corpus.train.batch_stats()
    #df.hist()

## sentiment: 0
## text:      [1249, 9389, 38, 1384, 38, 2, 5207, 119, 18, 2, 74, 6466, 456, 28, 2, 16, 2, 2, 27, 254, 868, 681, 342, 3936, 16, 472, 2950, 4355, 18, 3936, 275, 18, 2, 1249, 5149, 44, 1678, 16, 472, 503, 657, 93, 18, 3936, 40, 1110, 38, 47, 27, 18, 60, 44, 145, 1202, 2, 16, 2, 27, 2, 27, 2, 2, 3552, 275, 145, 853, 3936, 44, 18, 6631, 17, 245, 472, 2, 36, 2, 2, 1522, 2, 1249, 503, 680, 1209, 17, 456, 3936, 1110, 7692, 2, 2, 3922, 119, 564, 2, 973, 938, 36, 2, 2, 16, 215, 47, 1743, 1157, 292, 1249, 10055, 18, 2, 2, 38, 5545, 18, 2, 38, 1249, 130, 2, 275, 18, 2500, 119, 2, 16, 16, 16, 2, 3583, 23, 1678, 23, 4400, 47, 38, 47, 130, 28, 519, 4290, 16, 293, 472, 582, 292, 18, 2, 2, 103, 24, 18, 2, 2, 38, 18, 7044, 2, 472, 165, 203, 885, 119, 294, 688, 2, 160, 269, 681, 1042, 2741, 1267, 2, 44, 219, 2220, 552, 825, 18, 2, 203, 2, 38, 2323, 47, 275, 149, 16, 247, 79, 239, 7832, 4662, 38, 8351, 2, 38, 2, 160, 55, 3735, 17, 4647, 6460, 23, 4290, 16, 1]

## sentiment: 0
## text:    

In [11]:
train_dl = imdb_data.ImdbTextDataset(corpus.train)
valid_dl = imdb_data.ImdbTextDataset(corpus.valid)
test_dl = imdb_data.ImdbTextDataset(corpus.test)

In [12]:
train = next(iter(train_dl))

In [13]:
corpus.train.batch_start_end[:5]

[[0, 81], [81, 162], [162, 243], [243, 324], [324, 405]]

In [14]:
train

(tensor([[1249, 4184,   44,  ..., 1249,    2,  456],
         [9389,   23,   28,  ..., 3276,  130, 3922],
         [  38,   55,  671,  ...,   23,  149, 9129],
         ...,
         [1522,   27, 2065,  ..., 2808, 1231, 1244],
         [   2,   28,  725,  ...,  825,    2, 2067],
         [1249, 3747,    2,  ...,   18, 7374,    2]]),
 tensor([0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]))

In [15]:
for idx, (x, y) in enumerate(train_dl):
    if idx > 5:
        break
    print(idx, len(x[:,1]))
    for i in x[:,1]:
        print(corpus.vocab.itos[i], end=' ')
    print('##', corpus.classes.itos[y[1].item()], '##')

0 80
contrary to its <unk> this film offers no <unk> and thus audience is subjected to a <unk> <unk> . all <unk> appears <unk> <unk> <unk> <unk> reminiscent of those <unk> plays available on <unk> including even the <unk> . everybody is <unk> shouting and doing odd things for no reason . the <unk> looks interesting as it is a straight lift from <unk> <unk> <unk> . john abraham who is so natural in almost all his films is a complete ## negative ##
1 80
<unk> of <unk> <unk> my bad <unk> of course only a movie starring <unk> simpson can include serious <unk> like this . . <unk> a norwegian and i felt <unk> and <unk> the makers of this movie did not take the time to do their research upon making this <unk> movie . even <unk> is more accurate when it comes to <unk> about this country <unk> so <unk> posting my <unk> out of my <unk> <unk> country is named <unk> ## negative ##
2 80
wonderful cast <unk> on <unk> script . ten or so adults <unk> at the summer camp they attended as juveniles . cou

In [16]:
model = neural.class_model_LSTM(vocab_dim=len(corpus.vocab),
                                emb_dim=settings.emb_dim,
                                hidden_dim=settings.hidden_dim,
                                n_layers=settings.num_layers,
                                dropout=settings.dropout,
                                n_classes=corpus.n_classes
                               )

In [17]:
if cuda:
    model = model.cuda()

In [18]:
print(model)

class_model_LSTM(
  (embedding): Embedding(10954, 50)
  (lstm): LSTM(50, 300, num_layers=2, dropout=0.4, bidirectional=True)
  (fc): Linear(in_features=600, out_features=2, bias=True)
  (dropout): Dropout(p=0.4)
)


In [19]:
x, y = train
print(x.shape)
print(y.shape)
preds = model(x)
print(preds.shape)
print(y)
print(preds)
loss_func = nn.CrossEntropyLoss()
loss = loss_func(preds, y.long())

torch.Size([80, 20])
torch.Size([20])
torch.Size([20, 2])
tensor([0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1])
tensor([[ 0.0217, -0.0017],
        [-0.0211, -0.0044],
        [ 0.0170,  0.0029],
        [-0.0201,  0.0010],
        [-0.0045,  0.0019],
        [-0.0099,  0.0006],
        [ 0.0031, -0.0085],
        [-0.0003, -0.0038],
        [ 0.0141,  0.0025],
        [-0.0069, -0.0032],
        [-0.0059, -0.0099],
        [ 0.0072, -0.0012],
        [-0.0079,  0.0057],
        [-0.0145, -0.0085],
        [-0.0159, -0.0169],
        [-0.0124, -0.0074],
        [ 0.0136,  0.0067],
        [ 0.0081, -0.0019],
        [ 0.0072,  0.0081],
        [ 0.0068, -0.0018]], grad_fn=<ThAddmmBackward>)


In [20]:
#model = torch.load('model_weights.pt')

In [21]:
missclass = []
missclass_next = []
losses = []

In [22]:
def run_epochs(model, train_dl, valid_dl, epochs=settings.epochs,
               losses=[], missclass=[]):
    
    opt = optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-6)
    #opt = optim.Adam(model.parameters(), lr=0.001)
    #loss_func = nn.NLLLoss()
    loss_func = nn.CrossEntropyLoss()
    
    scheduler = optim.lr_scheduler.StepLR(opt, step_size=20, gamma=0.8)
    
    try: # Allow for user interrupt
 
        for epoch in range(1, epochs + 1):
            scheduler.step()
            running_loss = 0.0
            running_corrects = 0
            model.train() # turn on training mode

            num_vals = 0
            num_correct = 0

            #pdb.set_trace()

            for x, y in tqdm(train_dl, desc='Train {}/{}'.format(epoch, epochs)):
                opt.zero_grad()
                
                if cuda:
                    x = x.cuda()
                    y = y.cuda()

                preds = model(x)
                loss = loss_func(preds, y.long())

                loss.backward()
                #torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)
                opt.step()

                running_loss += loss.item() * x.size(0) / x.size(1)

                _, y_preds = torch.max(preds, dim=1)
                num_correct += torch.sum(y == y_preds).item()
                num_vals += y.size(0)

            #pdb.set_trace()

            missclass_tr = 1 - num_correct / num_vals

            epoch_loss = running_loss / len(train_dl)

            num_vals = 0
            num_correct = 0

            # calculate the validation loss for this epoch
            val_loss = 0.0
            model.eval() # turn on evaluation mode
            
            with torch.no_grad():
                for x, y in tqdm(valid_dl, desc='Valid {}/{}'.format(epoch, epochs)):
                    if cuda:
                        x = x.cuda()
                        y = y.cuda()   
                        
                    preds = model(x)
                    loss = loss_func(preds, y.long())

                    val_loss += loss.item() * x.size(0) / x.size(1)

                    _, y_preds = torch.max(preds, dim=1)
                    num_correct += torch.sum(y == y_preds).item()
                    num_vals += y.size(0)

            #pdb.set_trace()

            missclass_te = 1 - num_correct / num_vals
            val_loss /= len(valid_dl)
            
            missclass.append((missclass_tr, missclass_te))
            losses.append((epoch_loss, val_loss))

            print('Epoch: {}/{}, Loss: [{:.4f}, {:.4f}], Ppl: [{:6.2f}, {:6.2f}], '
                  'Miss: [{:.2%}, {:.2%}]'\
                  .format(epoch, epochs, epoch_loss, val_loss, 
                          math.exp(epoch_loss), math.exp(val_loss), 
                          missclass_tr, missclass_te))
            sys.stdout.flush()

            print('Saving weights file...', end=' ', flush=True)
            torch.save(model, 'model_weights_imdb.pt')
            print('Done.', flush=True)
            #to load: model = torch.load('model_weights.pt')
            
    except KeyboardInterrupt:
        print('Stopping with latest weights.')
        
    return model, opt, losses, missclass

In [None]:
model, opt, losses, missclass = run_epochs(model, train_dl, valid_dl, epochs=settings.epochs,
                                           losses=losses, missclass=missclass)

Train 1/10: 100%|██████████| 35/35 [00:37<00:00,  1.06s/it]
Valid 1/10: 100%|██████████| 7/7 [00:01<00:00,  6.17it/s]

Epoch: 1/10, Loss: [4.0447, 3.2823], Ppl: [ 57.09,  26.64], Miss: [49.14%, 46.43%]
Saving weights file... Done.



Train 2/10: 100%|██████████| 35/35 [00:36<00:00,  1.04s/it]
Valid 2/10: 100%|██████████| 7/7 [00:01<00:00,  6.08it/s]

Epoch: 2/10, Loss: [3.1273, 2.7967], Ppl: [ 22.81,  16.39], Miss: [49.29%, 47.14%]
Saving weights file... Done.



Train 3/10: 100%|██████████| 35/35 [00:36<00:00,  1.05s/it]
Valid 3/10: 100%|██████████| 7/7 [00:01<00:00,  6.13it/s]

Epoch: 3/10, Loss: [3.0623, 2.8583], Ppl: [ 21.38,  17.43], Miss: [51.14%, 46.43%]
Saving weights file... Done.



Train 4/10: 100%|██████████| 35/35 [00:36<00:00,  1.05s/it]
Valid 4/10: 100%|██████████| 7/7 [00:01<00:00,  6.04it/s]

Epoch: 4/10, Loss: [2.9570, 2.8027], Ppl: [ 19.24,  16.49], Miss: [48.86%, 52.14%]
Saving weights file... Done.



Train 5/10: 100%|██████████| 35/35 [00:38<00:00,  1.09s/it]
Valid 5/10: 100%|██████████| 7/7 [00:01<00:00,  5.83it/s]

Epoch: 5/10, Loss: [2.8324, 2.9855], Ppl: [ 16.99,  19.80], Miss: [44.29%, 54.29%]
Saving weights file... Done.



Train 6/10: 100%|██████████| 35/35 [00:37<00:00,  1.08s/it]
Valid 6/10: 100%|██████████| 7/7 [00:01<00:00,  5.50it/s]

Epoch: 6/10, Loss: [2.9910, 2.9648], Ppl: [ 19.91,  19.39], Miss: [46.43%, 53.57%]
Saving weights file... Done.



Train 7/10: 100%|██████████| 35/35 [00:37<00:00,  1.06s/it]
Valid 7/10: 100%|██████████| 7/7 [00:01<00:00,  6.09it/s]

Epoch: 7/10, Loss: [2.9099, 2.9403], Ppl: [ 18.36,  18.92], Miss: [46.71%, 56.43%]
Saving weights file... Done.



Train 8/10: 100%|██████████| 35/35 [00:39<00:00,  1.12s/it]
Valid 8/10: 100%|██████████| 7/7 [00:01<00:00,  6.47it/s]

Epoch: 8/10, Loss: [2.8977, 3.1777], Ppl: [ 18.13,  23.99], Miss: [48.57%, 54.29%]
Saving weights file... Done.



Train 9/10:  89%|████████▊ | 31/35 [00:33<00:04,  1.08s/it]

In [None]:
x, y = next(iter(valid_dl))
if cuda:
    x = x.cuda()
    y = y.cuda()
loss_func = nn.CrossEntropyLoss()
preds = model(x)
_, y_preds = torch.max(preds, dim=1)
loss = loss_func(preds, y.long())
loss

In [None]:
print(y.shape)
print(y[:])

In [None]:
print(y_preds[:])
print(preds)

In [None]:
if in_notebook:
    plt.plot(losses)
    plt.legend(['train', 'valid'])

In [None]:
if in_notebook:
    plt.plot(missclass)
    plt.legend(['train', 'valid'])