In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR
from torch.nn.utils import clip_grad_norm_
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, f1_score, accuracy_score
from tqdm import tqdm_notebook as tqdm
import torchnet as tnt
#from keras.preprocessing.sequence import pad_sequences
import warnings
warnings.filterwarnings('ignore')
from utils import *
import matplotlib.pyplot as plt
%matplotlib inline 
from deep_models import *

random_seed = 12345
torch.manual_seed(random_seed)
torch.backends.cudnn.deterministic = True
np.random.seed(random_seed)

Using CNTK backend


using gpu


In [2]:
def train_model(model, data_iter, loss_fun, opt):
    model.train()
    meter = tnt.meter.AverageValueMeter()
    meter.reset()
    y_pred = []
    y_true = []
    for headlines, bodies, labels in tqdm(data_iter):
        opt.zero_grad()
        headlines = torch.from_numpy(headlines).cuda().long()
        bodies = torch.from_numpy(bodies).cuda().long()
        y_true.extend(labels)
        labels = torch.from_numpy(labels).cuda().long()
        out, _, _ = model(headlines, bodies)
        _, index = torch.max(out, dim=1)
        y_pred.extend(index.cpu().data.numpy())
        loss = loss_fun(out, labels)
        loss.backward()
        clip_grad_norm_(model.parameters(), 1.0)
        opt.step()
        meter.add(loss.item())
    return meter.value()[0]

def val_model(model, data_iter, loss_fun):
    model.eval()
    meter = tnt.meter.AverageValueMeter()
    meter.reset()
    y_pred = []
    y_true = []
    with torch.no_grad():
        for headlines, bodies, labels in tqdm(data_iter):
            headlines = torch.from_numpy(headlines).cuda().long()
            bodies = torch.from_numpy(bodies).cuda().long()
            y_true.extend(labels)
            labels = torch.from_numpy(labels).cuda().long()
            out, _, _ = model(headlines, bodies)
            _, index = torch.max(out, dim=1)
            y_pred.extend(index.cpu().data.numpy())
            loss = loss_fun(out, labels)
            meter.add(loss.item())
    score = get_score(y_true, y_pred) / get_score(y_true, y_true)

    model.train()
    return meter.value()[0], score

def test_model(model, data_iter):
    model.eval()
    y_pred = []
    y_true = []
    with torch.no_grad():
        for headlines, bodies, labels in tqdm(data_iter):
            headlines = torch.from_numpy(headlines).cuda().long()
            bodies = torch.from_numpy(bodies).cuda().long()
            y_true.extend(labels)
            labels = torch.from_numpy(labels).cuda().long()
            out, _, _ = model(headlines, bodies)
            _, index = torch.max(out, dim=1)
            y_pred.extend(index.cpu().data.numpy())
    
    score = get_score(y_true, y_pred) / get_score(y_true, y_true)
    model.train()
    print('-' * 50)
    print('classification report:')
    print('accuracy: %.3f' % accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred))
    print('macro f1: %.3f' % f1_score(y_true, y_pred, average='macro'))
    print('score: %.3f' % score)
    return score
    
def test_get_batch(data_iter):
    for headlines, bodies, labels in tqdm(data_iter):
        #print(headlines.shape)
        print(bodies.shape)
        #print(labels.shape)
    
def my_plot(data):
    plt.plot(data['train'])
    plt.plot(data['val'])
    plt.legend(['train', 'val'])
    plt.show()
    print('best epoch num: %s loss: %.3f' % (np.argmin(data['val']) +1, min(data['val'])))

#test_get_batch(get_batch('./tmp/val_ids.pkl', batch_size=64, max_len_b=100))

In [3]:
pretrained_file_name = './tmp/pretrained.pkl'
train_filename = './tmp/train_ids.pkl'
val_filename = './tmp/val_ids.pkl'
test_filename = './tmp/test_ids.pkl'
vecs = pickle.load(open(pretrained_file_name, 'rb'))

In [4]:
def get_batch_all(train_filename, val_filename, batch_size=64, max_len_h=50, max_len_b=100, data_aug=False):
        # load headline, body, stance from a file
        train_data = pickle.load(open(train_filename, 'rb'))
        val_data = pickle.load(open(val_filename, 'rb'))
        headlines = np.array(train_data['h'] + val_data['h'])
        bodies = np.array(train_data['b'] + val_data['b'])
        y = np.array(list(train_data['y']) + list(val_data['y']))

        # sorting according to the articleBodies length
        len_b = [len(v) for v in bodies]
        # print('articleBodies length distriubtion:')
        # print(np.percentile(len_b, [0, 50, 95, 99, 100]))

        len_h = [len(v) for v in headlines]
        # print('headlines length distriubtion:')
        # print(np.percentile(len_h, [0, 50, 95, 99, 100]))
        # print('*' * 100)

        indices = np.argsort(len_b)

        sorted_h = headlines[indices]
        sorted_b = bodies[indices]
        sorted_y = y[indices]

        assert len(sorted_h) == len(sorted_b) == len(sorted_y)

        # using the first 200 words in articleBodies
        for i in range(0, len(sorted_h), batch_size):
            batch_h = sorted_h[i:i+batch_size]
            batch_b = sorted_b[i:i+batch_size]
            batch_y = sorted_y[i:i+batch_size]

            # get max length of headlines and bodies in the batch
            len_batch_h = [len(v) for v in batch_h]
            len_batch_b = [len(v) for v in batch_b]
            maxlen1 = max_len_h if max_len_h < max(len_batch_h) else max(len_batch_h)
            maxlen2 = max_len_b if max_len_b < max(len_batch_b) else max(len_batch_b)

            # padding to the max length
            batch_h = pad_sequences(batch_h, maxlen1, padding='pre', truncating='post')
            batch_b = pad_sequences(batch_b, maxlen2, padding='pre', truncating='post')
            yield (batch_h, batch_b, batch_y)

        # using the last 200 words in articleBodies
        if data_aug:
            for i in range(0, len(sorted_h), batch_size):
                batch_h = sorted_h[i:i+batch_size]
                batch_b = sorted_b[i:i+batch_size]
                batch_y = sorted_y[i:i+batch_size]

                # get max length of headlines and bodies in the batch
                len_batch_h = [len(v) for v in batch_h]
                len_batch_b = [len(v) for v in batch_b]
                maxlen1 = max_len_h if max_len_h < max(len_batch_h) else max(len_batch_h)
                maxlen2 = max_len_b if max_len_b < max(len_batch_b) else max(len_batch_b)

                # padding to the max length
                batch_h = pad_sequences(batch_h, maxlen1, padding='pre', truncating='pre')
                batch_b = pad_sequences(batch_b, maxlen2, padding='pre', truncating='pre')
                yield (batch_h, batch_b, batch_y)

In [5]:
model_name = 'Esim'
if model_name == 'EmbeddingBag':
    model = EmbeddingBag(vecs).cuda()
elif model_name == 'Esim':
    model = Esim(vecs).cuda()
    
print(model)
opt = Adam(model.parameters(), lr=1e-3, weight_decay=1e-8)
#weight = torch.Tensor([3, 3, 3, 1])
#loss_fun = nn.CrossEntropyLoss(weight)
scheduler = ReduceLROnPlateau(opt, mode='max', factor=0.5, patience=0, verbose=True, min_lr=1e-6)
loss_fun = FocalLoss(gamma=5)
loss_fun.cuda()
epochs = 5
history = {'train':[], 'val': []}
min_loss = 100.1
bst = 0

for epoch in range(epochs):
    #print(f"epoch: {epoch + 1}/{epochs}")
    train_loss = train_model(model, get_batch_all(train_filename, val_filename, batch_size=64, max_len_b=100, data_aug=True), loss_fun, opt)
    acc = test_model(model, get_batch(test_filename, batch_size=64, max_len_b=100, data_aug=False))
    scheduler.step(acc)
    if acc > bst:
        torch.save(model.state_dict(), './models/%s_%s_acc_%.3f' % (model_name, epoch + 1, acc))
        bst = acc

Esim(
  (embedding): Embedding(399670, 100)
  (gru_enc1): LSTM(100, 100, batch_first=True, bidirectional=True)
  (gru_comp1): LSTM(800, 100, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=800, out_features=100, bias=True)
  (last_layer): Linear(in_features=100, out_features=4, bias=True)
)


Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"





Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"



--------------------------------------------------
classification report:
accuracy: 0.821
             precision    recall  f1-score   support

          0       0.43      0.46      0.45      1903
          1       0.00      0.00      0.00       697
          2       0.56      0.74      0.64      4464
          3       0.95      0.91      0.93     18349

avg / total       0.82      0.82      0.82     25413

macro f1: 0.504
score: 0.762


Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"





Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"



--------------------------------------------------
classification report:
accuracy: 0.842
             precision    recall  f1-score   support

          0       0.49      0.45      0.47      1903
          1       0.00      0.00      0.00       697
          2       0.59      0.76      0.66      4464
          3       0.96      0.93      0.95     18349

avg / total       0.83      0.84      0.83     25413

macro f1: 0.520
score: 0.777


Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"





Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"



--------------------------------------------------
classification report:
accuracy: 0.843
             precision    recall  f1-score   support

          0       0.47      0.36      0.41      1903
          1       0.00      0.00      0.00       697
          2       0.61      0.74      0.67      4464
          3       0.94      0.95      0.94     18349

avg / total       0.82      0.84      0.83     25413

macro f1: 0.506
score: 0.759
Epoch     2: reducing learning rate of group 0 to 5.0000e-04.


Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"





Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"



--------------------------------------------------
classification report:
accuracy: 0.833
             precision    recall  f1-score   support

          0       0.35      0.75      0.48      1903
          1       0.00      0.00      0.00       697
          2       0.71      0.49      0.58      4464
          3       0.96      0.96      0.96     18349

avg / total       0.85      0.83      0.83     25413

macro f1: 0.505
score: 0.747
Epoch     3: reducing learning rate of group 0 to 2.5000e-04.


Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"





Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"



--------------------------------------------------
classification report:
accuracy: 0.837
             precision    recall  f1-score   support

          0       0.36      0.73      0.48      1903
          1       1.00      0.00      0.00       697
          2       0.73      0.49      0.59      4464
          3       0.95      0.96      0.96     18349

avg / total       0.87      0.84      0.83     25413

macro f1: 0.507
score: 0.743
Epoch     4: reducing learning rate of group 0 to 1.2500e-04.


In [11]:
model_name = 'Esim'
if model_name == 'EmbeddingBag':
    model = EmbeddingBag(vecs).cuda()
elif model_name == 'CE':
    model = CE(vecs).cuda()
elif model_name == 'IE':
    model = IE(vecs).cuda()
elif model_name == 'Esim':
    model = Esim(vecs).cuda()
elif model_name == 'Dattn':
    model = Dattn(vecs).cuda()
elif model_name == 'Declare':
    model = Declare(vecs).cuda()

model.load_state_dict(torch.load('./models/Esim_2_acc_0.774'))
test_model(model, get_batch(test_filename, batch_size=64, max_len_b=100))


classification report:
accuracy: 0.843
             precision    recall  f1-score   support

          0       0.35      0.57      0.43      1903
          1       0.00      0.00      0.00       697
          2       0.66      0.60      0.63      4464
          3       0.97      0.96      0.96     18344

avg / total       0.84      0.84      0.84     25408

macro f1: 0.507
score: 0.762
