In [2]:
!pip install gensim==4.0.0

Collecting gensim==4.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/c3/dd/5e00b6e788a9c522b48f9df10472b2017102ffa65b10bc657471e0713542/gensim-4.0.0-cp37-cp37m-manylinux1_x86_64.whl (23.9MB)
[K     |████████████████████████████████| 23.9MB 127kB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.0.0


In [1]:
# Standard library imports
import string
import time
import os
import warnings
warnings.filterwarnings('ignore')

# Third party imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
import torch.nn.functional as F
import gensim
from tqdm import tqdm
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Local application imports
from vectors import GoogleVec
from model_utils import *
from model import Model



In [3]:
class Dataset(data.Dataset):
    
    def __init__(self,                               \
                 bodies_path = './data/train_bodies.csv', \
                 stances_path = './data/train_stances.csv'):
        
        self.bodies = process_bodies(bodies_path)
        self.stances = process_stances(stances_path)
    
    def __len__(self):
        
        return len(self.stances)

    def __getitem__(self, idx):
                 
        if torch.is_tensor(idx):
            idx = idx.tolist()

        headline, body_id, stance = self.stances[idx]
        article = self.bodies[body_id]

        return headline, article, stance

In [5]:
def train(model, device, train_loader, criterion, optimizer, scheduler):
    
    # Put the model in training mode
    model.train()

    # List of train losses
    train_loss = []

    # Accuracy
    acc = []
    
    for data in tqdm(train_loader):
        
        # Load the data, and convert the tensor with the specified device
        headlines, articles, labels = data
        headlines, articles, labels = headlines.to(device), articles.to(device), labels.to(device)

        # Forward pass
        output = model(headlines, articles)
        predictions = np.argmax(output.cpu().detach().numpy(), axis = 1)
        acc.extend((predictions == labels.cpu().numpy()).tolist())

        # Set the gradients to 0
        optimizer.zero_grad()
        
        # Compute the loss, and with it, the gradients
        loss = criterion(output, labels)
        train_loss.append(loss.item())
        loss.backward()
        
        # Update the parameters
        optimizer.step()
        scheduler.step()

    # Calculate average training loss and accuracy
    avg_train_loss = torch.mean(torch.tensor(train_loss))
    avg_train_acc = sum(acc) / len(acc)

    print('Training set - Average loss = {:.4f}'.format(avg_train_loss))
    print('Training set - Accuracy = {:.4f}'.format(avg_train_acc))
    time.sleep(2)
    
    return avg_train_loss, avg_train_acc

In [6]:
def test(model, device, test_loader, criterion):
    
    # Put the model in training mode
    model.eval()

    # List of train losses
    test_loss = []

    # Accuracy
    acc = []
    
    for data in tqdm(test_loader):
        
        # Load the data, and convert the tensor with the specified device
        headlines, articles, labels = data
        headlines, articles, labels = headlines.to(device), articles.to(device), labels.to(device)

        # Forward pass
        output = model(headlines, articles)
        predictions = np.argmax(output.cpu().detach().numpy(), axis = 1)
        acc.extend((predictions == labels.cpu().numpy()).tolist())
        
        # Compute the loss, but not the gradients
        loss = criterion(output, labels)
        test_loss.append(loss.item())

    # Calculate average testing loss and accuracy
    avg_test_loss = torch.mean(torch.tensor(test_loss))
    avg_test_acc = sum(acc) / len(acc)

    print('Testing set - Average loss = {:.4f}'.format(avg_test_loss))
    print('Testing set - Accuracy = {:.4f}'.format(avg_test_acc))
    time.sleep(2)
    
    return avg_test_loss, avg_test_acc

In [8]:
train_bodies_path = './data/train_bodies.csv'  
train_stances_path = './data/train_stances.csv'
test_bodies_path = './data/test_bodies.csv'    
test_stances_path = './data/test_stances.csv'  
embedding_dim = 300                       
n_hidden = 256                            
pool_kernel_size = 2                      
dropout_rate = 0.5                       
batch_size = 16                      
epochs = 2                                
learning_rate = 5e-4
vecs = GoogleVec()

In [9]:
# Importing data

print('Loading datasets')

train_dataset = Dataset(bodies_path = train_bodies_path, \
                        stances_path = train_stances_path)

test_dataset = Dataset(bodies_path = test_bodies_path, \
                       stances_path = test_stances_path)

Loading datasets


In [10]:
# kwargs for using GPU

use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')
print('Using GPU'  if use_cuda else 'GPU not found')
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

Using GPU


In [11]:
train_loader = data.DataLoader(dataset = train_dataset,                           \
                               batch_size = batch_size,                           \
                               collate_fn = lambda x: collate_fn(x, vecs = vecs), \
                               shuffle = True)

test_loader = data.DataLoader(dataset = test_dataset,                            \
                              batch_size = batch_size,                           \
                              collate_fn = lambda x: collate_fn(x, vecs = vecs), \
                              shuffle = False)

In [12]:
# Setting up the model

print('Setting up the model')

model = Model(embedding_dim = embedding_dim,       \
              n_hidden = n_hidden,                 \
              pool_kernel_size = pool_kernel_size, \
              dropout_rate = dropout_rate).to(device)

print('Total model parameters =', sum([param.nelement() for param in model.parameters()]))

Setting up the model
Total model parameters = 6465796


In [13]:
# Optimizer, loss criterion, and learning rate scheduler

print('Defining optimizer, loss criterion and learning rate scheduler')

optimizer = optim.AdamW(model.parameters(), learning_rate)
criterion = nn.CrossEntropyLoss()
scheduler = optim.lr_scheduler.OneCycleLR(optimizer,                           \
                                          max_lr = learning_rate,              \
                                          steps_per_epoch = len(train_loader), \
                                          epochs = epochs,                     \
                                          anneal_strategy = 'linear')

Defining optimizer, loss criterion and learning rate scheduler


In [14]:
# Training the model

print('Training')

train_losses, test_losses, train_acc, test_acc = [], [], [], []

for epoch in range(1, epochs+1):

    print('\nEpoch', epoch)
    time.sleep(2)

    avg_train_loss, avg_train_acc = train(model, device, train_loader, criterion, optimizer, scheduler)
    train_losses.append(avg_train_loss)
    train_acc.append(avg_train_acc)

    avg_test_loss, avg_test_acc = test(model, device, test_loader, criterion)
    test_losses.append(avg_test_loss)
    test_acc.append(avg_test_acc)

Training

Epoch 1


100%|██████████| 3928/3928 [15:07<00:00,  4.33it/s]


Training set - Average loss = 1.0135
Training set - Accuracy = 0.7329


100%|██████████| 784/784 [02:50<00:00,  4.61it/s]


Testing set - Average loss = 1.0392
Testing set - Accuracy = 0.7047

Epoch 2


100%|██████████| 3928/3928 [15:08<00:00,  4.32it/s]


Training set - Average loss = 1.0108
Training set - Accuracy = 0.7329


100%|██████████| 784/784 [02:48<00:00,  4.65it/s]


Testing set - Average loss = 1.0392
Testing set - Accuracy = 0.7047


In [15]:
if not os.path.exists('model_states'):
    os.makedirs('model_states')
    
torch.save(model.state_dict(), './model_states/state_dict.pt')
torch.save(model, './model_states/model.pt')