In [2]:
!pip install gensim==4.0.0

Collecting gensim==4.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/c3/dd/5e00b6e788a9c522b48f9df10472b2017102ffa65b10bc657471e0713542/gensim-4.0.0-cp37-cp37m-manylinux1_x86_64.whl (23.9MB)
[K     |████████████████████████████████| 23.9MB 127kB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.0.0


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
import torch.nn.functional as F
import gensim
import os
os.chdir('/content/drive/MyDrive')
from tqdm import tqdm
import time
import string
import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings("ignore")



In [2]:
def process_bodies(file_name):

    df = pd.read_csv(file_name, header = 0)
    dct = {row['Body ID']: row['Article Body'] for _, row in df.iterrows()}

    return dct

def process_stances(file_name):

    df = pd.read_csv(file_name, header = 0)
    lst = [tuple(row.tolist()) for _, row in df.iterrows()]

    return lst

output_mapper = {"agree": 0, "disagree": 1, "discuss":2, "unrelated": 3}

def collate_fn(data, vecs = None):
    
    headlines = [x[0] for x in data]
    articles = [x[1] for x in data]
    stances = [x[2] for x in data]
    
    headlines_transformed = vecs.transform(headlines)
    articles_transformed = vecs.transform(articles)
    stances_transformed = torch.tensor([output_mapper[stance] for stance in stances])
    
    return headlines_transformed, articles_transformed, stances_transformed

In [3]:
class Dataset(data.Dataset):
    
    def __init__(self,                               \
                 bodies_path = './train_bodies.csv', \
                 stances_path = './train_stances.csv'):
        
        self.bodies = process_bodies(bodies_path)
        self.stances = process_stances(stances_path)
    
    def __len__(self):
        
        return len(self.stances)

    def __getitem__(self, idx):
                 
        if torch.is_tensor(idx):
            idx = idx.tolist()

        headline, body_id, stance = self.stances[idx]
        article = self.bodies[body_id]

        return headline, article, stance

In [4]:
class Model(nn.Module):
    
    
    def __init__(self, embedding_dim = 300,  \
                       n_hidden = 256,       \
                       pool_kernel_size = 2, \
                       dropout_rate = 0.5):
        
        super(Model, self).__init__()
        
        in_channels = [embedding_dim, n_hidden, n_hidden, n_hidden*2, n_hidden*2]
        out_channels = [n_hidden, n_hidden, n_hidden*2, n_hidden*2, n_hidden*3]
        filter_sizes = [3, 3, 3, 3, 3]
        strides = [1, 1, 1, 1, 1]
        paddings = [3, 3, 3, 1, 1]
        dense_sizes = [n_hidden*3*2, n_hidden*4, n_hidden*4, n_hidden*4, 4]
        
        conv_layers = [nn.Conv1d(*params) for params in \
                       zip(in_channels, out_channels, filter_sizes, strides, paddings)]
        pool_layers = [nn.MaxPool1d(pool_kernel_size) for _ in range(3)]
        relu = nn.ReLU()
        dropout = nn.Dropout(dropout_rate)
        
        self.cnn = nn.Sequential(conv_layers[0], relu, pool_layers[0], \
                                 conv_layers[1], relu, pool_layers[1], \
                                 conv_layers[2], relu, pool_layers[2], \
                                 conv_layers[3],                       \
                                 conv_layers[4])
        
        self.dnn = nn.Sequential(*[nn.Linear(in_size, out_size) \
                                   for in_size, out_size in     \
                                   zip(dense_sizes[:-1], dense_sizes[1:])])
        
        self.softmax = nn.Softmax(dim = -1)

    def forward(self, headlines, articles):
        
        headlines = self.cnn(headlines)
        headlines, _ = torch.max(headlines, axis = -1)
        
        articles = self.cnn(articles)
        articles, _ = torch.max(articles, axis = -1)
        
        dense_vec = torch.cat((headlines, articles), axis = -1)
        out = self.dnn(dense_vec)
        probs = self.softmax(out)
        
        return probs

In [5]:
def train(model, device, train_loader, criterion, optimizer, scheduler):
    
    # Put the model in training mode
    model.train()

    # List of train losses
    train_loss = []

    # Accuracy
    acc = []
    
    for data in tqdm(train_loader):
        
        # Load the data, and convert the tensor with the specified device
        headlines, articles, labels = data
        headlines, articles, labels = headlines.to(device), articles.to(device), labels.to(device)

        # Forward pass
        output = model(headlines, articles)
        predictions = np.argmax(output.cpu().detach().numpy(), axis = 1)
        acc.extend((predictions == labels.cpu().numpy()).tolist())

        # Set the gradients to 0
        optimizer.zero_grad()
        
        # Compute the loss, and with it, the gradients
        loss = criterion(output, labels)
        train_loss.append(loss.item())
        loss.backward()
        
        # Update the parameters
        optimizer.step()
        scheduler.step()

    # Calculate average training loss and accuracy
    avg_train_loss = torch.mean(torch.tensor(train_loss))
    avg_train_acc = sum(acc) / len(acc)

    print("Training set - Average loss = {:.4f}".format(avg_train_loss))
    print("Training set - Accuracy = {:.4f}".format(avg_train_acc))
    time.sleep(2)
    
    return avg_train_loss, avg_train_acc

In [6]:
def test(model, device, test_loader, criterion):
    
    # Put the model in training mode
    model.eval()

    # List of train losses
    test_loss = []

    # Accuracy
    acc = []
    
    for data in tqdm(test_loader):
        
        # Load the data, and convert the tensor with the specified device
        headlines, articles, labels = data
        headlines, articles, labels = headlines.to(device), articles.to(device), labels.to(device)

        # Forward pass
        output = model(headlines, articles)
        predictions = np.argmax(output.cpu().detach().numpy(), axis = 1)
        acc.extend((predictions == labels.cpu().numpy()).tolist())
        
        # Compute the loss, but not the gradients
        loss = criterion(output, labels)
        test_loss.append(loss.item())

    # Calculate average testing loss and accuracy
    avg_test_loss = torch.mean(torch.tensor(test_loss))
    avg_test_acc = sum(acc) / len(acc)

    print("Testing set - Average loss = {:.4f}".format(avg_test_loss))
    print("Testing set - Accuracy = {:.4f}".format(avg_test_acc))
    time.sleep(2)
    
    return avg_test_loss, avg_test_acc

In [7]:
class GoogleVec(object):
    

    def __init__(self, path = './GoogleNews-vectors-negative300.bin'):

        self.model = gensim.models.KeyedVectors.load_word2vec_format(path, unicode_errors = 'ignore', binary = True)
        self.vocab = set(self.model.index_to_key)
        self.dct = {c: '' for c in string.punctuation.replace('.', '')}

    def transform(self, X, pad = 0):

        embeddings = []
        max_len = 0

        for x in X:

            table = x.maketrans(self.dct)
            x = x.translate(table)
            v = self.vectorize(x)
            max_len = max(max_len, v.shape[-1])
            embeddings.append(v)

        padded_embeddings = torch.full(size = (len(X), self.model.vector_size, max_len), \
                                       fill_value = pad,                                 \
                                       dtype = torch.float)

        for i, v in enumerate(embeddings):

            padded_embeddings[i, :, :v.shape[-1]] = v
            
        return padded_embeddings

    def vectorize(self, text):
    
        words = [word for word in text.split() if word]
        w_idx = 0
        vectorized = []

        while w_idx < len(words):

            w0 = words[w_idx]
            w1 = words[w_idx+1] if w_idx+1 < len(words) else False
            w2 = words[w_idx+2] if w_idx+2 < len(words) else False

            if w2:
                s = '_'.join([w0, w1, w2])
                if s in self.vocab:
                    vectorized.append(self.model[s])
                    w_idx += 3
                    continue

            if w1:
                s = '_'.join([w0, w1])
                if s in self.vocab:
                    vectorized.append(self.model[s])
                    w_idx += 2
                    continue

            if w0 in self.vocab:
                vectorized.append(self.model[w0])
            elif w0.lower() in self.vocab:
                vectorized.append(self.model[w0.lower()])
            else:
                vectorized.append(self.model['</s>'])
            w_idx += 1
        
        return torch.tensor(vectorized, dtype = torch.float).transpose(0, 1)

In [8]:
train_bodies_path = "./train_bodies.csv"  
train_stances_path = "./train_stances.csv"
test_bodies_path = "./test_bodies.csv"    
test_stances_path = "./test_stances.csv"  
embedding_dim = 300                       
n_hidden = 256                            
pool_kernel_size = 2                      
dropout_rate = 0.5                       
batch_size = 16                      
epochs = 2                                
learning_rate = 5e-4
vecs = GoogleVec()

In [9]:
# Importing data
print("Loading datasets")

train_dataset = Dataset(bodies_path = train_bodies_path, \
                        stances_path = train_stances_path)

test_dataset = Dataset(bodies_path = test_bodies_path, \
                       stances_path = test_stances_path)

Loading datasets


In [10]:
# kwargs for using GPU
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print("Using GPU"  if use_cuda else "GPU not found")
kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}

Using GPU


In [11]:
train_loader = data.DataLoader(dataset = train_dataset,                           \
                               batch_size = batch_size,                           \
                               collate_fn = lambda x: collate_fn(x, vecs = vecs), \
                               shuffle = True)

test_loader = data.DataLoader(dataset = test_dataset,                            \
                              batch_size = batch_size,                           \
                              collate_fn = lambda x: collate_fn(x, vecs = vecs), \
                              shuffle = False)

In [12]:
# Setting up the model
print("Setting up the model")

model = Model(embedding_dim = embedding_dim,       \
              n_hidden = n_hidden,                 \
              pool_kernel_size = pool_kernel_size, \
              dropout_rate = dropout_rate).to(device)

print("Total model parameters =", sum([param.nelement() for param in model.parameters()]))

Setting up the model
Total model parameters = 6465796


In [13]:
# Optimizer, loss criterion, and learning rate scheduler
print("Defining optimizer, loss criterion and learning rate scheduler")

optimizer = optim.AdamW(model.parameters(), learning_rate)
criterion = nn.CrossEntropyLoss()
scheduler = optim.lr_scheduler.OneCycleLR(optimizer,                           \
                                          max_lr = learning_rate,              \
                                          steps_per_epoch = len(train_loader), \
                                          epochs = epochs,                     \
                                          anneal_strategy = "linear")

Defining optimizer, loss criterion and learning rate scheduler


In [14]:
# Training the model
print("Training")

train_losses, test_losses, train_acc, test_acc = [], [], [], []

for epoch in range(1, epochs+1):

    print("\nEpoch", epoch)
    time.sleep(2)

    avg_train_loss, avg_train_acc = train(model, device, train_loader, criterion, optimizer, scheduler)
    train_losses.append(avg_train_loss)
    train_acc.append(avg_train_acc)

    avg_test_loss, avg_test_acc = test(model, device, test_loader, criterion)
    test_losses.append(avg_test_loss)
    test_acc.append(avg_test_acc)

Training

Epoch 1


100%|██████████| 3928/3928 [15:07<00:00,  4.33it/s]


Training set - Average loss = 1.0135
Training set - Accuracy = 0.7329


100%|██████████| 784/784 [02:50<00:00,  4.61it/s]


Testing set - Average loss = 1.0392
Testing set - Accuracy = 0.7047

Epoch 2


100%|██████████| 3928/3928 [15:08<00:00,  4.32it/s]


Training set - Average loss = 1.0108
Training set - Accuracy = 0.7329


100%|██████████| 784/784 [02:48<00:00,  4.65it/s]


Testing set - Average loss = 1.0392
Testing set - Accuracy = 0.7047


In [15]:
torch.save(model.state_dict, 'model_state_dict.pt')
torch.save(model, 'model.pt')