In [18]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
import os
from tqdm import tqdm
import time
import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings("ignore")

In [42]:
class Dataset(data.Dataset):
    
    def __init__(self, bodies, stances):
        
        self.bodies = bodies
        self.stances = stances
    
    def __len__(self):
        
        return len(self.stances)
    
    def collate_fn(headline, article, stance, num_sentences = 7, transformer = "msmarco-distilbert-base-v2"):

        try:
            encoder = SentenceTransformer(transformer)
        except:
            raise ValueError("Transformer not identified.")

        tweet_encoding = encoder.encode(headline)
        sentences = article.split(", ")
        encoded_input = np.array([encoder.encode(sentence) for sentence in sentences] + tweet_encoding)
        assert encoded_input.shape == (num_sentences + 1, 768)

        output_mapper = {"agree": 0, "disagree": 1, "discuss":2, "unrelated": 3}
        encoded_output = output_mapper[stance]

        return encoded_input, encoded_output

    def __getitem__(self, idx):
                 
        if torch.is_tensor(idx):
            idx = idx.tolist()

        headline, body_id, stance = self.stances.loc[idx, :]
        article = self.bodies.loc[body_id, "articleBody"]

        return self.collate_fn(headline = headline, article = article, stance = stance)

In [37]:
class Model(nn.Module):
    
    
    def __init__(self, n_features = 8*768, 
                       n_hidden_layers = 2, 
                       hidden_layer_sizes = [1024, 64], 
                       num_classes = 4, 
                       dropout_rate = 0.1):
        
        super(Model, self).__init__()
        
        assert n_hidden_layers == len(hidden_layer_sizes)
        
        self.layers = [nn.Sequential(nn.Linear(n_features if n == 0 else hidden_layer_sizes[n-1], hidden_layer_sizes[n]),
                                     nn.GELU(),
                                     nn.Dropout(dropout_rate)) 
                       for n in range(n_hidden_layers)]
        
        self.output_layer = nn.Sequential(nn.Linear(hidden_layer_sizes[-1], num_classes),
                                          nn.Softmax(dim = 0))
        
    def forward(self, x):
        
        for layer in self.layers:
            x = layer(x)
            
        return self.output_layer(x)

In [38]:
def train(model, device, train_loader, criterion, optimizer, scheduler):
    
    # Put the model in training mode
    model.train()

    # List of train losses
    train_loss = []
    
    for data in tqdm(train_loader):
        
        # Load the data, and convert the tensor with the specified device
        encodings, labels = data
        encodings, labels = encodings.to(device), labels.to(device)

        # Forward pass
        output = model(encodings)

        # Set the gradients to 0
        optimizer.zero_grad()
        
        # Compute the loss, and with it, the gradients
        loss = criterion(outputs, labels)
        train_loss.append(loss.item())
        loss.backward()
        
        # Update the parameters
        optimizer.step()
        scheduler.step()

    # Calculate average character error rate and average word error rate
    avg_train_loss = torch.mean(torch.tensor(train_loss))

    print("Training set - Average loss = {:.4f}".format(avg_train_loss))
    time.sleep(2)
    
    return avg_train_loss

In [39]:
def test(model, device, test_loader, criterion):
    
    # Put the model in training mode
    model.eval()

    # List of train losses
    train_loss = []
    
    for data in tqdm(test_loader):
        
        # Load the data, and convert the tensor with the specified device
        encodings, labels = data
        encodings, labels = encodings.to(device), labels.to(device)

        # Forward pass
        output = model(encodings)
        
        # Compute the loss, and with it, the gradients
        loss = criterion(outputs, labels)
        test_loss.append(loss.item())

    # Calculate average character error rate and average word error rate
    avg_test_loss = torch.mean(torch.tensor(test_loss))

    print("Training set - Average loss = {:.4f}".format(avg_test_loss))
    time.sleep(2)
    
    return avg_test_loss

In [40]:
def main(train_bodies_path = "./data/processed/train_bodies.csv", 
         train_stances_path = "./data/processed/train_stances.csv", 
         test_bodies_path = "./data/processed/test_bodies.csv",
         test_stances_path = "./data/processed/test_stances.csv",
         num_sentences = 7,
         n_hidden_layers = 2,
         hidden_layer_sizes = [1024, 64],
         num_classes = 4,
         epochs = 10,
         batch_size = 32,
         learning_rate = 5e-4, 
         dropout_rate = 0.1):

    # If current directory is not data, make one called data
    
    if not os.path.isdir("./data"):
        os.makedirs("./data")

    # Importing data
    
    print("Loading datasets")
    
    train_bodies = pd.read_csv(train_bodies_path, header = 0, index_col = 0)
    train_stances = pd.read_csv(train_stances_path, header = 0)
    train_dataset = Dataset(train_bodies, train_stances)
    
    test_bodies = pd.read_csv(test_bodies_path, header = 0, index_col = 0)
    test_stances = pd.read_csv(test_stances_path, header = 0)
    test_dataset = Dataset(test_bodies, test_stances)  
    
    # kwargs for using GPU
    
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    print("Using GPU"  if use_cuda else "GPU not found")
    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}
    
    # Loading train data
    
    print("Loading train data")
    
    train_loader = data.DataLoader(dataset = train_dataset,
                                   batch_size = batch_size,
                                   shuffle = True)
    
    # Loading test data
    
    print("Loading test data")
    
    test_loader = data.DataLoader(dataset = test_dataset,
                                  batch_size = batch_size,
                                  shuffle = False)

    # Setting up the model
    
    print("Setting up the model")
    
    model = Model(n_features = (num_sentences+1)*768, 
                  n_hidden_layers = n_hidden_layers, 
                  hidden_layer_sizes = hidden_layer_sizes, 
                  num_classes = num_classes, 
                  dropout_rate = dropout_rate).to(device)

    print("Total model parameters =", sum([param.nelement() for param in model.parameters()]))

    # Optimizer, loss criterion, and learning rate scheduler
    
    print("Defining optimizer, loss criterion and learning rate scheduler")
    
    optimizer = optim.AdamW(model.parameters(), learning_rate)
    criterion = nn.CrossEntropyLoss()
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, 
                                              max_lr = learning_rate, 
                                              steps_per_epoch = len(train_loader),
                                              epochs = epochs,
                                              anneal_strategy = "linear")
    
    # Training the model
    
    print("\nTraining")
    
    train_losses, test_losses = [], []
    
    for epoch in range(1, epochs+1):
        
        print("\nEpoch", epoch)
        
        time.sleep(2)
        train_losses.append(train(model, device, train_loader, criterion, optimizer, scheduler))
        test_losses.append(test(model, device, test_loader, criterion))

    return model, train_losses, test_losses

In [43]:
model, train_losses, test_losses = main()

Loading datasets
GPU not found
Loading train data
Loading test data
Setting up the model
Total model parameters = 260
Defining optimizer, loss criterion and learning rate scheduler

Training

Epoch 1


  0%|                                                                                         | 0/1691 [00:00<?, ?it/s]


TypeError: collate_fn() got multiple values for argument 'headline'