# Imports

In [None]:
%%capture
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
import torch.utils.data as data_utils
import torch.optim as optim
import gc #garbage collector for gpu memory 
from tqdm import tqdm
import json

from transformers import BertForSequenceClassification, BertTokenizer, DistilBertModel, DistilBertTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import AutoConfig, AutoModel
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

from google.colab import drive
from google.colab import files
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Loading Data

In [None]:
politifact_data = json.load(open("gdrive/MyDrive/BT4222/Data/politifact_clean.json", "r"))
gossipcop_data = json.load(open("gdrive/MyDrive/BT4222/Data/gossipcop_clean.json", "r"))

# Convert list of json objects to dataframe
politifact_df = pd.DataFrame(politifact_data)
gossipcop_df = pd.DataFrame(gossipcop_data)

# Conver labels to integers
politifact_df['target'] = politifact_df['label'].apply(lambda x: 1 if x=='real' else 0)
gossipcop_df['target'] = gossipcop_df['label'].apply(lambda x: 1 if x=='real' else 0)

# Label to differentiate politifact and gossipcop
politifact_df['is_pf'] = 1
gossipcop_df['is_pf'] = 0

# Concatenate the two datasets
concat_df = pd.concat([politifact_df,gossipcop_df])

# Tokenizing

In [None]:
def tokenize(df, index_padded=None):
    # Get tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    # tokenize text
    print("Tokenizing")
    tokenized_df = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], tqdm(df['text'])))
    # Get token index
    indexed_tokens = list(map(tokenizer.convert_tokens_to_ids, tokenized_df))
    # Pad tokens
    totalpadlength = 512
    index_padded = np.array([xi+[0]*(totalpadlength-len(xi)) for xi in indexed_tokens])
        
    target_variable = df['target'].values
    article_flag = df[['is_pf']]

    # Mask
    mask_variable = [[float(i>0) for i in ii] for ii in index_padded]

    return index_padded, mask_variable, target_variable, article_flag

def format_tensors(text_data, mask, labels, batch_size, flag):
    X = torch.from_numpy(text_data)
    X = X.long()
    mask = torch.tensor(mask)
    y = torch.from_numpy(labels)
    y = y.long()
    numerical_data = torch.from_numpy(flag)
    numerical_data = numerical_data.long()

    tensordata = data_utils.TensorDataset(X, mask, numerical_data, y)
    loader = data_utils.DataLoader(tensordata, batch_size=batch_size, shuffle=False)
    return loader

def train_validation_test(index_padded, mask_variable, target_variable, article_flag, BATCH_SIZE = 8):
    # Train test split for train set
    X_train, X_rest, y_train, y_rest = train_test_split(index_padded, target_variable, test_size=0.3, random_state=42)
    train_masks, rest_masks, _, _ = train_test_split(mask_variable, index_padded, test_size=0.3, random_state=42)
    X_flag_train, X_flag_rest, _, _ = train_test_split(article_flag, index_padded, test_size=0.3, random_state=42)

    # Train test split again for validation and test set
    X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest, test_size=0.5, random_state=42)
    val_masks, test_masks, _, _ = train_test_split(rest_masks, X_rest, test_size=0.5, random_state=42)
    X_flag_val, X_flag_test, _, _ = train_test_split(X_flag_rest, X_rest, test_size=0.5, random_state=42)

    X_flag_train  = X_flag_train.is_pf.to_numpy()
    X_flag_val  = X_flag_val.is_pf.to_numpy()
    X_flag_test  = X_flag_test.is_pf.to_numpy()

    trainloader = format_tensors(X_train, train_masks, y_train, BATCH_SIZE, X_flag_train.reshape(-1))
    validationloader = format_tensors(X_val, val_masks, y_val, BATCH_SIZE, X_flag_val.reshape(-1))
    testloader = format_tensors(X_test, test_masks, y_test, BATCH_SIZE, X_flag_test.reshape(-1))

    return trainloader, validationloader, testloader

In [None]:
base_path = "gdrive/MyDrive/BT4222/Code/machine_learning/xp/combined/"
# Tokenize data and obtain necessary input features
index_padded, mask_variable, target_variable, article_flag = tokenize(concat_df)
# Create train validation test loaders
trainloader, validationloader, testloader = train_validation_test(index_padded, mask_variable, target_variable, article_flag)

Tokenizing


# Model Creation

In [None]:
class BertAndFlag(torch.nn.Module):
    """
    This takes a transformer backbone and puts a slightly-modified classification head on top.
    
    """
    def __init__(self):
        super().__init__()
        self.transformer = AutoModel.from_pretrained('distilbert-base-uncased') #Article transformer
        num_hidden_size = self.transformer.config.hidden_size # May be different depending on which model you use. Common sizes are 768 and 1024. Look in the config.json file 
        self.classifier = torch.nn.Linear(num_hidden_size+1, 2)

    def forward(self, input_ids, extra_data, attention_mask=None, labels=None):
        """
        extra_data should be of shape [batch_size, dim] 
        where dim is the number of additional numerical/categorical dimensions
        """
        hidden_states = self.transformer(input_ids=input_ids, attention_mask=attention_mask) # [batch size, sequence length, hidden size]
        cls_embeds = hidden_states.last_hidden_state[:, 0, :] # [batch size, hidden size]
        concat = torch.cat((cls_embeds, extra_data.unsqueeze(dim=-1)), dim=-1) # [batch size, hidden size+num extra dims]
        output = self.classifier(concat)
        return output

In [None]:
def weighted_loss(loss_function, outputs, labels, is_pf, weights):
    '''
    loss_function outputs a 1D Tensor of Shape [1,]
    outputs: model outputs of shape [batch_size, 2 ]
    is_df : 1D tensor of shape [batch_size]
    weights: 1d tensor of shape [2] where first value corresponds to weight
    for pf and second for gc
    '''
    loss = loss_function(outputs,labels)
    #print(loss)

    weight_vec = is_pf*weights[0] + (1-is_pf)*weights[1]
    #print(weight_vec)
    weighted_loss = weight_vec.mean()*loss
    #print(weighted_loss)
    return weighted_loss

In [None]:
def train_model(epochs, model, learning_rate, start_from_epoch=1):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    torch.cuda.empty_cache() #memory
    gc.collect() #memory
    NUM_EPOCHS = epochs
    loss_function = nn.CrossEntropyLoss()
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    # Loss weight
    weight = [0,0]
    weight[0] = concat_df.shape[0]/(2*politifact_df.shape[0])
    weight[1] = concat_df.shape[0]/(2*gossipcop_df.shape[0])
    weight = torch.tensor(weight).to(device)

    for epoch in range(start_from_epoch, NUM_EPOCHS+1):
        model.train()

        # For epoch metrics
        epoch_loss = []
        preds, truth, pred_proba, flag = [],[],[],[]
        iteration = 0
        for i, batch in enumerate(tqdm(trainloader)):
            iteration += 1
            token_ids, masks, numerical, labels = tuple(t.to(device) for t in batch)
            optimizer.zero_grad()
            outputs = model(input_ids=token_ids, extra_data=numerical, attention_mask=masks, labels=labels)
            loss = weighted_loss(loss_function, outputs, labels, numerical, weight)
            epoch_loss.append(float(loss.item()))
            loss.backward()
            optimizer.step()

            # Metrics for batch
            prediction_proba = torch.sigmoid(outputs[:,1]).cpu().data.numpy()
            prediction = (prediction_proba > 0.5).astype(int)
            baseline = labels.long().cpu().data.numpy().astype(int)
            article_flag = numerical.long().cpu().data.numpy().astype(int)
            preds.extend(prediction)
            pred_proba.extend(prediction_proba)
            truth.extend(baseline)
            flag.extend(article_flag)

            del token_ids, masks, numerical, labels #memory
            torch.cuda.empty_cache() #memory
            gc.collect() #memory
        

        # Calculate train and validation metrics and log them
        with torch.set_grad_enabled(False):
            metrics = {}
            # Training
            avg_accuracy, avg_roc_auc, avg_f1, avg_loss = accuracy_score(truth, preds), roc_auc_score(truth, pred_proba), f1_score(truth, preds), sum(epoch_loss)/float(iteration)
            print(f'Epoch {epoch}:\nTraining Accuracy: 'f'{avg_accuracy:.2f}%')
            print(f'Training ROC AUC: 'f'{avg_roc_auc:.2f}%')
            print(f'Training F1: 'f'{avg_f1:.2f}%')
            print(f'Training loss: 'f'{avg_loss}%\n')
            metrics['train'] = {
                'accuracy':avg_accuracy,
                'roc_auc':avg_roc_auc,
                'f1':avg_f1,
                'loss':avg_loss
            }
            epoch_results = list(zip(preds, truth, pred_proba, epoch_loss, flag))

            # Politifact Training
            politifact_results = list(filter(lambda x: x[4]==1, epoch_results))
            politifact_preds = list(map(lambda x: x[0], politifact_results))
            politifact_truth = list(map(lambda x: x[1], politifact_results))
            politifact_pred_proba = list(map(lambda x: x[2], politifact_results))
            politifact_loss = list(map(lambda x: x[3], politifact_results))
            avg_accuracy, avg_roc_auc, avg_f1, avg_loss = accuracy_score(politifact_truth, politifact_preds), roc_auc_score(politifact_truth, politifact_pred_proba), f1_score(politifact_truth, politifact_preds), sum(politifact_loss)/len(politifact_loss)
            print(f'Politifact Training Accuracy: 'f'{avg_accuracy:.2f}%')
            print(f'Politifact Training ROC AUC: 'f'{avg_roc_auc:.2f}%')
            print(f'Politifact Training F1: 'f'{avg_f1:.2f}%')
            print(f'Politifact Training loss: 'f'{avg_loss}%\n')
            metrics['politifact_train'] = {
                'accuracy':avg_accuracy,
                'roc_auc':avg_roc_auc,
                'f1':avg_f1,
                'loss':avg_loss
            }
            # Gossipcop Training
            gossipcop_results = list(filter(lambda x: x[4]==0, epoch_results))
            gossipcop_preds = list(map(lambda x: x[0], gossipcop_results))
            gossipcop_truth = list(map(lambda x: x[1], gossipcop_results))
            gossipcop_pred_proba = list(map(lambda x: x[2], gossipcop_results))
            gossipcop_loss = list(map(lambda x: x[3], gossipcop_results))
            avg_accuracy, avg_roc_auc, avg_f1, avg_loss = accuracy_score(gossipcop_truth, gossipcop_preds), roc_auc_score(gossipcop_truth, gossipcop_pred_proba), f1_score(gossipcop_truth, gossipcop_preds), sum(gossipcop_loss)/len(gossipcop_loss)
            print(f'Gossipcop Training Accuracy: 'f'{avg_accuracy:.2f}%')
            print(f'Gossipcop Training ROC AUC: 'f'{avg_roc_auc:.2f}%')
            print(f'Gossipcop Training F1: 'f'{avg_f1:.2f}%')
            print(f'Gossipcop Training loss: 'f'{avg_loss}%\n')
            metrics['gossipcop_train'] = {
                'accuracy':avg_accuracy,
                'roc_auc':avg_roc_auc,
                'f1':avg_f1,
                'loss':avg_loss
            }

            # Validation
            model.eval()
            epoch_loss = []
            preds, truth, pred_proba, flag = [],[],[],[]
            iteration = 0
            with torch.no_grad():
                for i, batch in enumerate(tqdm(validationloader)):
                    iteration += 1
                    token_ids, masks, numerical, labels = tuple(t.to(device) for t in batch)
                    outputs = model(input_ids=token_ids, extra_data=numerical, attention_mask=masks, labels=labels)
                    loss = loss_function(outputs, labels)

                    # Metrics for batch
                    epoch_loss.append(float(loss.item()))
                    prediction_proba = torch.sigmoid(outputs[:,1]).cpu().data.numpy()
                    prediction = (prediction_proba > 0.5).astype(int)
                    baseline = labels.long().cpu().data.numpy().astype(int)
                    article_flag = numerical.long().cpu().data.numpy().astype(int)
                    preds.extend(prediction)
                    pred_proba.extend(prediction_proba)
                    truth.extend(baseline)
                    flag.extend(article_flag)

                    del token_ids, masks, numerical, labels #memory
                    torch.cuda.empty_cache() #memory
                    gc.collect() #memory

            avg_accuracy, avg_roc_auc, avg_f1, avg_loss = accuracy_score(truth, preds), roc_auc_score(truth, pred_proba), f1_score(truth, preds), sum(epoch_loss)/float(iteration)
            print(f'Epoch {epoch}:\nValidation Accuracy: 'f'{avg_accuracy:.2f}%')
            print(f'Validation ROC AUC: 'f'{avg_roc_auc:.2f}%')
            print(f'Validation F1: 'f'{avg_f1:.2f}%')
            print(f'Validation loss: 'f'{avg_loss}%\n')
            metrics['validation'] = {
                'accuracy':avg_accuracy,
                'roc_auc':avg_roc_auc,
                'f1':avg_f1,
                'loss':avg_loss
            }
            epoch_results = list(zip(preds, truth, pred_proba, epoch_loss, flag))

            # Politifact Validation
            politifact_results = list(filter(lambda x: x[4]==1, epoch_results))
            politifact_preds = list(map(lambda x: x[0], politifact_results))
            politifact_truth = list(map(lambda x: x[1], politifact_results))
            politifact_pred_proba = list(map(lambda x: x[2], politifact_results))
            politifact_loss = list(map(lambda x: x[3], politifact_results))
            avg_accuracy, avg_roc_auc, avg_f1, avg_loss = accuracy_score(politifact_truth, politifact_preds), roc_auc_score(politifact_truth, politifact_pred_proba), f1_score(politifact_truth, politifact_preds), sum(politifact_loss)/len(politifact_loss)
            print(f'Politifact Validation Accuracy: 'f'{avg_accuracy:.2f}%')
            print(f'Politifact Validation ROC AUC: 'f'{avg_roc_auc:.2f}%')
            print(f'Politifact Validation F1: 'f'{avg_f1:.2f}%')
            print(f'Politifact Validation loss: 'f'{avg_loss}%\n')
            metrics['politifact_validation'] = {
                'accuracy':avg_accuracy,
                'roc_auc':avg_roc_auc,
                'f1':avg_f1,
                'loss':avg_loss
            }
            # Gossipcop Validation
            gossipcop_results = list(filter(lambda x: x[4]==0, epoch_results))
            gossipcop_preds = list(map(lambda x: x[0], gossipcop_results))
            gossipcop_truth = list(map(lambda x: x[1], gossipcop_results))
            gossipcop_pred_proba = list(map(lambda x: x[2], gossipcop_results))
            gossipcop_loss = list(map(lambda x: x[3], gossipcop_results))
            avg_accuracy, avg_roc_auc, avg_f1, avg_loss = accuracy_score(gossipcop_truth, gossipcop_preds), roc_auc_score(gossipcop_truth, gossipcop_pred_proba), f1_score(gossipcop_truth, gossipcop_preds), sum(gossipcop_loss)/len(gossipcop_loss)
            print(f'Gossipcop Validation Accuracy: 'f'{avg_accuracy:.2f}%')
            print(f'Gossipcop Validation ROC AUC: 'f'{avg_roc_auc:.2f}%')
            print(f'Gossipcop Validation F1: 'f'{avg_f1:.2f}%')
            print(f'Gossipcop Validation loss: 'f'{avg_loss}%\n')
            metrics['gossipcop_validation'] = {
                'accuracy':avg_accuracy,
                'roc_auc':avg_roc_auc,
                'f1':avg_f1,
                'loss':avg_loss
            }


In [None]:
model = BertAndFlag(torch.nn.Module)

In [None]:
train_model(10, model, 1e-5, start_from_epoch=1)

100%|██████████| 1838/1838 [38:19<00:00,  1.25s/it]


Epoch 4:
Training Accuracy: 0.92%
Training ROC AUC: 0.96%
Training F1: 0.95%
Training loss: 0.15875153909990933%

Politifact Training Accuracy: 0.86%
Politifact Training ROC AUC: 0.98%
Politifact Training F1: 0.88%
Politifact Training loss: 0.1529859317190669%

Gossipcop Training Accuracy: 0.92%
Gossipcop Training ROC AUC: 0.95%
Gossipcop Training F1: 0.95%
Gossipcop Training loss: 0.15901734114763172%



100%|██████████| 394/394 [03:19<00:00,  1.97it/s]


Epoch 4:
Validation Accuracy: 0.83%
Validation ROC AUC: 0.84%
Validation F1: 0.89%
Validation loss: 0.5000742741855722%

Politifact Validation Accuracy: 0.79%
Politifact Validation ROC AUC: 0.82%
Politifact Validation F1: 0.86%
Politifact Validation loss: 0.3651072084903717%

Gossipcop Validation Accuracy: 0.86%
Gossipcop Validation ROC AUC: 0.86%
Gossipcop Validation F1: 0.91%
Gossipcop Validation loss: 0.5050467450269743%



100%|██████████| 1838/1838 [38:18<00:00,  1.25s/it]


Epoch 5:
Training Accuracy: 0.94%
Training ROC AUC: 0.97%
Training F1: 0.96%
Training loss: 0.12275895621724223%

Politifact Training Accuracy: 0.91%
Politifact Training ROC AUC: 0.98%
Politifact Training F1: 0.92%
Politifact Training loss: 0.137535247106657%

Gossipcop Training Accuracy: 0.94%
Gossipcop Training ROC AUC: 0.97%
Gossipcop Training F1: 0.96%
Gossipcop Training loss: 0.12207774986434378%



100%|██████████| 394/394 [03:20<00:00,  1.96it/s]


Epoch 5:
Validation Accuracy: 0.84%
Validation ROC AUC: 0.85%
Validation F1: 0.90%
Validation loss: 0.5581347529875672%

Politifact Validation Accuracy: 0.71%
Politifact Validation ROC AUC: 0.78%
Politifact Validation F1: 0.82%
Politifact Validation loss: 0.37202364176378716%

Gossipcop Validation Accuracy: 0.86%
Gossipcop Validation ROC AUC: 0.86%
Gossipcop Validation F1: 0.91%
Gossipcop Validation loss: 0.5649914781379171%



100%|██████████| 1838/1838 [38:19<00:00,  1.25s/it]


Epoch 6:
Training Accuracy: 0.95%
Training ROC AUC: 0.98%
Training F1: 0.97%
Training loss: 0.10548447246436965%

Politifact Training Accuracy: 0.93%
Politifact Training ROC AUC: 0.98%
Politifact Training F1: 0.93%
Politifact Training loss: 0.11124854854934699%

Gossipcop Training Accuracy: 0.95%
Gossipcop Training ROC AUC: 0.98%
Gossipcop Training F1: 0.97%
Gossipcop Training loss: 0.10521874101139118%



100%|██████████| 394/394 [03:23<00:00,  1.94it/s]


Epoch 6:
Validation Accuracy: 0.84%
Validation ROC AUC: 0.83%
Validation F1: 0.90%
Validation loss: 0.628752222431228%

Politifact Validation Accuracy: 0.71%
Politifact Validation ROC AUC: 0.70%
Politifact Validation F1: 0.82%
Politifact Validation loss: 0.3798285395439182%

Gossipcop Validation Accuracy: 0.86%
Gossipcop Validation ROC AUC: 0.84%
Gossipcop Validation F1: 0.91%
Gossipcop Validation loss: 0.6379230949586552%



 19%|█▉        | 350/1838 [07:21<31:14,  1.26s/it]


KeyboardInterrupt: ignored