# Imports

In [None]:
%%capture
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
import torch.utils.data as data_utils
import torch.optim as optim
import gc #garbage collector for gpu memory 
from tqdm import tqdm
import json

from transformers import BertForSequenceClassification, BertTokenizer, DistilBertModel, DistilBertTokenizer
from transformers import AutoModelForSequenceClassification
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

from google.colab import drive
from google.colab import files
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Loading Data

In [None]:
politifact_data = json.load(open("gdrive/MyDrive/BT4222/Data/politifact_clean.json", "r"))
gossipcop_data = json.load(open("gdrive/MyDrive/BT4222/Data/gossipcop_clean.json", "r"))

# Convert list of json objects to dataframe
politifact_df = pd.DataFrame(politifact_data)
gossipcop_df = pd.DataFrame(gossipcop_data)

# Conver labels to integers
politifact_df['target'] = politifact_df['label'].apply(lambda x: 1 if x=='real' else 0)
gossipcop_df['target'] = gossipcop_df['label'].apply(lambda x: 1 if x=='real' else 0)

# Tokenizing

In [None]:
def tokenize(df):
    # Get tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    # tokenize text
    print("Tokenizing")
    tokenized_df = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:100] + ['[SEP]'], tqdm(df['title'])))
    # Get token index
    indexed_tokens = list(map(tokenizer.convert_tokens_to_ids, tokenized_df))
    # Pad tokens
    totalpadlength = 100
    index_padded = np.array([xi+[0]*(totalpadlength-len(xi)) for xi in indexed_tokens])
    target_variable = df['target'].values

    # Mask
    mask_variable = [[float(i>0) for i in ii] for ii in index_padded]

    return index_padded, mask_variable, target_variable

def format_tensors(text_data, mask, labels, batch_size):
    X = torch.from_numpy(text_data)
    X = X.long()
    mask = torch.tensor(mask)
    y = torch.from_numpy(labels)
    y = y.long()
    tensordata = data_utils.TensorDataset(X, mask, y)
    loader = data_utils.DataLoader(tensordata, batch_size=batch_size, shuffle=False)
    return loader

def train_validation_test(index_padded, mask_variable, target_variable, BATCH_SIZE = 8):
    # Train test split for train set
    X_train, X_rest, y_train, y_rest = train_test_split(index_padded, target_variable, test_size=0.3, random_state=42)
    train_masks, rest_masks, _, _ = train_test_split(mask_variable, index_padded, test_size=0.3, random_state=42)

    # Train test split again for validation and test set
    X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest, test_size=0.5, random_state=42)
    val_masks, test_masks, _, _ = train_test_split(rest_masks, X_rest, test_size=0.5, random_state=42)

    trainloader = format_tensors(X_train, train_masks, y_train, BATCH_SIZE)
    validationloader = format_tensors(X_val, val_masks, y_val, BATCH_SIZE)
    testloader = format_tensors(X_test, test_masks, y_test, BATCH_SIZE)

    return trainloader, validationloader, testloader

# Choose type of article to run

In [None]:
# Choose gossipcop or politifact
article = "gossipcop"
index_padded, mask_variable, target_variable = tokenize(politifact_df) if article == "politifact" else tokenize(gossipcop_df)
trainloader, validationloader, testloader = train_validation_test(index_padded, mask_variable, target_variable)

Tokenizing


100%|██████████| 20049/20049 [00:12<00:00, 1640.36it/s]


# Model Creation

In [None]:
def validation_metrics(model, device, dataloader):
    tqdm()
    model.eval()
    preds, truth, pred_proba = [],[],[]
    with torch.no_grad():
        for i, batch in enumerate(tqdm(dataloader)):
            token_ids, masks, labels = tuple(t.to(device) for t in batch)
            output = model(input_ids=token_ids, attention_mask=masks, labels=labels)
            loss = output['loss']
            yhat = output['logits']
            predicition_proba = torch.sigmoid(yhat[:,1]).cpu().data.numpy()
            prediction = (predicition_proba > 0.5).astype(int)
            baseline = labels.long().cpu().data.numpy().astype(int)
            preds.extend(prediction)
            pred_proba.extend(predicition_proba)
            truth.extend(baseline)
            del token_ids, masks, labels #memory        
        torch.cuda.empty_cache() #memory
        gc.collect() # memory
        return accuracy_score(truth, preds), roc_auc_score(truth, pred_proba), f1_score(truth, preds)
        

In [None]:
def train_model(epochs, model, learning_rate, start_from_epoch=1):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    torch.cuda.empty_cache() #memory
    gc.collect() #memory
    NUM_EPOCHS = epochs
    loss_function = nn.BCEWithLogitsLoss()
    losses = []
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    for epoch in range(start_from_epoch, NUM_EPOCHS+1):
        model.train()

        # For epoch metrics
        epoch_loss = 0.0
        preds, truth, pred_proba = [],[],[]
        iteration = 0
        for i, batch in enumerate(tqdm(trainloader)):
            iteration += 1
            token_ids, masks, labels = tuple(t.to(device) for t in batch)
            optimizer.zero_grad()
            outputs = model(input_ids=token_ids, attention_mask=masks, labels=labels)
            loss = outputs['loss']
            epoch_loss += float(loss.item())
            yhat = outputs['logits']
            loss.backward()
            optimizer.step()

            # Metrics for batch
            prediction_proba = torch.sigmoid(yhat[:,1]).cpu().data.numpy()
            prediction = (prediction_proba > 0.5).astype(int)
            baseline = labels.long().cpu().data.numpy().astype(int)
            preds.extend(prediction)
            pred_proba.extend(prediction_proba)
            truth.extend(baseline)

            del token_ids, masks, labels #memory
            torch.cuda.empty_cache() #memory
            gc.collect() #memory
        

        # Calculate train and validation metrics and log them
        with torch.set_grad_enabled(False):
            metrics = {}
            # Training
            avg_accuracy, avg_roc_auc, avg_f1, avg_loss = accuracy_score(truth, preds), roc_auc_score(truth, pred_proba), f1_score(truth, preds), epoch_loss/float(iteration)
            print(f'Epoch {epoch}:\nTraining Accuracy: 'f'{avg_accuracy:.2f}%')
            print(f'Training ROC AUC: 'f'{avg_roc_auc:.2f}%')
            print(f'Training F1: 'f'{avg_f1:.2f}%')
            print(f'Training loss: 'f'{avg_loss}%\n')
            metrics['train'] = {
                'accuracy':avg_accuracy,
                'roc_auc':avg_roc_auc,
                'f1':avg_f1,
                'loss':avg_loss
            }

            # Validation
            model.eval()
            epoch_loss = 0.0
            preds, truth, pred_proba = [],[],[]
            iteration = 0
            with torch.no_grad():
                for i, batch in enumerate(tqdm(validationloader)):
                    iteration += 1
                    token_ids, masks, labels = tuple(t.to(device) for t in batch)
                    outputs = model(input_ids=token_ids, attention_mask=masks, labels=labels)
                    loss = outputs['loss']
                    yhat = outputs['logits']

                    # Metrics for batch
                    epoch_loss += float(loss.item())
                    prediction_proba = torch.sigmoid(yhat[:,1]).cpu().data.numpy()
                    prediction = (prediction_proba > 0.5).astype(int)
                    baseline = labels.long().cpu().data.numpy().astype(int)
                    preds.extend(prediction)
                    pred_proba.extend(prediction_proba)
                    truth.extend(baseline)

                    del token_ids, masks, labels #memory
                    torch.cuda.empty_cache() #memory
                    gc.collect() #memory

            avg_accuracy, avg_roc_auc, avg_f1, avg_loss = accuracy_score(truth, preds), roc_auc_score(truth, pred_proba), f1_score(truth, preds), epoch_loss/float(iteration)
            print(f'Validation Accuracy: 'f'{avg_accuracy:.2f}%')
            print(f'Validation ROC AUC: 'f'{avg_roc_auc:.2f}%')
            print(f'Validation F1: 'f'{avg_f1:.2f}%')
            print(f'Validation loss: 'f'{avg_loss}%\n')
            metrics['validation'] = {
                'accuracy':avg_accuracy,
                'roc_auc':avg_roc_auc,
                'f1':avg_f1,
                'loss':avg_loss
            }

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifi

In [None]:
train_model(10, model, 1e-5, start_from_epoch=1)

100%|██████████| 1755/1755 [12:39<00:00,  2.31it/s]


Epoch 1:
Training Accuracy: 0.82%
Training ROC AUC: 0.83%
Training F1: 0.89%
Training loss: 0.406923114952029%



100%|██████████| 376/376 [01:48<00:00,  3.45it/s]


Validation Accuracy: 0.85%
Validation ROC AUC: 0.88%
Validation F1: 0.90%
Validation loss: 0.3495739454879089%



100%|██████████| 1755/1755 [12:51<00:00,  2.27it/s]


Epoch 2:
Training Accuracy: 0.88%
Training ROC AUC: 0.92%
Training F1: 0.92%
Training loss: 0.2937586132681098%



100%|██████████| 376/376 [01:52<00:00,  3.34it/s]


Validation Accuracy: 0.85%
Validation ROC AUC: 0.89%
Validation F1: 0.90%
Validation loss: 0.35457066210244403%



100%|██████████| 1755/1755 [12:52<00:00,  2.27it/s]


Epoch 3:
Training Accuracy: 0.92%
Training ROC AUC: 0.96%
Training F1: 0.95%
Training loss: 0.20758854888347733%



100%|██████████| 376/376 [01:45<00:00,  3.57it/s]


Validation Accuracy: 0.83%
Validation ROC AUC: 0.88%
Validation F1: 0.89%
Validation loss: 0.42230669840516405%



100%|██████████| 1755/1755 [12:24<00:00,  2.36it/s]


Epoch 4:
Training Accuracy: 0.95%
Training ROC AUC: 0.98%
Training F1: 0.97%
Training loss: 0.13685281862096166%



100%|██████████| 376/376 [01:46<00:00,  3.53it/s]


Validation Accuracy: 0.83%
Validation ROC AUC: 0.87%
Validation F1: 0.88%
Validation loss: 0.5059392544604402%



100%|██████████| 1755/1755 [12:21<00:00,  2.37it/s]


Epoch 5:
Training Accuracy: 0.96%
Training ROC AUC: 0.99%
Training F1: 0.98%
Training loss: 0.09991935837444746%



100%|██████████| 376/376 [01:45<00:00,  3.57it/s]


Validation Accuracy: 0.85%
Validation ROC AUC: 0.87%
Validation F1: 0.90%
Validation loss: 0.5723193709508199%



100%|██████████| 1755/1755 [12:21<00:00,  2.37it/s]


Epoch 6:
Training Accuracy: 0.97%
Training ROC AUC: 0.99%
Training F1: 0.98%
Training loss: 0.0806501167557372%



100%|██████████| 376/376 [01:44<00:00,  3.59it/s]


Validation Accuracy: 0.85%
Validation ROC AUC: 0.87%
Validation F1: 0.90%
Validation loss: 0.612450985575901%



100%|██████████| 1755/1755 [12:22<00:00,  2.36it/s]


Epoch 7:
Training Accuracy: 0.97%
Training ROC AUC: 1.00%
Training F1: 0.98%
Training loss: 0.06772559544032086%



100%|██████████| 376/376 [01:45<00:00,  3.55it/s]


Validation Accuracy: 0.83%
Validation ROC AUC: 0.86%
Validation F1: 0.89%
Validation loss: 0.6247016264112082%



100%|██████████| 1755/1755 [12:26<00:00,  2.35it/s]


Epoch 8:
Training Accuracy: 0.98%
Training ROC AUC: 1.00%
Training F1: 0.98%
Training loss: 0.061120408397468746%



100%|██████████| 376/376 [01:45<00:00,  3.58it/s]


Validation Accuracy: 0.83%
Validation ROC AUC: 0.85%
Validation F1: 0.89%
Validation loss: 0.6670782530693922%



100%|██████████| 1755/1755 [12:20<00:00,  2.37it/s]


Epoch 9:
Training Accuracy: 0.98%
Training ROC AUC: 1.00%
Training F1: 0.99%
Training loss: 0.05244513199957515%



100%|██████████| 376/376 [01:45<00:00,  3.55it/s]


Validation Accuracy: 0.84%
Validation ROC AUC: 0.86%
Validation F1: 0.90%
Validation loss: 0.6684694772199518%



100%|██████████| 1755/1755 [12:23<00:00,  2.36it/s]


Epoch 10:
Training Accuracy: 0.98%
Training ROC AUC: 1.00%
Training F1: 0.99%
Training loss: 0.04929513858344029%



100%|██████████| 376/376 [01:44<00:00,  3.59it/s]


Validation Accuracy: 0.84%
Validation ROC AUC: 0.86%
Validation F1: 0.90%
Validation loss: 0.6812157819931212%

