# Imports

In [None]:
%%capture
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
import torch.utils.data as data_utils
import torch.optim as optim
import gc #garbage collector for gpu memory 
from tqdm import tqdm
import json
import datetime as dt

from transformers import BertForSequenceClassification, BertTokenizer, DistilBertModel, DistilBertTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import AutoConfig, AutoModel
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

from google.colab import drive
from google.colab import files
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Loading Data

In [None]:
politifact_data = json.load(open("gdrive/MyDrive/BT4222/Data/politifact_clean.json", "r"))
gossipcop_data = json.load(open("gdrive/MyDrive/BT4222/Data/gossipcop_clean.json", "r"))

# Convert list of json objects to dataframe
politifact_df = pd.DataFrame(politifact_data)
gossipcop_df = pd.DataFrame(gossipcop_data)

# Conver labels to integers
politifact_df['target'] = politifact_df['label'].apply(lambda x: 1 if x=='real' else 0)
gossipcop_df['target'] = gossipcop_df['label'].apply(lambda x: 1 if x=='real' else 0)

politifact_df['is_pf'] = 1
gossipcop_df['is_pf'] = 0

concat_df = pd.concat([politifact_df,gossipcop_df])

In [None]:
politifact_df['parsed_month'] = politifact_df['publish_date'].apply(lambda x: dt.datetime.fromtimestamp(x).strftime("%m") if not pd.isna(x) else '0')
gossipcop_df['parsed_month'] = gossipcop_df['publish_date'].apply(lambda x: dt.datetime.fromtimestamp(x).strftime("%m") if not pd.isna(x) else '0')

In [None]:
politifact_df['parsed_hour'] = politifact_df['publish_date'].apply(lambda x: dt.datetime.fromtimestamp(x).strftime("%H") if not pd.isna(x) else '0')
gossipcop_df['parsed_hour'] = gossipcop_df['publish_date'].apply(lambda x: dt.datetime.fromtimestamp(x).strftime("%H") if not pd.isna(x) else '0')

In [None]:
politifact_df['publisher'] = politifact_df['publisher'].fillna('None')
gossipcop_df['publisher'] = gossipcop_df['publisher'].fillna('None')

In [None]:
article = 'gossipcop'

### Processing Other Features

We shall one hot encode hour, month and publisher. Fill publisher NaN with None text.

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
if article == "politifact":
  X_extra = politifact_df[['parsed_hour','parsed_month', 'publisher']].copy()
else:
  X_extra = gossipcop_df[['parsed_hour','parsed_month', 'publisher']].copy()

In [None]:
X_extra['parsed_hour'] = X_extra['parsed_hour'].astype('str')
X_extra['parsed_month'] = X_extra['parsed_month'].astype('str')
X_extra['publisher'] = X_extra['publisher'].astype('str')

In [None]:
enc = OneHotEncoder(handle_unknown = 'ignore')
enc.fit(X_extra)
X_extra_enc = enc.transform(X_extra).toarray()

# Experiment 1

### Tokenizing

In [None]:
def tokenize(df, index_padded=None):
    # Get tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    # tokenize text
    print("Tokenizing")
    if type(index_padded) == 'NoneType':
        tokenized_df = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], tqdm(df['text'])))
        # Get token index
        indexed_tokens = list(map(tokenizer.convert_tokens_to_ids, tokenized_df))
        # Pad tokens
        totalpadlength = 512
        index_padded = np.array([xi+[0]*(totalpadlength-len(xi)) for xi in indexed_tokens])
        
    target_variable = df['target'].values
    article_flag = df[['is_pf']]

    # Mask
    mask_variable = [[float(i>0) for i in ii] for ii in index_padded]

    return index_padded, mask_variable, target_variable, article_flag

def format_tensors(text_data, mask, labels, batch_size, flag):
    X = torch.from_numpy(text_data)
    X = X.long()
    mask = torch.tensor(mask)
    y = torch.from_numpy(labels)
    y = y.long()
    numerical_data = torch.from_numpy(flag)
    numerical_data = numerical_data.long()

    tensordata = data_utils.TensorDataset(X, mask, numerical_data, y)
    loader = data_utils.DataLoader(tensordata, batch_size=batch_size, shuffle=False)
    return loader

def train_validation_test(index_padded, mask_variable, target_variable, article_flag, BATCH_SIZE = 8):
    # Train test split for train set
    X_train, X_rest, y_train, y_rest = train_test_split(index_padded, target_variable, test_size=0.3, random_state=42)
    train_masks, rest_masks, _, _ = train_test_split(mask_variable, index_padded, test_size=0.3, random_state=42)
    X_flag_train, X_flag_rest, _, _ = train_test_split(article_flag, index_padded, test_size=0.3, random_state=42)

    # Train test split again for validation and test set
    X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest, test_size=0.5, random_state=42)
    val_masks, test_masks, _, _ = train_test_split(rest_masks, X_rest, test_size=0.5, random_state=42)
    X_flag_val, X_flag_test, _, _ = train_test_split(X_flag_rest, X_rest, test_size=0.5, random_state=42)

    X_flag_train  = X_flag_train.is_pf.to_numpy()
    X_flag_val  = X_flag_val.is_pf.to_numpy()
    X_flag_test  = X_flag_test.is_pf.to_numpy()

    trainloader = format_tensors(X_train, train_masks, y_train, BATCH_SIZE, X_flag_train.reshape(-1))
    validationloader = format_tensors(X_val, val_masks, y_val, BATCH_SIZE, X_flag_val.reshape(-1))
    testloader = format_tensors(X_test, test_masks, y_test, BATCH_SIZE, X_flag_test.reshape(-1))

    return trainloader, validationloader, testloader

In [None]:
base_path = "gdrive/MyDrive/BT4222/Code/machine_learning/xp/combined/"
index_padded, mask_variable, target_variable, article_flag = tokenize(concat_df, np.load(open(f'{base_path}/index_padded.npy', 'rb')))
trainloader, validationloader, testloader = train_validation_test(index_padded, mask_variable, target_variable, article_flag)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Tokenizing


### Model creation and test

In [None]:
class BertAndFlag(torch.nn.Module):
    """
    This takes a transformer backbone and puts a slightly-modified classification head on top.
    
    """

    def __init__(self):
        # num_extra_dims corresponds to the number of extra dimensions of numerical/categorical data

        super().__init__()

        # self.config = AutoConfig.from_pretrained(model_name)
        self.transformer = AutoModel.from_pretrained('distilbert-base-uncased') #Article transformer
        
        num_hidden_size = self.transformer.config.hidden_size # May be different depending on which model you use. Common sizes are 768 and 1024. Look in the config.json file 
        # self.linear1 = torch.nn.Linear(num_hidden_size+num_extra_dims, 100)
        self.classifier = torch.nn.Linear(num_hidden_size+1, 2)
        # self.dropout = torch.nn.Dropout(0.25)

    def forward(self, input_ids, extra_data, attention_mask=None, labels=None):
        """
        extra_data should be of shape [batch_size, dim] 
        where dim is the number of additional numerical/categorical dimensions
        """

        hidden_states = self.transformer(input_ids=input_ids, attention_mask=attention_mask) # [batch size, sequence length, hidden size]

        cls_embeds = hidden_states.last_hidden_state[:, 0, :] # [batch size, hidden size]

        concat = torch.cat((cls_embeds, extra_data.unsqueeze(dim=-1)), dim=-1) # [batch size, hidden size+num extra dims]
        output = self.classifier(concat)
        #x = self.dropout(x) 
        #output = self.classifier(x) # [batch size, num labels]

        return output

In [None]:
def weighted_loss(loss_function, outputs, labels, is_pf, weights):
    '''
    loss_function outputs a 1D Tensor of Shape [1,]
    outputs: model outputs of shape [batch_size, 2 ]
    is_df : 1D tensor of shape [batch_size]
    weights: 1d tensor of shape [2] where first value corresponds to weight
    for pf and second for gc
    '''
    loss = loss_function(outputs,labels)
    #print(loss)

    weight_vec = is_pf*weights[0] + (1-is_pf)*weights[1]
    #print(weight_vec)
    weighted_loss = weight_vec.mean()*loss
    #print(weighted_loss)
    return weighted_loss

In [None]:
# Loss weight
weight = [0,0]
weight[0] = concat_df.shape[0]/(2*politifact_df.shape[0])
weight[1] = concat_df.shape[0]/(2*gossipcop_df.shape[0])
weight = torch.tensor(weight).to(device)

model = BertAndFlag()
checkpoint = torch.load(f"gdrive/MyDrive/BT4222/Code/machine_learning/xp/Expt1/model_epoch3.pt")
model.load_state_dict(checkpoint['model_state_dict'])
loss_function = nn.CrossEntropyLoss()
model.to(device)
with torch.set_grad_enabled(False):
    model.eval()
    epoch_loss = 0.0
    preds, truth, pred_proba, flag = [],[],[],[]
    iteration = 0
    with torch.no_grad():
        for i, batch in enumerate(tqdm(trainloader)):
            iteration += 1
            token_ids, masks, numerical, labels = tuple(t.to(device) for t in batch)
            outputs = model(input_ids=token_ids, extra_data=numerical, attention_mask=masks, labels=labels)
            loss = weighted_loss(loss_function, outputs, labels, numerical, weight)
            epoch_loss += float(loss.item())

            # Metrics for batch
            epoch_loss += (float(loss.item()))
            prediction_proba = torch.sigmoid(outputs[:,1]).cpu().data.numpy()
            prediction = (prediction_proba > 0.5).astype(int)
            baseline = labels.long().cpu().data.numpy().astype(int)
            article_flag = numerical.long().cpu().data.numpy().astype(int)
            preds.extend(prediction)
            pred_proba.extend(prediction_proba)
            truth.extend(baseline)
            flag.extend(article_flag)

            del token_ids, masks, numerical, labels #memory
            torch.cuda.empty_cache() #memory
            gc.collect() #memory

    metrics = {}
    avg_accuracy, avg_roc_auc, avg_f1, avg_loss = accuracy_score(truth, preds), roc_auc_score(truth, pred_proba), f1_score(truth, preds), epoch_loss/float(iteration)
    print(f'Test Accuracy: 'f'{avg_accuracy:.2f}%')
    print(f'Test ROC AUC: 'f'{avg_roc_auc:.2f}%')
    print(f'Test F1: 'f'{avg_f1:.2f}%')
    print(f'Test loss: 'f'{avg_loss}%\n')
    metrics['Test'] = {
        'accuracy':avg_accuracy,
        'roc_auc':avg_roc_auc,
        'f1':avg_f1,
        'loss':avg_loss
    }
    epoch_results = list(zip(preds, truth, pred_proba, flag))

    # Politifact Validation
    politifact_results = list(filter(lambda x: x[3]==1, epoch_results))
    politifact_preds = list(map(lambda x: x[0], politifact_results))
    politifact_truth = list(map(lambda x: x[1], politifact_results))
    politifact_pred_proba = list(map(lambda x: x[2], politifact_results))
    avg_accuracy, avg_roc_auc, avg_f1 = accuracy_score(politifact_truth, politifact_preds), roc_auc_score(politifact_truth, politifact_pred_proba), f1_score(politifact_truth, politifact_preds)
    print(f'Politifact Test Accuracy: 'f'{avg_accuracy:.2f}%')
    print(f'Politifact Test ROC AUC: 'f'{avg_roc_auc:.2f}%')
    print(f'Politifact Test F1: 'f'{avg_f1:.2f}%')
    metrics['politifact_test'] = {
        'accuracy':avg_accuracy,
        'roc_auc':avg_roc_auc,
        'f1':avg_f1,
    }
    # Gossipcop Validation
    gossipcop_results = list(filter(lambda x: x[3]==0, epoch_results))
    gossipcop_preds = list(map(lambda x: x[0], gossipcop_results))
    gossipcop_truth = list(map(lambda x: x[1], gossipcop_results))
    gossipcop_pred_proba = list(map(lambda x: x[2], gossipcop_results))
    avg_accuracy, avg_roc_auc, avg_f1 = accuracy_score(gossipcop_truth, gossipcop_preds), roc_auc_score(gossipcop_truth, gossipcop_pred_proba), f1_score(gossipcop_truth, gossipcop_preds)
    print(f'Gossipcop Test Accuracy: 'f'{avg_accuracy:.2f}%')
    print(f'Gossipcop Test ROC AUC: 'f'{avg_roc_auc:.2f}%')
    print(f'Gossipcop Test F1: 'f'{avg_f1:.2f}%')
    metrics['gossipcop_test'] = {
        'accuracy':avg_accuracy,
        'roc_auc':avg_roc_auc,
        'f1':avg_f1,
    }


Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 394/394 [03:38<00:00,  1.80it/s]


Test Accuracy: 0.84%
Test ROC AUC: 0.86%
Test F1: 0.89%
Test loss: 0.7754806158496871%

Politifact Test Accuracy: 0.79%
Politifact Test ROC AUC: 0.85%
Politifact Test F1: 0.83%
Gossipcop Test Accuracy: 0.84%
Gossipcop Test ROC AUC: 0.85%
Gossipcop Test F1: 0.89%


# Experiment 2

### Tokenizing

In [None]:
def tokenize(df):
    # Get tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    # tokenize text
    print("Tokenizing")
    tokenized_df = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], tqdm(df['text'])))
    # Get token index
    indexed_tokens = list(map(tokenizer.convert_tokens_to_ids, tokenized_df))
    # Pad tokens
    totalpadlength = 512
    index_padded = np.array([xi+[0]*(totalpadlength-len(xi)) for xi in indexed_tokens])
    target_variable = df['target'].values

    # Mask
    mask_variable = [[float(i>0) for i in ii] for ii in index_padded]

    return index_padded, mask_variable, target_variable

def format_tensors(text_data, mask, labels, batch_size):
    X = torch.from_numpy(text_data)
    X = X.long()
    mask = torch.tensor(mask)
    y = torch.from_numpy(labels)
    y = y.long()
    tensordata = data_utils.TensorDataset(X, mask, y)
    loader = data_utils.DataLoader(tensordata, batch_size=batch_size, shuffle=False)
    return loader

def train_validation_test(index_padded, mask_variable, target_variable, BATCH_SIZE = 8):
    # Train test split for train set
    X_train, X_rest, y_train, y_rest = train_test_split(index_padded, target_variable, test_size=0.3, random_state=42)
    train_masks, rest_masks, _, _ = train_test_split(mask_variable, index_padded, test_size=0.3, random_state=42)

    # Train test split again for validation and test set
    X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest, test_size=0.5, random_state=42)
    val_masks, test_masks, _, _ = train_test_split(rest_masks, X_rest, test_size=0.5, random_state=42)

    trainloader = format_tensors(X_train, train_masks, y_train, BATCH_SIZE)
    validationloader = format_tensors(X_val, val_masks, y_val, BATCH_SIZE)
    testloader = format_tensors(X_test, test_masks, y_test, BATCH_SIZE)

    return trainloader, validationloader, testloader

In [None]:
# Choose gossipcop or politifact
index_padded, mask_variable, target_variable = tokenize(politifact_df) if article == "politifact" else tokenize(gossipcop_df)
trainloader, validationloader, testloader = train_validation_test(index_padded, mask_variable, target_variable)

Tokenizing


100%|██████████| 20049/20049 [05:41<00:00, 58.67it/s]


### Model creation and test

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(f"gdrive/MyDrive/BT4222/Code/machine_learning/xp/{article}/Expt2/model_epoch2")
model.to(device)
with torch.set_grad_enabled(False):
    model.eval()
    epoch_loss = 0.0
    preds, truth, pred_proba = [],[],[]
    iteration = 0
    with torch.no_grad():
        for i, batch in enumerate(tqdm(testloader)):
            iteration += 1
            token_ids, masks, labels = tuple(t.to(device) for t in batch)
            outputs = model(input_ids=token_ids, attention_mask=masks, labels=labels)
            loss = outputs['loss']
            yhat = outputs['logits']

            # Metrics for batch
            epoch_loss += float(loss.item())
            prediction_proba = torch.sigmoid(yhat[:,1]).cpu().data.numpy()
            prediction = (prediction_proba > 0.5).astype(int)
            baseline = labels.long().cpu().data.numpy().astype(int)
            preds.extend(prediction)
            pred_proba.extend(prediction_proba)
            truth.extend(baseline)

            del token_ids, masks, labels #memory
            torch.cuda.empty_cache() #memory
            gc.collect() #memory

    avg_accuracy, avg_roc_auc, avg_f1, avg_loss = accuracy_score(truth, preds), roc_auc_score(truth, pred_proba), f1_score(truth, preds), epoch_loss/float(iteration)
    print(f'\nTest Accuracy: 'f'{avg_accuracy:.2f}%')
    print(f'Test ROC AUC: 'f'{avg_roc_auc:.2f}%')
    print(f'Test F1: 'f'{avg_f1:.2f}%')
    print(f'Test loss: 'f'{avg_loss}%\n')
    test_metrics = {
        'accuracy':avg_accuracy,
        'roc_auc':avg_roc_auc,
        'f1':avg_f1,
        'loss':avg_loss
    }

100%|██████████| 376/376 [03:38<00:00,  1.72it/s]


Test Accuracy: 0.87%
Test ROC AUC: 0.90%
Test F1: 0.92%
Test loss: 0.3166738252522067%






# Experiment 3

### Tokenizing

In [None]:
def tokenize(df):
    # Get tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    # tokenize text
    print("Title Tokenizing")
    title_tokenized_df = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:98] + ['[SEP]'], tqdm(df['title'])))
    # Get token index
    title_indexed_tokens = list(map(tokenizer.convert_tokens_to_ids, title_tokenized_df))
    
    # Pad tokens
    totalpadlength = 100
    title_index_padded = np.array([xi+[0]*(totalpadlength-len(xi)) for xi in title_indexed_tokens])

    # Mask
    title_mask_variable = [[float(i>0) for i in ii] for ii in title_index_padded]


    print("Article Tokenizing")
    article_tokenized_df = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], tqdm(df['text'])))    
    article_indexed_tokens = list(map(tokenizer.convert_tokens_to_ids, article_tokenized_df))
  
    # Pad tokens
    totalpadlength = 512
    article_index_padded = np.array([xi+[0]*(totalpadlength-len(xi)) for xi in article_indexed_tokens])

    # Mask
    article_mask_variable = [[float(i>0) for i in ii] for ii in article_index_padded]

    # Target Variable
    target_variable = df['target'].values

    return title_index_padded, title_mask_variable, article_index_padded, article_mask_variable, target_variable

def format_tensors(article_data, article_mask, title_data, title_mask, labels, batch_size):
    
    X_article = torch.from_numpy(article_data)
    X_article = X_article.long()
    article_mask = torch.tensor(article_mask)

    X_title = torch.from_numpy(title_data)
    X_title = X_title.long()
    title_mask = torch.tensor(title_mask)

    y = torch.from_numpy(labels)
    y = y.long()

    tensordata = data_utils.TensorDataset(X_article, article_mask, X_title, title_mask,  y)
    loader = data_utils.DataLoader(tensordata, batch_size=batch_size, shuffle=False)
    
    return loader

def train_validation_test(title_index_padded, title_mask_variable, article_index_padded, article_mask_variable, target_variable, BATCH_SIZE = 8):
    # Train test split for train set
    X_train_title, X_rest_title, y_train, y_rest = train_test_split(title_index_padded, target_variable, test_size=0.3, random_state=42)
    train_masks_title, rest_masks_title, _, _ = train_test_split(title_mask_variable, title_index_padded, test_size=0.3, random_state=42)

    # Train test split again for validation and test set
    X_val_title, X_test_title, y_val, y_test = train_test_split(X_rest_title, y_rest, test_size=0.5, random_state=42)
    val_masks_title, test_masks_title, _, _ = train_test_split(rest_masks_title, X_rest_title, test_size=0.5, random_state=42)

    # Train test split for train set
    X_train_article, X_rest_article, y_train, y_rest = train_test_split(article_index_padded, target_variable, test_size=0.3, random_state=42)
    train_masks_article, rest_masks_article, _, _ = train_test_split(article_mask_variable, article_index_padded, test_size=0.3, random_state=42)

    # Train test split again for validation and test set
    X_val_article, X_test_article, y_val, y_test = train_test_split(X_rest_article, y_rest, test_size=0.5, random_state=42)
    val_masks_article, test_masks_article, _, _ = train_test_split(rest_masks_article, X_rest_article, test_size=0.5, random_state=42)

    trainloader = format_tensors(X_train_article, train_masks_article, X_train_title, train_masks_title, y_train, BATCH_SIZE)
    validationloader = format_tensors(X_val_article, val_masks_article, X_val_title, val_masks_title, y_val, BATCH_SIZE)
    testloader = format_tensors(X_test_article, test_masks_article, X_test_title, test_masks_title, y_test, BATCH_SIZE)

    return trainloader, validationloader, testloader

In [None]:
# Choose gossipcop or politifact
title_index_padded, title_mask_variable, article_index_padded, article_mask_variable, target_variable = tokenize(politifact_df) if article == "politifact" else tokenize(gossipcop_df)
trainloader, validationloader, testloader = train_validation_test(title_index_padded, title_mask_variable, article_index_padded, article_mask_variable, target_variable)

Title Tokenizing


100%|██████████| 20049/20049 [00:08<00:00, 2330.77it/s]


Article Tokenizing


100%|██████████| 20049/20049 [05:48<00:00, 57.58it/s]


### Model creation and test

In [None]:
model = torch.load(f"gdrive/MyDrive/BT4222/Code/machine_learning/neil/{article}/Expt3/model_epoch6")
loss_function = nn.CrossEntropyLoss()
model.to(device)
with torch.set_grad_enabled(False):
    model.eval()
    epoch_loss = 0.0
    preds, truth, pred_proba = [],[],[]
    iteration = 0
    with torch.no_grad():
        for i, batch in enumerate(tqdm(testloader)):
            iteration += 1
            article_token_ids, article_masks, title_token_ids, title_masks, labels = tuple(t.to(device) for t in batch)
            outputs = model(article_token_ids, article_masks, title_token_ids, title_masks, labels)
            loss = loss_function(outputs, labels)
            yhat = outputs

            # Metrics for batch
            epoch_loss += float(loss.item())
            prediction_proba = torch.sigmoid(yhat[:,1]).cpu().data.numpy()
            prediction = (prediction_proba > 0.5).astype(int)
            baseline = labels.long().cpu().data.numpy().astype(int)
            preds.extend(prediction)
            pred_proba.extend(prediction_proba)
            truth.extend(baseline)

            del article_token_ids, article_masks, title_token_ids, title_masks, labels #memory
            torch.cuda.empty_cache() #memory
            gc.collect() #memory

    avg_accuracy, avg_roc_auc, avg_f1, avg_loss = accuracy_score(truth, preds), roc_auc_score(truth, pred_proba), f1_score(truth, preds), epoch_loss/float(iteration)
    print(f'\nTest Accuracy: 'f'{avg_accuracy:.2f}%')
    print(f'Test ROC AUC: 'f'{avg_roc_auc:.2f}%')
    print(f'Test F1: 'f'{avg_f1:.2f}%')
    print(f'Test loss: 'f'{avg_loss}%\n')
    test_metrics = {
        'accuracy':avg_accuracy,
        'roc_auc':avg_roc_auc,
        'f1':avg_f1,
        'loss':avg_loss
    }

100%|██████████| 376/376 [03:52<00:00,  1.62it/s]



Test Accuracy: 0.89%
Test ROC AUC: 0.92%
Test F1: 0.93%
Test loss: 0.33495429144154226%



# Experiment 4

### Tokenizing

In [None]:
def tokenize(df):
    # Get tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    # tokenize text
    print("Title Tokenizing")
    title_tokenized_df = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:98] + ['[SEP]'], tqdm(df['title'])))
    # Get token index
    title_indexed_tokens = list(map(tokenizer.convert_tokens_to_ids, title_tokenized_df))
    
    # Pad tokens
    totalpadlength = 100
    title_index_padded = np.array([xi+[0]*(totalpadlength-len(xi)) for xi in title_indexed_tokens])

    # Mask
    title_mask_variable = [[float(i>0) for i in ii] for ii in title_index_padded]


    print("Article Tokenizing")
    article_tokenized_df = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], tqdm(df['text'])))    
    article_indexed_tokens = list(map(tokenizer.convert_tokens_to_ids, article_tokenized_df))
  
    # Pad tokens
    totalpadlength = 512
    article_index_padded = np.array([xi+[0]*(totalpadlength-len(xi)) for xi in article_indexed_tokens])

    # Mask
    article_mask_variable = [[float(i>0) for i in ii] for ii in article_index_padded]

    # Target Variable
    target_variable = df['target'].values

    return title_index_padded, title_mask_variable, article_index_padded, article_mask_variable, target_variable

def format_tensors(article_data, article_mask, title_data, title_mask, extra_features, labels, batch_size):
    
    X_article = torch.from_numpy(article_data)
    X_article = X_article.long()
    article_mask = torch.tensor(article_mask)

    X_title = torch.from_numpy(title_data)
    X_title = X_title.long()
    title_mask = torch.tensor(title_mask)

    extra_features = torch.from_numpy(extra_features)
    extra_features = extra_features.long()

    y = torch.from_numpy(labels)
    y = y.long()

    tensordata = data_utils.TensorDataset(X_article, article_mask, X_title, title_mask, extra_features,  y)
    loader = data_utils.DataLoader(tensordata, batch_size=batch_size, shuffle=False)
    
    return loader

def train_validation_test(title_index_padded, title_mask_variable, article_index_padded, article_mask_variable, X_extra, target_variable, BATCH_SIZE = 8):
    # Train test split for train set
    X_train_title, X_rest_title, y_train, y_rest = train_test_split(title_index_padded, target_variable, test_size=0.3, random_state=42)
    train_masks_title, rest_masks_title, _, _ = train_test_split(title_mask_variable, title_index_padded, test_size=0.3, random_state=42)

    # Train test split again for validation and test set
    X_val_title, X_test_title, y_val, y_test = train_test_split(X_rest_title, y_rest, test_size=0.5, random_state=42)
    val_masks_title, test_masks_title, _, _ = train_test_split(rest_masks_title, X_rest_title, test_size=0.5, random_state=42)

    # Train test split for train set
    X_train_article, X_rest_article, _, _ = train_test_split(article_index_padded, target_variable, test_size=0.3, random_state=42)
    train_masks_article, rest_masks_article, _, _ = train_test_split(article_mask_variable, article_index_padded, test_size=0.3, random_state=42)

    # Train test split again for validation and test set
    X_val_article, X_test_article, _, _ = train_test_split(X_rest_article, y_rest, test_size=0.5, random_state=42)
    val_masks_article, test_masks_article, _, _ = train_test_split(rest_masks_article, X_rest_article, test_size=0.5, random_state=42)

    # Train test split for train set
    X_train_extra, X_rest_extra, _, _ = train_test_split(X_extra, target_variable, test_size=0.3, random_state=42)

    # Train test split again for validation and test set
    X_val_extra, X_test_extra, _, _ = train_test_split(X_rest_extra, y_rest, test_size=0.5, random_state=42)

    trainloader = format_tensors(X_train_article, train_masks_article, X_train_title, train_masks_title, X_train_extra, y_train, BATCH_SIZE)
    validationloader = format_tensors(X_val_article, val_masks_article, X_val_title, val_masks_title, X_val_extra, y_val, BATCH_SIZE)
    testloader = format_tensors(X_test_article, test_masks_article, X_test_title, test_masks_title, X_test_extra, y_test, BATCH_SIZE)

    return trainloader, validationloader, testloader

In [None]:
title_index_padded, title_mask_variable, article_index_padded, article_mask_variable, target_variable = tokenize(politifact_df) if article == "politifact" else tokenize(gossipcop_df)
trainloader, validationloader, testloader = train_validation_test(title_index_padded, title_mask_variable, article_index_padded, article_mask_variable, X_extra_enc, target_variable)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Title Tokenizing


100%|██████████| 20049/20049 [00:09<00:00, 2212.37it/s]


Article Tokenizing


100%|██████████| 20049/20049 [05:55<00:00, 56.35it/s]


### Model creation and test

In [None]:
model = torch.load(f"gdrive/MyDrive/BT4222/Code/machine_learning/neil/{article}/Expt4/model_epoch7")
loss_function = nn.CrossEntropyLoss()
model.to(device)
with torch.set_grad_enabled(False):
    model.eval()
    epoch_loss = 0.0
    preds, truth, pred_proba = [],[],[]
    loss_function = nn.CrossEntropyLoss()
    iteration = 0
    with torch.no_grad():
        for i, batch in enumerate(tqdm(testloader)):
            iteration += 1
            article_token_ids, article_masks, title_token_ids, title_masks, extra_features, labels = tuple(t.to(device) for t in batch)
            outputs = model(article_token_ids, article_masks, title_token_ids, title_masks, extra_features, labels)
            
            loss = loss_function(outputs, labels)
            epoch_loss += float(loss.item())
            yhat = outputs
            prediction_proba = torch.sigmoid(yhat[:,1]).cpu().data.numpy()
            prediction = (prediction_proba > 0.5).astype(int)
            baseline = labels.long().cpu().data.numpy().astype(int)
            preds.extend(prediction)
            pred_proba.extend(prediction_proba)
            truth.extend(baseline)

            del article_token_ids, article_masks, title_token_ids, title_masks, extra_features, labels #memory
            torch.cuda.empty_cache() #memory
            gc.collect() #memory

    avg_accuracy, avg_roc_auc, avg_f1, avg_loss = accuracy_score(truth, preds), roc_auc_score(truth, pred_proba), f1_score(truth, preds), epoch_loss/float(iteration)
    print(f'\nTest Accuracy: 'f'{avg_accuracy:.2f}%')
    print(f'Test ROC AUC: 'f'{avg_roc_auc:.2f}%')
    print(f'Test F1: 'f'{avg_f1:.2f}%')
    print(f'Test loss: 'f'{avg_loss}%\n')
    test_metrics = {
        'accuracy':avg_accuracy,
        'roc_auc':avg_roc_auc,
        'f1':avg_f1,
        'loss':avg_loss
    }

100%|██████████| 376/376 [03:55<00:00,  1.59it/s]



Test Accuracy: 0.89%
Test ROC AUC: 0.92%
Test F1: 0.93%
Test loss: 0.3363721717825219%



# Experiment 5

### Summarizing

In [None]:
%%capture
!pip install sumy
import sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from gensim.summarization import summarize
from sumy.utils import get_stop_words
from sumy.nlp.tokenizers import Tokenizer as sumytoken
from sumy.nlp.stemmers import Stemmer
from sumy.summarizers.luhn import LuhnSummarizer

# For TextRank
import nltk
nltk.download('punkt') # one time execution

In [None]:
def luhn_summarizer(text, LANGUAGE, SENTENCES_COUNT):
    parser = PlaintextParser.from_string(text, sumytoken(LANGUAGE))
    sentences = []
    for sentence in summarizer_luhn(parser.document, SENTENCES_COUNT):
        a = sentence
        sentences.append(str(a))
    return " ".join(sentences)

In [None]:
LANGUAGE = "english"
SENTENCES_COUNT = 15 # dk how to collaborate this
stemmer = Stemmer(LANGUAGE)
summarizer_luhn = LuhnSummarizer(stemmer)
summarizer_luhn.stop_words = get_stop_words(LANGUAGE)

In [None]:
#gossipcop_df['text_summarised'] = gossipcop_df['text'].apply(lambda x: luhn_summarizer(x, LANGUAGE,SENTENCES_COUNT))
politifact_df['text_summarised'] = politifact_df['text'].apply(lambda x: luhn_summarizer(x, LANGUAGE,SENTENCES_COUNT))

In [None]:
politifact_df['text_summarised_len'] = politifact_df['text_summarised'].str.split().str.len()

#gossipcop_df['text_summarised_len'] = gossipcop_df['text_summarised'].str.split().str.len()

### Tokenizing

In [None]:
def tokenize(df):
    # Get tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    # tokenize text
    print("Title Tokenizing")
    title_tokenized_df = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:98] + ['[SEP]'], tqdm(df['title'])))
    # Get token index
    title_indexed_tokens = list(map(tokenizer.convert_tokens_to_ids, title_tokenized_df))
    
    # Pad tokens
    totalpadlength = 100
    title_index_padded = np.array([xi+[0]*(totalpadlength-len(xi)) for xi in title_indexed_tokens])

    # Mask
    title_mask_variable = [[float(i>0) for i in ii] for ii in title_index_padded]


    print("Article Tokenizing")
    article_tokenized_df = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], tqdm(df['text_summarised'])))    
    article_indexed_tokens = list(map(tokenizer.convert_tokens_to_ids, article_tokenized_df))
  
    # Pad tokens
    totalpadlength = 512
    article_index_padded = np.array([xi+[0]*(totalpadlength-len(xi)) for xi in article_indexed_tokens])

    # Mask
    article_mask_variable = [[float(i>0) for i in ii] for ii in article_index_padded]

    # Target Variable
    target_variable = df['target'].values

    return title_index_padded, title_mask_variable, article_index_padded, article_mask_variable, target_variable

def format_tensors(article_data, article_mask, title_data, title_mask, extra_features, labels, batch_size):
    
    X_article = torch.from_numpy(article_data)
    X_article = X_article.long()
    article_mask = torch.tensor(article_mask)

    X_title = torch.from_numpy(title_data)
    X_title = X_title.long()
    title_mask = torch.tensor(title_mask)

    extra_features = torch.from_numpy(extra_features)
    extra_features = extra_features.long()

    y = torch.from_numpy(labels)
    y = y.long()

    tensordata = data_utils.TensorDataset(X_article, article_mask, X_title, title_mask, extra_features,  y)
    loader = data_utils.DataLoader(tensordata, batch_size=batch_size, shuffle=False)
    
    return loader

def train_validation_test(title_index_padded, title_mask_variable, article_index_padded, article_mask_variable, X_extra, target_variable, BATCH_SIZE = 8):
    # Train test split for train set
    X_train_title, X_rest_title, y_train, y_rest = train_test_split(title_index_padded, target_variable, test_size=0.3, random_state=42)
    train_masks_title, rest_masks_title, _, _ = train_test_split(title_mask_variable, title_index_padded, test_size=0.3, random_state=42)

    # Train test split again for validation and test set
    X_val_title, X_test_title, y_val, y_test = train_test_split(X_rest_title, y_rest, test_size=0.5, random_state=42)
    val_masks_title, test_masks_title, _, _ = train_test_split(rest_masks_title, X_rest_title, test_size=0.5, random_state=42)

    # Train test split for train set
    X_train_article, X_rest_article, _, _ = train_test_split(article_index_padded, target_variable, test_size=0.3, random_state=42)
    train_masks_article, rest_masks_article, _, _ = train_test_split(article_mask_variable, article_index_padded, test_size=0.3, random_state=42)

    # Train test split again for validation and test set
    X_val_article, X_test_article, _, _ = train_test_split(X_rest_article, y_rest, test_size=0.5, random_state=42)
    val_masks_article, test_masks_article, _, _ = train_test_split(rest_masks_article, X_rest_article, test_size=0.5, random_state=42)

    # Train test split for train set
    X_train_extra, X_rest_extra, _, _ = train_test_split(X_extra, target_variable, test_size=0.3, random_state=42)

    # Train test split again for validation and test set
    X_val_extra, X_test_extra, _, _ = train_test_split(X_rest_extra, y_rest, test_size=0.5, random_state=42)

    trainloader = format_tensors(X_train_article, train_masks_article, X_train_title, train_masks_title, X_train_extra, y_train, BATCH_SIZE)
    validationloader = format_tensors(X_val_article, val_masks_article, X_val_title, val_masks_title, X_val_extra, y_val, BATCH_SIZE)
    testloader = format_tensors(X_test_article, test_masks_article, X_test_title, test_masks_title, X_test_extra, y_test, BATCH_SIZE)

    return trainloader, validationloader, testloader

In [None]:
title_index_padded, title_mask_variable, article_index_padded, article_mask_variable, target_variable = tokenize(politifact_df) if article == "politifact" else tokenize(gossipcop_df)
trainloader, validationloader, testloader = train_validation_test(title_index_padded, title_mask_variable, article_index_padded, article_mask_variable, X_extra_enc, target_variable)

Title Tokenizing


100%|██████████| 954/954 [00:00<00:00, 2762.37it/s]


Article Tokenizing


100%|██████████| 954/954 [00:07<00:00, 136.08it/s]


### Model creation and test

In [None]:
model = torch.load(f"gdrive/MyDrive/BT4222/Code/machine_learning/neil/{article}/Expt5/model_epoch3")
loss_function = nn.CrossEntropyLoss()
model.to(device)
with torch.set_grad_enabled(False):
    model.eval()
    epoch_loss = 0.0
    preds, truth, pred_proba = [],[],[]
    iteration = 0
    with torch.no_grad():
        for i, batch in enumerate(tqdm(testloader)):
            iteration += 1
            article_token_ids, article_masks, title_token_ids, title_masks, extra_features, labels = tuple(t.to(device) for t in batch)
            outputs = model(article_token_ids, article_masks, title_token_ids, title_masks, extra_features, labels)
            
            loss = loss_function(outputs, labels)
            epoch_loss += float(loss.item())
            yhat = outputs
            prediction_proba = torch.sigmoid(yhat[:,1]).cpu().data.numpy()
            prediction = (prediction_proba > 0.5).astype(int)
            baseline = labels.long().cpu().data.numpy().astype(int)
            preds.extend(prediction)
            pred_proba.extend(prediction_proba)
            truth.extend(baseline)

            del article_token_ids, article_masks, title_token_ids, title_masks, extra_features, labels #memory
            torch.cuda.empty_cache() #memory
            gc.collect() #memory

    avg_accuracy, avg_roc_auc, avg_f1, avg_loss = accuracy_score(truth, preds), roc_auc_score(truth, pred_proba), f1_score(truth, preds), epoch_loss/float(iteration)
    print(f'\nTest Accuracy: 'f'{avg_accuracy:.2f}%')
    print(f'Test ROC AUC: 'f'{avg_roc_auc:.2f}%')
    print(f'Test F1: 'f'{avg_f1:.2f}%')
    print(f'Test loss: 'f'{avg_loss}%\n')
    test_metrics = {
        'accuracy':avg_accuracy,
        'roc_auc':avg_roc_auc,
        'f1':avg_f1,
        'loss':avg_loss
    }

100%|██████████| 18/18 [00:11<00:00,  1.50it/s]


Test Accuracy: 0.89%
Test ROC AUC: 0.96%
Test F1: 0.91%
Test loss: 0.3030630453593201%






# Experiment 6

### Tokenizing - TF-idf

We don't use CV as it performs much poorly

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tokenize(df):
    # Get tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    # tokenize text
    print("Title Tokenizing")
    title_tokenized_df = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:98] + ['[SEP]'], tqdm(df['title'])))
    # Get token index
    title_indexed_tokens = list(map(tokenizer.convert_tokens_to_ids, title_tokenized_df))
    
    # Pad tokens
    totalpadlength = 100
    title_index_padded = np.array([xi+[0]*(totalpadlength-len(xi)) for xi in title_indexed_tokens])

    # Mask
    title_mask_variable = [[float(i>0) for i in ii] for ii in title_index_padded]

    # Target Variable
    target_variable = df['target'].values

    # Article text
    article_text = df.text_clean.values

    return title_index_padded, title_mask_variable, article_text, target_variable

def format_tensors(article_text, title_data, title_mask, extra_features, labels, batch_size):
    
    X_article = torch.from_numpy(article_text)
    X_article = X_article.long()

    X_title = torch.from_numpy(title_data)
    X_title = X_title.long()
    title_mask = torch.tensor(title_mask)

    extra_features = torch.from_numpy(extra_features)
    extra_features = extra_features.long()

    y = torch.from_numpy(labels)
    y = y.long()

    tensordata = data_utils.TensorDataset(X_article, X_title, title_mask, extra_features,  y)
    loader = data_utils.DataLoader(tensordata, batch_size=batch_size, shuffle=False)
    
    return loader

def train_validation_test(title_index_padded, title_mask_variable, article_text, X_extra, target_variable, BATCH_SIZE = 8):
    # TITLE #
    # Train test split for train set
    X_train_title, X_rest_title, y_train, y_rest = train_test_split(title_index_padded, target_variable, test_size=0.3, random_state=42)
    train_masks_title, rest_masks_title, _, _ = train_test_split(title_mask_variable, title_index_padded, test_size=0.3, random_state=42)

    # Train test split again for validation and test set
    X_val_title, X_test_title, y_val, y_test = train_test_split(X_rest_title, y_rest, test_size=0.5, random_state=42)
    val_masks_title, test_masks_title, _, _ = train_test_split(rest_masks_title, X_rest_title, test_size=0.5, random_state=42)

    # ARTICLE #
    # Train test split for train set
    X_train_article, X_rest_article, _, _ = train_test_split(article_text, target_variable, test_size=0.3, random_state=42)

    # Train test split again for validation and test set
    X_val_article, X_test_article, _, _ = train_test_split(X_rest_article, y_rest, test_size=0.5, random_state=42)

    vect = TfidfVectorizer(max_features=20000)
    X_train_article_dtm = vect.fit_transform(X_train_article).toarray()
    X_test_article_dtm = vect.transform(X_test_article).toarray()
    X_val_article_dtm = vect.transform(X_val_article).toarray()

    # Extra Features #
    # Train test split for train set
    X_train_extra, X_rest_extra, _, _ = train_test_split(X_extra, target_variable, test_size=0.3, random_state=42)

    # Train test split again for validation and test set
    X_val_extra, X_test_extra, _, _ = train_test_split(X_rest_extra, y_rest, test_size=0.5, random_state=42)

    enc = OneHotEncoder(handle_unknown = 'ignore')
    enc.fit(X_train_extra)
    X_train_extra_enc = enc.transform(X_train_extra).toarray()
    X_test_extra_enc = enc.transform(X_test_extra).toarray()
    X_val_extra_enc = enc.transform(X_val_extra).toarray()


    trainloader = format_tensors(X_train_article_dtm, X_train_title, train_masks_title, X_train_extra_enc, y_train, BATCH_SIZE)
    validationloader = format_tensors(X_val_article_dtm, X_val_title, val_masks_title, X_val_extra_enc, y_val, BATCH_SIZE)
    testloader = format_tensors(X_test_article_dtm, X_test_title, test_masks_title, X_test_extra_enc, y_test, BATCH_SIZE)

    return trainloader, validationloader, testloader

In [None]:
title_index_padded, title_mask_variable, article_text, target_variable = tokenize(politifact_df) if article == "politifact" else tokenize(gossipcop_df)
trainloader, validationloader, testloader = train_validation_test(title_index_padded, title_mask_variable, article_text, X_extra, target_variable)

Title Tokenizing


100%|██████████| 20049/20049 [00:11<00:00, 1746.04it/s]


### Model creation and test

In [None]:
model = torch.load(f"gdrive/MyDrive/BT4222/Code/machine_learning/neil/{article}/Expt6/model_epoch5")
loss_function = nn.CrossEntropyLoss()
model.to(device)
with torch.set_grad_enabled(False):
    model.eval()
    epoch_loss = 0.0
    loss_function = nn.CrossEntropyLoss()
    preds, truth, pred_proba = [],[],[]
    iteration = 0
    with torch.no_grad():
        for i, batch in enumerate(tqdm(testloader)):
            iteration += 1
            article_input, title_input_ids, title_attention_mask, extra_features, labels = tuple(t.to(device) for t in batch)
            outputs = model(article_input, title_input_ids, title_attention_mask, extra_features)
            
            loss = loss_function(outputs, labels)
            epoch_loss += float(loss.item())
            yhat = outputs
            prediction_proba = torch.sigmoid(yhat[:,1]).cpu().data.numpy()
            prediction = (prediction_proba > 0.5).astype(int)
            baseline = labels.long().cpu().data.numpy().astype(int)
            preds.extend(prediction)
            pred_proba.extend(prediction_proba)
            truth.extend(baseline)

            del article_input, title_input_ids, title_attention_mask, extra_features, labels #memory
            torch.cuda.empty_cache() #memory
            gc.collect() #memory

    avg_accuracy, avg_roc_auc, avg_f1, avg_loss = accuracy_score(truth, preds), roc_auc_score(truth, pred_proba), f1_score(truth, preds), epoch_loss/float(iteration)
    print(f'\nTest Accuracy: 'f'{avg_accuracy:.2f}%')
    print(f'Test ROC AUC: 'f'{avg_roc_auc:.2f}%')
    print(f'Test F1: 'f'{avg_f1:.2f}%')
    print(f'Test loss: 'f'{avg_loss}%\n')
    test_metrics = {
        'accuracy':avg_accuracy,
        'roc_auc':avg_roc_auc,
        'f1':avg_f1,
        'loss':avg_loss
    }

100%|██████████| 376/376 [01:48<00:00,  3.47it/s]


Test Accuracy: 0.84%
Test ROC AUC: 0.88%
Test F1: 0.90%
Test loss: 0.4561593482834956%






# Experiment 7

### Loading Data

In [None]:
politifact_data = json.load(open("gdrive/MyDrive/BT4222/Data/politifact_combined.json", "r"))
gossipcop_data = json.load(open("gdrive/MyDrive/BT4222/Data/gossipcop_combined.json", "r"))

In [None]:
politifact_df = pd.DataFrame(politifact_data)
gossipcop_df = pd.DataFrame(gossipcop_data)

In [None]:
politifact_df['target'] = politifact_df['label'].apply(lambda x: 1 if x=='real' else 0)
gossipcop_df['target'] = gossipcop_df['label'].apply(lambda x: 1 if x=='real' else 0)

In [None]:
politifact_df['parsed_month'] = politifact_df['publish_date'].apply(lambda x: dt.datetime.fromtimestamp(x).strftime("%m") if not pd.isna(x) else '0')
gossipcop_df['parsed_month'] = gossipcop_df['publish_date'].apply(lambda x: dt.datetime.fromtimestamp(x).strftime("%m") if not pd.isna(x) else '0')

In [None]:
politifact_df['parsed_hour'] = politifact_df['publish_date'].apply(lambda x: dt.datetime.fromtimestamp(x).strftime("%H") if not pd.isna(x) else '0')
gossipcop_df['parsed_hour'] = gossipcop_df['publish_date'].apply(lambda x: dt.datetime.fromtimestamp(x).strftime("%H") if not pd.isna(x) else '0')

In [None]:
politifact_df['publisher'] = politifact_df['publisher'].fillna('None')
gossipcop_df['publisher'] = gossipcop_df['publisher'].fillna('None')

In [None]:
if article == "politifact":
  X_extra = politifact_df[['parsed_hour','parsed_month', 'publisher']]
else:
  X_extra = gossipcop_df[['parsed_hour','parsed_month', 'publisher']]

In [None]:
X_extra['parsed_hour'] = X_extra['parsed_hour'].astype('str')
X_extra['parsed_month'] = X_extra['parsed_month'].astype('str')
X_extra['publisher'] = X_extra['publisher'].astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### Tokenizing - TF-idf

We don't use CV as it performs much poorly

In [None]:
def tokenize(df):
    # Get tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    # tokenize text
    print("Title Tokenizing")
    title_tokenized_df = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:98] + ['[SEP]'], tqdm(df['title'])))
    # Get token index
    title_indexed_tokens = list(map(tokenizer.convert_tokens_to_ids, title_tokenized_df))
    
    # Pad tokens
    totalpadlength = 100
    title_index_padded = np.array([xi+[0]*(totalpadlength-len(xi)) for xi in title_indexed_tokens])

    # Mask
    title_mask_variable = [[float(i>0) for i in ii] for ii in title_index_padded]


    print("Article Tokenizing")
    article_tokenized_df = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], tqdm(df['text'])))    
    article_indexed_tokens = list(map(tokenizer.convert_tokens_to_ids, article_tokenized_df))
  
    # Pad tokens
    totalpadlength = 512
    article_index_padded = np.array([xi+[0]*(totalpadlength-len(xi)) for xi in article_indexed_tokens])

    # Mask
    article_mask_variable = [[float(i>0) for i in ii] for ii in article_index_padded]
  
    # Article text
    tweets_text = df.tweets_text.values

    # Target Variable
    target_variable = df['target'].values

    return title_index_padded, title_mask_variable, article_index_padded, article_mask_variable, tweets_text, target_variable

def format_tensors(article_data, article_mask, title_data, title_mask, extra_features, tweets_text, labels, batch_size):
    
    X_article = torch.from_numpy(article_data)
    X_article = X_article.long()
    article_mask = torch.tensor(article_mask)

    X_title = torch.from_numpy(title_data)
    X_title = X_title.long()
    title_mask = torch.tensor(title_mask)

    extra_features = torch.from_numpy(extra_features)
    extra_features = extra_features.long()

    tweets_text = torch.from_numpy(tweets_text)
    tweets_text = tweets_text.long()    

    y = torch.from_numpy(labels)
    y = y.long()

    tensordata = data_utils.TensorDataset(X_article, article_mask, X_title, title_mask, extra_features,tweets_text,  y)
    loader = data_utils.DataLoader(tensordata, batch_size=batch_size, shuffle=False)
    
    return loader

def train_validation_test(title_index_padded, title_mask_variable, article_index_padded, article_mask_variable, tweets_text, target_variable, BATCH_SIZE = 8):

    # Train test split for train set
    X_train_title, X_rest_title, y_train, y_rest = train_test_split(title_index_padded, target_variable, test_size=0.3, random_state=42)
    train_masks_title, rest_masks_title, _, _ = train_test_split(title_mask_variable, title_index_padded, test_size=0.3, random_state=42)

    # Train test split again for validation and test set
    X_val_title, X_test_title, y_val, y_test = train_test_split(X_rest_title, y_rest, test_size=0.5, random_state=42)
    val_masks_title, test_masks_title, _, _ = train_test_split(rest_masks_title, X_rest_title, test_size=0.5, random_state=42)

    # Train test split for train set
    X_train_article, X_rest_article, y_train, y_rest = train_test_split(article_index_padded, target_variable, test_size=0.3, random_state=42)
    train_masks_article, rest_masks_article, _, _ = train_test_split(article_mask_variable, article_index_padded, test_size=0.3, random_state=42)

    # Train test split again for validation and test set
    X_val_article, X_test_article, y_val, y_test = train_test_split(X_rest_article, y_rest, test_size=0.5, random_state=42)
    val_masks_article, test_masks_article, _, _ = train_test_split(rest_masks_article, X_rest_article, test_size=0.5, random_state=42)


    # Tweets #
    # Train test split for train set
    X_train_tweet, X_rest_tweet, _, _ = train_test_split(tweets_text, target_variable, test_size=0.3, random_state=42)

    # Train test split again for validation and test set
    X_val_tweet, X_test_tweet, _, _ = train_test_split(X_rest_tweet, y_rest, test_size=0.5, random_state=42)

    vect = TfidfVectorizer(max_features=10000)
    X_train_tweet_dtm = vect.fit_transform(X_train_tweet).toarray()
    X_test_tweet_dtm = vect.transform(X_test_tweet).toarray()
    X_val_tweet_dtm = vect.transform(X_val_tweet).toarray()

    # Extra Features #
    # Train test split for train set
    X_train_extra, X_rest_extra, _, _ = train_test_split(X_extra, target_variable, test_size=0.3, random_state=42)

    # Train test split again for validation and test set
    X_val_extra, X_test_extra, _, _ = train_test_split(X_rest_extra, y_rest, test_size=0.5, random_state=42)

    enc = OneHotEncoder(handle_unknown = 'ignore')
    enc.fit(X_train_extra)
    X_train_extra_enc = enc.transform(X_train_extra).toarray()
    X_test_extra_enc = enc.transform(X_test_extra).toarray()
    X_val_extra_enc = enc.transform(X_val_extra).toarray()


    trainloader = format_tensors(X_train_article, train_masks_article, X_train_title, train_masks_title, X_train_extra_enc, X_train_tweet_dtm, y_train, BATCH_SIZE)
    validationloader = format_tensors(X_val_article, val_masks_article, X_val_title, val_masks_title, X_val_extra_enc, X_val_tweet_dtm, y_val, BATCH_SIZE)
    testloader = format_tensors(X_test_article, test_masks_article, X_test_title, test_masks_title, X_test_extra_enc,X_test_tweet_dtm,  y_test, BATCH_SIZE)

    return trainloader, validationloader, testloader

In [None]:
title_index_padded, title_mask_variable, article_index_padded, article_mask_variable, tweets_text, target_variable = tokenize(politifact_df) if article == "politifact" else tokenize(gossipcop_df)
trainloader, validationloader, testloader = train_validation_test(title_index_padded, title_mask_variable, article_index_padded, article_mask_variable, tweets_text, target_variable)

Title Tokenizing


100%|██████████| 954/954 [00:00<00:00, 1860.01it/s]


Article Tokenizing


100%|██████████| 954/954 [00:40<00:00, 23.54it/s]


### Model creation and test

In [None]:
model = torch.load(f"gdrive/MyDrive/BT4222/Code/machine_learning/neil/{article}/Expt7/model_epoch4")
loss_function = nn.CrossEntropyLoss()
model.to(device)
with torch.set_grad_enabled(False):
    model.eval()
    epoch_loss = 0.0
    loss_function = nn.CrossEntropyLoss()
    preds, truth, pred_proba = [],[],[]
    iteration = 0
    with torch.no_grad():
        for i, batch in enumerate(tqdm(testloader)):
            iteration += 1
            article_token_ids, article_masks, title_token_ids, title_masks, extra_features, tweet_text, labels = tuple(t.to(device) for t in batch)
            outputs = model(article_token_ids, article_masks, title_token_ids, title_masks, extra_features, tweet_text)
            
            loss = loss_function(outputs, labels)
            epoch_loss += float(loss.item())
            yhat = outputs
            prediction_proba = torch.sigmoid(yhat[:,1]).cpu().data.numpy()
            prediction = (prediction_proba > 0.5).astype(int)
            baseline = labels.long().cpu().data.numpy().astype(int)
            preds.extend(prediction)
            pred_proba.extend(prediction_proba)
            truth.extend(baseline)

            del article_token_ids, article_masks, title_token_ids, title_masks, extra_features, tweet_text, labels #memory
            torch.cuda.empty_cache() #memory
            gc.collect() #memory

    avg_accuracy, avg_roc_auc, avg_f1, avg_loss = accuracy_score(truth, preds), roc_auc_score(truth, pred_proba), f1_score(truth, preds), epoch_loss/float(iteration)
    print(f'\nTest Accuracy: 'f'{avg_accuracy:.2f}%')
    print(f'Test ROC AUC: 'f'{avg_roc_auc:.2f}%')
    print(f'Test F1: 'f'{avg_f1:.2f}%')
    print(f'Test loss: 'f'{avg_loss}%\n')
    test_metrics = {
        'accuracy':avg_accuracy,
        'roc_auc':avg_roc_auc,
        'f1':avg_f1,
        'loss':avg_loss
    }

100%|██████████| 18/18 [00:09<00:00,  1.86it/s]


Test Accuracy: 0.88%
Test ROC AUC: 0.93%
Test F1: 0.90%
Test loss: 0.32402483601537013%




