## Model 3: Here we are taking the metadata_sentence and using a transformer

In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [None]:
# We'll use section3 Data
# It has: ['statement', 'metadata_sentence', 'binary_label']
section3 = pd.read_csv('section3.csv')
# Train-test split
train_df, test_df = train_test_split(section3, test_size=0.2, random_state=42)


In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
# Tokenize sentences properly
train_encodings = tokenizer(list(train_df['metadata_sentence']), padding='max_length', truncation=True, max_length=128)
test_encodings = tokenizer(list(test_df['metadata_sentence']), padding='max_length', truncation=True, max_length=128)


In [None]:
# Defining a custom Dataset class for handling tokenized inputs and labels, then creates train and test datasets


import torch

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)

        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = NewsDataset(train_encodings, train_df['label'].tolist())
test_dataset = NewsDataset(test_encodings, test_df['label'].tolist())


In [None]:
from transformers import AutoModelForSequenceClassification
# Load a pre-trained DistilBERT model
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,               
    per_device_train_batch_size=16,   
    per_device_eval_batch_size=64,    
    eval_strategy="epoch",           
    save_strategy="epoch",           
    logging_dir='./logs',             
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

In [None]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)  # Taking the class with highest probability
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}


In [8]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


In [9]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6652,0.660511,0.60375
2,0.6216,0.658146,0.6225
3,0.6556,0.652143,0.62625


TrainOutput(global_step=600, training_loss=0.6502100642522176, metrics={'train_runtime': 949.3968, 'train_samples_per_second': 10.112, 'train_steps_per_second': 0.632, 'total_flos': 317921756774400.0, 'train_loss': 0.6502100642522176, 'epoch': 3.0})

In [None]:
#saving the model
trainer.save_model('section3_transformer_model')

In [None]:
#predicting on test set
predictions_output = trainer.predict(test_dataset)

In [None]:
import torch
probs = torch.nn.functional.softmax(torch.tensor(predictions_output.predictions), dim=1)[:, 1].numpy()

In [None]:
# Creating a DataFrame with test statements, true labels, and predicted probabilities,saving to a CSV file

import pandas as pd
# Online code used
final_df = pd.DataFrame({
    'statement': test_df['statement'].values,
    'label': test_df['label'].values,  
    'predicted_percentage_true': probs
})

final_df.to_csv('section3_predictions.csv', index=False)

print(final_df.head())


                                           statement  label  \
0  Obama says Iran is a 'tiny' country, 'doesn't ...  False   
1  Says Republican candidates in Oregon are advoc...  False   
2  Rob Cornilles has disputed for two years a new...  False   
3  I dont own a single stock or bond I have no sa...   True   
4  Regarding the Iraq War, I was the one that sai...  False   

   predicted_percentage_true  
0                   0.423365  
1                   0.742093  
2                   0.381915  
3                   0.671264  
4                   0.178504  


In [None]:
#printing the performance
from sklearn.metrics import accuracy_score

predicted_labels = (probs > 0.5).astype(int)

test_accuracy = accuracy_score(test_df['label'].values, predicted_labels)
print(f"Test Accuracy based on 0.5 threshold: {test_accuracy:.4f}")


Test Accuracy based on 0.5 threshold: 0.6262


In [None]:
predictions = trainer.predict(test_dataset)

In [None]:
# Printing the metrics

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#getting predicted and true labels
y_pred = predictions.predictions.argmax(axis=1)
y_true = predictions.label_ids

#calculate metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")

print(f"F1 Score : {f1:.4f}")


Accuracy : 0.6262
Precision: 0.6583
Recall   : 0.7705
F1 Score : 0.7100
