In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
import seaborn as sns
import shap
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
import torch
from wordcloud import WordCloud

In [None]:
# Load the data
data_path = '/content/twinviews-13k.csv'
df = pd.read_csv(data_path)

In [None]:
# Transform the dataset into a format with text and label columns
left_df = df[['left', 'topic']].rename(columns={'left': 'text'})
left_df['label'] = 0
right_df = df[['right', 'topic']].rename(columns={'right': 'text'})
right_df['label'] = 1
stance_df = pd.concat([left_df, right_df], ignore_index=True)

In [None]:
# K-Fold cross-validation
kf = KFold(n_splits=5, random_state=42, shuffle=True)

In [None]:
# Initialize lists to store the results
accuracies = []
f1_scores = []
precision_scores = []
recall_scores = []

In [None]:
# Simpler models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

In [None]:
for train_index, test_index in kf.split(stance_df):
    X_train, X_test = stance_df.iloc[train_index]['text'], stance_df.iloc[test_index]['text']
    y_train, y_test = stance_df.iloc[train_index]['label'], stance_df.iloc[test_index]['label']

    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Train and evaluate the simpler models
    for name, model in models.items():
        model.fit(X_train_tfidf, y_train)
        y_pred = model.predict(X_test_tfidf)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        accuracies.append((name, accuracy))
        f1_scores.append((name, f1))
        precision_scores.append((name, precision))
        recall_scores.append((name, recall))
        print(f"{name} Accuracy: {accuracy:.4f}, F1-score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")


Logistic Regression Accuracy: 0.9796, F1-score: 0.9797, Precision: 0.9785, Recall: 0.9809
Naive Bayes Accuracy: 0.9749, F1-score: 0.9751, Precision: 0.9704, Recall: 0.9799
SVM Accuracy: 0.9866, F1-score: 0.9867, Precision: 0.9877, Recall: 0.9856
Decision Tree Accuracy: 0.9598, F1-score: 0.9596, Precision: 0.9664, Recall: 0.9529
Logistic Regression Accuracy: 0.9829, F1-score: 0.9828, Precision: 0.9797, Recall: 0.9858
Naive Bayes Accuracy: 0.9769, F1-score: 0.9768, Precision: 0.9716, Recall: 0.9822
SVM Accuracy: 0.9895, F1-score: 0.9895, Precision: 0.9859, Recall: 0.9931
Decision Tree Accuracy: 0.9664, F1-score: 0.9662, Precision: 0.9634, Recall: 0.9691
Logistic Regression Accuracy: 0.9811, F1-score: 0.9809, Precision: 0.9825, Recall: 0.9793
Naive Bayes Accuracy: 0.9780, F1-score: 0.9779, Precision: 0.9779, Recall: 0.9779
SVM Accuracy: 0.9875, F1-score: 0.9875, Precision: 0.9866, Recall: 0.9884
Decision Tree Accuracy: 0.9648, F1-score: 0.9646, Precision: 0.9648, Recall: 0.9644
Logistic R

In [None]:
# Print the average performance metrics
print("\nAverage Performance Metrics:")
for metric, scores in [("Accuracy", accuracies), ("F1-score", f1_scores), ("Precision", precision_scores), ("Recall", recall_scores)]:
    print(f"{metric}:")
    for name, score in scores:
        print(f"{name}: {np.mean([s for n, s in scores if n == name]):.4f}")
    print()



Average Performance Metrics:
Accuracy:
Logistic Regression: 0.9812
Naive Bayes: 0.9768
SVM: 0.9883
Decision Tree: 0.9643
Logistic Regression: 0.9812
Naive Bayes: 0.9768
SVM: 0.9883
Decision Tree: 0.9643
Logistic Regression: 0.9812
Naive Bayes: 0.9768
SVM: 0.9883
Decision Tree: 0.9643
Logistic Regression: 0.9812
Naive Bayes: 0.9768
SVM: 0.9883
Decision Tree: 0.9643
Logistic Regression: 0.9812
Naive Bayes: 0.9768
SVM: 0.9883
Decision Tree: 0.9643

F1-score:
Logistic Regression: 0.9812
Naive Bayes: 0.9768
SVM: 0.9883
Decision Tree: 0.9643
Logistic Regression: 0.9812
Naive Bayes: 0.9768
SVM: 0.9883
Decision Tree: 0.9643
Logistic Regression: 0.9812
Naive Bayes: 0.9768
SVM: 0.9883
Decision Tree: 0.9643
Logistic Regression: 0.9812
Naive Bayes: 0.9768
SVM: 0.9883
Decision Tree: 0.9643
Logistic Regression: 0.9812
Naive Bayes: 0.9768
SVM: 0.9883
Decision Tree: 0.9643

Precision:
Logistic Regression: 0.9806
Naive Bayes: 0.9733
SVM: 0.9881
Decision Tree: 0.9653
Logistic Regression: 0.9806
Naive B

In [None]:
# Specify the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the specified device
model.to(device)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
model.to(device)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# K-Fold cross-validation for BERT
bert_accuracies = []
bert_f1_scores = []
bert_precision_scores = []
bert_recall_scores = []

for train_index, test_index in kf.split(stance_df):
    # Use `.loc` instead of `iloc` to make sure we are indexing correctly
    X_train, X_test = stance_df.loc[train_index, 'text'], stance_df.loc[test_index, 'text']
    y_train, y_test = stance_df.loc[train_index, 'label'], stance_df.loc[test_index, 'label']

    # Prepare the data for BERT and move them to the specified device
    X_train_input_ids = torch.tensor([tokenizer.encode(text, padding='max_length', max_length=128, truncation=True) for text in X_train]).to(device)
    X_train_attention_mask = torch.tensor([[1] * len(ids) for ids in X_train_input_ids]).to(device)
    X_test_input_ids = torch.tensor([tokenizer.encode(text, padding='max_length', max_length=128, truncation=True) for text in X_test]).to(device)
    X_test_attention_mask = torch.tensor([[1] * len(ids) for ids in X_test_input_ids]).to(device)
    y_train_tensor = torch.tensor(y_train.values).to(device)  # Convert Series to numpy for Tensor and move to device
    y_test_tensor = torch.tensor(y_test.values).to(device)

    # Train the BERT model
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    model.train()
    for epoch in range(3):
        optimizer.zero_grad()
        outputs = model(X_train_input_ids, attention_mask=X_train_attention_mask, labels=y_train_tensor)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Evaluate the BERT model
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_input_ids, attention_mask=X_test_attention_mask)
        logits = outputs.logits
        y_pred = torch.argmax(logits, dim=1).cpu()  # Move predictions back to CPU for evaluation
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        bert_accuracies.append(accuracy)
        bert_f1_scores.append(f1)
        bert_precision_scores.append(precision)
        bert_recall_scores.append(recall)
        print(f"BERT Accuracy: {accuracy:.4f}, F1-score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 8.12 GiB. GPU 0 has a total capacity of 14.75 GiB of which 5.55 GiB is free. Process 3565 has 9.20 GiB memory in use. Of the allocated memory 9.01 GiB is allocated by PyTorch, and 73.49 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [None]:
stance_df = stance_df.reset_index(drop=True)

In [None]:
print("\nAverage BERT Performance Metrics:")
print(f"Accuracy: {np.mean(bert_accuracies):.4f}")
print(f"F1-score: {np.mean(bert_f1_scores):.4f}")
print(f"Precision: {np.mean(bert_precision_scores):.4f}")
print(f"Recall: {np.mean(bert_recall_scores):.4f}")