# Utility

In [None]:
!pip install -q gdown
!pip install peft

In [3]:
import gdown
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
import nltk
from nltk.corpus import stopwords
import warnings
import string
import torch
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_scheduler
from tqdm import tqdm
from sklearn.metrics import classification_report
import torch.nn.functional as F
import matplotlib.pyplot as plt
import gc
import os
from peft import LoraModel, LoraConfig, get_peft_model

In [4]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]

In [6]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [7]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words):
        self.stop_words = stop_words

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Preprocess each document: lowercase, remove stopwords, keep only words
        X_processed = X.applymap(lambda text: ' '.join(
            word for word in re.findall(r'\b[a-zA-Z]+\b', text.lower())
            if word not in self.stop_words
        ))

        X_combined = X_processed.apply(lambda row: ' '.join(row), axis=1)

        return pd.DataFrame(X_combined, columns=["processed_text"])

# Data Download & Review

In [11]:
balanced_sample = pd.read_csv("/content/balanced_sample.csv")

# Data Split

In [13]:
X = balanced_sample[['headline', 'body']]
y = balanced_sample['political_leaning_encoded']
del balanced_sample
gc.collect()

131

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
stop_words = set(stopwords.words('english'))

preprocessor = Pipeline([
    ('selector', DataFrameSelector(['headline', 'body'])),
    ('text_preprocessor', TextPreprocessor(stop_words))
])

In [16]:
X_train_transformed = preprocessor.fit_transform(X_train)
X_val_transformed = preprocessor.transform(X_val)
X_train_transformed = X_train_transformed['processed_text'].tolist()
X_val_transformed = X_val_transformed['processed_text'].tolist()
del X_train, X_val
gc.collect()

9

# Pre-Trained Model Loading

In [17]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Tokenization

In [18]:
train_encodings = tokenizer(X_train_transformed, truncation=True, padding=True, max_length=128, return_tensors="pt")
del X_train_transformed

In [19]:
val_encodings = tokenizer(X_val_transformed, truncation=True, padding=True, max_length=128, return_tensors="pt")
del X_val_transformed
gc.collect()

0

In [20]:
train_dataset = CustomDataset(train_encodings, y_train.tolist())
val_dataset = CustomDataset(val_encodings, y_val.tolist())

In [37]:
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16)

# Fine-Tuning using LoRA

In [21]:
# Set up LoRA configuration
lora_config = LoraConfig(
    r=16,  # Rank of the low-rank matrices
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none"
)

In [22]:
# Wrap the model with LoRA
model = get_peft_model(model, lora_config)

In [23]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [24]:
# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to=None
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [25]:
# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [26]:
# Train the model
try:
    trainer.train()
except TypeError as e:
    print(f"Encountered a TypeError during training: {e}")


Epoch,Training Loss,Validation Loss
1,0.9574,No log
2,0.8905,No log
3,0.8519,No log


In [27]:
torch.save(model.state_dict(), 'lora_model.pt')

In [28]:
file_size_model = os.path.getsize('/content/lora_model.pt')

file_size_model = file_size_model / (1024 * 1024)
print(f"The size of the fine-tuned model is: {file_size_model:.2f} MB")

The size of the fine-tuned model is: 420.03 MB


# Real Life Example

In [29]:
headline = 'For Russia, Nuclear Weapons Are the Ultimate Bargaining Chip'

In [30]:
body = """On the 1,000th day of the war in Ukraine, President Volodymyr Zelensky took advantage of Washington’s new willingness to allow long-range missiles to be shot deep into Russia. Until this weekend, President Biden had declined to allow such strikes using American weapons, out of fear they could prompt World War III.

On the same day, Russia formally announced a new nuclear doctrine that it had signaled two months ago, declaring for the first time that it would use nuclear weapons not only in response to an attack that threatened its survival, but also in response to any attack that posed a “critical threat” to its sovereignty and territorial integrity — a situation very similar to what was playing out in the Kursk region, as American-made ballistic missiles struck Russian weapons arsenals.

And there was another wrinkle to Russia’s guidelines for nuclear use: For the first time, it declared the right to use nuclear weapons against a state that only possesses conventional arms — if it is backed by a nuclear power. Ukraine, backed by the United States, Britain and France — three of the five original nuclear-armed states — seems to be the country Russia’s president, Vladimir V. Putin, had in mind.

Yet it was telling that the reaction in Washington on Tuesday was just short of a yawn. Officials dismissed the doctrine as the nothingburger of nuclear threats. Instead, the city was rife with speculation over who would prevail as Treasury secretary, or whether Matt Gaetz, a former congressman surrounded by sex-and-drug allegations though never charged, could survive the confirmation process to become attorney general.

The Ukraine war has changed many things: It has ended hundreds of thousands of lives and shattered millions, it has shaken Europe, and it has deepened the enmity between Russia and the United States. But it has also inured Washington and the world to the renewed use of nuclear weapons as the ultimate bargaining chip. The idea that one of the nine countries now in possession of nuclear weapons — with Iran on the threshold of becoming the tenth — might press the button is more likely to evoke shrugs than a convening of the United Nations Security Council.

“This is a signaling exercise, trying to scare audiences in Europe — and to a lesser extent, the United States — into falling off support for Ukraine,” said Matthew Bunn, a Harvard professor who has tracked nuclear risks for decades. “The actual short-term probability of Russian nuclear use hasn’t increased. The long-term probability of nuclear war has probably increased slightly — because U.S. willingness to support strikes deep into Russia is reinforcing Putin’s hatred and fear of the West, and will likely provoke Russian responses that will increase Western fear and hatred of Russia.”"""


In [31]:
data = [f'{headline} {body}']
encodings = tokenizer(data, truncation=True, padding=True, max_length=128, return_tensors="pt")
encodings = {key: val.to('cpu') for key, val in encodings.items()}

In [35]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to('cpu')

predictions = []

with torch.no_grad():
    outputs = model(**encodings)
    logits = outputs.logits

    # Get predictions
    predicted_class = torch.argmax(logits, dim=-1)
    predictions.extend(predicted_class.cpu().numpy())
print(logits)
print("Predictions:", predictions)

probabilities = F.softmax(logits, dim=1)
percentages = probabilities * 100
print(percentages)

tensor([[0.1449, 0.0454, 0.5270]])
Predictions: [2]
tensor([[29.6674, 26.8588, 43.4739]])


In [None]:
all_labels = []
all_preds = []

with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to('cpu') for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch['labels'].cpu().numpy())

In [42]:
print(classification_report(all_labels, all_preds, target_names=['Center','Left','Right']))

              precision    recall  f1-score   support

      Center       0.76      0.58      0.66      2939
        Left       0.71      0.59      0.65      2867
       Right       0.55      0.78      0.65      2837

    accuracy                           0.65      8643
   macro avg       0.67      0.65      0.65      8643
weighted avg       0.67      0.65      0.65      8643

