In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    pipeline
)
from torch.utils.data import Dataset
import torch
import logging
import warnings
warnings.filterwarnings('ignore')

# Constants
MASKED_MODEL = "distilbert-base-multilingual-cased"
CAUSAL_MODEL = "EleutherAI/gpt-neo-125M"


class ParliamentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def load_data():
    """Load the datasets for the two tasks."""
    try:
        orientation_data = pd.read_csv("orientation-fr-train.tsv", sep="\t")
        orientation_data.drop("id", axis=1, inplace=True)
        orientation_data.dropna(inplace=True)

        power_data = pd.read_csv("power-fr-train.tsv", sep="\t")
        power_data.drop("id", axis=1, inplace=True)
        power_data.dropna(inplace=True)

        return orientation_data, power_data
    except FileNotFoundError as e:
        print(f"Error: {e}")
        raise

def prepare_data(df):
    """Prepare data for training with stratified split"""
    train_data, test_data = train_test_split(
        df,
        test_size=0.1,
        stratify=df['label'],
        random_state=42
    )
    return train_data, test_data

def compute_metrics(pred):
    """Compute metrics for model evaluation"""
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    f1 = f1_score(labels, preds, average='weighted')
    accuracy = accuracy_score(labels, preds)

    return {
        'accuracy': accuracy,
        'f1': f1
    }

def fine_tune_masked_model(train_data, test_data, text_column, model_path):
    """Fine-tune masked language model for classification"""
    tokenizer = AutoTokenizer.from_pretrained(MASKED_MODEL)
    model = AutoModelForSequenceClassification.from_pretrained(
        MASKED_MODEL,
        num_labels=2
    )

    # Tokenize data
    train_encodings = tokenizer(
        list(train_data[text_column]),
        truncation=True,
        padding=True,
        max_length=512
    )
    test_encodings = tokenizer(
        list(test_data[text_column]),
        truncation=True,
        padding=True,
        max_length=512
    )

    train_dataset = ParliamentDataset(train_encodings, train_data['label'].values)
    test_dataset = ParliamentDataset(test_encodings, test_data['label'].values)

    # Calculate class weights
    class_weights = compute_class_weight(
        'balanced',
        classes=np.unique(train_data['label']),
        y=train_data['label']
    )

    training_args = TrainingArguments(
        output_dir=model_path,
        learning_rate=0.1,
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)

    return model, tokenizer

def zero_shot_inference(texts, task_type):
    """Perform zero-shot inference using causal language model"""
    classifier = pipeline(
        "zero-shot-classification",
        model=CAUSAL_MODEL,
        device=0 if torch.cuda.is_available() else -1
    )

    if task_type == "orientation":
        candidate_labels = ["left-wing", "right-wing"]
    else:  # power
        candidate_labels = ["governing party", "opposition party"]

    results = []
    for text in texts:
        result = classifier(text, candidate_labels)
        label = 1 if result['labels'][0] == candidate_labels[1] else 0
        results.append(label)

    return results

def main():
    # Load datasets
    orientation_data, power_data = load_data()

    # Task 1: Political Orientation (Fine-tune on "text_en")
    print("\nTask 1: Political Orientation Classification")
    train_data, test_data = prepare_data(orientation_data)

    print("\nFine-tuning masked model on English text ('text_en')...")
    masked_model_en, tokenizer_en = fine_tune_masked_model(
        train_data,
        test_data,
        "text_en",  # Use "text_en" for fine-tuning
        "./orientation_model_en"
    )

    # Zero-shot inference for both languages
    print("\nPerforming zero-shot inference for orientation...")
    zero_shot_preds_en = zero_shot_inference(test_data['text_en'], "orientation")
    print("\nZero-shot Classification Report (English):")
    print(classification_report(test_data['label'], zero_shot_preds_en))

    zero_shot_preds_orig = zero_shot_inference(test_data['text'], "orientation")
    print("\nZero-shot Classification Report (Original Language):")
    print(classification_report(test_data['label'], zero_shot_preds_orig))

    # Task 2: Power Classification (Fine-tune on "text")
    print("\nTask 2: Power Classification")
    train_data, test_data = prepare_data(power_data)

    print("\nFine-tuning masked model on original text ('text')...")
    masked_model_orig, tokenizer_orig = fine_tune_masked_model(
        train_data,
        test_data,
        "text",  # Use original "text" for fine-tuning
        "./power_model_orig"
    )

    # Zero-shot inference for both languages
    print("\nPerforming zero-shot inference for power...")
    zero_shot_preds_en = zero_shot_inference(test_data['text_en'], "power")
    print("\nZero-shot Classification Report (English):")
    print(classification_report(test_data['label'], zero_shot_preds_en))

    zero_shot_preds_orig = zero_shot_inference(test_data['text'], "power")
    print("\nZero-shot Classification Report (Original Language):")
    print(classification_report(test_data['label'], zero_shot_preds_orig))

if __name__ == "__main__":
    main()



Task 1: Political Orientation Classification

Fine-tuning masked model on English text ('text_en')...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6141,0.611984,0.698895,0.575026
2,41.8717,0.61181,0.698895,0.575026
3,0.6397,0.614061,0.698895,0.575026



Performing zero-shot inference for orientation...


config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Device set to use cuda:0
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
Tokenizer was not supporting padding necessary for zero-shot, attempting to use  `pad_token=eos_token`
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



Zero-shot Classification Report (English):
              precision    recall  f1-score   support

           0       0.28      0.43      0.34       109
           1       0.68      0.52      0.59       253

    accuracy                           0.49       362
   macro avg       0.48      0.47      0.46       362
weighted avg       0.56      0.49      0.51       362



Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
Tokenizer was not supporting padding necessary for zero-shot, attempting to use  `pad_token=eos_token`



Zero-shot Classification Report (Original Language):
              precision    recall  f1-score   support

           0       0.18      0.06      0.08       109
           1       0.69      0.89      0.77       253

    accuracy                           0.64       362
   macro avg       0.43      0.47      0.43       362
weighted avg       0.53      0.64      0.57       362


Task 2: Power Classification

Fine-tuning masked model on original text ('text')...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6553,0.659828,0.629328,0.486156
2,0.673,0.659428,0.629328,0.486156
3,0.6582,0.659833,0.629328,0.486156



Performing zero-shot inference for power...


Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
Tokenizer was not supporting padding necessary for zero-shot, attempting to use  `pad_token=eos_token`



Zero-shot Classification Report (English):
              precision    recall  f1-score   support

           0       0.69      0.31      0.43       618
           1       0.39      0.76      0.52       364

    accuracy                           0.48       982
   macro avg       0.54      0.54      0.47       982
weighted avg       0.58      0.48      0.46       982



Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
Tokenizer was not supporting padding necessary for zero-shot, attempting to use  `pad_token=eos_token`



Zero-shot Classification Report (Original Language):
              precision    recall  f1-score   support

           0       0.64      0.91      0.75       618
           1       0.43      0.11      0.18       364

    accuracy                           0.62       982
   macro avg       0.53      0.51      0.46       982
weighted avg       0.56      0.62      0.54       982

