In [None]:
# !pip install transformers datasets torch scikit-learn gdown evaluate

In [None]:
import json
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import classification_report, roc_auc_score, brier_score_loss, f1_score, precision_recall_fscore_support
from datasets import Dataset
import numpy as np

import os
import torch
import evaluate
import matplotlib.pyplot as plt

In [None]:
# target_dir = "/home/zs07752/sub_task2/data"
# os.makedirs(target_dir, exist_ok=True)
# !gdown "https://drive.google.com/uc?id=1u5C4o_fmjL5nQ_RtgLDShuG97Ix6_KGK" -O "/home/zs07752/sub_task2/data/train.jsonl"
# !gdown "https://drive.google.com/uc?id=1rNQTkhkVG9nzcT97Nk_WyJd80ZaacT0-" -O "/home/zs07752/sub_task2/data/val.jsonl"

Downloading...
From (original): https://drive.google.com/uc?id=1u5C4o_fmjL5nQ_RtgLDShuG97Ix6_KGK
From (redirected): https://drive.google.com/uc?id=1u5C4o_fmjL5nQ_RtgLDShuG97Ix6_KGK&confirm=t&uuid=5438989c-e954-4e26-8ab0-8db2e9df8151
To: /home/zs07752/sub_task2/data/train.jsonl
100%|████████████████████████████████████████| 662M/662M [09:32<00:00, 1.16MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1rNQTkhkVG9nzcT97Nk_WyJd80ZaacT0-
From (redirected): https://drive.google.com/uc?id=1rNQTkhkVG9nzcT97Nk_WyJd80ZaacT0-&confirm=t&uuid=52c9a6ad-4353-45ce-83e8-b0ad5754920d
To: /home/zs07752/sub_task2/data/val.jsonl
100%|████████████████████████████████████████| 140M/140M [01:59<00:00, 1.17MB/s]


In [None]:
# Function to load and extract required fields
def load_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            entry = json.loads(line)
            data.append({"text": entry["text"], "language": entry["language"], "label": entry["label"]})
    return pd.DataFrame(data)  # Convert to DataFrame

# Load training and validation data into DataFrames
train_df = load_data("/home/zs07752/sub_task2/data/train.jsonl")
val_df = load_data("/home/zs07752/sub_task2/data/val.jsonl")

# Display first few rows
print("Training Data:")
print(train_df.head())

print("\nValidation Data:")
print(val_df.head())

# text, language, label, source_dataset, model, label_text

Training Data:
                                                text language  label
0  Have you ever had to wait for something for a ...  English      4
1  But now, things were not so simple._SEP_The gi...  English      3
2  Dear Editor,  I am writing to express my opini...  English      4
3  Humans once wielded formidable magical power. ...  English      4
4  Here is a way that I had to be patient, and we...  English      4

Validation Data:
                                                text language  label
0  In a brief speech with a main theme of comprom...  English      0
1  HOW TO MAKE: Fall Squash Vegetarian Brioche\nI...  English      0
2  The paper models the relation extraction probl...  English      3
3  This paper considers the problem of model-base...  English      3
4  Paper Summary:\n\nAuthors investigate identity...  English      3


In [None]:
# converting the data to the hugging face format.
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

## **Tokenization**

In [None]:
id2label = {0: "fully human-written", 1: "human-written, then machine-polished", 2: "machine-written, then machine-humanized", 3: "human-initiated, then machine-continued",
            4: "deeply-mixed text; where some parts are written by a human and some are generated by a machine", 5: "machine-written, then human-edited"}
label2id = {"fully human-written": 0, "human-written, then machine-polished": 1, "machine-written, then machine-humanized": 2, "human-initiated, then machine-continued": 3,
            "deeply-mixed text; where some parts are written by a human and some are generated by a machine": 4, "machine-written, then human-edited": 5}

model_name = "tabularisai/multilingual-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True  # This will allow loading despite size mismatch
)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Remove original text column (not needed after tokenization)
train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/902 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at tabularisai/multilingual-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([6]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/288918 [00:00<?, ? examples/s]

Map:   0%|          | 0/72661 [00:00<?, ? examples/s]

## **Training Arguments**

In [None]:
training_args = TrainingArguments(
    output_dir="/home/zs07752/sub_task2/results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="/home/zs07752/sub_task2/logs",
)



## **Metrics**

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions_probs = torch.nn.functional.softmax(torch.tensor(predictions), dim=1).numpy()
    predictions = np.argmax(predictions, axis=1)
    
    # Classification Report
    report = classification_report(labels, predictions, target_names=id2label.values(), digits=4)
    print(report)
    
    # ROC-AUC (macro)
    roc_auc = roc_auc_score(labels, predictions_probs, multi_class='ovr')
    
    # Brier Score
    brier = brier_score_loss(labels, predictions_probs[np.arange(len(labels)), labels])
    
    # C@1 metric (Ratio of correctly predicted instances)
    c_at_1 = np.mean(predictions == labels)
    
    # F0.5u score (weighted towards precision)
    precision, recall, f_beta, _ = precision_recall_fscore_support(labels, predictions, beta=0.5, average='macro')
    f05u = f_beta
    
    # F1-score (macro)
    f1 = f1_score(labels, predictions, average='macro')
    
    results = {
        "roc_auc": roc_auc,
        "brier_score": brier,
        "c_at_1": c_at_1,
        "f0.5u": f05u,
        "f1_macro": f1
    }
    
    # Plot Metrics
    metrics_names = list(results.keys())
    metrics_values = list(results.values())
    
    plt.figure(figsize=(8, 5))
    plt.barh(metrics_names, metrics_values, color=['blue', 'green', 'red', 'purple', 'orange'])
    plt.xlabel("Score")
    plt.title("Model Performance Metrics")
    plt.xlim(0, 1)
    plt.show()
    
    return results

## **Trainer**

In [None]:
# Update Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,  # Include custom metrics
)

## **Train the Model**

In [None]:
!wandb login d263ae15255e15e9e2e1943f80a700ad7d7a2c6c

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
# Train the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msamiyaalizaidi[0m ([33msamiyaalizaidi-habib-university[0m). Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss


## **Evaluate the Model**

In [None]:
# Evaluate on validation set
results = trainer.evaluate()
print(results)  

## **Saving the Model**

In [None]:
model.save_pretrained("/home/zs07752/sub_task2/results/st2model_test")
tokenizer.save_pretrained("/home/zs07752/sub_task2/results/st2tokenizer_test")