In [None]:
# Cell 1: Environment Setup
!pip install --upgrade transformers datasets peft accelerate bitsandbytes scikit-learn evaluate

Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.18.0-py3-none-any.whl.metadata (14 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-0.18.0-py3-none-any.whl (556 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.48

In [None]:
# Cell 2: Import Libraries and Mount Google Drive
import os
import glob
import pandas as pd
import numpy as np
import torch
from google.colab import drive

from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import evaluate

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from peft import LoraConfig, get_peft_model, TaskType

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Cell 3: Data Loading and Cleaning (Corrected)

import os
import glob
import pandas as pd

# --- Configuration ---
DATA_DIR = "/content/drive/MyDrive/CSCI594Project/"

# --- Loading ---
csv_files = glob.glob(os.path.join(DATA_DIR, "*.csv"))
if not csv_files:
    raise ValueError(f"No CSV files found in the specified directory: {DATA_DIR}")

df_list = [pd.read_csv(file) for file in csv_files]
full_df = pd.concat(df_list, ignore_index=True)

print(f"Loaded {len(full_df)} rows from {len(csv_files)} files.")
print("Original DataFrame columns:", full_df.columns.tolist())
print("\nOriginal label distribution:\n", full_df['label'].value_counts())

# --- Cleaning & Normalization ---
# Define label mappings
label2id = {'NORMAL': 0, 'OFFENSIVE': 1, 'HATE_SPEECH': 2}
id2label = {0: 'NORMAL', 1: 'OFFENSIVE', 2: 'HATE_SPEECH'}

# Keep only the necessary columns
cleaned_df = full_df[['text', 'label']].copy()

# 1. *** THE LABEL FIX: NORMALIZE LABELS ***
# Convert all labels to string and then to uppercase
cleaned_df['label'] = cleaned_df['label'].astype(str).str.upper()

# 2. Map string labels to integers
# Invalid labels (like 'API_FAILURE') will become NaN
cleaned_df['label'] = cleaned_df['label'].map(label2id)

# 3. *** THE TEXT FIX: Drop rows missing text OR a valid label ***
# This drops rows where 'text' is NaN OR 'label' is NaN
cleaned_df.dropna(subset=['text', 'label'], inplace=True)

# 4. Convert label to integer
cleaned_df['label'] = cleaned_df['label'].astype(int)

print(f"\nFinal DataFrame shape: {cleaned_df.shape}")
print("Final label distribution:\n", cleaned_df['label'].value_counts())
print("\nSample of cleaned data:")
print(cleaned_df.head())

Loaded 4220 rows from 7 files.
Original DataFrame columns: ['author_hash', 'text_redacted', 'label', 'comment', 'text', 'language_category', 'user']

Original label distribution:
 label
OFFENSIVE      1383
NORMAL         1063
normal          841
HATE_SPEECH     787
offensive       106
hate_speech      38
API_FAILURE       1
label             1
Name: count, dtype: int64

Final DataFrame shape: (2760, 2)
Final label distribution:
 label
1    1268
0     922
2     570
Name: count, dtype: int64

Sample of cleaned data:
                                                  text  label
473  Осы қазақтар неге сонша тентек? Бәрі бірдей ма...      2
474  Сенің анаңды шешеңмен бірге... п...ц, неге осы...      1
475  Алматыда жаңа кафе ашылған екен, барып көрген ...      0
476  Уйгурлар кетсін, біздің жерімізді басып алды, ...      2
477   Админ ты шо, совсем ебанутый? Пост өшірдің неге?      1


In [None]:
# Cell 4: Convert to Hugging Face Dataset and Split

# Convert pandas DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(cleaned_df)

# Split the dataset into training (90%) and testing (10%)
train_test_split_dataset = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)

# Create a DatasetDict
hf_dataset = DatasetDict({
    'train': train_test_split_dataset['train'],
    'test': train_test_split_dataset['test']
})

print("\nHugging Face Dataset structure:")
print(hf_dataset)


Hugging Face Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 2484
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 276
    })
})


In [None]:
# Cell 5: Setup & Configuration (Upgraded Model)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    DataCollatorWithPadding
)
from peft import LoraConfig, get_peft_model, TaskType

# --- Model & Tokenizer Config (UPGRADED) ---
MODEL_NAME = "xlm-roberta-base" # Using a more powerful, modern model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# --- LoRA Config (STRENGTHENED) ---
# We target more modules to give the model more trainable parameters
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "value", "key", "dense"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

# --- Tokenize Function (from your old Cell 6) ---
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# --- Metrics Function (FIXED: average='macro') ---
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")["f1"]
    precision = precision_metric.compute(predictions=predictions, references=labels, average="macro")["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels, average="macro")["recall"]

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# --- Training Arguments (STABLE) ---
OUTPUT_DIR = "/content/drive/MyDrive/CSCI594Project"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    save_total_limit=1,
)

# --- Data Collator ---
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Cell 6: Stratified K-Fold Cross-Validation Training
from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import torch
from transformers import Trainer

# --- Custom Trainer (FIXED: class_weights dtype to float32) ---
# This fixes the numerical instability bug.
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Calculate weights inside the trainer, using the device
        train_labels = self.train_dataset["label"]
        class_weights = compute_class_weight(
            class_weight='balanced',
            classes=np.unique(train_labels),
            y=train_labels
        )
        # THE FIX: Use float32 for stable loss calculation
        weights_dtype = logits.dtype
        class_weights_tensor = torch.tensor(class_weights, dtype=weights_dtype).to(self.model.device)

        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

# --- K-Fold Setup ---
N_SPLITS = 5  # 5 folds = 80% train, 20% validation each time
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

# Use the master 'cleaned_df'
X = cleaned_df['text']
y = cleaned_df['label']

all_fold_metrics = [] # Store results from each fold

# --- THE K-FOLD LOOP ---
for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    print(f"\n--- FOLD {fold+1}/{N_SPLITS} ---")

    # 1. Create train/validation dataframes for this fold
    train_df = cleaned_df.iloc[train_index]
    val_df = cleaned_df.iloc[val_index]

    # 2. Convert to Hugging Face Datasets
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)

    # 3. Tokenize
    tokenized_train = train_dataset.map(tokenize_function, batched=True).remove_columns(["text"])
    tokenized_val = val_dataset.map(tokenize_function, batched=True).remove_columns(["text"])

    # 4. Re-initialize the Model FOR EACH FOLD
    # This is critical to prevent data leakage.
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(id2label),
        id2label=id2label,
        label2id=label2id,
    )
    peft_model = get_peft_model(model, lora_config)

    # 5. Initialize our Custom Weighted Trainer
    trainer = WeightedTrainer(
        model=peft_model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # 6. Train
    print(f"Starting training for fold {fold+1}...")
    trainer.train()

    # 7. Evaluate and save metrics
    print(f"Evaluating fold {fold+1}...")
    metrics = trainer.evaluate()
    all_fold_metrics.append(metrics)
    print(f"Fold {fold+1} Metrics: {metrics}")

    # 8. Clean up to free VRAM (optional but good practice)
    del model, peft_model, trainer
    torch.cuda.empty_cache()


--- FOLD 1/5 ---


Map:   0%|          | 0/2208 [00:00<?, ? examples/s]

Map:   0%|          | 0/552 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(


Starting training for fold 1...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0694,0.927586,0.581522,0.589793,0.635047,0.583567
2,0.7999,0.717501,0.612319,0.633751,0.669068,0.624637
3,0.6418,0.655726,0.677536,0.689411,0.704058,0.696089
4,0.5952,0.639773,0.70471,0.713402,0.726739,0.719451
5,0.5628,0.644958,0.701087,0.707371,0.72661,0.714985


Evaluating fold 1...


Fold 1 Metrics: {'eval_loss': 0.6397729516029358, 'eval_accuracy': 0.7047101449275363, 'eval_precision': 0.7134017292553878, 'eval_recall': 0.726738820118881, 'eval_f1': 0.7194511038454218, 'eval_runtime': 4.6086, 'eval_samples_per_second': 119.775, 'eval_steps_per_second': 7.594, 'epoch': 5.0}

--- FOLD 2/5 ---


Map:   0%|          | 0/2208 [00:00<?, ? examples/s]

Map:   0%|          | 0/552 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(


Starting training for fold 2...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0846,0.998133,0.543478,0.563144,0.620408,0.546351
2,0.8324,0.648207,0.692029,0.698978,0.743882,0.707881
3,0.6514,0.588314,0.733696,0.736761,0.76585,0.748537
4,0.609,0.558347,0.757246,0.772294,0.787903,0.778418
5,0.5951,0.553905,0.744565,0.755194,0.782211,0.762834


Evaluating fold 2...


Fold 2 Metrics: {'eval_loss': 0.5539053082466125, 'eval_accuracy': 0.7445652173913043, 'eval_precision': 0.7551943620754549, 'eval_recall': 0.7822110408395746, 'eval_f1': 0.7628338920100161, 'eval_runtime': 4.5813, 'eval_samples_per_second': 120.49, 'eval_steps_per_second': 7.64, 'epoch': 5.0}

--- FOLD 3/5 ---


Map:   0%|          | 0/2208 [00:00<?, ? examples/s]

Map:   0%|          | 0/552 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(


Starting training for fold 3...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0652,0.919191,0.583333,0.608533,0.651863,0.586294
2,0.7558,0.655528,0.70471,0.701729,0.736409,0.713793
3,0.614,0.647093,0.70471,0.700777,0.736794,0.712375
4,0.5808,0.635614,0.722826,0.720673,0.747807,0.731294
5,0.5745,0.630017,0.724638,0.726464,0.749119,0.735995


Evaluating fold 3...


Fold 3 Metrics: {'eval_loss': 0.6300174593925476, 'eval_accuracy': 0.7246376811594203, 'eval_precision': 0.7264641626284957, 'eval_recall': 0.7491188529923462, 'eval_f1': 0.7359946063968182, 'eval_runtime': 4.802, 'eval_samples_per_second': 114.951, 'eval_steps_per_second': 7.289, 'epoch': 5.0}

--- FOLD 4/5 ---


Map:   0%|          | 0/2208 [00:00<?, ? examples/s]

Map:   0%|          | 0/552 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(


Starting training for fold 4...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0686,0.910689,0.57971,0.596282,0.647099,0.583162
2,0.7723,0.701022,0.653986,0.664306,0.712256,0.664743
3,0.6547,0.604782,0.726449,0.740184,0.75787,0.747541
4,0.6204,0.595828,0.710145,0.71821,0.750524,0.728353
5,0.5974,0.582942,0.722826,0.735809,0.760716,0.743295


Evaluating fold 4...


Fold 4 Metrics: {'eval_loss': 0.582942008972168, 'eval_accuracy': 0.7228260869565217, 'eval_precision': 0.7358086054107148, 'eval_recall': 0.7607155106010941, 'eval_f1': 0.7432951437438805, 'eval_runtime': 5.3872, 'eval_samples_per_second': 102.464, 'eval_steps_per_second': 6.497, 'epoch': 5.0}

--- FOLD 5/5 ---


Map:   0%|          | 0/2208 [00:00<?, ? examples/s]

Map:   0%|          | 0/552 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(


Starting training for fold 5...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0804,0.967085,0.605072,0.644269,0.675431,0.611747
2,0.7848,0.647395,0.692029,0.694576,0.738625,0.706737
3,0.6462,0.583025,0.744565,0.754441,0.774104,0.762803
4,0.6126,0.577177,0.728261,0.732878,0.767397,0.744179
5,0.5925,0.558992,0.742754,0.754345,0.77972,0.761721


Evaluating fold 5...


Fold 5 Metrics: {'eval_loss': 0.558991551399231, 'eval_accuracy': 0.7427536231884058, 'eval_precision': 0.7543448609224731, 'eval_recall': 0.7797203902467059, 'eval_f1': 0.7617207238176285, 'eval_runtime': 5.4488, 'eval_samples_per_second': 101.306, 'eval_steps_per_second': 6.423, 'epoch': 5.0}


In [None]:
# Cell 7: Report Final Cross-Validation Results

print("\n--- Cross-Validation Complete ---")

# Average the metrics across all folds
avg_f1 = np.mean([m['eval_f1'] for m in all_fold_metrics])
avg_precision = np.mean([m['eval_precision'] for m in all_fold_metrics])
avg_recall = np.mean([m['eval_recall'] for m in all_fold_metrics])
avg_accuracy = np.mean([m['eval_accuracy'] for m in all_fold_metrics])

print(f"\n--- Final Average Metrics ({N_SPLITS}-Fold) ---")
print(f"Average Macro F1: {avg_f1:.4f}")
print(f"Average Macro Precision: {avg_precision:.4f}")
print(f"Average Macro Recall: {avg_recall:.4f}")
print(f"Average Accuracy: {avg_accuracy:.4f}")

print("\nFull metrics per fold:")
for i, m in enumerate(all_fold_metrics):
    print(f"Fold {i+1}: F1={m['eval_f1']:.4f}, Precision={m['eval_precision']:.4f}, Recall={m['eval_recall']:.4f}")


--- Cross-Validation Complete ---

--- Final Average Metrics (5-Fold) ---
Average Macro F1: 0.7447
Average Macro Precision: 0.7370
Average Macro Recall: 0.7597
Average Accuracy: 0.7279

Full metrics per fold:
Fold 1: F1=0.7195, Precision=0.7134, Recall=0.7267
Fold 2: F1=0.7628, Precision=0.7552, Recall=0.7822
Fold 3: F1=0.7360, Precision=0.7265, Recall=0.7491
Fold 4: F1=0.7433, Precision=0.7358, Recall=0.7607
Fold 5: F1=0.7617, Precision=0.7543, Recall=0.7797
