<a href="https://colab.research.google.com/github/jaynejy/UM-ATLAS-Multilingual-Sentiment-Analysis/blob/main/(Ver_2)_UM_ATLAS_Sentiment_Analysis_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**UM-ATLAS Sentiment Analysis Pipeline**
1) Upload raw_mixed_feedback.xlsx (no labels) -> clean -> export clean_mixed_feedback.xlsx
2) Upload labelled_mixed_feedback.xlsx (annotated clean_mixed_feedback.xlsx)
3) Run this script in TRAIN mode:
   - Load labelled_mixed_feedback.xlsx
   - Validate labels (sentiment or numeric label)
   - Split into train/val/test (non-overlapping, stratified)
   - Train XLM-R, evaluate on val+test, save reports + model zip

In [1]:
# 0) Install dependencies
!pip -q install -U transformers datasets evaluate scikit-learn numpy openpyxl "pandas==2.2.2"

In [2]:
# 1) Imports
import os
import re
import json
import shutil
import random
import unicodedata
from typing import Dict, List, Optional, Tuple

import numpy as np
import pandas as pd
import torch
import evaluate

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
)

# Avoid Weights & Biases prompts (extra safety: TrainingArguments.report_to="none" below)
os.environ["WANDB_DISABLED"] = "true"

In [3]:
# 2) CONFIG
# Files
RAW_INPUT_XLSX = "/content/raw_mixed_feedback.xlsx"
CLEANED_EXPORT_XLSX = "/content/clean_mixed_feedback.xlsx"
LABELLED_INPUT_XLSX = "/content/(2000) labelled_mixed_feedback.xlsx"

# -------------------------
# Auto column detection
# -------------------------
AUTO_DETECT_COLUMNS = True

# Try these names first
TEXT_COL_CANDIDATES = ["clean_text", "text", "comment", "comments", "feedback", "review", "reviews", "ulasan", "cadangan"]
SENTIMENT_COL_CANDIDATES = ["sentiment", "sentiment_label"]
LABEL_COL_CANDIDATES = ["label", "labels"]

# If AUTO_DETECT_COLUMNS=False, set these manually:
TEXT_COL = "cadangan"         # raw text column in Excel (example)
SENTIMENT_COL = "sentiment"   # sentiment string column (positive/neutral/negative)
LABEL_COL = "label"           # numeric label column (0,1,2)

# -------------------------
# Cleaning controls
# -------------------------
REMOVE_FILLER_COMMENTS = True

# Training de-duplication (training stage only)
# - "none": keep all rows
# - "by_text_label": drop exact duplicates of (clean_text, label)  [recommended]
# - "by_text": drop duplicates of clean_text only
DEDUP_STRATEGY = "by_text_label"

# -------------------------
# Split ratios
# -------------------------
TRAIN_RATIO = 0.8
VAL_RATIO = 0.1
TEST_RATIO = 0.1
SEED = 42

# -------------------------
# Model / training
# -------------------------
MODEL_NAME = "xlm-roberta-base"
MAX_LEN = 256

EPOCHS = 3
TRAIN_BS = 8
EVAL_BS = 8
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01

EVAL_STEPS = 100      # change to 250 when got more dataset
SAVE_STEPS = 100      # change to 500 when got more dataset
LOGGING_STEPS = 100   # change to 500 when got more dataset
EARLY_STOP_PATIENCE = 2

# fp16 runs only if CUDA is available (safe auto)
ENABLE_FP16 = True

# -------------------------
# Output paths
# -------------------------
OUT_TRAIN = "/content/train_data.csv"
OUT_VAL   = "/content/val_data.csv"
OUT_TEST  = "/content/test_data.csv"

OUTPUT_DIR = "/content/xlmr_results"
SAVE_DIR = "/content/models/xlmr-base"
ZIP_OUT_PATH = "/content/xlmr-base.zip"

EVAL_VAL_JSON = "/content/eval_results_val.json"
EVAL_TEST_JSON = "/content/eval_results_test.json"
CM_VAL_CSV = "/content/confusion_matrix_val.csv"
CM_TEST_CSV = "/content/confusion_matrix_test.csv"
REPORT_VAL_TXT = "/content/classification_report_val.txt"
REPORT_TEST_TXT = "/content/classification_report_test.txt"
TEST_PRED_CSV = "/content/test_predictions.csv"

In [4]:
# 3) Dictionaries
# Contraction correction dictionary
contractions: Dict[str, str] = {
    "dont": "don't",
    "doesnt": "doesn't",
    "cant": "can't",
    "wont": "won't",
    "im": "I'm",
    "ive": "I've",
    "isnt": "isn't",
    "arent": "aren't",
    "wasnt": "wasn't",
    "werent": "weren't",
    "shouldnt": "shouldn't",
    "couldnt": "couldn't",
    "wouldnt": "wouldn't",
    "didnt": "didn't",
    "hadnt": "hadn't",
    "hasnt": "hasn't",
    "havent": "haven't",
    "youre": "you're",
    "theyre": "they're",
    "thats": "that's",
    "theres": "there's",
    "whos": "who's",
    "whats": "what's",
    "ill": "I'll",
    "youll": "you'll",
    "theyll": "they'll",
    "youve": "you've",
    "weve": "we've",
}

# Sentiment label encoding
LABEL_ENCODING: Dict[str, int] = {"positive": 0, "neutral": 1, "negative": 2}
ID2LABEL: Dict[int, str] = {v: k for k, v in LABEL_ENCODING.items()}

# Filler/no-content comment pattern
FILLER_PATTERN = r"^\s*(-|\.|no\.?|none|n/a|na|x|xde|no comment|not have|nothing|tak ada|takde cadangan|tak ada cadangan|tiada cadangan|x ada|xde komen|no suggestion|none suggestion)\s*\.?$"

In [5]:
# 4) Helper functions
# A) Reproducibility
def set_seed(seed: int) -> None:
    """Set random seeds for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    # Optional: more stable but slower
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


# B) Column detection
def _pick_first_existing(df: pd.DataFrame, candidates: List[str]) -> Optional[str]:
    """Pick the first existing column name in df."""
    lower_map = {c.lower(): c for c in df.columns}
    for cand in candidates:
        if cand.lower() in lower_map:
            return lower_map[cand.lower()]
    return None


def detect_text_column(df: pd.DataFrame) -> str:
    """Detect ONLY the text column (Stage A: raw -> clean export)."""
    if not AUTO_DETECT_COLUMNS:
        return TEXT_COL

    text_col = _pick_first_existing(df, TEXT_COL_CANDIDATES)
    if text_col is None:
        raise ValueError(
            "Could not detect a TEXT column.\n"
            f"Available columns: {list(df.columns)}\n"
            f"Expected something like: {TEXT_COL_CANDIDATES}\n"
            "Fix: set AUTO_DETECT_COLUMNS=False and set TEXT_COL manually."
        )
    return text_col


def detect_columns(df: pd.DataFrame, require_label: bool = True) -> Tuple[str, Optional[str], Optional[str]]:
    """Detect text + (sentiment/label) columns for Stage B training."""
    if not AUTO_DETECT_COLUMNS:
        return TEXT_COL, SENTIMENT_COL, LABEL_COL

    text_col = _pick_first_existing(df, TEXT_COL_CANDIDATES)
    sent_col = _pick_first_existing(df, SENTIMENT_COL_CANDIDATES)
    lab_col = _pick_first_existing(df, LABEL_COL_CANDIDATES)

    if text_col is None:
        raise ValueError(
            "Could not detect a TEXT column.\n"
            f"Available columns: {list(df.columns)}\n"
            f"Expected something like: {TEXT_COL_CANDIDATES}\n"
            "Fix: set AUTO_DETECT_COLUMNS=False and set TEXT_COL manually."
        )

    if require_label and (sent_col is None and lab_col is None):
        raise ValueError(
            "Could not detect SENTIMENT/LABEL columns.\n"
            f"Available columns: {list(df.columns)}\n"
            f"Expected sentiment like: {SENTIMENT_COL_CANDIDATES} or label like: {LABEL_COL_CANDIDATES}\n"
            "Fix: set AUTO_DETECT_COLUMNS=False and set SENTIMENT_COL or LABEL_COL manually."
        )

    return text_col, sent_col, lab_col


# C) Text cleaning
def fix_contractions(text: str) -> str:
    """Replace common missing-apostrophe contractions (whole words only)."""
    for wrong, correct in contractions.items():
        text = re.sub(r"\b" + re.escape(wrong) + r"\b", correct, text, flags=re.IGNORECASE)
    return text


def clean_text(text) -> str:
    """Clean and normalize ONE comment string."""
    if not isinstance(text, str):
        return ""

    # Normalize Unicode to reduce weird variations
    text = unicodedata.normalize("NFKC", text)

    # Remove leading Excel auto-insert characters (after stripping LEFT spaces)
    if text.lstrip().startswith(("=", "+", "-")):
        text = re.sub(r"^\s*[=+-]+", "", text)

    # Fix common contractions
    text = fix_contractions(text)

    # Normalize all variants of 'she course/subject' to a stable token
    text = re.sub(r"(['\"‘’]?)\bshe\b['\"‘’]?\s+(course|subject)", "SHE_COURSE", text, flags=re.IGNORECASE)
    text = re.sub(r"\bSHE (course|subject)\b", "SHE_COURSE", text, flags=re.IGNORECASE)

    # Normalize spacing
    text = re.sub(r"\s+", " ", text)

    # Keep basic punctuation; remove odd symbols
    text = re.sub(r"[^\w\s.,!?;:()\'\"-%><=]", "", text)

    # Trim whitespace
    text = text.strip()

    # Drop rows with no real content (only symbols / punctuation).
    if not re.search(r"[A-Za-z0-9\u4e00-\u9fff]", text):
        return ""

    return text


def apply_dataset_cleaning(df: pd.DataFrame, text_col: str) -> pd.DataFrame:
    """Create 'clean_text' + remove blank/filler rows."""
    out = df.copy()
    # Apply text cleaning function to the raw text column
    out["clean_text"] = out[text_col].apply(clean_text)

    # Remove blank rows
    out = out[out["clean_text"].astype(str).str.strip() != ""].copy()

    # Remove filler/no-content comments
    if REMOVE_FILLER_COMMENTS:
        out = out[~out["clean_text"].str.lower().str.strip().str.match(FILLER_PATTERN)].copy()

    return out.reset_index(drop=True)


# D) Label validation
def validate_and_build_label(df: pd.DataFrame, sent_col: Optional[str], lab_col: Optional[str]) -> pd.DataFrame:
    """
    Ensure df has numeric labels in {0,1,2} as column 'label'.
    Supports:
    - numeric label column OR
    - sentiment string column (positive/neutral/negative) mapped using LABEL_ENCODING
    """
    out = df.copy()

    if lab_col is not None:
        out["label"] = pd.to_numeric(out[lab_col], errors="coerce")
        if out["label"].isna().any():
            bad = out.loc[out["label"].isna(), lab_col].astype(str).unique().tolist()
            raise ValueError(f"Found non-numeric labels in '{lab_col}': {bad[:20]} (showing up to 20)")
        out["label"] = out["label"].astype(int)
    else:
        sent_norm = out[sent_col].astype(str).str.strip().str.lower()
        out["label"] = sent_norm.map(LABEL_ENCODING)
        if out["label"].isna().any():
            bad = sorted(sent_norm[out["label"].isna()].unique().tolist())
            raise ValueError(
                f"Found unmapped sentiment values in '{sent_col}': {bad}\n"
                f"Expected only: {list(LABEL_ENCODING.keys())}"
            )
        out["label"] = out["label"].astype(int)

    valid = set(LABEL_ENCODING.values())
    if not set(out["label"].unique()).issubset(valid):
        raise ValueError(f"Invalid labels found. Expected subset of {valid}. Got: {sorted(out['label'].unique())}")

    return out


# E) Dataset utilities (dedup, split, distributions)
def deduplicate(df: pd.DataFrame) -> pd.DataFrame:
    """Optional de-duplication for training."""
    if DEDUP_STRATEGY == "none":
        return df
    if DEDUP_STRATEGY == "by_text":
        return df.drop_duplicates(subset=["clean_text"]).reset_index(drop=True)
    if DEDUP_STRATEGY == "by_text_label":
        return df.drop_duplicates(subset=["clean_text", "label"]).reset_index(drop=True)
    raise ValueError(f"Unknown DEDUP_STRATEGY='{DEDUP_STRATEGY}'. Use: none/by_text/by_text_label")


def print_label_distribution(df: pd.DataFrame, title: str) -> None:
    """Print label distribution to spot imbalance quickly."""
    print(f"\n=== {title} ===")
    vc = df["label"].value_counts().sort_index()
    total = len(df)
    for k, v in vc.items():
        pct = (v / total) * 100 if total else 0.0
        print(f"label={k} ({ID2LABEL.get(int(k), 'unknown')}): {v} ({pct:.2f}%)")
    print(f"Total rows: {total}")


def split_train_val_test(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Split into train/val/test (non-overlapping, stratified)."""
    if not np.isclose(TRAIN_RATIO + VAL_RATIO + TEST_RATIO, 1.0):
        raise ValueError("TRAIN_RATIO + VAL_RATIO + TEST_RATIO must sum to 1.0")

    df_small = df[["clean_text", "label"]].reset_index(drop=True)

    temp_ratio = VAL_RATIO + TEST_RATIO
    train_df, temp_df = train_test_split(
        df_small,
        test_size=temp_ratio,
        random_state=SEED,
        shuffle=True,
        stratify=df_small["label"],
    )

    test_within_temp = TEST_RATIO / temp_ratio
    val_df, test_df = train_test_split(
        temp_df,
        test_size=test_within_temp,
        random_state=SEED,
        shuffle=True,
        stratify=temp_df["label"],
    )

    return train_df, val_df, test_df


# F) Output helpers (JSON + confusion matrix CSV)
def save_json(path: str, obj: dict) -> None:
    """Save dict to JSON safely."""
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, indent=2)


def save_confusion_matrix_csv(path: str, y_true: np.ndarray, y_pred: np.ndarray) -> None:
    """Save confusion matrix as CSV with readable row/col labels."""
    cm = confusion_matrix(y_true, y_pred)
    cm_df = pd.DataFrame(
        cm,
        index=["true_positive", "true_neutral", "true_negative"],
        columns=["pred_positive", "pred_neutral", "pred_negative"],
    )
    cm_df.to_csv(path, encoding="utf-8-sig", index=True)

## 5) Stage A

In [6]:
# Export cleaned Excel for annotation
def export_cleaned_for_annotation() -> None:
    """RAW_INPUT_XLSX -> create clean_text -> export CLEANED_EXPORT_XLSX."""
    if not os.path.exists(RAW_INPUT_XLSX):
        raise FileNotFoundError(
            f"File not found: {RAW_INPUT_XLSX}\n"
            "Upload raw_mixed_feedback.xlsx to /content/ (Colab) first."
        )

    df = pd.read_excel(RAW_INPUT_XLSX)
    text_col = detect_text_column(df)

    df_out = df.copy()
    df_out["clean_text"] = df_out[text_col].apply(clean_text)

    # Remove blanks/filler before annotation (recommended)
    df_out = df_out[df_out["clean_text"].astype(str).str.strip() != ""].copy()
    if REMOVE_FILLER_COMMENTS:
        df_out = df_out[~df_out["clean_text"].str.lower().str.strip().str.match(FILLER_PATTERN)].copy()

    # Add an empty sentiment column if not present (makes annotation easier)
    if not any(c.lower() == "sentiment" for c in df_out.columns):
        df_out["sentiment"] = ""

    df_out.to_excel(CLEANED_EXPORT_XLSX, index=False)
    print(f"✅ Exported cleaned file for annotation: {CLEANED_EXPORT_XLSX} (rows={len(df_out)})")
    print("\nNext step:")
    print("1) Download clean_mixed_feedback.xlsx")
    print("2) Fill sentiment with: positive / neutral / negative")
    print("3) Save as labelled_mixed_feedback.xlsx and upload back to /content/")

# Run Stage A:
export_cleaned_for_annotation()

✅ Exported cleaned file for annotation: /content/clean_mixed_feedback.xlsx (rows=9956)

Next step:
1) Download clean_mixed_feedback.xlsx
2) Fill sentiment with: positive / neutral / negative
3) Save as labelled_mixed_feedback.xlsx and upload back to /content/


## 6) Stage B

In [7]:
# Load labelled data
def load_validate_clean_labelled() -> pd.DataFrame:
    """Load labelled Excel and ensure clean_text + label exist."""
    if not os.path.exists(LABELLED_INPUT_XLSX):
        raise FileNotFoundError(
            f"File not found: {LABELLED_INPUT_XLSX}\n"
            "Upload labelled_mixed_feedback.xlsx to /content/ first."
        )

    df = pd.read_excel(LABELLED_INPUT_XLSX)

    # If clean_text exists, reuse it to avoid mismatch vs your annotation.
    has_clean_text = any(c.lower() == "clean_text" for c in df.columns)
    if has_clean_text:
        clean_col = next(c for c in df.columns if c.lower() == "clean_text")
        text_col = clean_col
        _, sent_col, lab_col = detect_columns(df, require_label=True)
    else:
        text_col, sent_col, lab_col = detect_columns(df, require_label=True)

    # Build numeric label column
    df = validate_and_build_label(df, sent_col=sent_col, lab_col=lab_col)

    # Ensure clean_text exists + apply filtering
    if not has_clean_text:
        df = apply_dataset_cleaning(df, text_col=text_col)
    else:
        df = df.copy()
        df["clean_text"] = df[text_col].astype(str).str.strip()
        df = df[df["clean_text"] != ""].copy()
        if REMOVE_FILLER_COMMENTS:
            df = df[~df["clean_text"].str.lower().str.strip().str.match(FILLER_PATTERN)].copy()
        df = df.reset_index(drop=True)

    # De-duplication (training-only convenience)
    df = deduplicate(df)

    print(f"✅ Loaded labelled dataset: rows={len(df)}")
    # Note: label ids follow LABEL_ENCODING = {"positive": 0, "neutral": 1, "negative": 2}
    print_label_distribution(df, "Full labelled dataset distribution")
    return df

df_labelled = load_validate_clean_labelled()
df_labelled.head()

✅ Loaded labelled dataset: rows=2000

=== Full labelled dataset distribution ===
label=0 (positive): 811 (40.55%)
label=1 (neutral): 750 (37.50%)
label=2 (negative): 439 (21.95%)
Total rows: 2000


Unnamed: 0,course_code,course_name,occ,reviews,clean_text,sentiment,label
0,ACC 610,Public Accounting Practice,2,Saya rasa kursus ini berguna dari segi strateg...,Saya rasa kursus ini berguna dari segi strateg...,neutral,1
1,ACC 611,External Reporting,2,"This course is like AFM291, 391, and 491 on st...","This course is like AFM291, 391, and 491 on st...",neutral,1
2,ACC 623,Business Technology Law,2,Saya gagal melihat bagaimana kursus ini sesuai...,Saya gagal melihat bagaimana kursus ini sesuai...,negative,2
3,ACC 650,Assurance and Governance,2,"In our year, the lectures were mostly designed...","In our year, the lectures were mostly designed...",positive,0
4,ACC 685,Performance Management,1,"Jika anda fikir AFM433 adalah mengarut (bs), m...","Jika anda fikir AFM433 adalah mengarut (bs), m...",negative,2



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [10]:
# Split train/val/test and save CSVs
train_df, val_df, test_df = split_train_val_test(df_labelled)

# Save to CSV (so you can reuse without re-reading Excel)
train_df.to_csv(OUT_TRAIN, index=False, encoding="utf-8-sig")
val_df.to_csv(OUT_VAL, index=False, encoding="utf-8-sig")
test_df.to_csv(OUT_TEST, index=False, encoding="utf-8-sig")

print("✅ Saved:")
print(f" - {OUT_TRAIN} (rows={len(train_df)})")
print(f" - {OUT_VAL}   (rows={len(val_df)})")
print(f" - {OUT_TEST}  (rows={len(test_df)})")

print_label_distribution(train_df, "Train distribution")
print_label_distribution(val_df, "Val distribution")
print_label_distribution(test_df, "Test distribution")

✅ Saved:
 - /content/train_data.csv (rows=1600)
 - /content/val_data.csv   (rows=200)
 - /content/test_data.csv  (rows=200)

=== Train distribution ===
label=0 (positive): 649 (40.56%)
label=1 (neutral): 600 (37.50%)
label=2 (negative): 351 (21.94%)
Total rows: 1600

=== Val distribution ===
label=0 (positive): 81 (40.50%)
label=1 (neutral): 75 (37.50%)
label=2 (negative): 44 (22.00%)
Total rows: 200

=== Test distribution ===
label=0 (positive): 81 (40.50%)
label=1 (neutral): 75 (37.50%)
label=2 (negative): 44 (22.00%)
Total rows: 200


In [11]:
# Build Dataset + Train (XLM-R Trainer)
class SentimentDataset(Dataset):
    """Torch dataset; dynamic padding is handled by DataCollatorWithPadding."""
    def __init__(self, df: pd.DataFrame, tokenizer, max_length: int = 256):
        self.texts = df["clean_text"].astype(str).tolist()
        self.labels = df["label"].astype(int).tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self) -> int:
        return len(self.texts)

    def __getitem__(self, idx: int) -> dict:
        encoded = self.tokenizer(
            self.texts[idx],
            truncation=True,
            max_length=self.max_length,
        )
        encoded["labels"] = int(self.labels[idx])
        return encoded


def train_model(train_df: pd.DataFrame, val_df: pd.DataFrame) -> Trainer:
    """Train XLM-R using train_df and validate on val_df."""
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(LABEL_ENCODING))

    # Store label mappings in config (helps safe save/load later)
    model.config.label2id = LABEL_ENCODING
    model.config.id2label = ID2LABEL

    train_ds = SentimentDataset(train_df, tokenizer, max_length=MAX_LEN)
    val_ds = SentimentDataset(val_df, tokenizer, max_length=MAX_LEN)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")

    def compute_metrics(eval_pred):
        """Compute accuracy + weighted F1 + macro F1."""
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        acc = accuracy.compute(predictions=preds, references=labels)["accuracy"]
        f1_weighted = f1.compute(predictions=preds, references=labels, average="weighted")["f1"]
        f1_macro = f1.compute(predictions=preds, references=labels, average="macro")["f1"]
        return {"accuracy": acc, "f1": f1_weighted, "f1_macro": f1_macro}

    args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=TRAIN_BS,
        per_device_eval_batch_size=EVAL_BS,
        learning_rate=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,

        eval_strategy="steps",
        save_strategy="steps",
        eval_steps=EVAL_STEPS,
        save_steps=SAVE_STEPS,
        logging_steps=LOGGING_STEPS,

        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,

        save_total_limit=2,
        seed=SEED,

        # No external trackers
        report_to="none",

        # Safe auto fp16
        fp16=(ENABLE_FP16 and torch.cuda.is_available()),
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=EARLY_STOP_PATIENCE)],
    )

    print("CUDA available:", torch.cuda.is_available())
    print("fp16 enabled:", (ENABLE_FP16 and torch.cuda.is_available()))
    print("\n🚀 Training started...")
    trainer.train()
    print("✅ Training finished.")
    return trainer

set_seed(SEED)
trainer = train_model(train_df, val_df)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(


CUDA available: True
fp16 enabled: True

🚀 Training started...


Step,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
100,1.0651,0.917864,0.625,0.555875,0.472585
200,0.7566,0.625548,0.73,0.708514,0.716179
300,0.5571,0.63326,0.745,0.733632,0.730144
400,0.6199,0.492926,0.8,0.794104,0.793784
500,0.4025,0.490712,0.835,0.831549,0.828562
600,0.3904,0.534477,0.82,0.815115,0.814497


✅ Training finished.


In [12]:
# Evaluate (VAL + TEST) and save reports
def evaluate_and_save(trainer: Trainer, df_eval: pd.DataFrame, split_name: str,
                      out_json: str, out_cm_csv: str, out_report_txt: str) -> None:
    """Evaluate on a split and save JSON + confusion matrix + classification report."""
    eval_ds = SentimentDataset(df_eval, trainer.tokenizer, max_length=MAX_LEN)

    eval_results = trainer.evaluate(eval_dataset=eval_ds)
    print(f"\n📊 {split_name} evaluation results:")
    for k, v in eval_results.items():
        try:
            print(f"{k}: {float(v):.4f}")
        except Exception:
            print(f"{k}: {v}")

    pred = trainer.predict(eval_ds)
    y_true = pred.label_ids
    y_pred = np.argmax(pred.predictions, axis=1)

    save_json(out_json, {k: (float(v) if isinstance(v, (int, float, np.floating)) else v) for k, v in eval_results.items()})
    save_confusion_matrix_csv(out_cm_csv, y_true, y_pred)

    report_txt = classification_report(y_true, y_pred, target_names=["positive", "neutral", "negative"])
    with open(out_report_txt, "w", encoding="utf-8") as f:
        f.write(report_txt)

    print(f"✅ Saved: {out_json}")
    print(f"✅ Saved: {out_cm_csv}")
    print(f"✅ Saved: {out_report_txt}")


def save_test_predictions(trainer: Trainer, test_df: pd.DataFrame) -> None:
    """Export test predictions CSV (true label + predicted label + confidence)."""
    test_ds = SentimentDataset(test_df, trainer.tokenizer, max_length=MAX_LEN)

    pred = trainer.predict(test_ds)
    logits = pred.predictions
    y_true = pred.label_ids
    y_pred = np.argmax(logits, axis=1)

    probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()
    max_prob = probs.max(axis=1)

    out = test_df.copy()
    out["true_sentiment"] = [ID2LABEL[int(i)] for i in y_true]
    out["pred_label"] = y_pred.astype(int)
    out["pred_sentiment"] = [ID2LABEL[int(i)] for i in y_pred]
    out["pred_confidence"] = max_prob

    out.to_csv(TEST_PRED_CSV, index=False, encoding="utf-8-sig")
    print(f"✅ Saved test predictions: {TEST_PRED_CSV} (rows={len(out)})")


# Evaluate & save
evaluate_and_save(trainer, val_df,  "VAL",  EVAL_VAL_JSON,  CM_VAL_CSV,  REPORT_VAL_TXT)
evaluate_and_save(trainer, test_df, "TEST", EVAL_TEST_JSON, CM_TEST_CSV, REPORT_TEST_TXT)

# Save detailed test predictions
save_test_predictions(trainer, test_df)

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



📊 VAL evaluation results:
eval_loss: 0.4907
eval_accuracy: 0.8350
eval_f1: 0.8315
eval_f1_macro: 0.8286
eval_runtime: 0.6961
eval_samples_per_second: 287.3010
eval_steps_per_second: 35.9130
epoch: 3.0000


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


✅ Saved: /content/eval_results_val.json
✅ Saved: /content/confusion_matrix_val.csv
✅ Saved: /content/classification_report_val.txt



📊 TEST evaluation results:
eval_loss: 0.5903
eval_accuracy: 0.7850
eval_f1: 0.7847
eval_f1_macro: 0.7698
eval_runtime: 0.6043
eval_samples_per_second: 330.9420
eval_steps_per_second: 41.3680
epoch: 3.0000


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


✅ Saved: /content/eval_results_test.json
✅ Saved: /content/confusion_matrix_test.csv
✅ Saved: /content/classification_report_test.txt


✅ Saved test predictions: /content/test_predictions.csv (rows=200)


In [13]:
# Save model + zip
def save_model_and_zip(trainer: Trainer) -> None:
    """Save model/tokenizer, zip it, and download if running in Colab."""
    os.makedirs(SAVE_DIR, exist_ok=True)
    trainer.tokenizer.save_pretrained(SAVE_DIR)
    trainer.save_model(SAVE_DIR)
    print(f"✅ Model saved to: {SAVE_DIR}")

    base_no_ext = ZIP_OUT_PATH.replace(".zip", "")
    shutil.make_archive(base_no_ext, "zip", SAVE_DIR)
    print(f"✅ Model zipped to: {ZIP_OUT_PATH}")

    # Colab download helper
    try:
        from google.colab import files
        files.download(ZIP_OUT_PATH)
    except Exception:
        print("Note: files.download works only in Google Colab (safe to ignore).")

save_model_and_zip(trainer)

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


✅ Model saved to: /content/models/xlmr-base
✅ Model zipped to: /content/xlmr-base.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>