# Starter Notebook

Install and import required libraries

In [None]:
# !pip install transformers datasets evaluate accelerate peft trl bitsandbytes
# !pip install nvidia-ml-py3

In [None]:
import os
import pandas as pd
import torch
import transformers
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
import pickle

In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))

## Load Tokenizer and Preprocess Data

In [None]:
base_model = 'roberta-base'

dataset = load_dataset('ag_news', split='train')

import pandas as pd
df = pd.DataFrame(dataset[:])
df["length"] = df["text"].apply(lambda x: len(x.split()))
df["length"].describe()

In [None]:
split_raw = dataset.train_test_split(test_size=640, seed=42)
train_raw = split_raw["train"]
eval_raw = split_raw["test"]

In [None]:
# def is_valid(example):
#     text = example["text"]
#     # filter text by length
#     if len(text.split()) < 5 or len(text.split()) > 256:
#         return False
#     # filter all scrambled text
#     if text.strip() == "" or sum(c.isalnum() for c in text) / len(text) < 0.3:
#         return False
#     return True

# train_filtered = train_raw.filter(is_valid)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained(base_model)

def preprocess(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding=True)
    return tokenized

# tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
# tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

tokenized_train = train_raw.map(preprocess, batched=True, remove_columns=["text"])
tokenized_train = tokenized_train.rename_column("label", "labels")

tokenized_eval = eval_raw.map(preprocess, batched=True, remove_columns=["text"])
tokenized_eval = tokenized_eval.rename_column("label", "labels")

# Final datasets
train_dataset = tokenized_train
eval_dataset = tokenized_eval

In [None]:
# Extract the number of classess and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


## Load Pre-trained Model
Set up config for pretrained model and download it from hugging face

In [None]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label)
model

## Anything from here on can be modified

In [None]:
# # Split the original training set
# split_datasets = tokenized_dataset.train_test_split(test_size=640, seed=42)
# train_dataset = split_datasets['train']
# eval_dataset = split_datasets['test']

## Setup LoRA Config
Setup PEFT config and get peft model for finetuning

In [None]:
# PEFT Config
# peft_config = LoraConfig(
#     r=2,
#     lora_alpha=4,
#     lora_dropout=0.05,
#     bias = 'none',
#     target_modules = ['query'],
#     task_type="SEQ_CLS",
# )

peft_config = LoraConfig(
    r=7,
    lora_alpha=16,
    lora_dropout=0.4,
    bias="none",
    target_modules=["query", "key", "value", "dense"],
    task_type="SEQ_CLS"
)

In [None]:
peft_model = get_peft_model(model, peft_config)
peft_model

In [None]:
print("Trainable parameters:")
for name, param in peft_model.named_parameters():
    if param.requires_grad:
        print(name)

In [None]:
print('PEFT Model')
peft_model.print_trainable_parameters()

## Training Setup

In [None]:
# To track evaluation accuracy during training
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy,
        'precision': precision_score(labels, preds, average='macro'),
        'recall': recall_score(labels, preds, average='macro'),
        'f1': f1_score(labels, preds, average='macro')
    }

In [None]:
import sys
print(sys.executable)

In [None]:
from transformers import EarlyStoppingCallback
# Setup Training args
output_dir = "results"
# training_args = TrainingArguments(
#     output_dir=output_dir,
#     report_to=None,
#     eval_strategy='steps',
#     logging_steps=100,
#     learning_rate=5e-6,
#     num_train_epochs=1,
#     max_steps=1200,
#     use_cpu=False,
#     dataloader_num_workers=4,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=64,
#     optim="sgd",
#     gradient_checkpointing=False,
#     gradient_checkpointing_kwargs={'use_reentrant':True}
# )

training_args = TrainingArguments(
    output_dir=output_dir,
    report_to="wandb",
    eval_strategy='steps',
    max_steps=4400,
    logging_steps=100,
    learning_rate=1e-4,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    load_best_model_at_end=False, 
    metric_for_best_model="accuracy",
    optim="adamw_torch",
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={'use_reentrant': True}
)

def get_trainer(model):
      return  Trainer(
          model=model,
          args=training_args,
          callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
          compute_metrics=compute_metrics,
          train_dataset=train_dataset,
          eval_dataset=eval_dataset,
          data_collator=data_collator,
      )

### Start Training

In [None]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())

In [None]:
peft_lora_finetuning_trainer = get_trainer(peft_model)

result = peft_lora_finetuning_trainer.train()

## Evaluate Finetuned Model


### Performing Inference on Custom Input
Uncomment following functions for running inference on custom inputs

In [None]:
def classify(model, tokenizer, text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
    output = model(**inputs)

    prediction = output.logits.argmax(dim=-1).item()

    print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
    return id2label[prediction]

In [None]:
classify( peft_model, tokenizer, "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...")

In [None]:
classify( peft_model, tokenizer, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.")

### Run Inference on eval_dataset

In [None]:
# from torch.utils.data import DataLoader
# import evaluate
# from tqdm import tqdm

# def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
#     """
#     Evaluate a PEFT model on a dataset.

#     Args:
#         inference_model: The model to evaluate.
#         dataset: The dataset (Hugging Face Dataset) to run inference on.
#         labelled (bool): If True, the dataset includes labels and metrics will be computed.
#                          If False, only predictions will be returned.
#         batch_size (int): Batch size for inference.
#         data_collator: Function to collate batches. If None, the default collate_fn is used.

#     Returns:
#         If labelled is True, returns a tuple (metrics, predictions)
#         If labelled is False, returns the predictions.
#     """
#     # Create the DataLoader
#     eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#     inference_model.to(device)
#     inference_model.eval()

#     all_predictions = []
#     if labelled:
#         metric = evaluate.load('accuracy')

#     # Loop over the DataLoader
#     for batch in tqdm(eval_dataloader):
#         # Move each tensor in the batch to the device
#         batch = {k: v.to(device) for k, v in batch.items()}
#         with torch.no_grad():
#             outputs = inference_model(**batch)
#         predictions = outputs.logits.argmax(dim=-1)
#         all_predictions.append(predictions.cpu())

#         if labelled:
#             # Expecting that labels are provided under the "labels" key.
#             references = batch["labels"]
#             metric.add_batch(
#                 predictions=predictions.cpu().numpy(),
#                 references=references.cpu().numpy()
#             )

#     # Concatenate predictions from all batches
#     all_predictions = torch.cat(all_predictions, dim=0)

#     if labelled:
#         eval_metric = metric.compute()
#         print("Evaluation Metric:", eval_metric)
#         return eval_metric, all_predictions
#     else:
#         return all_predictions

from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm
import torch
import pandas as pd

def evaluate_model(inference_model, dataset, tokenizer, labelled=True, batch_size=8, data_collator=None):
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    all_labels = []
    all_texts = []

    if labelled:
        metric = evaluate.load('accuracy')

    for batch in tqdm(eval_dataloader):
        text_batch = tokenizer.batch_decode(batch['input_ids'], skip_special_tokens=True)
        all_texts.extend(text_batch)

        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.extend(predictions.cpu().tolist())

        if labelled:
            references = batch["labels"].cpu().tolist()
            all_labels.extend(references)
            metric.add_batch(predictions=predictions.cpu().numpy(), references=references)

    if labelled:
        eval_metric = metric.compute()
        print("Evaluation Metric:", eval_metric)

        # 构建 DataFrame 进行错误分析
        df = pd.DataFrame({
            "text": all_texts,
            "true_label": all_labels,
            "predicted_label": all_predictions
        })

        return eval_metric, df
    else:
        return all_predictions

In [None]:
# Evaluate and return a labeled DataFrame
metric, error_df = evaluate_model(peft_model, eval_dataset, tokenizer, labelled=True, batch_size=8, data_collator=data_collator)

from collections import defaultdict

# Multiple inference rounds + record misclassified samples
def run_multiple_inferences(model, dataset, tokenizer, rounds=5, seed_start=42, batch_size=8):
    error_text_counts = defaultdict(int)
    error_records = []

    for i in range(rounds):
        print(f"\n== Inference Round {i+1} ==")
        seed = seed_start + i
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        _, df = evaluate_model(model, dataset, tokenizer, labelled=True, batch_size=batch_size, data_collator=data_collator)
        errors = df[df['true_label'] != df['predicted_label']]
        error_records.append(errors)

        for text in errors["text"]:
            error_text_counts[text] += 1

    return error_text_counts, error_records


# Analyze overlapping misclassified samples
def get_common_errors(error_text_counts, threshold=2):
    common = [(text, count) for text, count in error_text_counts.items() if count >= threshold]
    common_df = pd.DataFrame(common, columns=["text", "error_count"]).sort_values("error_count", ascending=False)
    return common_df


# === Run multi-round evaluation and find frequent error samples ===
error_text_counts, error_records = run_multiple_inferences(peft_model, eval_dataset, tokenizer, rounds=5)

# Samples misclassified more than 3 times
common_errors_df = get_common_errors(error_text_counts, threshold=3)

# Display top 10 frequently misclassified texts
print("\nFrequently misclassified texts (error ≥ 3 times):")
display(common_errors_df.head(10))

# Show misclassified samples (predicted ≠ actual label)
wrong_predictions = error_df[error_df['true_label'] != error_df['predicted_label']]
wrong_predictions


In [None]:
def analyze_text_properties(common_df, reference_df):
    # Find complete misclassification records in common_df (including predictions and true labels)
    merged = reference_df.merge(common_df, on="text", how="inner")

    # Text length statistics
    merged["length"] = merged["text"].apply(lambda x: len(x.split()))
    print("Length distribution of misclassified texts:")
    print(merged["length"].describe())

    # Confusion matrix
    confusion = pd.crosstab(merged["true_label"], merged["predicted_label"], rownames=["True"], colnames=["Pred"])
    print("Confusion matrix of misclassified samples:")
    print(confusion)

    return merged

merged_common_errors = analyze_text_properties(common_errors_df, error_df)

In [None]:
# Label-specific hint keywords (adjust according to your label order)
label_hint = {
    0: ["government", "UN", "diplomacy", "conflict", "international"],
    1: ["tournament", "score", "championship", "athlete", "league"],
    2: ["market", "stock", "economy", "investment", "corporation"],
    3: ["technology", "cybersecurity", "software", "gadget", "digital"]
}

In [None]:
import random
from nltk.corpus import wordnet

def synonym_replace(text):
    words = text.split()
    new_words = words.copy()
    random.shuffle(new_words)

    for i, word in enumerate(new_words):
        synonyms = wordnet.synsets(word)
        if synonyms:
            lemmas = [lemma.name().replace('_', ' ') for lemma in synonyms[0].lemmas()]
            lemmas = [lemma for lemma in lemmas if lemma.lower() != word.lower()]
            if lemmas:
                replacement = random.choice(lemmas)
                original_index = words.index(word)
                words[original_index] = replacement
                break
    return " ".join(words)

In [None]:
def insert_label_hint(text, label_id):
    hints = label_hint.get(label_id, [])
    if not hints:
        return text
    hint_phrase = " This article involves " + ", ".join(random.sample(hints, min(2, len(hints)))) + "."
    return text + hint_phrase

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
# Example: Use the first misclassified sample
sample_text = common_errors_df.iloc[0]['text']
true_label = merged_common_errors[merged_common_errors["text"] == sample_text]["true_label"].values[0]

print("Original text：")
print(sample_text)

augmented = insert_label_hint(synonym_replace(sample_text), true_label)

print("Augmented text：")
print(augmented)

In [None]:
# Batch augment frequently misclassified texts and inject them back into the training set 
from datasets import Dataset, concatenate_datasets, Value

# Define batch augmentation function
def batch_augment_from_common_errors(common_df, reference_df, method=synonym_replace, hint_inserter=insert_label_hint):
    merged = reference_df.merge(common_df, on="text", how="inner")

    augmented_texts = []
    augmented_labels = []

    for _, row in merged.iterrows():
        original = row["text"]
        label = row["true_label"]

        aug = method(original)
        aug = hint_inserter(aug, label)

        augmented_texts.append(aug)
        augmented_labels.append(label)

    df_aug = pd.DataFrame({"text": augmented_texts, "label": augmented_labels})
    return Dataset.from_pandas(df_aug)

# Generate augmented sample dataset
augmented_dataset = batch_augment_from_common_errors(common_errors_df, error_df)

# Tokenize the augmented samples
tokenized_augmented = augmented_dataset.map(preprocess, batched=True, remove_columns=["text"])
tokenized_augmented = tokenized_augmented.rename_column("label", "labels")

# Concatenate augmented data with original training set
train_dataset_casted = train_dataset.cast_column("labels", Value("int64"))
train_dataset_with_aug = concatenate_datasets([train_dataset_casted, tokenized_augmented])

# Re-train LoRA model 
peft_aug_trainer = get_trainer(peft_model)
peft_aug_trainer.train_dataset = train_dataset_with_aug

# Start re-training
result_aug = peft_aug_trainer.train()

# Evaluate the new model
metric_aug, error_df_aug = evaluate_model(peft_model, eval_dataset, tokenizer, labelled=True, batch_size=8, data_collator=data_collator)

# Display number of misclassified samples in the new model
print(f"Number of misclassified samples (augmented model): {len(error_df_aug[error_df_aug['true_label'] != error_df_aug['predicted_label']])}")


In [None]:
# !pip install --upgrade pip setuptools wheel
# !pip install sentencepiece --prefer-binary

In [None]:
from transformers import MarianMTModel, MarianTokenizer
import torch
import random
import pandas as pd
from datasets import Dataset, concatenate_datasets, Value
from tqdm import tqdm

# Load translation models
en2de_model_name = "Helsinki-NLP/opus-mt-en-de"
de2en_model_name = "Helsinki-NLP/opus-mt-de-en"
en2de_tokenizer = MarianTokenizer.from_pretrained(en2de_model_name)
en2de_model = MarianMTModel.from_pretrained(en2de_model_name)
de2en_tokenizer = MarianTokenizer.from_pretrained(de2en_model_name)
de2en_model = MarianMTModel.from_pretrained(de2en_model_name)

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
en2de_model.to(device)
de2en_model.to(device)

# Define batched backtranslation function
def backtranslate_batch(text_list, device="cuda"):
    try:
        # English → German
        inputs = en2de_tokenizer(text_list, return_tensors="pt", padding=True, truncation=True).to(device)
        translated = en2de_model.generate(**inputs)
        de_texts = en2de_tokenizer.batch_decode(translated, skip_special_tokens=True)

        # German → English
        inputs_back = de2en_tokenizer(de_texts, return_tensors="pt", padding=True, truncation=True).to(device)
        translated_back = de2en_model.generate(**inputs_back)
        en_texts = de2en_tokenizer.batch_decode(translated_back, skip_special_tokens=True)

        return en_texts
    except Exception as e:
        print(f"Batch backtranslation failed: {e}")
        return text_list

# Backtranslate 30% of train_raw
subset_size = int(0.3 * len(train_raw))
subset_indices = random.sample(range(len(train_raw)), subset_size)
subset = train_raw.select(subset_indices)

# Apply backtranslation in batches
bt_texts, bt_labels = [], []
batch_size = 16

print("Backtranslating subset (batched)...")
for i in tqdm(range(0, len(subset), batch_size), desc="Backtranslating"):
    batch = subset[i:i+batch_size]  # this is a dict
    texts = batch['text']
    labels = batch['label']

    bt_batch = backtranslate_batch(texts, device=device)
    bt_texts.extend(bt_batch)
    bt_labels.extend(labels)

# Create dataset from backtranslated texts
bt_df = pd.DataFrame({'text': bt_texts, 'label': bt_labels})
bt_dataset = Dataset.from_pandas(bt_df)

# Tokenize backtranslated data
tokenized_bt = bt_dataset.map(preprocess, batched=True, remove_columns=["text"])
tokenized_bt = tokenized_bt.rename_column("label", "labels")

# Concatenate with original train_dataset
train_dataset_casted_bt = train_dataset.cast_column("labels", Value("int64"))
train_dataset_backtranslated = concatenate_datasets([train_dataset_casted_bt, tokenized_bt])

print(f"Backtranslation complete. New training set size: {len(train_dataset_backtranslated)}")


In [None]:
# Check evaluation accuracy
_, _ = evaluate_model(peft_model, eval_dataset, True, 8, data_collator)

### Run Inference on unlabelled dataset

In [None]:
#Load your unlabelled data
unlabelled_dataset = pd.read_pickle("test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])
unlabelled_dataset

In [None]:
# Run inference and save predictions
preds = evaluate_model(peft_model, test_dataset, False, 8, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})
df_output.to_csv(os.path.join(output_dir,"inference_output.csv"), index=False)
print("Inference complete. Predictions saved to inference_output.csv")