In [None]:
# !pip install -U datasets
# !pip install transformers==4.47.0
# !pip install torch==2.5.1+cu121 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

# Environment Setup

In [None]:
# General imports
import numpy as np
import pandas as pd
import json
import torch
import random
import os
import warnings
import re
import string
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import ParameterGrid
import wandb
from datasets import Dataset
from huggingface_hub import HfFolder

# Transformers and Hugging Face imports
import transformers
from transformers import (
    set_seed,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    AutoModel,
    AutoConfig,
    Trainer,
    TrainingArguments,
    pipeline,
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding
)

# PEFT (Prompt Engineering for Transformers) imports
from peft import (
    get_peft_config,
    get_peft_model,
    PromptTuningInit,
    PromptTuningConfig,
    TaskType,
    PeftType,
    PromptEncoderConfig
)

# Kaggle secrets
# from kaggle_secrets import UserSecretsClient

# Display the environment
print(f"torch version: {torch.__version__}")
print(f"transformers version: {transformers.__version__}")

# Set up warnings and plotting style
warnings.filterwarnings('ignore')
plt.style.use('ggplot')

# Walk through input directory and print filenames
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def set_random_seed(seed: int):
    set_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# Dataload

In [None]:
data = pd.read_excel("/kaggle/input/ci-in6229/PHEME_stance_topic1234_hilight.xlsx")

pheme_df = pd.DataFrame(data)

# news_df.head()

valid_categories = [
    "favor",
    "neutral",
    "against"
]
pheme_df = pheme_df[pheme_df['stance'].isin(valid_categories)]
pheme_df.head(5)

# Preprocessing

In [None]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')


# preprocessing function
def preprocessing(text, do_lowercase=False, remove_punctuation=False, remove_stopwords=False,
                  stemming=False, lemmatization=True, remove_numbers=False, remove_extra_spaces=False):

    # 1. Convert text to lowercase
    if do_lowercase:
        text = text.lower()

    # 2. Remove punctuation
    if remove_punctuation:
        text = text.translate(str.maketrans('', '', string.punctuation))

    # 3. Remove stopwords (common words with little semantic meaning)
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        text = ' '.join([word for word in text.split() if word not in stop_words])

    # 4. Apply stemming (reducing words to their root form)
    if stemming:
        stemmer = PorterStemmer()
        text = ' '.join([stemmer.stem(word) for word in text.split()])

    # 5. Apply lemmatization (getting the base form of words)
    if lemmatization:
        lemmatizer = WordNetLemmatizer()
        text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    # 6. Remove numbers
    if remove_numbers:
        text = re.sub(r'\d+', '', text)

    # 7. Remove extra spaces (e.g., multiple spaces between words)
    if remove_extra_spaces:
        text = ' '.join(text.split())

    return text

In [None]:
# Select required columns and rename them
df = pheme_df[['comments', 'content', 'entity', 'stance']]
df.rename(columns={'content': 'text1', 'entity': 'text2', 'comments': 'text3'}, inplace=True)

# Map stance to labels
df['labels'] = df['stance'].map({category: idx for idx, category in enumerate(valid_categories)})

# Data preprocessing
df[['text1', 'text2', 'text3']] = df[['text1', 'text2', 'text3']].applymap(preprocessing)

# Perform data sampling
max_count = 1000
df = (
    df.groupby('stance', group_keys=False)
    .apply(lambda x: x.sample(min(len(x), max_count), random_state=42))
)

# Print the count of each stance category after sampling
print("\n============= stance_count =================")
print(df['stance'].value_counts())
print("==============================================\n")

# Split the dataset
dataset = df.drop(columns=['stance'])
dataset = Dataset.from_pandas(dataset, preserve_index=False)
split_dataset = dataset.train_test_split(test_size=0.1, seed=2025)

# Further split the 'train' set into train (80%) and eval (10%)
train_eval_split = split_dataset['train'].train_test_split(test_size=0.125, seed=2025) # 0.125 * 0.9 = 0.1125, so 10% of the total dataset for eval
split_dataset['train'] = train_eval_split['train']
split_dataset['eval'] = train_eval_split['test']

print("\n============= split_dataset =================")
print(split_dataset)
print("==============================================\n")

# Update label mappings
label2id = {category: idx for idx, category in enumerate(valid_categories)}
id2label = {v: k for k, v in label2id.items()}

num_labels = len(label2id)

print("\n============= label2id =================")
print(label2id)
print("==============================================\n")


# Tokenizer

In [None]:
# Model ID
# model_id = "Qwen/Qwen2.5-0.5B-Instruct"
# model_id = "answerdotai/ModernBERT-base"
# model_id = "FacebookAI/roberta-base"
model_id = "google/flan-t5-large"
# model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# special_tokens = ['<｜target_of_sentence｜>', '<｜context_of_sentence｜>', '<｜comment_of_sentence｜>']
# tokenizer.add_tokens(special_tokens)
# tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})

print("\n============= special_tokens_map =================")
print(tokenizer.special_tokens_map)
print("==============================================\n")

In [None]:
# Tokenize function
def combine_text(t1, t2, t3):
    return f"Context: {t1} {tokenizer.sep_token} Target: {t2} {tokenizer.sep_token} Comment: {t3}"
#     return f"""You are given a Context, a Target topic, and a user Comment.  
# Your task is to determine the stance expressed in the Comment **toward the specific Target**.
# Please classify the stance as one of the following: **FAVOR**, **AGAINST**, or **NEUTRAL**.
# - Context: {t1}  
# - Target: {t2}  
# - Comment: {t3}"""

def tokenize(batch):
    combined_texts = [
        combine_text(t1, t2, t3)
        for t1, t2, t3 in zip(batch['text1'], batch['text2'], batch['text3'])
    ]
    return tokenizer(combined_texts, padding="max_length", truncation=True, max_length=128)

In [None]:
tokenized_dataset = split_dataset.map(
    tokenize,
    batched=True,
    remove_columns=["text1", "text2", "text3"]  # Remove raw text column after tokenization
)
print("\n============= tokenized_dataset =================")
print(tokenized_dataset)
print("==============================================\n")

print("\n============= dataset[0] =================")

print("📌 Attention Mask:", tokenized_dataset['train']['attention_mask'][0])
print("🎯 Labels:", tokenized_dataset['train']['labels'][0])
print("🔢 Input IDs:", tokenized_dataset['train']['input_ids'][0])
print("🔡 Decoded Tokens:", tokenizer.convert_ids_to_tokens(tokenized_dataset['train']['input_ids'][0]))
print("==============================================\n")

# Utils

In [None]:
# compute metrics

from sklearn.metrics import f1_score, accuracy_score, classification_report
import numpy as np

def compute_metrics(eval_pred):
    if (model_id == "google/flan-t5-large"):
        (preds, _), labels = eval_pred
    else:
        preds, labels = eval_pred

    predictions = np.argmax(preds, axis=1)

    predictions = [id2label[i] for i in predictions]
    labels = [id2label[i] for i in labels]


    f1 = f1_score(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)

    report = classification_report(labels, predictions, output_dict=True)

    class_metrics = {
        f"{label}_{metric}": float(value)
        for label, scores in report.items()
        if label not in ["accuracy", "macro avg", "weighted avg"]
        for metric, value in scores.items()
        if metric in ["precision", "recall", "f1-score"]
    }

    return {
        "accuracy": float(acc),
        "f1": float(f1),
        **class_metrics
    }


In [None]:
# Training_args
from transformers import TrainerCallback

class SaveEpochCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        if state.epoch == 6.0:
            print("Saving model at epoch 6...")
            kwargs["model"].save_pretrained("model_epoch6")
            tokenizer.save_pretrained("model_epoch6")
            
def get_trainer(model,
                dataset,
                output_dir="/kaggle/working",
                learning_rate=5e-5,
                num_train_epochs=1,
                eval_batch_size=2,
                train_batch_size=8,
                do_train=True,
                do_eval=True,
                evaluation_strategy="epoch"
               ):
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=eval_batch_size,
        learning_rate=learning_rate,
        num_train_epochs=num_train_epochs,

        # will out of disk space
        # save_total_limit=2,
        # load_best_model_at_end=True,
        
        # bf16=True,
        optim="adamw_torch_fused",
        warmup_steps=100,
        weight_decay=0.01,

        metric_for_best_model="f1",
        gradient_accumulation_steps=2,
        do_train=do_train,
        do_eval=do_eval,

        save_strategy="no",
        evaluation_strategy=evaluation_strategy,
        push_to_hub=False,
        report_to="none",
    )


    # Initialize the Trainer with the model and training parameters
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["eval"],
        compute_metrics=compute_metrics,
        callbacks=[SaveEpochCallback]
    )
    return trainer

# Train

In [None]:
param_grid = {
    "learning_rate": [5e-5],
    "train_batch_size": [2],
    "num_train_epochs": [10],
}

In [None]:
from sklearn.model_selection import ParameterSampler

best_model_path = None
best_results = None
best_params = None

# Iterate through all hyperparameter combinations from the parameter grid
n_iter = 4
for params in ParameterSampler(param_grid, n_iter=n_iter, random_state=42):
    # Clear any GPU memory before training to prevent memory overload
    torch.cuda.empty_cache()

    set_random_seed(42)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_id,
        num_labels=num_labels,
        label2id=label2id,
        id2label=id2label,
        return_dict=True,
        pad_token_id=tokenizer.pad_token_id,
        # torch_dtype=torch.float16,  # Uncomment if using mixed precision
        device_map="auto",  # Automatically allocate model layers across available devices
    )
    # model.resize_token_embeddings(len(tokenizer))  # Adjust model embeddings for the tokenizer's vocabulary size

    trainer = get_trainer(model,
                          tokenized_dataset,
                          learning_rate=params['learning_rate'],
                          num_train_epochs=params['num_train_epochs'],
                          eval_batch_size=2,
                          train_batch_size=params['train_batch_size'],
                          do_train=True,
                          do_eval=True,
                          evaluation_strategy="epoch")

    # Define output directory for saving model and tokenizer after training
    output_dir = f"lr_{params['learning_rate']}_epochs_{params['num_train_epochs']}_batch_{params['train_batch_size']}"

    # Start model training
    trainer.train()

    # Evaluate the model on the evaluation dataset
    eval_results = trainer.evaluate()

    # Print the results of the current hyperparameter combination
    print("\n============= params_results =================")
    print(f"Params: {params}")
    print(f"Eval Results: {eval_results}")
    print("==============================================\n")

    # If this model has the best F1 score so far, update the best model path and parameters
    if best_results is None or eval_results['eval_f1'] > best_results['eval_f1']:
        best_results = eval_results
        best_model_path = output_dir
        best_params = params

        # # Save the trained model and tokenizer to the output directory
        # trainer.save_model(output_dir)
        # tokenizer.save_pretrained(output_dir)

    del model
    del trainer

# Print the best hyperparameters and corresponding results
print(f"Best Hyperparameters: {json.dumps(best_params, indent=4, ensure_ascii=False)}\n")
print(f"Best Results: {json.dumps(best_results, indent=4, ensure_ascii=False)}\n")

## Evaluate

In [None]:
set_random_seed(42)
best_model = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/model_epoch6")
best_trainer = get_trainer(best_model,
                           tokenized_dataset,
                           learning_rate=best_params['learning_rate'],
                           num_train_epochs=best_params['num_train_epochs'],
                           eval_batch_size=2,
                           train_batch_size=best_params['train_batch_size'],
                           do_train=True,
                           do_eval=True,
                           evaluation_strategy="epoch")

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

test_dataset = tokenized_dataset["test"]

predictions_output = best_trainer.predict(test_dataset)
print(f"\nTest Metrics (trainer.predict): {json.dumps(predictions_output.metrics, indent=4, ensure_ascii=False)}\n)")

if (model_id == "google/flan-t5-large"):
    (logits, _) = predictions_output.predictions
else:
    logits = predictions_output.predictions
labels = predictions_output.label_ids
preds = np.argmax(logits, axis=1)

labels = [id2label[label] for label in labels]
preds = [id2label[pred] for pred in preds]

unique_labels = sorted(set(labels + preds))
print(classification_report(labels, preds, digits=4, labels=unique_labels))

# Calculate confusion matrix
cm = confusion_matrix(labels, preds, labels=unique_labels)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=unique_labels, yticklabels=unique_labels)

plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()

In [None]:
input_ids = test_dataset["input_ids"]

print("\n📌 correct/incorrect exmaples：\n")

for label in unique_labels:
    correct = [
        (ids, l, p) for ids, l, p in zip(input_ids, labels, preds)
        if l == label and p == label
    ]
    incorrect = [
        (ids, l, p) for ids, l, p in zip(input_ids, labels, preds)
        if l == label and p != label
    ]

    if correct:
        ids, l, p = correct[0]
        text = tokenizer.decode(ids, skip_special_tokens=True)
        print(f"✅ [{l} → {p}] - \"{text}\"")

    if incorrect:
        ids, l, p = incorrect[0]
        text = tokenizer.decode(ids, skip_special_tokens=True)
        print(f"❌ [{l} → {p}] - \"{text}\"")


# Inference

In [None]:
# Import necessary libraries
import torch

# Set the device to GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def inference_test(text1, text2, text3):
    # Combine the input texts into a single input string (assuming combine_text is defined elsewhere)
    text = combine_text(text1, text2, text3)

    # Tokenize the combined text, apply truncation and padding, and move tensors to the selected device (GPU/CPU)
    inputs = tokenizer(text, truncation=True, padding="longest", return_tensors="pt").to(device)

    # Perform inference without updating the model parameters (no gradient computation)
    with torch.no_grad():
        logits = best_model(**inputs).logits  # Get the raw model predictions (logits)

    # Import necessary functions for probability calculation
    import torch.nn.functional as F

    # Apply softmax to the logits to obtain class probabilities
    probabilities = F.softmax(logits, dim=1)[0]  # Get the first (and likely only) instance's probabilities

    # Prepare the results list to store label names and their corresponding probabilities
    results = []
    for idx, prob in enumerate(probabilities):
        label_name = id2label[idx] if idx in id2label else f"Label_{idx}"  # Get label name from id2label or default format
        results.append((label_name, prob.item()))  # Append the label name and its probability value

    # Sort the results by probability score in descending order
    sorted_results = sorted(results, key=lambda x: x[1], reverse=True)

    # Print the ranked categories with their corresponding probabilities
    print("============= ranked_categories =================")
    for rank, (label, score) in enumerate(sorted_results, start=1):
        print(f"Rank {rank:>2}: {label:<20} | Score: {score:.6f}")  # Format and print each ranked label and its score
    print("==================================================\n")

In [None]:
print("Neutral Example")
text1 = "BREAKING: Ottawa Police confirm that a member of the Canadian Forces has succumbed to injuries in hospital following shooting..."
text2 = "Ottawa Polic"
text3 = "My prayers and heart goes out for the family of the slain soldier. God Bless his family."
inference_test(text1, text2, text3)