Hi there!
This is the code template for CW2 task3 of COMP34711 2025/26.

- <span style="color:red; font-size:1em">First of all, please rename the notebook into "{your_student_id}_CW2_task{your_task_number}.ipynb", for example "12345678_CW2_task3.ipynb".</span>

- In this template, we only provide the minimal structure for your coursework.
  
- Please carefully read and organize your code in the template we provided.

## Constants

In [6]:
#Please keep only necessary information in this cell.

#----------------------Please keep all following constants unchanged.----------------------------------------
NUM_ROWS_VALIDATION = 1031 # Number of rows in validation set
NUM_ROWS_TEST = 1053 # Number of rows in test set

#----------------------Please modify the following constants to fit your actual value.-----------------------
STUDENT_ID = 'your_student_id'  # Replace with your actual 8-digits student ID
TRAINING_SET = './data/CW2_training_dataset.csv' # Replace with the actual path to your training dataset csv file
VALIDATION_SET = './data/CW2_validation_dataset.csv'  # Replace with the actual path to your validation dataset csv file
VALIDATION_SET_OUTPUT = f'./data/{STUDENT_ID}_CW2_task3_validation_results.csv'  # Replace with the actual path to your validation prediction csv file
TEST_SET_INPUT = './data/CW2_test_dataset.csv'  # Replace with the actual path to your test prediction csv file

#----------------------Your constants------------------------------------------------
# By adding more constants here, you can help improve the clarity and maintainability of your code and make the reviewing easier for TAs.
BATCH_SIZE = 16

## Installations

In [7]:
# Install required packages for the coursework
# Uncomment and run the following lines if needed

# !pip install pandas scikit-learn --quiet

## Imports

In [13]:
#Please keep all imports of your code cells in this cell

#---------------------Required imports----------------------
import pandas as pd
import re
import sys
import os.path
import csv
from sklearn.metrics import f1_score
#----------------------Your imports-------------------------
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import DebertaV2Model, DebertaV2Tokenizer
from sklearn.metrics import f1_score
import numpy as np

## Start of your code cells

- The code cells provided below are demo code format for TAs to quickly locate your implementation.

- You have full right to freely add/delete/edit the titles and codes in the following cells.

- Please follow this genre order: "comedy, cult, flashback, historical, revenge, romantic, scifi, violence".

### Data Loading

In [14]:
# Your code cells here
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Your code cells here
df = pd.read_csv(TRAINING_SET)

GENRE_COLS = [
    "comedy", "cult", "flashback", "historical",
    "revenge", "romantic", "scifi", "violence"
]

# extract Y from training df
Y_train = df[GENRE_COLS].values   # shape (7127, 8)

# Count positives and negatives per label
pos_counts = Y_train.sum(axis=0)
neg_counts = (Y_train.shape[0] - pos_counts)

# Compute pos_weight = neg/pos
pos_weight = torch.tensor(neg_counts / pos_counts, dtype=torch.float32).to(device)

print(pos_weight)

tensor([ 4.7943,  2.9572,  2.5724, 36.3141,  3.2423,  2.5760, 33.4300,  1.3506],
       device='cuda:0')


### Tokenization

In [15]:
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-base")


ImportError: 
DebertaV2Tokenizer requires the SentencePiece library but it was not found in your environment. Check out the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [None]:
class MovieDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512):
        self.df = df
        self.tokenizer = tokenizer
        self.genre_cols = [
            "comedy", "cult", "flashback", "historical",
            "revenge", "romantic", "scifi", "violence"
        ]
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = f"{row.title} [SEP] {row.plot_synopsis}"

        enc = self.tokenizer(
            text,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )

        labels = torch.tensor(row[self.genre_cols].values.astype(float), dtype=torch.float)

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": labels
        }


### Model and Training

In [None]:
class DistilBERTClassifier(nn.Module):
    def __init__(self, num_labels=8, hidden_dim=128, pretrained_model="distilroberta-base"):
        super().__init__()
        self.transformer = DistilBertModel.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.3)
        self.fc1 = nn.Linear(self.transformer.config.hidden_size, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = outputs.last_hidden_state  # [batch, seq_len, hidden]
        # Mean pooling over tokens
        pooled = (last_hidden * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(1, keepdim=True)
        x = self.dropout(pooled)
        x = torch.relu(self.fc1(x))
        logits = self.fc2(x)
        return logits  # return raw logits for BCEWithLogitsLoss

In [None]:
model = DistilBERTClassifier()
model.to(device)

X_train = MovieDataset(df, tokenizer)
train_loader = DataLoader(X_train, batch_size= BATCH_SIZE, shuffle=False)

val_df = pd.read_csv(VALIDATION_SET)
X_val = MovieDataset(val_df, tokenizer)
val_loader = DataLoader(X_val, batch_size= BATCH_SIZE, shuffle=False)

optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCEWithLogitsLoss()
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} - Train Loss: {total_loss/len(train_loader):.4f}")

    model.eval()
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"]

            logits = model(input_ids, attention_mask)
            probs = torch.sigmoid(logits).cpu()
            all_probs.append(probs)
            all_labels.append(labels)

    probs_val = torch.cat(all_probs, dim=0).numpy()
    y_val = torch.cat(all_labels, dim=0).numpy()
    macro_f1 = f1_score(y_val, (probs_val >= 0.5).astype(int), average="macro")
    print(f"Validation Macro F1: {macro_f1:.4f}")


### Finding the best thresholds

In [None]:
thresholds = np.linspace(0, 1, 101)
best_thresholds = []

for col in range(probs_val.shape[1]):
    best_f1 = 0
    best_thr = 0.5
    for thr in thresholds:
        preds = (probs_val[:, col] >= thr).astype(int)
        f1 = f1_score(y_val[:, col], preds, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_thr = thr
    best_thresholds.append(best_thr)

best_thresholds = np.array(best_thresholds)
print("Best thresholds per genre:", best_thresholds)

### Prediction

In [None]:
def predict_dataframe(df, model, tokenizer, batch_size=BATCH_SIZE, device="cuda"):
    dataset = MovieDataset(df, tokenizer)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    all_preds = []

    model.to(device)
    model.eval()
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.sigmoid(logits)

            # If you have per-label thresholds
            thresholds_tensor = torch.tensor(best_thresholds, device=probs.device, dtype=probs.dtype)
            preds = (probs >= thresholds_tensor).int()

            all_preds.append(preds.cpu())

    all_preds = torch.cat(all_preds, dim=0).numpy()
    return all_preds  # shape: (len(df), 8)


## End of your code cells

### Evaluation scripts

In [None]:
def read_data(submission_file_path, gold_standard_file_path):
    """
    Read submission and gold standard files.
    Extract student ID from filename.
    """
    # Try to find student ID from the filename (looks for 8 digit numbers)
    id_regex = r'\d{8}'

    user_id = re.findall(id_regex, submission_file_path)
    print("Found your ID: ", user_id)
    if user_id:
        user_id = user_id[0]
    else:
        user_id = 'Unknown'

    # Load submission CSV
    print(f"\nLoading submission file: {submission_file_path}")
    submission_df = pd.read_csv(submission_file_path, sep=',', header=None,
                                quoting=csv.QUOTE_NONE, encoding='utf-8')

    # Load gold standard CSV
    print(f"Loading gold standard file: {gold_standard_file_path}")
    gold_standard_df = pd.read_csv(gold_standard_file_path, header=None)

    # Remove columns 1 and 2 (keep only ID and labels)
    gold_standard_df = gold_standard_df.drop([1, 2], axis=1)
    # Skip header row
    gold_standard_df = gold_standard_df.iloc[1:]

    return submission_df, gold_standard_df, user_id


def match_and_prepare_data(submission_df, gold_standard_df, user_id):
    """
    Match submission rows with gold standard rows by ID.
    Prepare data for evaluation.
    """
    gold_standard_labels = []
    submission_labels = []
    missed_rows = []
    submission_df_copy = submission_df.copy()

    print(f"\nMatching submission with gold standard...")
    print(f"Gold standard rows: {len(gold_standard_df)}")
    print(f"Submission rows: {len(submission_df_copy)}")

    # Match each gold standard row with submission
    for index, row in gold_standard_df.iterrows():
        row = row.reset_index(drop=True)
        row_found = False
        row_id = row[0]

        # Extract gold standard labels
        row_labels = [int(row[i]) for i in range(1, len(row))]
        gold_standard_labels.append(row_labels)

        # Find corresponding submission row
        for sub_index, submission_row in submission_df_copy.iterrows():
            if submission_row[0].strip() == row_id.strip():
                try:
                    # Extract submission labels
                    submission_row_labels = [int(submission_row[i]) for i in range(1, len(submission_row))]
                except:
                    # Handle malformed labels (take first character if multi-digit)
                    submission_row_labels = [int(str(submission_row[i])[0]) for i in range(1, len(submission_row))]

                submission_labels.append(submission_row_labels)
                row_found = True
                submission_df_copy.drop(sub_index, inplace=True)
                break

        if not row_found:
            # If row is missing, add inverse labels (worst possible prediction)
            missed_rows.append(row_id)
            submission_labels.append([0 if label == 1 else 1 for label in row_labels])

    return gold_standard_labels, submission_labels, missed_rows


def evaluate_submission(gold_standard_labels, submission_labels):
    """
    Calculate weighted F1 score.
    """
    print(f"\nCalculating weighted F1 score...")

    # Calculate weighted F1 score (accounts for class imbalance)
    f1_weighted = f1_score(gold_standard_labels, submission_labels, average='weighted')

    return f1_weighted


def print_results(user_id, f1_weighted, missed_rows):
    """
    Print evaluation results to screen.
    """
    print("\n" + "="*70)
    print("YOUR SUBMISSION EVALUATION REPORT")
    print("="*70)

    # Alert if ID not found in filename
    if user_id == 'Unknown':
        print('WARNING: ID not found in filename!')
        print('   Please ensure your filename contains your 8-digit student ID.')
        print()

    print(f"Your ID: {user_id}")
    print()

    # Display F1 score with visual indicator
    print("EVALUATION RESULTS:")
    print(f"   Weighted F1 Score: {f1_weighted:.4f}")
    print()

    # Report missing rows
    if missed_rows:
        print(f"MISSING DATA ({len(missed_rows)} rows not found):")
        print("-" * 70)
        for i, row in enumerate(missed_rows[:10], 1):  # Show first 10
            print(f"    {i}. Row ID: {row}")
        if len(missed_rows) > 10:
            print(f"    ... and {len(missed_rows) - 10} more missing rows")
        print()
        print("TIP: Make sure your submission includes all required rows.")
        print("        Missing rows are penalized with worst possible predictions.")
    else:
        print("DATA COMPLETENESS: All expected rows found in your submission!")

    print()
    print("="*70)
    print()


def evaluate(submission_path, gold_standard_path):
    """
    Main function to run the submission evaluation script.
    """

    submission_file = submission_path
    gold_standard_file = gold_standard_path

    # Check if files exist
    if not os.path.exists(submission_file):
        print(f"Error: Your submission file '{submission_file}' not found!")
        print("Make sure the file path is correct and the file exists.")
        sys.exit(1)

    if not os.path.exists(gold_standard_file):
        print(f"Error: Gold standard file '{gold_standard_file}' not found!")
        print("Make sure you have the correct gold standard file.")
        sys.exit(1)

    try:
        # Step 1: Read data
        submission_df, gold_standard_df, user_id = read_data(submission_file, gold_standard_file)

        # Step 2: Match and prepare data
        gold_standard_labels, submission_labels, missed_rows = match_and_prepare_data(
            submission_df, gold_standard_df, user_id
        )

        # Step 3: Evaluate
        f1_weighted = evaluate_submission(gold_standard_labels, submission_labels)

        # Step 4: Print results
        print_results(user_id, f1_weighted, missed_rows)

    except Exception as e:
        print(f"Error during evaluation: {str(e)}")
        print("Please check that your files are in the correct CSV format.")
        print("Each row should contain: ID, label1, label2, label3, ...")
        import traceback
        traceback.print_exc()
        sys.exit(1)

### Evaluate the model on the validation dataset

In [None]:
# Please run the evaluation scripts cell above before running the mark_and_record

# Please make sure that output format is like following (no header row, no tilte and plot columns):
# 94834c61-0e30-4799-9998-6f74f6sbb204	0	1	0	0	1	0	0	0
# 559sdd28-b6a2-4662-ab55-a6678as26a56	0	0	0	0	0	0	1	0
# b71y3317-04cd-42f5-a380-d21dfasdbd36	0	0	0	0	1	0	0	0

evaluation_results = evaluate(VALIDATION_SET_OUTPUT, VALIDATION_SET)

### Save predictions to formatted file.

In [None]:
# Now please modify the code to format your output csv file.

# Please make sure that output format is like following (no header row, no tilte and plot columns):
# 94834c61-0e30-4799-9998-6f74f6sbb204	0	1	0	0	1	0	0	0
# 559sdd28-b6a2-4662-ab55-a6678as26a56	0	0	0	0	0	0	1	0
# b71y3317-04cd-42f5-a380-d21dfasdbd36	0	0	0	0	1	0	0	0

test_df = pd.read_csv(VALIDATION_SET)
test_preds = predict_dataframe(test_df, model, tokenizer)

output_df = pd.DataFrame(test_preds, columns=GENRE_COLS)
output_df.insert(0, 'ID', test_df['ID'])

# For example, if you have a DataFrame named 'output_df', you can save it
#assert isinstance(output_df, pd.DataFrame)
#assert len(output_df) == NUM_ROWS_TEST, "Output length is not aligned with the testdata.csv."
#assert len(output_df.columns) == 9, "Please make sure to follow the format above and keep only IDs and 8 columns of prediction."
output_df.to_csv(f'./data/{STUDENT_ID}_CW2_task3_validation_results.csv', index=False, header=False)