# Task 3: Enhanced Multi-Label Movie Genre Classifier
## NO FUNCTIONS - JUST PURE CODE

## Constants

In [34]:
#Please keep only necessary information in this cell.

#----------------------Please keep all following constants unchanged.----------------------------------------
NUM_ROWS_VALIDATION = 1031 # Number of rows in validation set
NUM_ROWS_TEST = 1053 # Number of rows in test set

#----------------------Please modify the following constants to fit your actual value.-----------------------
STUDENT_ID = '11445473'  # Replace with your actual 8-digits student ID
TRAINING_SET = './data/CW2_training_dataset.csv' # Replace with the actual path to your training dataset csv file
VALIDATION_SET = './data/CW2_validation_dataset.csv'  # Replace with the actual path to your validation dataset csv file
VALIDATION_SET_OUTPUT = f'./data/{STUDENT_ID}_CW2_task3_validation_results.csv'  # Replace with the actual path to your validation prediction csv file
TEST_SET_INPUT = './data/CW2_test_dataset.csv'  # Replace with the actual path to your test prediction csv file


#----------------------Your constants------------------------------------------------
# By adding more constants here, you can help improve the clarity and maintainability of your code and make the reviewing easier for TAs.
MODEL_NAME = 'roberta-base'
NUM_LABELS = 8
MAX_LENGTH = 256
CONCAT_LAST_N_LAYERS = 4
DROPOUT = 0.3
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_EPOCHS = 10
WARMUP_RATIO = 0.1
WEIGHT_DECAY = 0.01
MAX_GRAD_NORM = 1.0
FOCAL_ALPHA = 0.25
FOCAL_GAMMA = 2.0

GENRE_COLS = ['comedy', 'cult', 'flashback', 'historical', 'revenge', 'romantic', 'scifi', 'violence']
RANDOM_SEED = 42

## Imports

In [19]:
#---------------------Required imports----------------------
import pandas as pd
import re
import sys
import os.path
import csv
from sklearn.metrics import f1_score
#----------------------Your imports-------------------------
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, get_cosine_schedule_with_warmup
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import hamming_loss
from tqdm.auto import tqdm
import warnings
import random

warnings.filterwarnings('ignore')

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


## Data Loading

In [20]:
train_df = pd.read_csv(TRAINING_SET)
print(f"Training samples: {len(train_df)}")
print(train_df.head())

for genre in GENRE_COLS:
    count = train_df[genre].sum()
    percentage = (count / len(train_df)) * 100
    print(f"{genre:12s}: {count:5d} ({percentage:.1f}%)")

texts = (train_df['title'] + ' [SEP] ' + train_df['plot_synopsis']).values
labels = train_df[GENRE_COLS].values

X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.15, random_state=RANDOM_SEED)
print(f"\nTrain: {len(X_train)}, Val: {len(X_val)}")

Training samples: 7127
                                     ID                        title  \
0  ee7722b2-bc23-400b-9461-4ff91f01f486                  Next of Kin   
1  3b111f7d-0c19-4cb3-84a1-d6dc687c9716              The Survivalist   
2  3116352f-4b50-43a2-b9be-458c4aa086e5                  Superman II   
3  bbb71d71-1503-4aa6-9129-918b9efc3c3f  The Hunchback of Notre Dame   
4  c12f67ca-5825-43d0-b9a7-61c348915715                         Taxi   

                                       plot_synopsis  comedy  cult  flashback  \
0  Truman Gates (Patrick Swayze), raised in Appal...       0     0          0   
1  The film takes place when oil production has c...       0     1          0   
2  Before the destruction of Krypton, the crimina...       0     0          0   
3  The gypsy Esmeralda captures the hearts of man...       0     0          0   
4  Taxi portrays director Jafar Panahi as he cour...       0     0          0   

   historical  revenge  romantic  scifi  violence  
0    

## Tokenization

In [21]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"Tokenizer loaded: {MODEL_NAME}")

Tokenizer loaded: roberta-base


In [22]:
class MovieGenreDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.FloatTensor(label)
        }

train_dataset = MovieGenreDataset(X_train, y_train, tokenizer, MAX_LENGTH)
val_dataset = MovieGenreDataset(X_val, y_val, tokenizer, MAX_LENGTH)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
print(f"Dataloaders created: {len(train_loader)} train batches, {len(val_loader)} val batches")

Dataloaders created: 379 train batches, 67 val batches


## Model

In [23]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
    def forward(self, inputs, targets):
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        return (self.alpha * (1-pt)**self.gamma * BCE_loss).mean()

class MultiHeadAttentionPooling(nn.Module):
    def __init__(self, hidden_size, num_heads=8):
        super().__init__()
        self.attention = nn.MultiheadAttention(hidden_size, num_heads, batch_first=True)
        self.query = nn.Parameter(torch.randn(1, 1, hidden_size))
    def forward(self, hidden_states, attention_mask=None):
        query = self.query.expand(hidden_states.size(0), -1, -1)
        if attention_mask is not None:
            attention_mask = ~attention_mask.bool()
        attn_output, _ = self.attention(query, hidden_states, hidden_states, key_padding_mask=attention_mask)
        return attn_output.squeeze(1)

class EnhancedTransformerClassifier(nn.Module):
    def __init__(self, model_name, num_labels, concat_last_n_layers=4, dropout=0.3):
        super().__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        self.hidden_size = self.transformer.config.hidden_size
        self.concat_last_n_layers = concat_last_n_layers
        feature_size = self.hidden_size * concat_last_n_layers
        self.attention_pooling = MultiHeadAttentionPooling(self.hidden_size, num_heads=8)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(feature_size)
        self.classifier = nn.Sequential(
            nn.Linear(feature_size, feature_size // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(feature_size // 2, num_labels)
        )
    def forward(self, input_ids, attention_mask):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        last_n_layers = outputs.hidden_states[-self.concat_last_n_layers:]
        pooled_layers = [self.attention_pooling(layer, attention_mask) for layer in last_n_layers]
        features = torch.cat(pooled_layers, dim=-1)
        features = self.layer_norm(self.dropout(features))
        return self.classifier(features)

model = EnhancedTransformerClassifier(MODEL_NAME, NUM_LABELS, CONCAT_LAST_N_LAYERS, DROPOUT)
model.to(device)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total params: {total_params:,}")
print(f"Trainable params: {trainable_params:,}")
print(f"Under 600M: {trainable_params < 600_000_000}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total params: 131,747,336
Trainable params: 131,747,336
Under 600M: True


## Training Setup

In [24]:
criterion = FocalLoss(FOCAL_ALPHA, FOCAL_GAMMA)

no_decay = ['bias', 'LayerNorm.weight']
optimizer_params = [
    {'params': [p for n, p in model.transformer.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': WEIGHT_DECAY, 'lr': LEARNING_RATE * 0.1},
    {'params': [p for n, p in model.transformer.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr': LEARNING_RATE * 0.1},
    {'params': [p for n, p in model.named_parameters() if 'transformer' not in n], 'weight_decay': WEIGHT_DECAY, 'lr': LEARNING_RATE}
]
optimizer = torch.optim.AdamW(optimizer_params)

num_training_steps = len(train_loader) * NUM_EPOCHS
num_warmup_steps = int(num_training_steps * WARMUP_RATIO)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)

print(f"Optimizer and scheduler ready")
print(f"Training steps: {num_training_steps}, Warmup: {num_warmup_steps}")

Optimizer and scheduler ready
Training steps: 3790, Warmup: 379


## Training

In [25]:
best_f1 = 0
best_model_state = None
best_val_preds = None
best_val_labels = None
history = {'train_loss': [], 'val_loss': [], 'micro_f1': [], 'macro_f1': []}

for epoch in range(NUM_EPOCHS):
    print(f"\nEpoch {epoch+1}/{NUM_EPOCHS}")
    print("-" * 70)
    
    # TRAINING
    model.train()
    total_train_loss = 0
    train_preds_list = []
    train_labels_list = []
    
    for batch in tqdm(train_loader, desc='Training'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
        optimizer.step()
        scheduler.step()
        
        total_train_loss += loss.item()
        preds = torch.sigmoid(logits).detach().cpu().numpy()
        train_preds_list.append(preds)
        train_labels_list.append(labels.cpu().numpy())
    
    avg_train_loss = total_train_loss / len(train_loader)
    train_preds = np.vstack(train_preds_list)
    train_labels = np.vstack(train_labels_list)
    
    # VALIDATION
    model.eval()
    total_val_loss = 0
    val_preds_list = []
    val_labels_list = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Validation'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            total_val_loss += loss.item()
            
            preds = torch.sigmoid(logits).cpu().numpy()
            val_preds_list.append(preds)
            val_labels_list.append(labels.cpu().numpy())
    
    avg_val_loss = total_val_loss / len(val_loader)
    val_preds = np.vstack(val_preds_list)
    val_labels = np.vstack(val_labels_list)
    
    # METRICS
    val_preds_binary = (val_preds > 0.5).astype(int)
    micro_f1 = f1_score(val_labels, val_preds_binary, average='micro')
    macro_f1 = f1_score(val_labels, val_preds_binary, average='macro')
    samples_f1 = f1_score(val_labels, val_preds_binary, average='samples')
    hamming = hamming_loss(val_labels, val_preds_binary)
    
    history['train_loss'].append(avg_train_loss)
    history['val_loss'].append(avg_val_loss)
    history['micro_f1'].append(micro_f1)
    history['macro_f1'].append(macro_f1)
    
    print(f"Train Loss: {avg_train_loss:.4f}")
    print(f"Val Loss: {avg_val_loss:.4f}")
    print(f"Micro F1: {micro_f1:.4f}")
    print(f"Macro F1: {macro_f1:.4f}")
    print(f"Samples F1: {samples_f1:.4f}")
    print(f"Hamming: {hamming:.4f}")
    
    per_class_f1 = f1_score(val_labels, val_preds_binary, average=None)
    for i, genre in enumerate(GENRE_COLS):
        print(f"  {genre:12s}: {per_class_f1[i]:.4f}")
    
    if micro_f1 > best_f1:
        best_f1 = micro_f1
        best_model_state = model.state_dict().copy()
        best_val_preds = val_preds.copy()
        best_val_labels = val_labels.copy()
        print(f"✓ Best model updated (F1: {best_f1:.4f})")

print(f"\nTraining done! Best F1: {best_f1:.4f}")

# Load best model state back into the model
model.load_state_dict(best_model_state)
print("Best model loaded")


Epoch 1/10
----------------------------------------------------------------------


Training:   0%|          | 0/379 [00:00<?, ?it/s]

Training: 100%|██████████| 379/379 [00:53<00:00,  7.10it/s]
Validation: 100%|██████████| 67/67 [00:04<00:00, 15.44it/s]


Train Loss: 0.0327
Val Loss: 0.0282
Micro F1: 0.3170
Macro F1: 0.1316
Samples F1: 0.2622
Hamming: 0.1933
  comedy      : 0.0000
  cult        : 0.0137
  flashback   : 0.0000
  historical  : 0.0000
  revenge     : 0.0221
  romantic    : 0.3668
  scifi       : 0.0000
  violence    : 0.6504
✓ Best model updated (F1: 0.3170)

Epoch 2/10
----------------------------------------------------------------------


Training: 100%|██████████| 379/379 [00:53<00:00,  7.06it/s]
Validation: 100%|██████████| 67/67 [00:04<00:00, 15.37it/s]


Train Loss: 0.0271
Val Loss: 0.0254
Micro F1: 0.4553
Macro F1: 0.2729
Samples F1: 0.4070
Hamming: 0.1792
  comedy      : 0.0918
  cult        : 0.4201
  flashback   : 0.3259
  historical  : 0.0000
  revenge     : 0.0077
  romantic    : 0.5950
  scifi       : 0.0625
  violence    : 0.6800
✓ Best model updated (F1: 0.4553)

Epoch 3/10
----------------------------------------------------------------------


Training: 100%|██████████| 379/379 [00:53<00:00,  7.04it/s]
Validation: 100%|██████████| 67/67 [00:04<00:00, 15.48it/s]


Train Loss: 0.0254
Val Loss: 0.0247
Micro F1: 0.4644
Macro F1: 0.3342
Samples F1: 0.4044
Hamming: 0.1706
  comedy      : 0.2379
  cult        : 0.4388
  flashback   : 0.3441
  historical  : 0.2000
  revenge     : 0.0597
  romantic    : 0.5969
  scifi       : 0.1250
  violence    : 0.6716
✓ Best model updated (F1: 0.4644)

Epoch 4/10
----------------------------------------------------------------------


Training: 100%|██████████| 379/379 [00:53<00:00,  7.04it/s]
Validation: 100%|██████████| 67/67 [00:04<00:00, 15.47it/s]


Train Loss: 0.0243
Val Loss: 0.0245
Micro F1: 0.4854
Macro F1: 0.3636
Samples F1: 0.4245
Hamming: 0.1704
  comedy      : 0.3180
  cult        : 0.2939
  flashback   : 0.4686
  historical  : 0.1429
  revenge     : 0.2769
  romantic    : 0.5420
  scifi       : 0.1765
  violence    : 0.6899
✓ Best model updated (F1: 0.4854)

Epoch 5/10
----------------------------------------------------------------------


Training: 100%|██████████| 379/379 [00:53<00:00,  7.07it/s]
Validation: 100%|██████████| 67/67 [00:04<00:00, 15.40it/s]


Train Loss: 0.0235
Val Loss: 0.0245
Micro F1: 0.4993
Macro F1: 0.3807
Samples F1: 0.4304
Hamming: 0.1685
  comedy      : 0.2933
  cult        : 0.4852
  flashback   : 0.4498
  historical  : 0.1379
  revenge     : 0.2436
  romantic    : 0.4673
  scifi       : 0.2632
  violence    : 0.7052
✓ Best model updated (F1: 0.4993)

Epoch 6/10
----------------------------------------------------------------------


Training: 100%|██████████| 379/379 [00:53<00:00,  7.03it/s]
Validation: 100%|██████████| 67/67 [00:04<00:00, 15.36it/s]


Train Loss: 0.0228
Val Loss: 0.0243
Micro F1: 0.5335
Macro F1: 0.4513
Samples F1: 0.4783
Hamming: 0.1659
  comedy      : 0.3431
  cult        : 0.4698
  flashback   : 0.4593
  historical  : 0.4091
  revenge     : 0.3753
  romantic    : 0.5749
  scifi       : 0.2703
  violence    : 0.7089
✓ Best model updated (F1: 0.5335)

Epoch 7/10
----------------------------------------------------------------------


Training: 100%|██████████| 379/379 [00:54<00:00,  7.01it/s]
Validation: 100%|██████████| 67/67 [00:04<00:00, 15.32it/s]


Train Loss: 0.0222
Val Loss: 0.0245
Micro F1: 0.5375
Macro F1: 0.4500
Samples F1: 0.4833
Hamming: 0.1645
  comedy      : 0.3237
  cult        : 0.5000
  flashback   : 0.4484
  historical  : 0.3529
  revenge     : 0.4103
  romantic    : 0.5954
  scifi       : 0.2703
  violence    : 0.6994
✓ Best model updated (F1: 0.5375)

Epoch 8/10
----------------------------------------------------------------------


Training: 100%|██████████| 379/379 [00:53<00:00,  7.02it/s]
Validation: 100%|██████████| 67/67 [00:04<00:00, 15.52it/s]


Train Loss: 0.0218
Val Loss: 0.0247
Micro F1: 0.5444
Macro F1: 0.4623
Samples F1: 0.4965
Hamming: 0.1674
  comedy      : 0.3869
  cult        : 0.5042
  flashback   : 0.4631
  historical  : 0.3429
  revenge     : 0.3938
  romantic    : 0.5912
  scifi       : 0.3077
  violence    : 0.7084
✓ Best model updated (F1: 0.5444)

Epoch 9/10
----------------------------------------------------------------------


Training: 100%|██████████| 379/379 [00:54<00:00,  7.01it/s]
Validation: 100%|██████████| 67/67 [00:04<00:00, 15.42it/s]


Train Loss: 0.0216
Val Loss: 0.0249
Micro F1: 0.5398
Macro F1: 0.4668
Samples F1: 0.4860
Hamming: 0.1673
  comedy      : 0.3792
  cult        : 0.4968
  flashback   : 0.4524
  historical  : 0.3889
  revenge     : 0.3979
  romantic    : 0.5686
  scifi       : 0.3415
  violence    : 0.7094

Epoch 10/10
----------------------------------------------------------------------


Training: 100%|██████████| 379/379 [00:53<00:00,  7.02it/s]
Validation: 100%|██████████| 67/67 [00:04<00:00, 15.51it/s]

Train Loss: 0.0214
Val Loss: 0.0249
Micro F1: 0.5422
Macro F1: 0.4637
Samples F1: 0.4899
Hamming: 0.1659
  comedy      : 0.3731
  cult        : 0.5000
  flashback   : 0.4553
  historical  : 0.3889
  revenge     : 0.4010
  romantic    : 0.5837
  scifi       : 0.3000
  violence    : 0.7074

Training done! Best F1: 0.5444
Best model loaded





## Finding Best Thresholds

In [26]:
model.eval()

val_preds_list = []
val_labels_list = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc='Getting predictions'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']
        
        logits = model(input_ids, attention_mask)
        probs = torch.sigmoid(logits).cpu().numpy()
        
        val_preds_list.append(probs)
        val_labels_list.append(labels.numpy())

probs_val = np.vstack(val_preds_list)
y_val = np.vstack(val_labels_list)

print(f"Probs shape: {probs_val.shape}")
print(f"Labels shape: {y_val.shape}")

Getting predictions: 100%|██████████| 67/67 [00:04<00:00, 15.45it/s]

Probs shape: (1070, 8)
Labels shape: (1070, 8)





In [27]:
# Find best thresholds - EXACTLY like your Task 1 code
thresholds = np.linspace(0, 1, 101)
best_thresholds = []

print("Finding optimal thresholds...\n")

for col in range(probs_val.shape[1]):
    best_f1 = 0
    best_thr = 0.5
    
    for thr in thresholds:
        preds = (probs_val[:, col] >= thr).astype(int)
        f1 = f1_score(y_val[:, col], preds, zero_division=0)
        
        if f1 > best_f1:
            best_f1 = f1
            best_thr = thr
    
    best_thresholds.append(best_thr)
    print(f"{GENRE_COLS[col]:12s}: threshold = {best_thr:.2f}, F1 = {best_f1:.4f}")

best_thresholds = np.array(best_thresholds)
print(f"\nBEST THRESHOLDS:")
print(best_thresholds)

Finding optimal thresholds...

comedy      : threshold = 0.41, F1 = 0.4684
cult        : threshold = 0.39, F1 = 0.5629
flashback   : threshold = 0.37, F1 = 0.5515
historical  : threshold = 0.42, F1 = 0.4528
revenge     : threshold = 0.41, F1 = 0.5158
romantic    : threshold = 0.43, F1 = 0.6383
scifi       : threshold = 0.41, F1 = 0.4615
violence    : threshold = 0.49, F1 = 0.7104

BEST THRESHOLDS:
[0.41 0.39 0.37 0.42 0.41 0.43 0.41 0.49]


In [28]:
# Evaluate with optimized thresholds
val_preds_optimized = np.zeros_like(probs_val)
for i, threshold in enumerate(best_thresholds):
    val_preds_optimized[:, i] = (probs_val[:, i] >= threshold).astype(int)

micro_f1_opt = f1_score(y_val, val_preds_optimized, average='micro')
macro_f1_opt = f1_score(y_val, val_preds_optimized, average='macro')
samples_f1_opt = f1_score(y_val, val_preds_optimized, average='samples')

val_preds_default = (probs_val > 0.5).astype(int)
micro_f1_default = f1_score(y_val, val_preds_default, average='micro')
macro_f1_default = f1_score(y_val, val_preds_default, average='macro')

print(f"\nDefault (0.5): Micro F1 = {micro_f1_default:.4f}, Macro F1 = {macro_f1_default:.4f}")
print(f"Optimized: Micro F1 = {micro_f1_opt:.4f}, Macro F1 = {macro_f1_opt:.4f}")
print(f"Improvement: {micro_f1_opt - micro_f1_default:+.4f}")


Default (0.5): Micro F1 = 0.5422, Macro F1 = 0.4637
Optimized: Micro F1 = 0.5877, Macro F1 = 0.5452
Improvement: +0.0454


## Prediction

In [29]:
def predict_dataframe(df, model, tokenizer, batch_size=BATCH_SIZE, device="cuda"):
    # Prepare texts
    texts = (df['title'] + ' [SEP] ' + df['plot_synopsis']).values
    dummy_labels = np.zeros((len(texts), NUM_LABELS))
    
    # Create dataset and loader
    dataset = MovieGenreDataset(texts, dummy_labels, tokenizer, MAX_LENGTH)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    all_preds = []
    
    model.to(device)
    model.eval()
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            
            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.sigmoid(logits)
            
            # Apply per-label thresholds
            thresholds_tensor = torch.tensor(best_thresholds, device=probs.device, dtype=probs.dtype)
            preds = (probs >= thresholds_tensor).int()
            
            all_preds.append(preds.cpu())
    
    all_preds = torch.cat(all_preds, dim=0).numpy()
    return all_preds  # shape: (len(df), 8)

## End of your code cells

### Evaluation scripts

In [30]:
def read_data(submission_file_path, gold_standard_file_path):
    """
    Read submission and gold standard files.
    Extract student ID from filename.
    """
    # Try to find student ID from the filename (looks for 8 digit numbers)
    id_regex = r'\d{8}'

    user_id = re.findall(id_regex, submission_file_path)
    print("Found your ID: ", user_id)
    if user_id:
        user_id = user_id[0]
    else:
        user_id = 'Unknown'

    # Load submission CSV
    print(f"\nLoading submission file: {submission_file_path}")
    submission_df = pd.read_csv(submission_file_path, sep=',', header=None,
                                quoting=csv.QUOTE_NONE, encoding='utf-8')

    # Load gold standard CSV
    print(f"Loading gold standard file: {gold_standard_file_path}")
    gold_standard_df = pd.read_csv(gold_standard_file_path, header=None)

    # Remove columns 1 and 2 (keep only ID and labels)
    gold_standard_df = gold_standard_df.drop([1, 2], axis=1)
    # Skip header row
    gold_standard_df = gold_standard_df.iloc[1:]

    return submission_df, gold_standard_df, user_id


def match_and_prepare_data(submission_df, gold_standard_df, user_id):
    """
    Match submission rows with gold standard rows by ID.
    Prepare data for evaluation.
    """
    gold_standard_labels = []
    submission_labels = []
    missed_rows = []
    submission_df_copy = submission_df.copy()

    print(f"\nMatching submission with gold standard...")
    print(f"Gold standard rows: {len(gold_standard_df)}")
    print(f"Submission rows: {len(submission_df_copy)}")

    # Match each gold standard row with submission
    for index, row in gold_standard_df.iterrows():
        row = row.reset_index(drop=True)
        row_found = False
        row_id = row[0]

        # Extract gold standard labels
        row_labels = [int(row[i]) for i in range(1, len(row))]
        gold_standard_labels.append(row_labels)

        # Find corresponding submission row
        for sub_index, submission_row in submission_df_copy.iterrows():
            if submission_row[0].strip() == row_id.strip():
                try:
                    # Extract submission labels
                    submission_row_labels = [int(submission_row[i]) for i in range(1, len(submission_row))]
                except:
                    # Handle malformed labels (take first character if multi-digit)
                    submission_row_labels = [int(str(submission_row[i])[0]) for i in range(1, len(submission_row))]

                submission_labels.append(submission_row_labels)
                row_found = True
                submission_df_copy.drop(sub_index, inplace=True)
                break

        if not row_found:
            # If row is missing, add inverse labels (worst possible prediction)
            missed_rows.append(row_id)
            submission_labels.append([0 if label == 1 else 1 for label in row_labels])

    return gold_standard_labels, submission_labels, missed_rows


def evaluate_submission(gold_standard_labels, submission_labels):
    """
    Calculate weighted F1 score.
    """
    print(f"\nCalculating weighted F1 score...")

    # Calculate weighted F1 score (accounts for class imbalance)
    f1_weighted = f1_score(gold_standard_labels, submission_labels, average='weighted')

    return f1_weighted


def print_results(user_id, f1_weighted, missed_rows):
    """
    Print evaluation results to screen.
    """
    print("\n" + "="*70)
    print("YOUR SUBMISSION EVALUATION REPORT")
    print("="*70)

    # Alert if ID not found in filename
    if user_id == 'Unknown':
        print('WARNING: ID not found in filename!')
        print('   Please ensure your filename contains your 8-digit student ID.')
        print()

    print(f"Your ID: {user_id}")
    print()

    # Display F1 score with visual indicator
    print("EVALUATION RESULTS:")
    print(f"   Weighted F1 Score: {f1_weighted:.4f}")
    print()

    # Report missing rows
    if missed_rows:
        print(f"MISSING DATA ({len(missed_rows)} rows not found):")
        print("-" * 70)
        for i, row in enumerate(missed_rows[:10], 1):  # Show first 10
            print(f"    {i}. Row ID: {row}")
        if len(missed_rows) > 10:
            print(f"    ... and {len(missed_rows) - 10} more missing rows")
        print()
        print("TIP: Make sure your submission includes all required rows.")
        print("        Missing rows are penalized with worst possible predictions.")
    else:
        print("DATA COMPLETENESS: All expected rows found in your submission!")

    print()
    print("="*70)
    print()


def evaluate(submission_path, gold_standard_path):
    """
    Main function to run the submission evaluation script.
    """

    submission_file = submission_path
    gold_standard_file = gold_standard_path

    # Check if files exist
    if not os.path.exists(submission_file):
        print(f"Error: Your submission file '{submission_file}' not found!")
        print("Make sure the file path is correct and the file exists.")
        sys.exit(1)

    if not os.path.exists(gold_standard_file):
        print(f"Error: Gold standard file '{gold_standard_file}' not found!")
        print("Make sure you have the correct gold standard file.")
        sys.exit(1)

    try:
        # Step 1: Read data
        submission_df, gold_standard_df, user_id = read_data(submission_file, gold_standard_file)

        # Step 2: Match and prepare data
        gold_standard_labels, submission_labels, missed_rows = match_and_prepare_data(
            submission_df, gold_standard_df, user_id
        )

        # Step 3: Evaluate
        f1_weighted = evaluate_submission(gold_standard_labels, submission_labels)

        # Step 4: Print results
        print_results(user_id, f1_weighted, missed_rows)

    except Exception as e:
        print(f"Error during evaluation: {str(e)}")
        print("Please check that your files are in the correct CSV format.")
        print("Each row should contain: ID, label1, label2, label3, ...")
        import traceback
        traceback.print_exc()
        sys.exit(1)

### Evaluate the model on the validation dataset

In [31]:
# Please run the evaluation scripts cell above before running the mark_and_record

# Please make sure that output format is like following (no header row, no tilte and plot columns):
# 94834c61-0e30-4799-9998-6f74f6sbb204	0	1	0	0	1	0	0	0
# 559sdd28-b6a2-4662-ab55-a6678as26a56	0	0	0	0	0	0	1	0
# b71y3317-04cd-42f5-a380-d21dfasdbd36	0	0	0	0	1	0	0	0

evaluation_results = evaluate(VALIDATION_SET_OUTPUT, VALIDATION_SET)

Error: Your submission file './data/your_student_id_CW2_task3_validation_results.csv' not found!
Make sure the file path is correct and the file exists.


SystemExit: 1

### Save predictions to formatted file.

In [35]:
# Now please modify the code to format your output csv file.

# Please make sure that output format is like following (no header row, no tilte and plot columns):
# 94834c61-0e30-4799-9998-6f74f6sbb204	0	1	0	0	1	0	0	0
# 559sdd28-b6a2-4662-ab55-a6678as26a56	0	0	0	0	0	0	1	0
# b71y3317-04cd-42f5-a380-d21dfasdbd36	0	0	0	0	1	0	0	0

test_df = pd.read_csv(TEST_SET_INPUT)
test_preds = predict_dataframe(test_df, model, tokenizer)

output_df = pd.DataFrame(test_preds, columns=GENRE_COLS)
output_df.insert(0, 'ID', test_df['ID'])

# For example, if you have a DataFrame named 'output_df', you can save it
assert isinstance(output_df, pd.DataFrame)
assert len(output_df) == NUM_ROWS_TEST, "Output length is not aligned with the testdata.csv."
assert len(output_df.columns) == 9, "Please make sure to follow the format above and keep only IDs and 8 columns of prediction."
output_df.to_csv(f'./data/{STUDENT_ID}_CW2_task3_results.csv', index=False, header=False)