# Task 3: Enhanced Multi-Label Movie Genre Classifier
## NO FUNCTIONS - JUST PURE CODE

## Constants

In [None]:
MODEL_NAME = 'roberta-base'
NUM_LABELS = 8
MAX_LENGTH = 256
CONCAT_LAST_N_LAYERS = 4
DROPOUT = 0.3
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_EPOCHS = 5
WARMUP_RATIO = 0.1
WEIGHT_DECAY = 0.01
MAX_GRAD_NORM = 1.0
FOCAL_ALPHA = 0.25
FOCAL_GAMMA = 2.0

TRAIN_PATH = 'train.csv'
TEST_PATH = 'test.csv'
MODEL_SAVE_PATH = 'best_model_task3.pt'

GENRE_NAMES = ['comedy', 'cult', 'flashback', 'historical', 'revenge', 'romantic', 'scifi', 'violence']
RANDOM_SEED = 42

## Imports

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, get_cosine_schedule_with_warmup
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, hamming_loss
from tqdm.auto import tqdm
import warnings
import random

warnings.filterwarnings('ignore')

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## Data Loading

In [None]:
train_df = pd.read_csv(TRAIN_PATH)
print(f"Training samples: {len(train_df)}")
print(train_df.head())

for genre in GENRE_NAMES:
    count = train_df[genre].sum()
    percentage = (count / len(train_df)) * 100
    print(f"{genre:12s}: {count:5d} ({percentage:.1f}%)")

texts = (train_df['title'] + ' [SEP] ' + train_df['plot']).values
labels = train_df[GENRE_NAMES].values

X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.15, random_state=RANDOM_SEED)
print(f"\nTrain: {len(X_train)}, Val: {len(X_val)}")

## Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"Tokenizer loaded: {MODEL_NAME}")

In [None]:
class MovieGenreDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.FloatTensor(label)
        }

train_dataset = MovieGenreDataset(X_train, y_train, tokenizer, MAX_LENGTH)
val_dataset = MovieGenreDataset(X_val, y_val, tokenizer, MAX_LENGTH)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
print(f"Dataloaders created: {len(train_loader)} train batches, {len(val_loader)} val batches")

## Model

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
    def forward(self, inputs, targets):
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        return (self.alpha * (1-pt)**self.gamma * BCE_loss).mean()

class MultiHeadAttentionPooling(nn.Module):
    def __init__(self, hidden_size, num_heads=8):
        super().__init__()
        self.attention = nn.MultiheadAttention(hidden_size, num_heads, batch_first=True)
        self.query = nn.Parameter(torch.randn(1, 1, hidden_size))
    def forward(self, hidden_states, attention_mask=None):
        query = self.query.expand(hidden_states.size(0), -1, -1)
        if attention_mask is not None:
            attention_mask = ~attention_mask.bool()
        attn_output, _ = self.attention(query, hidden_states, hidden_states, key_padding_mask=attention_mask)
        return attn_output.squeeze(1)

class EnhancedTransformerClassifier(nn.Module):
    def __init__(self, model_name, num_labels, concat_last_n_layers=4, dropout=0.3):
        super().__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        self.hidden_size = self.transformer.config.hidden_size
        self.concat_last_n_layers = concat_last_n_layers
        feature_size = self.hidden_size * concat_last_n_layers
        self.attention_pooling = MultiHeadAttentionPooling(self.hidden_size, num_heads=8)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(feature_size)
        self.classifier = nn.Sequential(
            nn.Linear(feature_size, feature_size // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(feature_size // 2, num_labels)
        )
    def forward(self, input_ids, attention_mask):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        last_n_layers = outputs.hidden_states[-self.concat_last_n_layers:]
        pooled_layers = [self.attention_pooling(layer, attention_mask) for layer in last_n_layers]
        features = torch.cat(pooled_layers, dim=-1)
        features = self.layer_norm(self.dropout(features))
        return self.classifier(features)

model = EnhancedTransformerClassifier(MODEL_NAME, NUM_LABELS, CONCAT_LAST_N_LAYERS, DROPOUT)
model.to(device)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total params: {total_params:,}")
print(f"Trainable params: {trainable_params:,}")
print(f"Under 600M: {trainable_params < 600_000_000}")

## Training Setup

In [None]:
criterion = FocalLoss(FOCAL_ALPHA, FOCAL_GAMMA)

no_decay = ['bias', 'LayerNorm.weight']
optimizer_params = [
    {'params': [p for n, p in model.transformer.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': WEIGHT_DECAY, 'lr': LEARNING_RATE * 0.1},
    {'params': [p for n, p in model.transformer.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr': LEARNING_RATE * 0.1},
    {'params': [p for n, p in model.named_parameters() if 'transformer' not in n], 'weight_decay': WEIGHT_DECAY, 'lr': LEARNING_RATE}
]
optimizer = torch.optim.AdamW(optimizer_params)

num_training_steps = len(train_loader) * NUM_EPOCHS
num_warmup_steps = int(num_training_steps * WARMUP_RATIO)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)

print(f"Optimizer and scheduler ready")
print(f"Training steps: {num_training_steps}, Warmup: {num_warmup_steps}")

## Training

In [None]:
best_f1 = 0
history = {'train_loss': [], 'val_loss': [], 'micro_f1': [], 'macro_f1': []}

for epoch in range(NUM_EPOCHS):
    print(f"\nEpoch {epoch+1}/{NUM_EPOCHS}")
    print("-" * 70)
    
    # TRAINING
    model.train()
    total_train_loss = 0
    train_preds_list = []
    train_labels_list = []
    
    for batch in tqdm(train_loader, desc='Training'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
        optimizer.step()
        scheduler.step()
        
        total_train_loss += loss.item()
        preds = torch.sigmoid(logits).detach().cpu().numpy()
        train_preds_list.append(preds)
        train_labels_list.append(labels.cpu().numpy())
    
    avg_train_loss = total_train_loss / len(train_loader)
    train_preds = np.vstack(train_preds_list)
    train_labels = np.vstack(train_labels_list)
    
    # VALIDATION
    model.eval()
    total_val_loss = 0
    val_preds_list = []
    val_labels_list = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Validation'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            total_val_loss += loss.item()
            
            preds = torch.sigmoid(logits).cpu().numpy()
            val_preds_list.append(preds)
            val_labels_list.append(labels.cpu().numpy())
    
    avg_val_loss = total_val_loss / len(val_loader)
    val_preds = np.vstack(val_preds_list)
    val_labels = np.vstack(val_labels_list)
    
    # METRICS
    val_preds_binary = (val_preds > 0.5).astype(int)
    micro_f1 = f1_score(val_labels, val_preds_binary, average='micro')
    macro_f1 = f1_score(val_labels, val_preds_binary, average='macro')
    samples_f1 = f1_score(val_labels, val_preds_binary, average='samples')
    hamming = hamming_loss(val_labels, val_preds_binary)
    
    history['train_loss'].append(avg_train_loss)
    history['val_loss'].append(avg_val_loss)
    history['micro_f1'].append(micro_f1)
    history['macro_f1'].append(macro_f1)
    
    print(f"Train Loss: {avg_train_loss:.4f}")
    print(f"Val Loss: {avg_val_loss:.4f}")
    print(f"Micro F1: {micro_f1:.4f}")
    print(f"Macro F1: {macro_f1:.4f}")
    print(f"Samples F1: {samples_f1:.4f}")
    print(f"Hamming: {hamming:.4f}")
    
    per_class_f1 = f1_score(val_labels, val_preds_binary, average=None)
    for i, genre in enumerate(GENRE_NAMES):
        print(f"  {genre:12s}: {per_class_f1[i]:.4f}")
    
    if micro_f1 > best_f1:
        best_f1 = micro_f1
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'micro_f1': micro_f1,
            'val_preds': val_preds,
            'val_labels': val_labels
        }, MODEL_SAVE_PATH)
        print(f"âœ“ Saved best model (F1: {best_f1:.4f})")

print(f"\nTraining done! Best F1: {best_f1:.4f}")

## Finding Best Thresholds

In [None]:
checkpoint = torch.load(MODEL_SAVE_PATH)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

val_preds_list = []
val_labels_list = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc='Getting predictions'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']
        
        logits = model(input_ids, attention_mask)
        probs = torch.sigmoid(logits).cpu().numpy()
        
        val_preds_list.append(probs)
        val_labels_list.append(labels.numpy())

probs_val = np.vstack(val_preds_list)
y_val = np.vstack(val_labels_list)

print(f"Probs shape: {probs_val.shape}")
print(f"Labels shape: {y_val.shape}")

In [None]:
# Find best thresholds - EXACTLY like your Task 1 code
thresholds = np.linspace(0, 1, 101)
best_thresholds = []

print("Finding optimal thresholds...\n")

for col in range(probs_val.shape[1]):
    best_f1 = 0
    best_thr = 0.5
    
    for thr in thresholds:
        preds = (probs_val[:, col] >= thr).astype(int)
        f1 = f1_score(y_val[:, col], preds, zero_division=0)
        
        if f1 > best_f1:
            best_f1 = f1
            best_thr = thr
    
    best_thresholds.append(best_thr)
    print(f"{GENRE_NAMES[col]:12s}: threshold = {best_thr:.2f}, F1 = {best_f1:.4f}")

best_thresholds = np.array(best_thresholds)
print(f"\nBEST THRESHOLDS:")
print(best_thresholds)

In [None]:
# Evaluate with optimized thresholds
val_preds_optimized = np.zeros_like(probs_val)
for i, threshold in enumerate(best_thresholds):
    val_preds_optimized[:, i] = (probs_val[:, i] >= threshold).astype(int)

micro_f1_opt = f1_score(y_val, val_preds_optimized, average='micro')
macro_f1_opt = f1_score(y_val, val_preds_optimized, average='macro')
samples_f1_opt = f1_score(y_val, val_preds_optimized, average='samples')

val_preds_default = (probs_val > 0.5).astype(int)
micro_f1_default = f1_score(y_val, val_preds_default, average='micro')
macro_f1_default = f1_score(y_val, val_preds_default, average='macro')

print(f"\nDefault (0.5): Micro F1 = {micro_f1_default:.4f}, Macro F1 = {macro_f1_default:.4f}")
print(f"Optimized: Micro F1 = {micro_f1_opt:.4f}, Macro F1 = {macro_f1_opt:.4f}")
print(f"Improvement: {micro_f1_opt - micro_f1_default:+.4f}")

checkpoint['best_thresholds'] = best_thresholds
torch.save(checkpoint, MODEL_SAVE_PATH)
print(f"Thresholds saved")

## Prediction

In [None]:
# Load test data
test_df = pd.read_csv(TEST_PATH)
print(f"Test samples: {len(test_df)}")

# Prepare test texts
test_texts = (test_df['title'] + ' [SEP] ' + test_df['plot']).values
test_dummy_labels = np.zeros((len(test_texts), NUM_LABELS))

# Create test dataset and loader
test_dataset = MovieGenreDataset(test_texts, test_dummy_labels, tokenizer, MAX_LENGTH)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Test loader ready: {len(test_loader)} batches")

In [None]:
# Get predictions
model.eval()
all_probs = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc='Predicting'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        logits = model(input_ids, attention_mask)
        probs = torch.sigmoid(logits).cpu().numpy()
        all_probs.append(probs)

test_probabilities = np.vstack(all_probs)
print(f"Probabilities shape: {test_probabilities.shape}")

# Apply thresholds
test_predictions = np.zeros_like(test_probabilities)
for i, threshold in enumerate(best_thresholds):
    test_predictions[:, i] = (test_probabilities[:, i] >= threshold).astype(int)

print(f"Predictions shape: {test_predictions.shape}")

In [None]:
# Create submission
submission_df = pd.DataFrame(test_predictions, columns=GENRE_NAMES)
if 'id' in test_df.columns:
    submission_df.insert(0, 'id', test_df['id'].values)

submission_df.to_csv('submission_task3.csv', index=False)
print("Submission saved: submission_task3.csv")
print("\nFirst 10 predictions:")
print(submission_df.head(10))

In [None]:
# Statistics
print("\nPrediction statistics:")
for i, genre in enumerate(GENRE_NAMES):
    count = test_predictions[:, i].sum()
    percentage = (count / len(test_predictions)) * 100
    avg_prob = test_probabilities[:, i].mean()
    print(f"{genre:12s}: {count:5d} ({percentage:5.1f}%) | Avg prob: {avg_prob:.3f}")

genres_per_movie = test_predictions.sum(axis=1)
print(f"\nGenres per movie: Min={genres_per_movie.min()}, Max={genres_per_movie.max()}, Mean={genres_per_movie.mean():.2f}")