# Task B Training - Model Attribution (12-class)

This notebook trains an improved model for Task B using:
- TF-IDF features (character + word n-grams)
- Cosine similarity to class centroids
- AST-based structural features
- XGBoost with GPU acceleration

**Target:** Improve from ~20% to 40-55% Macro F1

## 1. Setup (Colab)

In [None]:
# Check if running in Colab
import sys
import os
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Clone repo (only if not already cloned)
    if not os.path.exists('/content/semeval26-task13'):
        !git clone https://github.com/kayceenguyenn/semeval26-task13.git /content/semeval26-task13
    
    # Always change to repo directory (use absolute path)
    %cd /content/semeval26-task13
    
    # Install only the packages we need (without strict versions to avoid Colab conflicts)
    !pip install -q xgboost loguru tqdm pydantic pydantic-settings
    
    # Copy data from Google Drive
    !mkdir -p data
    !cp /content/drive/MyDrive/semeval-data/*.parquet data/ 2>/dev/null || echo "Data already copied or not found in Drive"
    
    print("Running in Google Colab")
    print(f"Working directory: {os.getcwd()}")
else:
    print("Running locally")

In [None]:
# Imports
import pandas as pd
import numpy as np
from pathlib import Path
import sys

# Add src to path
sys.path.insert(0, 'src' if Path('src').exists() else '../src')

from data_loader import TaskDataLoader
from features import (
    extract_features_from_dataframe,
    fit_tfidf_pipeline,
    save_fitted_state,
    load_fitted_state,
)
from models import get_model
from evaluate import evaluate, print_results

# Check GPU availability
try:
    import xgboost as xgb
    print(f"XGBoost version: {xgb.__version__}")
except ImportError:
    print("XGBoost not installed!")

# Set random seed
np.random.seed(42)

## 2. Load Data

In [None]:
# Load Task B data
loader = TaskDataLoader(task='B')

print("Loading training data...")
train_df = loader.load_split('train')
print(f"Train samples: {len(train_df):,}")

print("\nLoading validation data...")
val_df = loader.load_split('validation')
print(f"Validation samples: {len(val_df):,}")

# Show label distribution
print("\nLabel distribution (train):")
print(train_df['label'].value_counts().sort_index())

In [None]:
# Sample for faster iteration (optional)
SAMPLE_SIZE = None  # Set to e.g. 50000 for faster testing, None for full dataset

if SAMPLE_SIZE:
    print(f"Sampling {SAMPLE_SIZE:,} training samples for faster iteration...")
    train_df = train_df.sample(n=SAMPLE_SIZE, random_state=42)
    print(f"Sampled train size: {len(train_df):,}")

## 3. Fit TF-IDF Pipeline

This fits the TF-IDF vectorizers and computes class centroids on training data.

In [None]:
# Fit TF-IDF on training data
train_codes = train_df['code'].tolist()
train_labels = train_df['label'].values

fit_tfidf_pipeline(train_codes, train_labels)

In [None]:
# Save fitted state for later use
FITTED_STATE_PATH = 'models/tfidf_state'
save_fitted_state(FITTED_STATE_PATH)

## 4. Extract Features

In [None]:
# Extract features from training data (with TF-IDF)
print("Extracting training features...")
X_train = extract_features_from_dataframe(train_df, include_tfidf=True)
y_train = train_df['label'].values

print(f"\nTraining features shape: {X_train.shape}")
print(f"Feature columns: {list(X_train.columns[:10])}... (and {len(X_train.columns)-10} more)")

In [None]:
# Extract features from validation data
print("Extracting validation features...")
X_val = extract_features_from_dataframe(val_df, include_tfidf=True)
y_val = val_df['label'].values

print(f"Validation features shape: {X_val.shape}")

## 5. Train XGBoost Model

In [None]:
# Create and train XGBoost model
model = get_model(
    'xgboost',
    n_estimators=300,
    max_depth=8,
    learning_rate=0.1,
    use_gpu=True,  # Use GPU if available
)

# Train with validation set for early stopping
model.fit(X_train, y_train, X_val=X_val, y_val=y_val)

## 6. Evaluate

In [None]:
# Evaluate on validation set
results = model.evaluate(X_val, y_val, detailed=True)

In [None]:
# Show feature importance
model.get_feature_importance(list(X_train.columns), top_n=25)

## 7. Save Model

In [None]:
# Save trained model
MODEL_PATH = 'models/task_B_xgboost.pkl'
model.save(MODEL_PATH)

print(f"\nModel saved to: {MODEL_PATH}")
print(f"TF-IDF state saved to: {FITTED_STATE_PATH}")

In [None]:
# Copy to Google Drive (if in Colab)
if IN_COLAB:
    !cp -r models /content/drive/MyDrive/semeval-models/
    print("Models copied to Google Drive")

## 8. Compare with Baseline

In [None]:
# Train baseline for comparison (without TF-IDF features)
print("Training baseline Random Forest for comparison...")
print("(Using only basic + keyword + AST features)")

# Extract features without TF-IDF
X_train_basic = extract_features_from_dataframe(train_df, include_tfidf=False)
X_val_basic = extract_features_from_dataframe(val_df, include_tfidf=False)

baseline = get_model('random_forest')
baseline.fit(X_train_basic, y_train)

print("\nBaseline Results:")
baseline_results = baseline.evaluate(X_val_basic, y_val, detailed=True)

In [None]:
# Summary comparison
print("\n" + "="*70)
print("COMPARISON SUMMARY")
print("="*70)
print(f"\nBaseline (RF, {X_train_basic.shape[1]} features):  {baseline_results['macro_f1']:.4f} Macro F1")
print(f"XGBoost  ({X_train.shape[1]} features):  {results['macro_f1']:.4f} Macro F1")
print(f"\nImprovement: +{(results['macro_f1'] - baseline_results['macro_f1'])*100:.2f}%")
print("="*70)

## 9. Make Predictions on Test Set (Optional)

In [None]:
# Load test data (if available)
try:
    test_df = loader.load_split('test')
    print(f"Test samples: {len(test_df):,}")
    
    # Extract features
    X_test = extract_features_from_dataframe(test_df, include_tfidf=True)
    
    # Make predictions
    test_preds = model.predict(X_test)
    
    # Create submission
    submission = pd.DataFrame({
        'id': test_df['id'],
        'label': test_preds
    })
    
    submission.to_csv('results/predictions/task_B_submission.csv', index=False)
    print("\nSubmission saved to results/predictions/task_B_submission.csv")
    
except Exception as e:
    print(f"Test data not available: {e}")

## Next Steps

To further improve performance:

1. **Hyperparameter tuning**: Try different `n_estimators`, `max_depth`, `learning_rate`
2. **More TF-IDF features**: Increase `max_features` in vectorizers
3. **LightGBM**: Often faster and comparable accuracy
4. **Ensemble**: Combine XGBoost with Random Forest
5. **Transformers**: Fine-tune CodeBERT for maximum performance