# ü§ñ Baseline Model Training - SemEval 2026 Task 13

**Goal:** Train and compare baseline ML models

**Level:** ‚≠ê‚≠ê Intermediate (1-2 hours)

**What you'll learn:**
- Feature extraction from code
- Train multiple ML models
- Compare model performance
- Understand baseline results

import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

from src.features import extract_features_from_dataframe

# Set seed for reproducibility
SEED = 42
np.random.seed(SEED)
import random
random.seed(SEED)

print("‚úÖ Libraries loaded!")
print(f"üîí Random seed: {SEED}")

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

from src.features import extract_features_from_dataframe

print("‚úÖ Libraries loaded!")

## 2. Load Data

In [None]:
# Load datasets
train_df = pd.read_parquet('../data/train_A.parquet')
val_df = pd.read_parquet('../data/validation_A.parquet')

print(f"Training: {len(train_df)} samples")
print(f"Validation: {len(val_df)} samples")

## 3. Extract Features

In [None]:
# Extract features
print("Extracting features...")
X_train = extract_features_from_dataframe(train_df)
X_val = extract_features_from_dataframe(val_df)
y_train = train_df['label'].values
y_val = val_df['label'].values

print(f"Feature shape: {X_train.shape}")
print(f"Number of features: {X_train.shape[1]}")
print(f"\nFeature names: {list(X_train.columns)[:10]}...")

## 4. Train Models

In [None]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

results = {}

# Train and evaluate each model
for name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Training {name}...")
    print(f"{'='*60}")
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
    
    # Evaluate
    train_f1 = f1_score(y_train, y_pred_train, average='macro')
    val_f1 = f1_score(y_val, y_pred_val, average='macro')
    
    results[name] = {
        'train_f1': train_f1,
        'val_f1': val_f1,
        'model': model,
        'predictions': y_pred_val
    }
    
    print(f"Train F1: {train_f1:.4f}")
    print(f"Val F1:   {val_f1:.4f}")
    print(f"\nClassification Report:")
    print(classification_report(y_val, y_pred_val, target_names=['Human', 'AI']))

## 5. Compare Models

In [None]:
# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Train F1': [r['train_f1'] for r in results.values()],
    'Val F1': [r['val_f1'] for r in results.values()]
})

print(comparison_df)

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(comparison_df))
width = 0.35

ax.bar(x - width/2, comparison_df['Train F1'], width, label='Train F1', color='#66bb6a')
ax.bar(x + width/2, comparison_df['Val F1'], width, label='Val F1', color='#42a5f5')

ax.set_xlabel('Model', fontsize=12)
ax.set_ylabel('Macro F1 Score', fontsize=12)
ax.set_title('Model Comparison', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(comparison_df['Model'], rotation=15, ha='right')
ax.legend()
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Confusion Matrix (Best Model)

In [None]:
# Get best model
best_model_name = max(results, key=lambda k: results[k]['val_f1'])
best_predictions = results[best_model_name]['predictions']

print(f"Best Model: {best_model_name}")
print(f"Validation F1: {results[best_model_name]['val_f1']:.4f}")

# Confusion matrix
cm = confusion_matrix(y_val, best_predictions)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Human', 'AI'],
            yticklabels=['Human', 'AI'])
plt.title(f'Confusion Matrix - {best_model_name}', fontsize=14, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

## 7. Feature Importance (Random Forest)

In [None]:
# Get feature importance from Random Forest
rf_model = results['Random Forest']['model']
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

# Plot top 15 features
plt.figure(figsize=(10, 8))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['importance'], color='#66bb6a')
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance', fontsize=12)
plt.title('Top 15 Most Important Features', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 10 Features:")
print(feature_importance.head(10))

## 8. Your Turn! üéØ

**Experiments to try:**
1. Tune hyperparameters (max_depth, n_estimators, etc.)
2. Try other models (SVM, XGBoost, LightGBM)
3. Feature selection (remove low-importance features)
4. Cross-validation for more robust evaluation

**Add your code below:**

In [None]:
# Your experiments here!


## 9. Key Takeaways

**Expected baseline performance:**
- Logistic Regression: ~50-55% F1
- Random Forest: ~55-60% F1
- Gradient Boosting: ~58-62% F1

**To reach competitive performance (85-95% F1):**
- Add AST features (notebook 03)
- Use transformer models like CodeBERT (notebook 05)
- Ensemble multiple models

**Your observations:**
- 
- 
- 

---

## ‚úÖ Next Steps

1. **Try hyperparameter tuning** - Can you beat the baseline?
2. **Move to notebook 03** - Add AST features for +10-15% F1
3. **Share your results** - Open a PR with your experiments

**Great work on training baseline models!** üéâ