# Static Yaw Challenge - Baseline Models

**Objective**: Build baseline classification models for yaw offset prediction

**Approach**:
1. Load and prepare segment-level features
2. Create train/validation split matching test distribution
3. Train baseline models (Random Forest, Logistic Regression)
4. Evaluate performance and establish benchmarks
5. Analyze feature importance
6. Generate predictions for test set

## 1. Setup and Imports

In [None]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# ML libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Custom modules
import sys
sys.path.append('../src')
from feature_engineering import create_segment_features, filter_production_mode
from validation import (create_distribution_matched_split, evaluate_classification,
                       calculate_class_weights, create_test_submission_template)

# Set random seed
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Configure plotting
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 11

# Paths
DATA_DIR = Path('../data/yaw_alignment_dataset')
RESULTS_DIR = Path('../results')
FIGURES_DIR = Path('../reports/figures')
RESULTS_DIR.mkdir(exist_ok=True)

print("Environment ready!")

## 2. Load and Prepare Data

In [None]:
# Load training data
print("Loading training data...")
df_train = pd.read_parquet(DATA_DIR / 'train.parquet')
print(f"Training data shape: {df_train.shape}")

# Load test data
print("\nLoading test data...")
df_test = pd.read_parquet(DATA_DIR / 'test.parquet')
print(f"Test data shape: {df_test.shape}")
print(f"Unique segments: {df_test['segment_id'].nunique()}")

## 3. Feature Engineering - Segment Level

We'll focus on segment-level predictions (298 segments) rather than row-level.

In [None]:
# For training data, we need to create synthetic segments
# Alternative: Use the full data and aggregate differently
# For this baseline, let's filter to production mode and sample segments

print("Filtering training data to production mode (status 10)...")
df_train_prod = filter_production_mode(df_train)
print(f"Production mode data: {len(df_train_prod):,} rows ({len(df_train_prod)/len(df_train)*100:.1f}%)")

# Create synthetic 1-hour segments for training
print("\nCreating synthetic 1-hour segments for training...")
SEGMENT_SIZE = 3600  # 1 hour at 1 Hz

# Group by yaw offset to maintain class balance
train_segments_list = []
for yaw in df_train_prod['yaw_offset'].unique():
    yaw_data = df_train_prod[df_train_prod['yaw_offset'] == yaw].copy()
    
    # Create segments
    n_segments = len(yaw_data) // SEGMENT_SIZE
    for i in range(n_segments):
        segment = yaw_data.iloc[i*SEGMENT_SIZE:(i+1)*SEGMENT_SIZE].copy()
        segment['segment_id'] = f'train_seg_{yaw}_{i:05d}'
        train_segments_list.append(segment)

df_train_segments = pd.concat(train_segments_list, ignore_index=True)
print(f"Created {df_train_segments['segment_id'].nunique()} training segments")
print(f"Segment yaw distribution:")
print(df_train_segments.groupby('segment_id')['yaw_offset'].first().value_counts().sort_index())

In [None]:
# Create segment-level features for training
print("Creating segment-level features for training data...")
train_segment_features = create_segment_features(
    df_train_segments,
    segment_col='segment_id',
    exclude_cols=['row_id', 'datetime']
)

# Add target (yaw_offset per segment)
segment_targets = df_train_segments.groupby('segment_id')['yaw_offset'].first().reset_index()
train_segment_features = train_segment_features.merge(segment_targets, on='segment_id')

print(f"\nSegment features shape: {train_segment_features.shape}")
print(f"Features created: {train_segment_features.shape[1] - 2}")  # Exclude segment_id and yaw_offset

In [None]:
# Create segment-level features for test data
print("Creating segment-level features for test data...")
test_segment_features = create_segment_features(
    df_test,
    segment_col='segment_id',
    exclude_cols=['row_id', 'segment_time']
)

print(f"Test segment features shape: {test_segment_features.shape}")
print(f"Number of test segments: {len(test_segment_features)}")

## 4. Prepare Features and Target

In [None]:
# Define feature columns (exclude segment_id and target)
feature_cols = [c for c in train_segment_features.columns 
                if c not in ['segment_id', 'yaw_offset']]

print(f"Number of features: {len(feature_cols)}")
print(f"\nFeatures: {feature_cols[:10]}...")  # Show first 10

# Prepare training data
X_train_full = train_segment_features[feature_cols]
y_train_full = train_segment_features['yaw_offset'].values

# Prepare test data
X_test = test_segment_features[feature_cols]

print(f"\nTraining features shape: {X_train_full.shape}")
print(f"Test features shape: {X_test.shape}")

## 5. Train/Validation Split

In [None]:
# Create stratified split
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full,
    test_size=0.2,
    stratify=y_train_full,
    random_state=RANDOM_SEED
)

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"\nTraining target distribution:")
unique, counts = np.unique(y_train, return_counts=True)
for u, c in zip(unique, counts):
    print(f"  {int(u)}°: {c} ({c/len(y_train)*100:.1f}%)")

print(f"\nValidation target distribution:")
unique, counts = np.unique(y_val, return_counts=True)
for u, c in zip(unique, counts):
    print(f"  {int(u)}°: {c} ({c/len(y_val)*100:.1f}%)")

## 6. Handle Class Imbalance

In [None]:
# Calculate class weights
class_weights = calculate_class_weights(y_train)

# Convert to format for sklearn
class_weight_dict = {int(k): v for k, v in class_weights.items()}

## 7. Baseline Model 1: Random Forest

In [None]:
print("Training Random Forest Classifier...")

# Create and train Random Forest
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight=class_weight_dict,
    random_state=RANDOM_SEED,
    n_jobs=-1,
    verbose=1
)

rf_model.fit(X_train, y_train)
print("Training complete!")

In [None]:
# Evaluate on validation set
y_val_pred_rf = rf_model.predict(X_val)

print("\n" + "="*70)
print("RANDOM FOREST VALIDATION RESULTS")
print("="*70)
rf_metrics = evaluate_classification(
    y_val, y_val_pred_rf,
    class_names=['0°', '4°', '6°'],
    print_results=True
)

In [None]:
# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 20 Most Important Features:")
print(feature_importance.head(20))

# Plot feature importance
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(20)
plt.barh(range(len(top_features)), top_features['importance'].values)
plt.yticks(range(len(top_features)), top_features['feature'].values)
plt.xlabel('Importance', fontsize=12)
plt.title('Top 20 Feature Importances (Random Forest)', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig(FIGURES_DIR / 'rf_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

## 8. Baseline Model 2: Logistic Regression

In [None]:
print("Training Logistic Regression...")

# Create pipeline with scaling (important for logistic regression)
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(
        max_iter=1000,
        class_weight=class_weight_dict,
        random_state=RANDOM_SEED,
        multi_class='multinomial',
        solver='lbfgs'
    ))
])

lr_pipeline.fit(X_train, y_train)
print("Training complete!")

In [None]:
# Evaluate on validation set
y_val_pred_lr = lr_pipeline.predict(X_val)

print("\n" + "="*70)
print("LOGISTIC REGRESSION VALIDATION RESULTS")
print("="*70)
lr_metrics = evaluate_classification(
    y_val, y_val_pred_lr,
    class_names=['0°', '4°', '6°'],
    print_results=True
)

## 9. Model Comparison

In [None]:
# Compare models
comparison = pd.DataFrame({
    'Model': ['Random Forest', 'Logistic Regression'],
    'Accuracy': [rf_metrics['accuracy'], lr_metrics['accuracy']],
    'F1 (Macro)': [rf_metrics['f1_macro'], lr_metrics['f1_macro']],
    'F1 (Weighted)': [rf_metrics['f1_weighted'], lr_metrics['f1_weighted']],
    'F1 (0°)': [rf_metrics['f1_0°'], lr_metrics['f1_0°']],
    'F1 (4°)': [rf_metrics['f1_4°'], lr_metrics['f1_4°']],
    'F1 (6°)': [rf_metrics['f1_6°'], lr_metrics['f1_6°']]
})

print("\n" + "="*70)
print("MODEL COMPARISON")
print("="*70)
print(comparison.to_string(index=False))

# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Overall metrics
metrics_to_plot = ['Accuracy', 'F1 (Macro)', 'F1 (Weighted)']
x = np.arange(len(metrics_to_plot))
width = 0.35

axes[0].bar(x - width/2, comparison.iloc[0][metrics_to_plot], width, label='Random Forest', alpha=0.8)
axes[0].bar(x + width/2, comparison.iloc[1][metrics_to_plot], width, label='Logistic Regression', alpha=0.8)
axes[0].set_xlabel('Metric', fontsize=12)
axes[0].set_ylabel('Score', fontsize=12)
axes[0].set_title('Overall Model Performance', fontsize=14, fontweight='bold')
axes[0].set_xticks(x)
axes[0].set_xticklabels(metrics_to_plot)
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[0].set_ylim(0, 1)

# Per-class F1 scores
class_metrics = ['F1 (0°)', 'F1 (4°)', 'F1 (6°)']
x2 = np.arange(len(class_metrics))

axes[1].bar(x2 - width/2, comparison.iloc[0][class_metrics], width, label='Random Forest', alpha=0.8)
axes[1].bar(x2 + width/2, comparison.iloc[1][class_metrics], width, label='Logistic Regression', alpha=0.8)
axes[1].set_xlabel('Yaw Offset Class', fontsize=12)
axes[1].set_ylabel('F1 Score', fontsize=12)
axes[1].set_title('Per-Class F1 Scores', fontsize=14, fontweight='bold')
axes[1].set_xticks(x2)
axes[1].set_xticklabels(['0°', '4°', '6°'])
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].set_ylim(0, 1)

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## 10. Generate Test Predictions

Using the best performing model (likely Random Forest)

In [None]:
# Select best model (Random Forest based on typical performance)
best_model = rf_model
model_name = "Random Forest"

print(f"Generating predictions with {model_name}...")

# Predict on test segments
segment_predictions = best_model.predict(X_test)

# Create segment-level submission
segment_submission = pd.DataFrame({
    'segment_id': test_segment_features['segment_id'],
    'predicted_yaw': segment_predictions
})

print(f"\nSegment-level predictions:")
print(segment_submission.head(10))
print(f"\nPredicted distribution:")
print(segment_submission['predicted_yaw'].value_counts().sort_index())

In [None]:
# Map segment predictions to row-level predictions
row_predictions = df_test[['row_id', 'segment_id']].merge(
    segment_submission,
    on='segment_id',
    how='left'
)

# Create submission file
submission = row_predictions[['row_id', 'predicted_yaw']].rename(
    columns={'predicted_yaw': 'yaw_offset'}
)

# Save submission
submission_path = RESULTS_DIR / 'baseline_submission.csv'
submission.to_csv(submission_path, index=False)

print(f"\nSubmission saved to: {submission_path}")
print(f"Submission shape: {submission.shape}")
print(f"\nRow-level prediction distribution:")
print(submission['yaw_offset'].value_counts().sort_index())
print(f"\nFirst few rows:")
print(submission.head(10))

## 11. Save Models and Results

In [None]:
# Save models
import joblib

models_dir = RESULTS_DIR / 'models'
models_dir.mkdir(exist_ok=True)

joblib.dump(rf_model, models_dir / 'random_forest_baseline.pkl')
joblib.dump(lr_pipeline, models_dir / 'logistic_regression_baseline.pkl')

print("Models saved!")

# Save feature importance
feature_importance.to_csv(RESULTS_DIR / 'feature_importance_rf.csv', index=False)
print("Feature importance saved!")

# Save metrics
comparison.to_csv(RESULTS_DIR / 'baseline_model_comparison.csv', index=False)
print("Metrics saved!")

## 12. Summary and Next Steps

In [None]:
print("="*70)
print("BASELINE MODELING SUMMARY")
print("="*70)

print("\n1. DATA PREPARATION:")
print(f"   - Training segments: {len(train_segment_features)}")
print(f"   - Test segments: {len(test_segment_features)}")
print(f"   - Features created: {len(feature_cols)}")

print("\n2. MODEL PERFORMANCE (Validation Set):")
print(f"   Random Forest:")
print(f"     - Accuracy: {rf_metrics['accuracy']:.4f}")
print(f"     - F1 (Macro): {rf_metrics['f1_macro']:.4f}")
print(f"   Logistic Regression:")
print(f"     - Accuracy: {lr_metrics['accuracy']:.4f}")
print(f"     - F1 (Macro): {lr_metrics['f1_macro']:.4f}")

print("\n3. TOP 5 MOST IMPORTANT FEATURES:")
for i, row in feature_importance.head(5).iterrows():
    print(f"   {i+1}. {row['feature']}: {row['importance']:.4f}")

print("\n4. PREDICTIONS GENERATED:")
print(f"   - Segment-level: {len(segment_submission)} predictions")
print(f"   - Row-level: {len(submission)} predictions")
print(f"   - Distribution: 0°={len(submission[submission['yaw_offset']==0])}, "
      f"4°={len(submission[submission['yaw_offset']==4])}, "
      f"6°={len(submission[submission['yaw_offset']==6])}")

print("\n5. NEXT STEPS:")
print("   - Try gradient boosting models (XGBoost, LightGBM)")
print("   - Experiment with different feature engineering approaches")
print("   - Try two-stage modeling (row → segment)")
print("   - Implement ensemble methods")
print("   - Address mystery offset in private test (regression approach)")
print("   - Perform hyperparameter tuning")

print("\n" + "="*70)