# Baseline Models

In [None]:
import polars as pl
import numpy as np
import os
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score

In [None]:
data_dir = Path.home() / "teams/b13-domain-2/ca_data/"

X_train = pl.scan_parquet(data_dir / "X_train.parquet").collect(engine="streaming")
y_train = pl.scan_parquet(data_dir / "y_train.parquet").collect(engine="streaming").to_numpy().ravel()

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=1000, n_jobs=-1, random_state=42, class_weight='balanced', verbose=1)

logreg.fit(X_train, y_train)

In [None]:
X_test = pl.scan_parquet(data_dir / "X_test.parquet").collect(engine="streaming")
y_test = pl.scan_parquet(data_dir / "y_test.parquet").collect(engine="streaming").to_numpy().ravel()

In [None]:
y_pred = logreg.predict(X_test)
y_pred_proba = logreg.predict_proba(X_test)[:, 1]

print(f"\nTrain accuracy: {logreg.score(X_train, y_train):.4f}")
print(f"Test accuracy: {logreg.score(X_test, y_test):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print(f"\nROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

print(f"\nPR-AUC: {average_precision_score(y_test, y_pred_proba):.4f}")

In [None]:
from sklearn.metrics import precision_recall_curve

feature_names = X_train.columns
weights_df = pd.DataFrame({
    'feature': feature_names,
    'weight': logreg.coef_[0]
})

weights_df_sorted = weights_df.sort_values(by='weight', key=lambda x: x.abs(), ascending=False).reset_index(drop=True)


precisions, recalls, thresholds = precision_recall_curve(y_test, y_pred_proba)

plt.figure(figsize=(10, 6))
plt.plot(recalls, precisions)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.grid(True)
plt.show()

for target_recall in [0.1, 0.2, 0.3]:
    idx = np.argmin(np.abs(recalls[:-1] - target_recall))
    print(f"Recall {target_recall:.0%}: Precision = {precisions[idx]:.3f}, Threshold = {thresholds[idx]:.4f}")

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(
    n_estimators=200,          # Number of trees - start with 200 for good performance/speed balance
    max_depth=15,              # Limit tree depth to prevent overfitting
    max_features='sqrt',       # Use sqrt(n_features) for each split - good for classification
    class_weight='balanced_subsample',   # Handle class imbalance (fires are likely rare events)
    random_state=42,           # For reproducibility
    n_jobs=-1,                 # Use all CPU cores
    verbose=1                  # Show progress during training
)

rfc.fit(X_train, y_train)

In [None]:
y_pred = rfc.predict(X_test)
y_pred_proba = rfc.predict_proba(X_test)[:, 1]

print(f"\nTrain accuracy: {rfc.score(X_train, y_train):.4f}")
print(f"Test accuracy: {rfc.score(X_test, y_test):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print(f"\nROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

print(f"\nPR-AUC: {average_precision_score(y_test, y_pred_proba):.4f}")

## SVM

In [None]:
from sklearn.svm import LinearSVC

svm = LinearSVC(random_state=42, verbose=1, dual=False, max_iter=1000)

svm.fit(X_train, y_train)

In [None]:
y_pred = svm.predict(X_test)

y_scores = svm.decision_function(X_test)

print(f"\nTrain accuracy: {svm.score(X_test, y_train):.4f}")
print(f"Test accuracy:  {svm.score(X_test, y_test):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print(f"\nROC-AUC Score: {roc_auc_score(y_test, y_scores):.4f}")
print(f"\nPR-AUC:        {average_precision_score(y_test, y_scores):.4f}")

## Validation

In [None]:
X_validation = pl.scan_parquet(data_dir / "X_validation.parquet").collect(engine="streaming")
y_validation = pl.scan_parquet(data_dir / "y_validation.parquet").collect(engine="streaming").to_numpy().ravel()

In [None]:
# Validate Logistic Regression

y_val_pred = logreg.predict(X_validation)
y_val_pred_proba = logreg.predict_proba(X_validation)[:, 1]

print(f"\nTrain accuracy: {logreg.score(X_train, y_train):.4f}")
print(f"Test accuracy: {logreg.score(X_test, y_test):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print(f"\nROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

print(f"\nPR-AUC: {average_precision_score(y_test, y_pred_proba):.4f}")