In [1]:
# RANDOM FOREST CLASSIFIER

import pandas as pd
full = pd.read_csv("../data/nba_games_rolling_averages.csv", index_col=0)

In [2]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier
import joblib

In [9]:
# === 2) Define model + CV + SFS (swap Ridge -> RandomForest) ===
# SMALL forest *inside SFS* (avoid nested parallelism)
rf = RandomForestClassifier(
    n_estimators=80,      # smaller
    max_depth=10,        # shallower trees
    max_features="sqrt",
    min_samples_leaf=2,  # a touch more regularization
    random_state=42,
    n_jobs=1             # <<< critical: no parallelism inside RF
)

sfs = SequentialFeatureSelector(
    rf,
    n_features_to_select=20,        # was 30 → big speedup
    direction="forward",
    cv=TimeSeriesSplit(n_splits=2), # was 3 → big speedup
    n_jobs=-1,                      # parallelize at SFS level instead
    # scoring="roc_auc" or "accuracy"
)

In [12]:
rf = RandomForestClassifier(n_estimators=50, max_depth=8, max_features="sqrt",
                            min_samples_leaf=2, random_state=42, n_jobs=1)
sfs = SequentialFeatureSelector(rf, n_features_to_select=12,
                                direction="forward",
                                cv=TimeSeriesSplit(n_splits=2),
                                n_jobs=-1)

In [13]:
# === 3) Choose training columns (same removal pattern as train_rr_2) ===
removed_columns = list(full.columns[full.dtypes == "object"]) + [
    "season", "date", "won", "target", "team", "team_opp"
]
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [14]:
# === 4) Fit ONLY the feature selector on current data ===
sfs.fit(full[selected_columns], full["target"])

KeyboardInterrupt: 