In [None]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, f1_score, roc_auc_score

# ───────────────────────────────
# 1. Load data (same as before)
# ───────────────────────────────
df = pd.read_csv(
    r'C:\Users\ghkjs\GitHub\regime-shift-classification\data\processed\inflation_regime_dataset.csv',
    index_col=0,
    parse_dates=True
)

X = df.drop(columns=['Inflation_Regime'])
y = df['Inflation_Regime']

# ───────────────────────────────
# 2. Time-series CV splitter
#    (5 splits, expanding window)
# ───────────────────────────────
tscv = TimeSeriesSplit(
    n_splits=5,       # adjust if you want more folds
    test_size=None    # default: split points chosen automatically
)

# Choose the main metric for selection
# f1 is good if you care about balancing FP & FN
scorer = make_scorer(f1_score)

# ╭─────────────────────────────╮
# │ 3-A. Grid search: Logistic  │
# ╰─────────────────────────────╯
lr_pipe = Pipeline([
    ('imputer',   SimpleImputer(strategy='median')),
    ('scaler',    StandardScaler()),
    ('clf',       LogisticRegression(max_iter=2000))
])

lr_param_grid = {
    'clf__C':        [0.01, 0.1, 1, 5, 10, 50, 100],
    'clf__penalty':  ['l2'],          # l1 requires saga solver
    'clf__solver':   ['lbfgs', 'saga'],
    'clf__class_weight': [None, 'balanced']
}

lr_grid = GridSearchCV(
    estimator=lr_pipe,
    param_grid=lr_param_grid,
    cv=tscv,
    scoring=scorer,
    n_jobs=-1,
    verbose=2
)
lr_grid.fit(X, y)

print("┏━━ Best Logistic Regression ━━┓")
print("Best params :", lr_grid.best_params_)
print("Best f1     :", lr_grid.best_score_)
print("┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛\n")

# ╭─────────────────────────────╮
# │ 3-B. Grid search:  KNN      │
# ╰─────────────────────────────╯
knn_pipe = Pipeline([
    ('imputer',   SimpleImputer(strategy='median')),
    ('scaler',    StandardScaler()),
    ('clf',       KNeighborsClassifier())
])

knn_param_grid = {
    'clf__n_neighbors': [3, 5, 7, 9, 11, 15],
    'clf__weights':     ['uniform', 'distance'],
    'clf__metric':      ['minkowski', 'euclidean', 'manhattan']
}

knn_grid = GridSearchCV(
    estimator=knn_pipe,
    param_grid=knn_param_grid,
    cv=tscv,
    scoring=scorer,
    n_jobs=-1,
    verbose=2
)
knn_grid.fit(X, y)

print("┏━━ Best KNN ━━━━━━━━━━━━━━━━━━┓")
print("Best params :", knn_grid.best_params_)
print("Best f1     :", knn_grid.best_score_)
print("┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛")


Fitting 5 folds for each of 28 candidates, totalling 140 fits
