# Final Model Selection - Logistic Regression

## Setup

In [2]:
#import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, recall_score, roc_auc_score, make_scorer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("../data/df_final.csv")

## Randomized Search Cross-Validation

I will now use `RandomizedSearchCV` to find the best hyperparameters for my Logistic Regression model. I will use `StratifiedKFold` for cross-validation and `SMOTE` for oversampling, to account for the imbalance in the dataset.

In [3]:
#get features and target
X = df.drop("is_child", axis=1)
y = df["is_child"]

In [4]:
#make pipeline
pipe = Pipeline([
    ("smote", SMOTE(random_state=42)),
    ("clf", LogisticRegression(random_state=42, max_iter=2000, penalty="l2", solver="saga"))
])

#set hyperparameters
params = {
        "clf__C": np.logspace(-2, 0, 10), 
        "smote__k_neighbors": [3, 5, 7],
        "smote__sampling_strategy": [0.6, 0.7, 0.8, 0.9, "auto"],
         }

#instantiate folds for cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#set scoring metrics
scoring = {
    "recall": make_scorer(recall_score),
    "f1": make_scorer(f1_score),
    "roc_auc": make_scorer(roc_auc_score)
}

#instantiate cross-validation
random_search = RandomizedSearchCV(
    pipe,
    param_distributions=params,
    n_iter=25,
    cv=cv,
    scoring=scoring,
    n_jobs=-1,
    random_state=42,
    refit="recall",
    verbose=2
)

In [5]:
#split into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
#fit model
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] END clf__C=0.016681005372000592, smote__k_neighbors=3, smote__sampling_strategy=0.9; total time=15.7min
[CV] END clf__C=0.0774263682681127, smote__k_neighbors=7, smote__sampling_strategy=0.9; total time=16.3min
[CV] END clf__C=0.0774263682681127, smote__k_neighbors=7, smote__sampling_strategy=0.9; total time=16.3min
[CV] END clf__C=0.0774263682681127, smote__k_neighbors=7, smote__sampling_strategy=0.9; total time=16.5min
[CV] END clf__C=0.016681005372000592, smote__k_neighbors=3, smote__sampling_strategy=0.9; total time=16.6min
[CV] END clf__C=0.0774263682681127, smote__k_neighbors=7, smote__sampling_strategy=0.9; total time=16.7min
[CV] END clf__C=0.0774263682681127, smote__k_neighbors=7, smote__sampling_strategy=0.9; total time=16.8min
[CV] END clf__C=0.016681005372000592, smote__k_neighbors=3, smote__sampling_strategy=0.9; total time=16.9min
[CV] END clf__C=0.016681005372000592, smote__k_neighbors=3, smote__sampling_

In [8]:
#get predictions
y_pred = random_search.predict(X_test)

#find best parameters from cross-validation
print("Best parameters:", random_search.best_params_)
print("Best recall score:", random_search.best_score_)

Best parameters: {'smote__sampling_strategy': 'auto', 'smote__k_neighbors': 3, 'clf__C': np.float64(0.016681005372000592)}
Best recall score: 0.7722966064560106
