In [1]:
import numpy as np, pandas as pd
from sklearn.model_selection import GroupShuffleSplit, GroupKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

X_train = pd.read_csv("X_train_imputed_scaled.csv", index_col=0)
X_test = pd.read_csv("X_test_imputed_scaled.csv",  index_col=0)
y_train = pd.read_csv("y_train.csv", index_col=0)["label"]
y_test = pd.read_csv("y_test.csv",  index_col=0)["label"]
groups = pd.read_csv("groups.csv", index_col=0)["star_key"]

In [2]:
pipeline = Pipeline([
    ("scale", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced", solver="lbfgs"))
])

C_l2 = np.logspace(-3, 3, 7)              
C_l1 = [1e-3, 1e-2, 1e-1, 1, 10, 100]      

param_grid = [
    {
        "clf__penalty": ["l2"],
        "clf__solver": ["lbfgs"],
        "clf__C": C_l2,
        "clf__class_weight": [None, "balanced"],
        "clf__max_iter": [3000],
        "clf__tol": [1e-4, 1e-3],
    },
    {
        "clf__penalty": ["l1"],
        "clf__solver": ["liblinear"],
        "clf__C": C_l1,
        "clf__class_weight": [None, "balanced"],
        "clf__max_iter": [5000],
        "clf__tol": [1e-3],   
    },
    {
        "clf__penalty": ["elasticnet"],
        "clf__solver": ["saga"],
        "clf__l1_ratio": [0.1, 0.5, 0.9],
        "clf__C": [0.1, 1, 10],
        "clf__class_weight": [None, "balanced"],
        "clf__max_iter": [5000],
        "clf__tol": [1e-3],
    },
]

kf = GroupKFold(n_splits=5)
log_reg_cv = GridSearchCV(pipeline, param_grid=param_grid, scoring="average_precision", cv=kf)
log_reg_cv.fit(X_train, y_train, groups=groups)

log_reg_best_model = log_reg_cv.best_estimator_ 
print("Best PR-AUC:", log_reg_cv.best_score_)
print("Best params:", log_reg_cv.best_params_)

Best PR-AUC: 0.7895439814763148
Best params: {'clf__C': 1000.0, 'clf__class_weight': None, 'clf__max_iter': 3000, 'clf__penalty': 'l2', 'clf__solver': 'lbfgs', 'clf__tol': 0.0001}


In [3]:
from sklearn.metrics import classification_report, confusion_matrix, precision_score 
threshold = 0.5
y_proba = log_reg_best_model.predict_proba(X_test)[:, 1]
y_pred_thresh = (y_proba >= threshold).astype(int)

print(confusion_matrix(y_test, y_pred_thresh))
print(classification_report(y_test, y_pred_thresh))

precision = precision_score(y_test, y_pred_thresh)
print("Precision score at threshold", threshold, ":", precision)


[[ 588  661]
 [ 103 1296]]
              precision    recall  f1-score   support

           0       0.85      0.47      0.61      1249
           1       0.66      0.93      0.77      1399

    accuracy                           0.71      2648
   macro avg       0.76      0.70      0.69      2648
weighted avg       0.75      0.71      0.69      2648

Precision score at threshold 0.5 : 0.6622381195707716


In [4]:
from sklearn.ensemble import RandomForestClassifier
pipeline_rf = Pipeline([
    ("rf", RandomForestClassifier(random_state=42, n_jobs=-1))
])

param_grid_rf = {
    "rf__max_depth": [None, 10],              
    "rf__min_samples_leaf": [1, 2, 4],         
    "rf__max_features": ["sqrt", 0.5],       
    "rf__class_weight": [None, "balanced_subsample"]
}

kf_rf = GroupKFold(n_splits=5)
rf_cv = GridSearchCV(pipeline_rf, param_grid_rf, scoring="average_precision", cv=kf_rf, n_jobs=-1, refit=True, verbose=1)
rf_cv.fit(X_train, y_train, groups=groups)

rf_best_model = rf_cv.best_estimator_
print("RF best PR-AUC (CV):", rf_cv.best_score_)
print("RF best params:", rf_cv.best_params_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
RF best PR-AUC (CV): 0.9218529961495451
RF best params: {'rf__class_weight': None, 'rf__max_depth': None, 'rf__max_features': 0.5, 'rf__min_samples_leaf': 4}


In [5]:
threshold = 0.5

y_proba_rf = rf_best_model.predict_proba(X_test)[:, 1]
y_pred_thresh_rf = (y_proba_rf >= threshold).astype(int)

print(confusion_matrix(y_test, y_pred_thresh_rf))
print(classification_report(y_test, y_pred_thresh_rf))

precision_rf = precision_score(y_test, y_pred_thresh_rf)
print("Precision score at threshold", threshold, ":", precision_rf)



[[1168   81]
 [ 648  751]]
              precision    recall  f1-score   support

           0       0.64      0.94      0.76      1249
           1       0.90      0.54      0.67      1399

    accuracy                           0.72      2648
   macro avg       0.77      0.74      0.72      2648
weighted avg       0.78      0.72      0.72      2648

Precision score at threshold 0.5 : 0.9026442307692307


In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

pipeline_knn = Pipeline([
    ("scale", StandardScaler()),
    ("knn", KNeighborsClassifier())
])

param_grid_knn = {
    "knn__n_neighbors": [3, 5, 7, 10, 15],
    "knn__weights": ["uniform", "distance"],
    "knn__metric": ["minkowski", "euclidean", "manhattan"],
    "knn__algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "knn__leaf_size": [20, 30, 40],
    "knn__p": [1, 2],
}

kf_knn = GroupKFold(n_splits=5)

knn_cv = GridSearchCV(pipeline_knn, param_grid=param_grid_knn, scoring="average_precision", cv=kf_knn)

knn_cv.fit(X_train, y_train, groups=groups)

knn_best_model = knn_cv.best_estimator_
print("Best PR-AUC:", knn_cv.best_score_)
print("Best params:", knn_cv.best_params_)


Best PR-AUC: 0.819068299685932
Best params: {'knn__algorithm': 'auto', 'knn__leaf_size': 20, 'knn__metric': 'minkowski', 'knn__n_neighbors': 15, 'knn__p': 1, 'knn__weights': 'distance'}


In [7]:
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score

knn_cv.fit(X_train, y_train, groups=groups)
knn_best_model = knn_cv.best_estimator_

y_proba = knn_best_model.predict_proba(X_test)[:, 1]
threshold = 0.5
y_pred = (y_proba >= threshold).astype(int)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\nPrecision score:", precision)
print("Recall score:", recall)
print("F1 score:", f1)


Confusion Matrix:
[[ 976  273]
 [ 303 1096]]

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.78      0.77      1249
           1       0.80      0.78      0.79      1399

    accuracy                           0.78      2648
   macro avg       0.78      0.78      0.78      2648
weighted avg       0.78      0.78      0.78      2648


Precision score: 0.8005843681519357
Recall score: 0.7834167262330236
F1 score: 0.791907514450867


In [8]:
gkf = GroupKFold(n_splits=5)
splits = list(gkf.split(X_train, y_train, groups=groups))

bases = [("lr", log_reg_best_model), ("rf", rf_best_model)]
Z_train = np.zeros((X_train.shape[0], len(bases)))
print(splits)

[(array([    0,     1,     2, ..., 10011, 10012, 10013]), array([    5,     7,    26, ..., 10015, 10016, 10017])), (array([    0,     5,     6, ..., 10015, 10016, 10017]), array([    1,     2,     3, ..., 10011, 10012, 10013])), (array([    1,     2,     3, ..., 10015, 10016, 10017]), array([    0,     8,    19, ..., 10001, 10002, 10003])), (array([    0,     1,     2, ..., 10015, 10016, 10017]), array([   6,   15,   16, ..., 9927, 9928, 9929])), (array([    0,     1,     2, ..., 10015, 10016, 10017]), array([  10,   13,   14, ..., 9845, 9846, 9847]))]


In [9]:
from sklearn.base import clone

gkf = GroupKFold(n_splits=5)
splits = list(gkf.split(X_train, y_train, groups=groups))

bases = [("lr", log_reg_best_model), ("rf", rf_best_model)]
Z_train = np.zeros((X_train.shape[0], len(bases)))

for m, (name, model) in enumerate(bases): 
    oof = np.zeros(X_train.shape[0])
    for training, test in splits:
        mdl = clone(model)          
        mdl.fit(X_train.iloc[training], y_train.iloc[training])
        oof[test] = mdl.predict_proba(X_train.iloc[test])[:, 1]
        Z_train[:, m] = oof

meta_base = LogisticRegression() 

param_grid_meta = [ 
    {  
        "C": np.logspace(-3, 3, 7),
        "penalty": ["l2"],
        "solver": ["lbfgs"],
        "class_weight": [None, "balanced"],
        "max_iter": [5000],
        "tol": [1e-3],
    },
    {  
        "C": [0.1, 1, 10],
        "penalty": ["elasticnet"],
        "l1_ratio": [0.1, 0.5, 0.9],
        "solver": ["saga"],
        "class_weight": [None, "balanced"],
        "max_iter": [5000],
        "tol": [1e-3],
    },
]

grid_meta = GridSearchCV(
    estimator=meta_base,
    param_grid=param_grid_meta,
    scoring="average_precision",
    cv=splits,      
    n_jobs=-1,
    refit=True,
    verbose=1
)

Z_test = np.column_stack([
    clone(model).fit(X_train, y_train).predict_proba(X_test)[:, 1]
    for _, model in bases
])

grid_meta.fit(Z_train, y_train)   
meta = grid_meta.best_estimator_ 
print("Meta CV PR-AUC:", grid_meta.best_score_)
print("Meta best params:", grid_meta.best_params_)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Meta CV PR-AUC: 0.921784765410048
Meta best params: {'C': 0.1, 'class_weight': None, 'l1_ratio': 0.9, 'max_iter': 5000, 'penalty': 'elasticnet', 'solver': 'saga', 'tol': 0.001}


In [10]:
meta_prob = grid_meta.predict_proba(Z_test)[:, 1]

In [11]:
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score

thresholds = np.linspace(0.0, 1.0, 101)  
rows = []
for thr in thresholds:
    y_hat = (meta_prob >= thr).astype(int)
    p = precision_score(y_test, y_hat, zero_division=0)
    r = recall_score(y_test, y_hat, zero_division=0)
    f1 = f1_score(y_test, y_hat, zero_division=0)
    rows.append((thr, p, r, f1))

cand = [(thr, p, r, f1) for (thr, p, r, f1) in rows if p >= 0.90]
if cand:
    thr_star, p_star, r_star, f1 = max(cand, key=lambda x: x[2]) 
else:
    thr_star, p_star, r_star, f1 = 0.5, None, None 

print(f"Chosen threshold: {thr_star:.3f}  (Precision≥0.90)")
y_pred_thr = (meta_prob >= thr_star).astype(int)
print("Precision/Recall/F1 @ thr:", 
      precision_score(y_test, y_pred_thr), 
      recall_score(y_test, y_pred_thr),
      f1_score(y_test, y_pred_thr))
print(confusion_matrix(y_test, y_pred_thr))


Chosen threshold: 0.480  (Precision≥0.90)
Precision/Recall/F1 @ thr: 0.9013605442176871 0.5682630450321658 0.6970626918018413
[[1162   87]
 [ 604  795]]


In [12]:
y_pred_thr = (meta_prob >= thr_star).astype(int)

report = classification_report(y_test, y_pred_thr)
print(report)

              precision    recall  f1-score   support

           0       0.66      0.93      0.77      1249
           1       0.90      0.57      0.70      1399

    accuracy                           0.74      2648
   macro avg       0.78      0.75      0.73      2648
weighted avg       0.79      0.74      0.73      2648

