In [1]:
import os
import random
import json
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np

from sklearn.model_selection import (
    StratifiedKFold, cross_validate, GridSearchCV
)

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, make_scorer, confusion_matrix
)

from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

from collections import Counter


In [2]:
N_SPLITS_CV = 5
N_JOBS = 1  # prioritizing no leakage

def set_random_states(random_state):
    np.random.seed(random_state)
    os.environ['PYTHONHASHSEED'] = str(random_state)
    random.seed(random_state)

    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    return random_state

random_state = set_random_states(1618)

In [3]:
overall_df = pd.read_csv("./data/allIngredients.csv")
overall_df = overall_df.fillna(0)
overall_df = overall_df.sample(frac=1, random_state=random_state)
bread_cake_df = overall_df[overall_df['label'] != "banana"]
banana_df = overall_df[overall_df['label'] == "banana"]

In [4]:
X = bread_cake_df.drop("label", axis=1)
y = bread_cake_df["label"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=random_state, stratify=y
)

In [6]:
# get models - broad coverage across linear, margin, instance-based, and nonlinear tree ensembles
def build_models(random_state):
    models = {
        # high numbers of iterations (max_iter) avoids convergence warnings in high-dimensional spaces
        "LogisticRegression": LogisticRegression(max_iter=20000, class_weight="balanced", random_state=random_state), # strong baseline
        "RidgeClassifier": RidgeClassifier(class_weight="balanced", random_state=random_state), # strong baseline
        "LinearSVC": LinearSVC(max_iter=20000, class_weight="balanced", random_state=random_state), # strong baseline
        "SVC-RBF": SVC(probability=False, random_state=random_state, class_weight="balanced"), # probability set to false saves on compute
        "SGD-Hinge": SGDClassifier(loss="hinge", max_iter=20000, random_state=random_state, class_weight="balanced"), # hinge loss is good for binary classification problems - fast on large data
        "KNN": KNeighborsClassifier(), # deterministic - sensitive to scaling (good to test with and without standard scaler)
        "DecisionTree": DecisionTreeClassifier(random_state=random_state, class_weight="balanced"), # don't need scaling and captures nonlinear splits
        "RandomForest": RandomForestClassifier(random_state=random_state, class_weight="balanced"), # don't need scaling and captures nonlinear splits
        "ExtraTrees": ExtraTreesClassifier(random_state=random_state, class_weight="balanced"), # don't need scaling and captures nonlinear splits
        "GradientBoosting": GradientBoostingClassifier(random_state=random_state), # don't need scaling and captures nonlinear splits
        "HistGB": HistGradientBoostingClassifier(random_state=random_state), # don't need scaling and captures nonlinear splits
        "AdaBoost": AdaBoostClassifier(random_state=random_state), # don't need scaling and captures nonlinear splits
        "GaussianNaiveBayes": GaussianNB(), # deterministic - interpretable
    }
    return models

# get parameter grid to test
def small_param_grid(name): # high-leverage parameter sweeps (runtime will be effected if we test everything)
    grids = {
        # lowered c values due to convergence issues
        "LogisticRegression": {"clf__C": [0.1, 0.5, 1.0]},
        "LinearSVC":          {"clf__C": [0.1, 0.5, 1.0]},
        "SVC-RBF":            {"clf__C": [0.1, 0.5, 1.0], "clf__gamma": ["scale", "auto"]},
        "KNN":                {"clf__n_neighbors": [1, 3, 5, 7, 9, 11, 15, 21, 31], "clf__weights": ["uniform", "distance"], "clf__metric": ["manhattan", "euclidean", "minkowski"], "clf__p": [1, 2], "clf__leaf_size": [10, 20, 30, 40, 50], "clf__algorithm": ["auto", "ball_tree", "kd_tree", "brute"]},
        "RandomForest":       {"clf__n_estimators": [300, 600], "clf__max_depth": [None, 10, 20]},
        "ExtraTrees":         {"clf__n_estimators": [300, 600], "clf__max_depth": [None, 10, 20]},
        "HistGB":             {"clf__max_depth": [None, 6, 10]},
    }
    return grids.get(name, None)



def build_scorers():
    return {
        "accuracy": "accuracy", # normal accuracy
        "precision": make_scorer(precision_score, average="binary", pos_label='bread', zero_division=0),
        "recall": make_scorer(recall_score, average="binary", pos_label='bread', zero_division=0),
        "f1": make_scorer(f1_score, average="binary", pos_label='bread', zero_division=0)
    }

In [None]:

# use stratify k-fold as balanced class representation wanted in train and test sets
inner_cv = StratifiedKFold(n_splits=N_SPLITS_CV, shuffle=True, random_state=random_state)
outer_cv = StratifiedKFold(n_splits=N_SPLITS_CV, shuffle=True, random_state=random_state)
scorers = build_scorers()

results = []
# runs process for each model
for name, clf in build_models(random_state).items():
    # build pipeline
    pipe = Pipeline([
        ("l1norm", Normalizer(norm="l1")),
        ("clf", clf)
    ])
    # build parameter grid
    grid = small_param_grid(name)

    # grid search to find best model parameters
    model_for_cv = GridSearchCV(
        pipe, grid, scoring=scorers,
        cv=inner_cv, n_jobs=N_JOBS, refit="accuracy"
    ) if grid else pipe

    # cross validate data using best parameters
    cv_out = cross_validate(
        estimator=model_for_cv,
        X=X_train, y=y_train,
        cv=outer_cv, scoring=scorers,
        return_train_score=False, n_jobs=N_JOBS, return_estimator=True)

    # refit on full training set using the same best parameters
    model_for_cv.fit(X_train, y_train)

    # predictions on holdout
    y_pred = model_for_cv.predict(X_test)

    if isinstance(model_for_cv, GridSearchCV):
        chosen = model_for_cv.best_estimator_
        tuned  = model_for_cv.best_params_
    else:
        chosen = model_for_cv
        tuned  = {}

    # classifier-only params actually used
    clf_params = chosen.named_steps["clf"].get_params()
    tuned_json = json.dumps(tuned, default=str)
    clf_json   = json.dumps(clf_params, default=str)
    
    # ROC AUC
    y_score = None
    if hasattr(model_for_cv, "predict_proba"):
        y_score = model_for_cv.predict_proba(X_test)[:, 1]
    elif hasattr(model_for_cv, "decision_function"):
        y_score = model_for_cv.decision_function(X_test)

    row = {
        "model": name,
        "tuned_parameters": tuned_json,      # only those searched by GridSearchCV
        "clf_parameters":   clf_json,   
        # cv means/stds
        **{f"cv_{k.replace('test_','')}_mean": float(np.mean(v)) for k, v in cv_out.items() if k.startswith("test_")},
        **{f"cv_{k.replace('test_','')}_std":  float(np.std(v))  for k, v in cv_out.items() if k.startswith("test_")},

        # holdout
        "holdout_accuracy": accuracy_score(y_test, y_pred),
        "holdout_precision": precision_score(y_test, y_pred, pos_label='bread', zero_division=0),
        "holdout_recall":    recall_score(y_test, y_pred, pos_label='bread', zero_division=0),
        "holdout_f1":        f1_score(y_test, y_pred, pos_label='bread', zero_division=0)
    }

    results.append(row)

results_df = pd.DataFrame(results)

In [None]:
# helpers
def best_by_feature_cv(results, metric = 'accuracy', top_n = 3):
    col = f"cv_{metric}_mean"
    return (results
            .dropna(subset=[col])
            .sort_values([col], ascending=False)
            .head(top_n))

def best_by_feature_holdout(results, metric = "accuracy", top_n = 1):
    col = f"holdout_{metric}"
    return (results
            .dropna(subset=[col])
            .sort_values([col], ascending=False)
            .head(top_n))

def overall_best(results, by = "accuracy", top_n = 10):
    return results.dropna(subset=[f"holdout_{by}"]).sort_values(f"holdout_{by}", ascending=False).head(top_n)

In [None]:
# inspect the winners
cv_top = best_by_feature_cv(results_df, metric="accuracy", top_n=3)
holdout_winners = best_by_feature_holdout(results_df, metric="accuracy", top_n=1)
overall_top = overall_best(results_df, by="accuracy", top_n=10)

In [None]:
cv_top

In [None]:
holdout_winners

In [None]:
overall_top

In [None]:
new_model = build_models(random_state)[list(overall_top['model'])[0]]
new_model.set_params(**(json.loads(list(overall_top['clf_parameters'])[0])))

In [None]:
new_model.fit(X_train, y_train)
y_pred = new_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(f"Precision: {precision_score(y_test, y_pred, pos_label='bread')}")
print(f"Recall: {recall_score(y_test, y_pred, pos_label='bread')}")
print(f"F1-Score: {f1_score(y_test, y_pred, pos_label='bread')}")

In [None]:
X_banana = banana_df.drop("label", axis=1)
banana_predicted = new_model.predict(X_banana)
print(Counter(banana_predicted))