In [6]:
import pandas as pd
import numpy as np
import openml
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from skopt import BayesSearchCV

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector

import warnings
warnings.filterwarnings("ignore")


In [7]:
# Define dataset IDs (replace these with your chosen OpenML dataset IDs)
dataset_ids = [1489, 1464, 1462, 37]

def get_data(id):
    df = openml.datasets.get_dataset(id).get_data(dataset_format="dataframe")[0]
    X, y = df.iloc[:, :-1], df.iloc[:, -1]
    if id != 37:
        y = y.apply(lambda y: 1 if y == '1' else 0)
    else:
        y = y.apply(lambda y: 1 if y == 'tested_positive' else 0)
    return X, y


In [8]:
numpipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

catpipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

column_transformer = ColumnTransformer(
    transformers=[
        ('num', numpipe, make_column_selector(dtype_include=['int64', 'float64'])),
        ('cat', catpipe, make_column_selector(dtype_include='object'))
    ]
)


In [9]:
pipeline = Pipeline(steps=[
    ('preprocessing', column_transformer),
    ('model', KNeighborsClassifier())
])

knn_parameters = {
    'model__n_neighbors': np.arange(2, 30, 1),
    'model__weights': ['uniform', 'distance'],
    'model__p': [1, 2],
    'model__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

In [10]:
randomized_search = RandomizedSearchCV(
    pipeline,
    param_distributions=knn_parameters,
    n_iter=50,
    cv=5,
    random_state=2137,
    scoring='roc_auc',
    verbose=True
)

bayesian_search = BayesSearchCV(
    pipeline,
    search_spaces=knn_parameters,
    n_iter=50, 
    cv=5,
    random_state=2137,
    scoring='roc_auc',
    verbose=True
)


In [11]:
# Create a directory for results if it doesn't exist
import os
os.makedirs("./KNNResults", exist_ok=True)

for id in dataset_ids:
    # Load dataset
    X, y = get_data(id)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2137)
    
    # Random Search
    model_r = randomized_search.fit(X_train, y_train)
    pd.DataFrame(model_r.cv_results_).to_excel(f"./KNNResults/df_{id}_random.xlsx", index=0)
    
    # Bayesian Search
    model_b = bayesian_search.fit(X_train, y_train)
    pd.DataFrame(model_b.cv_results_).to_excel(f"./KNNResults/df_{id}_bayes.xlsx", index=0)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5

In [12]:
baseline_scores = {}

for id in dataset_ids:
    X, y = get_data(id)
    
    roc_auc_scores = cross_val_score(
        pipeline, 
        X, 
        y, 
        cv=KFold(n_splits=5, shuffle=True, random_state=2137),
        scoring='roc_auc',
        verbose=True
    )
    
    baseline_scores[id] = roc_auc_scores.mean()
    print(f'Mean ROC AUC score for dataset {id}: ', roc_auc_scores.mean())

# Save baseline scores to Excel
pd.DataFrame(baseline_scores.items(), columns=['Dataset ID', 'Mean ROC AUC']).to_excel('./KNNResults/baseline_scores.xlsx', index=False)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Mean ROC AUC score for dataset 1489:  0.9265834341508368
Mean ROC AUC score for dataset 1464:  0.5765701635969946
Mean ROC AUC score for dataset 1462:  0.9986562049062048
Mean ROC AUC score for dataset 37:  0.7692422964370038


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
