# Business Problem

Machine learning that can predict whether people have diabetes when their characteristics are specified

The dataset is part of the large dataset held at the National Institutes of Diabetes-Digestive-Kidney Diseases in the USA. Data used for diabetes research on Pima Indian women aged 21 and over living in Phoenix, the 5th largest city of the State of Arizona in the USA. It consists of 768 observations and 8 numerical independent variables. The target variable is specified as "outcome"; 1 indicates positive diabetes test result, 0 indicates negative.

**Variables**
* Pregnancies: Number of pregnancies
* Glucose: Glucose.
* BloodPressure: Blood pressure.
* SkinThickness: Skin Thickness
* Insulin: Insulin.
* BMI: Body mass index.
* DiabetesPedigreeFunction: A function that calculates our probability of having diabetes based on our ancestry.
* Age: Age (years)
* Outcome: Information whether the person has diabetes or not. Have the disease (1) or not (0)

# Importing Libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_validate, validation_curve
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler
from pandas_profiling import ProfileReport
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import OneHotEncoder
%matplotlib inline
import optuna
from optuna import Trial, visualization

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)

# Exploratory Data Analysis

In [None]:
df = pd.read_csv('/kaggle/input/diabetes-dataset/diabetes.csv')
df.head()

In [None]:
def check_df(dataframe, head=10):
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Variables #####################")
    print(dataframe.columns)
    print("##################### Descriptive Stats #####################")
    print(dataframe.describe().T)
    print("##################### Null Values #####################")
    print(dataframe.isnull().sum())
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Info #####################")
    print(dataframe.info())
check_df(df)

In [None]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car
cat_cols, num_cols, cat_but_car = grab_col_names(df)

In [None]:
df.isnull().sum()

## Categorical variable analysis

In [None]:
def cat_summary(dataframe, col_name, plot = False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))
    print("##########################################")
    if plot:
        sns.countplot(x=dataframe[col_name], data=dataframe)

In [None]:
for cat_col in cat_cols:
    cat_summary(df, cat_col, True)

When the dataset is imbalanced (churn ratio is imbalanced) we should generate artificial data in order to get accurate results. We can use the SMOTETomek technique which combine a synthetic oversampling sequence (SMOTE) followed by an undersampling sequence (TOMEK) Step 1 : Oversampling synthetically the minority class Step 2 : Undersampling by cleaning the noise generated by the SMOTE technique. But in our case we dont need to use it since difference between ratio of 0 and 1 classes is not big.

In [None]:
## Numeric variable
def num_summary(dataframe, numerical_col):
    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    print(dataframe[numerical_col].describe(quantiles).T)
    print("#"*9)

In [None]:
for num_col in num_cols:
    num_summary(df, num_col)

In [None]:
fig = make_subplots(rows=4, cols=2, subplot_titles=["Pregnancies - Dist", "Glucose - Dist", "BloodPressure - Dist",
                                          "SkinThickness - Dist",  "Insulin - Dist", "BMI - Dist", 
                                          "DiabetesPedigreeFunction - Dist", "Age - Dist"])

fig.add_trace(go.Histogram(x = df["Pregnancies"] ,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=1, col=1)
fig.add_trace(go.Histogram(x = df["Glucose"], marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=1, col=2)
fig.add_trace(go.Histogram(x = df["BloodPressure"], marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=2, col=1)
fig.add_trace(go.Histogram(x = df["SkinThickness"], marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=2, col=2)
fig.add_trace(go.Histogram(x = df["Insulin"], marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=3, col=1)
fig.add_trace(go.Histogram(x = df["BMI"], marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=3, col=2)
fig.add_trace(go.Histogram(x = df["DiabetesPedigreeFunction"], marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=4, col=1)
fig.add_trace(go.Histogram(x = df["Age"], marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=4, col=2)

# fig['layout']['xaxis16'].update(range=[0, 1])

fig.update_layout(height=1200, width=1200, paper_bgcolor='rgb(233,233,233)', title="Histogram Plots", showlegend=False)

**None** of features in our dataset seem to be **normally distributed**

In [None]:
def target_analyser(dataframe, target, num_cols, cat_cols):
    print("#"*9,"target_numeric_analysis", "#"*9)
    for num_col in num_cols:
        print(pd.DataFrame({f"{num_col}_TARGET_MEAN": dataframe.groupby(target)[num_col].mean()}), end="\n\n\n")
    print("#"*9,"target_categoric_analysis", "#"*9)
    for cat_col in cat_cols:
        print(cat_col, ":", len(dataframe[cat_col].value_counts()))
        print(pd.DataFrame({"COUNT": dataframe[cat_col].value_counts(),
                            "RATIO": dataframe[cat_col].value_counts() / len(dataframe),
                            "TARGET_MEAN": dataframe.groupby(cat_col)[target].mean()}), end="\n\n\n")

In [None]:
target_analyser(df, "Outcome", num_cols, cat_cols)

It is expected that as glucose, Insulin, age etc increase, **outcome** value more likely to have value of 1.

## OUTLIERS

In [None]:
def outlier_thresholds(dataframe, variable, q1 = 0.10, q2= 0.90):
    quartile1 = dataframe[variable].quantile(q1)
    quartile3 = dataframe[variable].quantile(q2)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
#     up_limit = round(up_limit)
#     low_limit = round(low_limit)
    return low_limit, up_limit

def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False
    
def grab_outliers(dataframe, col_name, index=False):
    low, up = outlier_thresholds(dataframe, col_name)

    if dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))].shape[0] > 10:
        print(dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))].head())
    else:
        print(dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))])

    if index:
        outlier_index = dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))].index
        return outlier_index

def remove_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    df_without_outliers = dataframe[~((dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit))]
    return df_without_outliers

def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [None]:
check_outlier(df, "Pregnancies")

In [None]:
check_outlier(df, "SkinThickness")

When we examine other variable's quantiles, we see that none of our variables seems to have outlier.

**Multivariate analysis**

In [None]:
# LOF
df_num = df[num_cols]
clf = LocalOutlierFactor(n_neighbors = 20, contamination = 0.1)
clf.fit_predict(df_num)
df_scores = clf.negative_outlier_factor_

In [None]:
pd.DataFrame(np.sort(df_scores)).plot(stacked = True, xlim =[0,50], style=".-")

In [None]:
th = np.sort(df_scores)[10]

In [None]:
df[df_scores < th].shape

In [None]:
df[df_scores < th]

In [None]:
df = df.drop(axis=0, labels=df[df_scores < th].index)

## CORRELATION

In [None]:
df[num_cols].corr()

f, ax = plt.subplots(figsize=[18, 13])
sns.heatmap(df[num_cols].corr(), annot=True, fmt=".2f", ax=ax, cmap="magma")
ax.set_title("Correlation Matrix", fontsize=20)
plt.show()

If there is multicollinearity between two variables, we can analyze them with the VIF (Variable Inflation Factors) method. It measures the strength of the correlation between our independent variables. In order to avoid inaccurate parameter estimations, multiple correlation analysis should be performed and related variables should be removed from the data set if needed. We set our max threshold at 10 [VIF](https://quantifyinghealth.com/vif-threshold/)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
#Exclude categoricals + target variable
X=df.select_dtypes(include = ['float64', 'int64'])
X=X.drop('Outcome', axis = 1)
VIF = X
vif_data = pd.DataFrame()
vif_data["Feature"] = VIF.columns
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(VIF.values, i) for i in range(len(VIF.columns))]
vif_data=vif_data.sort_values(by='VIF',ascending=False)
vif_data.style.background_gradient(cmap = 'Reds', axis = 0)

We can see that BMI, Glucose, BloodPressure, Age have high VIF. As presumed, the high correlation coefficient of BMI result in a high VIF. We will remove the variable BMI and recheck if the VIF of other variables dropped

In [None]:
#We remove Year_Birth + Income and recalculate the VIF of other features 
VIF_filter=X.drop(columns=['BMI'], axis = 1)
vif_filtered_data = pd.DataFrame() 
vif_filtered_data["Feature"] = VIF_filter.columns 
vif_filtered_data["VIF"] = [variance_inflation_factor(VIF_filter.values, i) for i in range(len(VIF_filter.columns))]
# vif_filtered_data=vif_filtered_data[vif_filtered_data["VIF"] > 5]
vif_filtered_data=vif_filtered_data.sort_values(by='VIF',ascending=False)
vif_filtered_data.style.background_gradient(cmap = 'Reds', axis = 0)

In [None]:
df = df.drop('BMI', axis = 1)

Under normal circumstances, we want the VIF value to be below 10. For now, we stop the filtering process at this stage, as we have few variables in the data set.

## Feature Extraction

In [None]:
# Glucose_Reference_Range
df.loc[(df["Glucose"] < 70), "Glucose_RR"] = "Hypoglycemia"
df.loc[(df["Glucose"] >= 70) & (df["Glucose"] < 100), "Glucose_RR"] = "Normal"
df.loc[(df["Glucose"] >= 100) & (df["Glucose"] < 125), "Glucose_RR"] = "Prediabetes"
df.loc[(df["Glucose"] >= 125), "Glucose_RR"] = "Diabetes"

## ENCODING

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

In [None]:
cat_cols = [col for col in cat_cols if col not in ["Outcome"]]
cat_cols


def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe


df = one_hot_encoder(df, cat_cols, drop_first=True)

df.head()
df.shape

## Modelling

In [None]:
y = df["Outcome"]
X = df.drop(["Outcome"], axis=1)
#1. Split data into X and Y. We use stratify to keep an equal proportion of examples in each class between train set and test set
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=1,shuffle=True, stratify=y)

In [None]:
from collections import Counter
counter = Counter(y)
for k,v in counter.items():
    per = v / len(y) * 100
    print('Class=%s, Count=%d, Percentage=%.2f%%' % (k, v, per))

In [None]:
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import FeatureUnion
from imblearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier

In [None]:
logic_model = LogisticRegression(verbose=False, random_state=1).fit(X_train, y_train)
y_pred = logic_model.predict(X_test)

print(f"Accuracy: {round(accuracy_score(y_pred, y_test), 3)}")
print(f"Recall: {round(recall_score(y_pred, y_test), 2)}")
print(f"Precision: {round(precision_score(y_pred, y_test), 2)}")
print(f"F1: {round(f1_score(y_pred, y_test), 2)}")
print(f"Auc: {round(roc_auc_score(y_pred, y_test), 2)}")

In [None]:
K_model = KNeighborsClassifier().fit(X_train, y_train)
y_pred = K_model.predict(X_test)

print(f"Accuracy: {round(accuracy_score(y_pred, y_test), 3)}")
print(f"Recall: {round(recall_score(y_pred, y_test), 2)}")
print(f"Precision: {round(precision_score(y_pred, y_test), 2)}")
print(f"F1: {round(f1_score(y_pred, y_test), 2)}")
print(f"Auc: {round(roc_auc_score(y_pred, y_test), 2)}")

In [None]:
#Feature type selection
class feat_sel(BaseEstimator, TransformerMixin):
    def __init__(self, dtype='numeric'):
        self.dtype = dtype

    def fit( self, X, y=None ):
        return self 

    def transform(self, X, y=None):
        if self.dtype == 'numeric':
            num_cols = X.columns[X.dtypes != object].tolist()
            return X[num_cols]
        elif self.dtype == 'category':
            cat_cols = X.columns[X.dtypes == object].tolist()
            return X[cat_cols]
    def get_feature_names(self):
        if self.dtype == 'numeric':
            num_cols = X.columns[X.dtypes != object].tolist()
            return X[num_cols]
        elif self.dtype == 'category':
            cat_cols = X.columns[X.dtypes == object].tolist()
            return X[cat_cols]

In [None]:
#Scale
class df_scaler(BaseEstimator, TransformerMixin):
    def __init__(self, method=StandardScaler()):
        super().__init__()
        self.method = method        

    def fit(self, X, y=None):
        return self.method.fit(X)

    def transform(self, X, y=None):
        Xscl = self.method.transform(X)
        Xscaled = pd.DataFrame(Xscl, index=X.index, columns=X.columns)
        self.columns = X.columns
        return Xscaled
    def get_feature_names(self):
        return list(self.columns)

In [None]:
class FeatureUnion_df(TransformerMixin, BaseEstimator):
    
    def __init__(self, transformer_list, n_jobs=None, transformer_weights=None, verbose=False):
        self.transformer_list = transformer_list
        self.n_jobs = n_jobs
        self.transformer_weights = transformer_weights
        self.verbose = verbose 
        self.feat_un = FeatureUnion(self.transformer_list)
        
    def fit(self, X, y=None):
        self.feat_un.fit(X)
        return self

    def transform(self, X, y=None):
        X_tr = self.feat_un.transform(X)
        columns = []
        
        for trsnf in self.transformer_list:
            cols = trsnf[1].steps[-1][1].get_feature_names()  
            columns += list(cols)

        X_tr = pd.DataFrame(X_tr, index=X.index, columns=columns)
        
        return X_tr

    def get_params(self, deep=True): 
        return self.feat_un.get_params(deep=deep)
    def get_feature_names(self):
        return self.columns

In [None]:
# #Model Selection
# class Model_selection(BaseEstimator):
#     def __init__(self, estimator = CatBoostClassifier()):
#         self.estimator = estimator
#     def fit(self, X, y=None, **kwargs):
#         self.estimator.fit(X, y)
#         return self
#     def predict(self, X, y=None):
#         return self.estimator.predict(X)
#     def predict_proba(self, X):
#         return self.estimator.predict_proba(X)
#     def score(self, X, y):
#         return self.estimator.score(X, y)

## Hyperparameter tuning with Optuna

In [None]:
# Standard Scaler
def objective_logistic(trial):
    numeric_pipe = Pipeline([('fs', feat_sel(dtype='numeric')),  # Select only the numeric features
                             ('scl', df_scaler(method=StandardScaler())) # Scale data
                             ]) 
                         
    categorical_pipeline = Pipeline( steps = [( 'fs', feat_sel(dtype='category')), # Select only the categorical features
                                             ])
    
    processing_pipe = FeatureUnion_df(transformer_list=[('cat_pipe', categorical_pipeline),
                                                        ('num_pipe', numeric_pipe)])        
    
    cv_outer=StratifiedKFold(n_splits=5, random_state=1,shuffle=True)  
    param = {        
        'penalty': trial.suggest_categorical('penalty', ['l2']),
        'C': trial.suggest_float("C", 0.01, 100, log=True),
        'solver': trial.suggest_categorical('solver', ['lbfgs', "liblinear", "newton-cg"]),
    }   
    pipe = Pipeline([
        ('prep',processing_pipe),
        ('est', LogisticRegression(**param))])     
    return cross_val_score(pipe, X, y,cv=cv_outer,scoring="f1_weighted").mean()

In [None]:
%%time
models=[]
scores=[]
logistic_study = optuna.create_study(direction='maximize')
optuna.logging.set_verbosity(optuna.logging.WARNING)
logistic_study.optimize(objective_logistic, n_trials=300)
model='Logistic_Reg'
score=logistic_study.best_trial.value
models.append(model)
scores.append(score)

In [None]:
print('Number of finished trials:', len(logistic_study.trials))
print('Best trial: score {}, params {}'.format(logistic_study.best_trial.value, logistic_study.best_trial.params))

In [None]:
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances

In [None]:
plot_optimization_history(logistic_study)

In [None]:
plot_parallel_coordinate(logistic_study)

In [None]:
plot_contour(logistic_study)

In [None]:
plot_param_importances(logistic_study)

In [None]:
plot_edf(logistic_study)

In [None]:
logistic_final = LogisticRegression(**logistic_study.best_trial.params, random_state=1)

cv_results = cross_validate(logistic_final,
                            X, y,
                            cv=5,
                            scoring=["accuracy", "f1", "roc_auc"])

print(f"test_accuracy: {cv_results['test_accuracy'].mean()}")
print(f"test_f1: {cv_results['test_f1'].mean()}")
print(f"roc_auc: {cv_results['test_roc_auc'].mean()}")

In [None]:
def val_curve_params(model, X, y, param_name, param_range, scoring="roc_auc", cv=10):
    train_score, test_score = validation_curve(
        model, X=X, y=y, param_name=param_name, param_range=param_range, scoring=scoring, cv=cv)

    mean_train_score = np.mean(train_score, axis=1)
    mean_test_score = np.mean(test_score, axis=1)

    plt.plot(param_range, mean_train_score,
             label="Training Score", color='b')

    plt.plot(param_range, mean_test_score,
             label="Validation Score", color='g')

    plt.title(f"Validation Curve for {type(model).__name__}")
    plt.xlabel(f"Number of {param_name}")
    plt.ylabel(f"{scoring}")
    plt.tight_layout()
    plt.legend(loc='best')
    plt.show(block=True)

In [None]:
logistic_val_params = [["C", range(1, 100)]]

for i in range(len(logistic_val_params)):
    val_curve_params(logistic_final, X, y, logistic_val_params[i][0], logistic_val_params[i][1])

In [None]:
# Without Scale
# Standard Scaler
def objective_logistic(trial):
    numeric_pipe = Pipeline([('fs', feat_sel(dtype='numeric')),  # Select only the numeric features
#                              ('scl', df_scaler(method=StandardScaler())) # Scale data
                             ]) 
                         
    categorical_pipeline = Pipeline( steps = [( 'fs', feat_sel(dtype='category')), # Select only the categorical features
                                             ])
    
    processing_pipe = FeatureUnion_df(transformer_list=[('cat_pipe', categorical_pipeline),
                                                        ('num_pipe', numeric_pipe)])        
    
    cv_outer=StratifiedKFold(n_splits=5, random_state=1,shuffle=True)  
    param = {        
        'penalty': trial.suggest_categorical('penalty', ['l2']),
        'C': trial.suggest_float("C", 0.01, 100, log=True),
        'solver': trial.suggest_categorical('solver', ['lbfgs', "liblinear", "newton-cg"]),
    }   
    pipe = Pipeline([
        ('prep',processing_pipe),
        ('est', LogisticRegression(**param))])     
    return cross_val_score(pipe, X, y,cv=cv_outer,scoring="f1_weighted").mean()

In [None]:
logistic_study_NS = optuna.create_study(direction='maximize')
optuna.logging.set_verbosity(optuna.logging.WARNING)
logistic_study_NS.optimize(objective_logistic, n_trials=300)
model='Logistic_Reg_NS'
score=logistic_study_NS.best_trial.value
models.append(model)
scores.append(score)

In [None]:
print('Number of finished trials:', len(logistic_study_NS.trials))
print('Best trial: score {}, params {}'.format(logistic_study_NS.best_trial.value, logistic_study_NS.best_trial.params))

## KNN

In [None]:
# Standard Scaler
def objective_KNN(trial):
    numeric_pipe = Pipeline([('fs', feat_sel(dtype='numeric')),  # Select only the numeric features
                             ('scl', df_scaler(method=StandardScaler())) # Scale data
                             ]) 
                         
    categorical_pipeline = Pipeline( steps = [( 'fs', feat_sel(dtype='category')), # Select only the categorical features
                                             ])
    
    processing_pipe = FeatureUnion_df(transformer_list=[('cat_pipe', categorical_pipeline),
                                                        ('num_pipe', numeric_pipe)])        
    
    cv_outer=StratifiedKFold(n_splits=5, random_state=1,shuffle=True)  
    param = {
        'n_neighbors': trial.suggest_int('n_neighbors', 2, 50, 1),
    }   
    pipe = Pipeline([
        ('prep',processing_pipe),
        ('est', KNeighborsClassifier(**param))])     
    return cross_val_score(pipe, X, y,cv=cv_outer,scoring="f1_weighted").mean()

In [None]:
KNN_study = optuna.create_study(direction='maximize')
optuna.logging.set_verbosity(optuna.logging.WARNING)
KNN_study.optimize(objective_KNN, n_trials=150)
model='KNN'
score=KNN_study.best_trial.value
models.append(model)
scores.append(score)

In [None]:
print('Number of finished trials:', len(KNN_study.trials))
print('Best trial: score {}, params {}'.format(KNN_study.best_trial.value, KNN_study.best_trial.params))

In [None]:
KNN_final = KNeighborsClassifier(**KNN_study.best_trial.params)

cv_results = cross_validate(KNN_final,
                            X, y,
                            cv=5,
                            scoring=["accuracy", "f1", "roc_auc"])

print(f"test_accuracy: {cv_results['test_accuracy'].mean()}")
print(f"test_f1: {cv_results['test_f1'].mean()}")
print(f"roc_auc: {cv_results['test_roc_auc'].mean()}")

In [None]:
best_model = pd.DataFrame(list(zip(models, scores)), columns =['Model', 'Score']).sort_values(ascending=False,by='Score')
best_model

In [None]:
logistic_final = LogisticRegression(**logistic_study_NS.best_trial.params, random_state=1).fit(X,y)
y_pred = logistic_final.predict(X)

In [None]:
def plot_confusion_matrix(y, y_pred):
    acc = round(accuracy_score(y, y_pred), 2)
    cm = confusion_matrix(y, y_pred)
    sns.heatmap(cm, annot=True, fmt=".0f")
    plt.xlabel('y_pred')
    plt.ylabel('y')
    plt.title('Accuracy Score: {0}'.format(acc), size=10)
    plt.show()
plot_confusion_matrix(y, y_pred)

print(classification_report(y, y_pred))