In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import (classification_report, matthews_corrcoef,
                             precision_score, recall_score, f1_score,
                            confusion_matrix, ConfusionMatrixDisplay)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split

import seaborn as sns

In [None]:
data = pd.read_csv("parkinsons_updrs.data")

In [None]:
#Separate X from y
X = data.drop(columns=["total_UPDRS", "motor_UPDRS"])
y = data["total_UPDRS"].map(lambda x: 1 if x>40 else 0)

In [None]:
y.value_counts()

In [None]:
#Separate Training set from Independent Validations Set
(X_train, X_IND, 
 y_train, y_IND) = train_test_split(X, y, test_size=361, random_state=361)

In [None]:
#join X and y from the training set
data_train = pd.concat((X_train, y_train), axis=1 )

In [None]:
X_train = X_train.drop(columns=["subject#", "test_time"])

In [None]:
def get_model_statistics_simple(model, X, y, normalize = True, plot=False, *args, **kwargs):    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

    if normalize:
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
    
    #fit model
    temp_model = model.fit(X_train, y_train)
    
    #get predictions on test set
    if normalize:
        X_test = scaler.transform(X_test)
    preds = temp_model.predict(X_test)
    
    #get statistics
    if np.any(preds)==1:
        prec = precision_score(y_test, preds)
    else:
        prec = np.nan
        
    
    recall = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    mcc = matthews_corrcoef(y_test, preds)
    
    if plot:
        if (prec>0.5 or recall>0.5 or mcc>0.5):
            cm = ConfusionMatrixDisplay(
                confusion_matrix(y_test, preds),
            )
            cm.plot()
            cm.ax_.set_title(kwargs.get("title", ""))
            plt.show()
    #return statistics estimations
    return (
        prec,
        recall,
        f1,
        mcc
       )

In [None]:
def get_model_statistics_cv(model, X, y, normalize = True):
    kf = KFold(n_splits=16)
    
    #statistic arrays
    prec = []
    recall = []
    f1 = []
    mcc = []
    
    #Get the train/test folds
    for train_index, test_index in kf.split(X):
        #Create train/test sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        if normalize:
            scaler = StandardScaler()
            scaler.fit(X_train)
            X_train = scaler.transform(X_train)

        #fit model
        temp_model = model.fit(X_train, y_train)

        #get predictions on test set
        if normalize:
            X_test = scaler.transform(X_test)
        preds = temp_model.predict(X_test)

        #get statistics
        prec.append( precision_score(y_test, preds, zero_division=np.nan) )
        recall.append( recall_score(y_test, preds, zero_division=np.nan) )
        f1.append( f1_score(y_test, preds, zero_division=np.nan) )
        mcc.append( matthews_corrcoef(y_test, preds) )
    
    #return statistics estimations
    return (
        np.mean(prec),
        np.mean(recall),
        np.mean(f1),
        np.mean(mcc)
       )

In [None]:
for col in X_train.columns:
    f = plt.figure()
    ax = plt.axes()
    X_train[col].plot.hist(ax = ax, title=col, bins=20)