In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import (classification_report, matthews_corrcoef,
                             precision_score, recall_score, f1_score,
                            confusion_matrix, ConfusionMatrixDisplay)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.decomposition import PCA

import seaborn as sns

In [2]:
data = pd.read_csv("parkinsons_updrs.data")

In [3]:
#Separate X from y
X = data.drop(columns=["total_UPDRS", "motor_UPDRS"])
y = data["total_UPDRS"].map(lambda x: 1 if x>40 else 0)

In [4]:
y.value_counts()

0    4869
1    1006
Name: total_UPDRS, dtype: int64

In [5]:
#Separate Training set from Independent Validations Set
(X_train, X_IND, 
 y_train, y_IND) = train_test_split(X, y, test_size=361, random_state=361)

In [6]:
y_train.value_counts()

0    4569
1     945
Name: total_UPDRS, dtype: int64

In [None]:
#join X and y from the training set
data_train = pd.concat((X_train, y_train), axis=1 )

In [None]:
X_train = X_train.drop(columns=["subject#", "test_time"])

In [None]:
def get_model_statistics_simple(model, X, y, normalize = True, plot=False, *args, **kwargs):    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

    if normalize:
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
    
    #fit model
    temp_model = model.fit(X_train, y_train)
    
    #get predictions on test set
    if normalize:
        X_test = scaler.transform(X_test)
    preds = temp_model.predict(X_test)
    
    #get statistics
    if np.any(preds)==1:
        prec = precision_score(y_test, preds)
    else:
        prec = np.nan
        
    
    recall = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    mcc = matthews_corrcoef(y_test, preds)
    
    if plot:
        if (prec>0.1 or recall>0.1 or mcc>0.1):
            cm = ConfusionMatrixDisplay(
                confusion_matrix(y_test, preds),
            )
            cm.plot()
            cm.ax_.set_title(kwargs.get("title", ""))
            plt.show()
    #return statistics estimations
    return (
        prec,
        recall,
        f1,
        mcc
       )

In [None]:
def get_model_statistics_cv(model, X, y, normalize = True):
    kf = KFold(n_splits=16)
    
    #statistic arrays
    prec = []
    recall = []
    f1 = []
    mcc = []
    
    #Get the train/test folds
    for train_index, test_index in kf.split(X):
        #Create train/test sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        if normalize:
            scaler = StandardScaler()
            scaler.fit(X_train)
            X_train = scaler.transform(X_train)

        #fit model
        temp_model = model.fit(X_train, y_train)

        #get predictions on test set
        if normalize:
            X_test = scaler.transform(X_test)
        preds = temp_model.predict(X_test)

        #get statistics
        if np.any(preds)==1:
            prec.append(precision_score(y_test, preds))
        #prec.append( precision_score(y_test, preds) )
        recall.append( recall_score(y_test, preds) )
        f1.append( f1_score(y_test, preds) )
        mcc.append( matthews_corrcoef(y_test, preds) )
    
    #return statistics estimations
    return (
        np.mean(prec),
        np.mean(recall),
        np.mean(f1),
        np.mean(mcc)
       )

In [None]:
solvers = ["svd", "lsqr", "eigen"]
for slv in solvers:
    model = LinearDiscriminantAnalysis(solver=slv)
    print(get_model_statistics_simple(model, X_train, y_train, normalize=False, plot=True))

In [None]:
for col in X_train.columns:
    data_train.plot(x=col, y="total_UPDRS", ls="", marker="*", markersize=1)

In [None]:
for col in X_train.columns:
    f = plt.figure()
    ax = plt.axes()
    sns.histplot(data_train, x=col, hue = "total_UPDRS", ax=ax)

In [None]:
def plot_df(df, column, ax, title="", x_label="", y_label="", legend=""):
    ax.set_title(title)
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    
    ax.plot(df.index, df[column], "-*", label=legend)
    ax.grid(True)
    if legend!="":
        ax.legend()

In [None]:
criteria = ["gini", "entropy", "log_loss"]
max_depth = range(3, 30+1)
min_sample_split = range(2, 30+1)
min_samples_leaf = range(1, 50+1)

columns = ["Precision", "Recall", "F1 Score", "MCC"]

# Criteria : Gini
## 1. Depth Testing

In [None]:
stats_depth = pd.DataFrame(columns=columns)
for d in max_depth:
    temp = get_model_statistics_cv(
        DecisionTreeClassifier(criterion=criteria[0], max_depth=d), 
        X_train, 
        y_train,
    )
    stats_depth.loc[d] = dict(zip(columns, temp))

f, ax = plt.subplots(2,2)
plot_df(stats_depth, "Precision", ax[0,0], x_label = "Max Depth", y_label="Precision Score")
plot_df(stats_depth, "Recall", ax[0,1], x_label = "Max Depth", y_label="Recall Score")
plot_df(stats_depth, "F1 Score", ax[1,0], x_label = "Max Depth", y_label="F1 Score")
plot_df(stats_depth, "MCC", ax[1,1], x_label = "Max Depth", y_label="MCC")
f.tight_layout()

In [None]:
stats_depth.idxmax()

## 2. Sample Split

In [None]:
stats_split = pd.DataFrame(columns=columns)
for s in min_sample_split:
    temp = get_model_statistics_cv(
        DecisionTreeClassifier(criterion=criteria[0], max_depth=9, min_samples_split=s), 
        X_train, 
        y_train,
    )
    stats_split.loc[s] = dict(zip(columns, temp))

f, ax = plt.subplots(2,2)
plot_df(stats_split, "Precision", ax[0,0], x_label = "min_samples_split", y_label="Precision Score")
plot_df(stats_split, "Recall", ax[0,1], x_label = "min_samples_split", y_label="Recall Score")
plot_df(stats_split, "F1 Score", ax[1,0], x_label = "min_samples_split", y_label="F1 Score")
plot_df(stats_split, "MCC", ax[1,1], x_label = "min_samples_split", y_label="MCC")
f.tight_layout()

In [None]:
stats_split.idxmax()

## 3. Samples Leaf

In [None]:
stats_leaf = pd.DataFrame(columns=columns)
for s in min_samples_leaf:
    temp = get_model_statistics_cv(
        DecisionTreeClassifier(criterion=criteria[0], max_depth=9,
                               min_samples_split=3, min_samples_leaf=s), 
        X_train, 
        y_train,
    )
    stats_leaf.loc[s] = dict(zip(columns, temp))

f, ax = plt.subplots(2,2)
plot_df(stats_leaf, "Precision", ax[0,0], x_label = "min_samples_leaf", y_label="Precision Score")
plot_df(stats_leaf, "Recall", ax[0,1], x_label = "min_samples_leaf", y_label="Recall Score")
plot_df(stats_leaf, "F1 Score", ax[1,0], x_label = "min_samples_leaf", y_label="F1 Score")
plot_df(stats_leaf, "MCC", ax[1,1], x_label = "min_samples_leaf", y_label="MCC")
f.tight_layout()

In [None]:
stats_leaf.idxmax()

## Best Gini

In [None]:
dt = DecisionTreeClassifier(max_depth=9,
                           min_samples_split=3,
                           min_samples_leaf=20,
                           )
get_model_statistics_cv(dt, X_train, y_train)

# Criteria : Entropy
## 1. Depth

In [None]:
stats_depth = pd.DataFrame(columns=columns)
for d in max_depth:
    temp = get_model_statistics_cv(
        DecisionTreeClassifier(criterion=criteria[1], max_depth=d), 
        X_train, 
        y_train,
    )
    stats_depth.loc[d] = dict(zip(columns, temp))

f, ax = plt.subplots(2,2)
plot_df(stats_depth, "Precision", ax[0,0], x_label = "Max Depth", y_label="Precision Score")
plot_df(stats_depth, "Recall", ax[0,1], x_label = "Max Depth", y_label="Recall Score")
plot_df(stats_depth, "F1 Score", ax[1,0], x_label = "Max Depth", y_label="F1 Score")
plot_df(stats_depth, "MCC", ax[1,1], x_label = "Max Depth", y_label="MCC")
f.tight_layout()

In [None]:
stats_depth.idxmax()

## 2. Split

In [None]:
stats_split = pd.DataFrame(columns=columns)
for s in min_sample_split:
    temp = get_model_statistics_cv(
        DecisionTreeClassifier(criterion=criteria[1], max_depth=7,
                               min_samples_split=s), 
        X_train, 
        y_train,
    )
    stats_split.loc[s] = dict(zip(columns, temp))

f, ax = plt.subplots(2,2)
plot_df(stats_split, "Precision", ax[0,0], x_label = "min_samples_split", y_label="Precision Score")
plot_df(stats_split, "Recall", ax[0,1], x_label = "min_samples_split", y_label="Recall Score")
plot_df(stats_split, "F1 Score", ax[1,0], x_label = "min_samples_split", y_label="F1 Score")
plot_df(stats_split, "MCC", ax[1,1], x_label = "min_samples_split", y_label="MCC")
f.tight_layout()

In [None]:
stats_split.idxmax()

## 3.  Leaf

In [None]:
stats_leaf = pd.DataFrame(columns=columns)
for s in min_samples_leaf:
    temp = get_model_statistics_cv(
        DecisionTreeClassifier(criterion=criteria[1], max_depth=7,
                               min_samples_split=20, min_samples_leaf=s), 
        X_train, 
        y_train,
    )
    stats_leaf.loc[s] = dict(zip(columns, temp))

f, ax = plt.subplots(2,2)
plot_df(stats_leaf, "Precision", ax[0,0], x_label = "min_samples_leaf", y_label="Precision Score")
plot_df(stats_leaf, "Recall", ax[0,1], x_label = "min_samples_leaf", y_label="Recall Score")
plot_df(stats_leaf, "F1 Score", ax[1,0], x_label = "min_samples_leaf", y_label="F1 Score")
plot_df(stats_leaf, "MCC", ax[1,1], x_label = "min_samples_leaf", y_label="MCC")
f.tight_layout()

In [None]:
stats_leaf.idxmax()

In [None]:
stats_leaf = pd.DataFrame(columns=columns)
for s in range(2,50):
    temp = get_model_statistics_cv(
        DecisionTreeClassifier(criterion=criteria[1], max_depth=7,
                               min_samples_split=20, min_samples_leaf=2,
                              max_leaf_nodes=s), 
        X_train, 
        y_train,
    )
    stats_leaf.loc[s] = dict(zip(columns, temp))

f, ax = plt.subplots(2,2)
plot_df(stats_leaf, "Precision", ax[0,0], x_label = "min_samples_leaf", y_label="Precision Score")
plot_df(stats_leaf, "Recall", ax[0,1], x_label = "min_samples_leaf", y_label="Recall Score")
plot_df(stats_leaf, "F1 Score", ax[1,0], x_label = "min_samples_leaf", y_label="F1 Score")
plot_df(stats_leaf, "MCC", ax[1,1], x_label = "min_samples_leaf", y_label="MCC")
f.tight_layout()

In [None]:
stats_leaf.idxmax()

In [None]:
stats_leaf = pd.DataFrame(columns=columns)
for s in np.linspace(0,0.05,100):
    temp = get_model_statistics_cv(
        DecisionTreeClassifier(criterion=criteria[1], max_depth=7,
                               min_samples_split=20, min_samples_leaf=2,
                              max_leaf_nodes=28, min_impurity_decrease=s), 
        X_train, 
        y_train,
    )
    stats_leaf.loc[s] = dict(zip(columns, temp))

f, ax = plt.subplots(2,2)
plot_df(stats_leaf, "Precision", ax[0,0], x_label = "min_samples_leaf", y_label="Precision Score")
plot_df(stats_leaf, "Recall", ax[0,1], x_label = "min_samples_leaf", y_label="Recall Score")
plot_df(stats_leaf, "F1 Score", ax[1,0], x_label = "min_samples_leaf", y_label="F1 Score")
plot_df(stats_leaf, "MCC", ax[1,1], x_label = "min_samples_leaf", y_label="MCC")
f.tight_layout()

In [None]:
stats_leaf.idxmax()

## Best Entropy

In [None]:
dt = DecisionTreeClassifier(criterion=criteria[1],
                           max_depth=7,
                           min_samples_split=20,
                           min_samples_leaf=2,
                           max_leaf_nodes=28,
                           )

get_model_statistics_cv(dt, X_train, y_train)

## Logistic Regression


In [64]:
(pred, recall,
 f1, ncc) = get_model_statistics_cv(LogisticRegression(random_state=361), X_train, y_train, normalize = True)

print("Precision:", pred, "\nRecall:", recall, "\nF1:", f1, "\nMCC", mcc)

Precision: 0.21875 
Recall: 0.004364296556358917 
F1: 0.008547730854026252 
MCC 0.014537910213583069
