In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from sklearn.metrics import (classification_report, matthews_corrcoef,
                            confusion_matrix, ConfusionMatrixDisplay,
                            mean_squared_error, explained_variance_score)
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import KFold, LeaveOneOut, train_test_split

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import cross_val_score
from itertools import combinations
from time import time

In [None]:
data = pd.read_csv("parkinsons_updrs.data")

In [None]:
data.info()

In [None]:
data.columns

In [None]:
X = data.drop(columns=["total_UPDRS", "motor_UPDRS"])
y = data["motor_UPDRS"]

In [None]:
X_train, X_IND, y_train, y_IND = train_test_split(X, y, test_size=361, random_state=361)

In [None]:
data_train = pd.concat((X_train, y_train), axis=1 )

In [None]:
data_train[data_train["subject#"]==20].plot(x="test_time", y = "motor_UPDRS", ls="", marker="o")

In [None]:
data_train.columns

In [None]:
data_train.sex.unique()

In [None]:
sns.boxplot(data=data_train, x="sex", y="motor_UPDRS")

In [None]:
X_train = X_train.drop(columns=["subject#", "test_time"])

In [None]:
stats = pd.DataFrame(columns=["Pearson", "RMSE", "RVE", "MaxError"])
folder = KFold(n_splits=16)

#Create Linear model 1D , for each X variable.
for xi in X_train.columns:
    #Use this 4 statistics to get information about how a variable affects the response
    pearse = []
    rmse = []
    rve = []
    max_err = []
    
    #KFold cv to get statistics
    for train_index, test_index in folder.split(X_train):
        lm = LinearRegression()
        
        x_train, x_test = X_train.iloc[train_index], X_train.iloc[test_index]
        yy_train, yy_test = y_train.iloc[train_index], y_train.iloc[test_index]
        
        lm = lm.fit(x_train[xi].to_numpy().reshape(-1,1), yy_train)
        preds=lm.predict(x_test[xi].to_numpy().reshape(-1,1))
        
        pearse.append( np.corrcoef(yy_test, preds)[0,1] )
        rmse.append( np.sqrt(mean_squared_error(yy_test, preds)) )
        rve.append( explained_variance_score(yy_test, preds) )
        max_err.append( np.abs(yy_test - preds).max() )
    
    stats.loc[xi] = {"Pearson": np.mean(pearse),
                    "RMSE": np.mean(rmse),
                    "RVE": np.mean(rve),
                    "MaxError": np.mean(max_err)}

In [None]:
stats.sort_values(by="Pearson", ascending=False)

In [None]:
def conf_int(arr, n, conf):
    z = {
        99: 2.576, 
        95: 1.960,
        90: 1.645,
        85: 1.440,
        80: 1.282,
    }
    mean = np.mean(arr)
    std = np.std(arr)
    temp = z[conf] * (std/np.sqrt(n))
    
    min_v = mean - temp
    max_v = mean + temp
    return min_v, max_v

In [None]:
def bootstrap(model, X, y, test_size, n):
    
    pearse=[]
    rve=[]
    rmse=[]
    max_err=[]
    for i in range(n):
        (X_train, X_test, 
         y_train, y_test) = train_test_split(X, y, test_size=test_size)
        temp_model = model().fit(X_train, y_train)
        
        preds = temp_model.predict(X_test)
        
        pearse.append( np.corrcoef(preds, y_test)[0,1] )
        rve.append( explained_variance_score(y_test, preds) )
        rmse.append( np.sqrt(mean_squared_error(y_test, preds) ) )
        max_err.append( np.abs(y_test - preds).max() )
    
    stats = pd.Series(
        data = [conf_int(pearse, n, 95),
               conf_int(rve, n, 95),
               conf_int(rmse, n, 95),
               conf_int(max_err, n, 95)],
        index = ["Pearson", "RVE", "RMSE", "MaxErr"]
    )
    return stats

In [None]:
bootstrap(LinearRegression, X_train[["age", "HNR", "PPE", "Shimmer:APQ11"]], y_train, 0.2, 1000)