In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, explained_variance_score

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, train_test_split

import seaborn as sns

In [None]:
data = pd.read_csv("parkinsons_updrs.data")

In [None]:
data.columns

In [None]:
#Separate X from y
X = data.drop(columns=["total_UPDRS", "motor_UPDRS"])
y = data["motor_UPDRS"]

In [None]:
#Separate Training set from Independent Validations Set
(X_train, X_IND, 
 y_train, y_IND) = train_test_split(X, y, test_size=361, random_state=361)

In [None]:
#join X and y from the training set
data_train = pd.concat((X_train, y_train), axis=1 )

In [None]:
data_train.head()

In [None]:
data_train[data_train["subject#"]==20].plot(x="test_time", y = "motor_UPDRS", ls="", marker="o")

In [None]:
data_train.columns

In [None]:
data_train.sex.unique()

In [None]:
sns.boxplot(data=data_train, x="sex", y="motor_UPDRS")

In [None]:
X_train = X_train.drop(columns=["subject#", "test_time"])

In [None]:
def get_model_statistics_simple(model, X, y):    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=21)

    #fit model
    temp_model = model.fit(X_train, y_train)
    
    #get predictions on train set 
    train_preds = temp_model.predict(X_train)
    #calculate rmse on training set
    train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
    
    #get predictions on test set
    preds = temp_model.predict(X_test)
    #calculate rmse on test set
    test_rmse = np.sqrt(mean_squared_error(y_test, preds))
    
    #get statistics
    pearse = ( np.corrcoef(y_test, preds)[0,1] )
    rve = ( explained_variance_score(y_test, preds) )
    max_err = ( np.abs(y_test - preds).max() )
    
    #return statistics estimations
    return (
        train_rmse,
        test_rmse,
        pearse,
        rve,
        max_err
       )

In [None]:
def get_model_statistics_cv(model, X, y):
    kf = KFold(n_splits=16)
    
    #statistic arrays
    pearse = []
    test_rmse = []
    train_rmse = []
    rve = []
    max_err = []
    
    #Get the train/test folds
    for train_index, test_index in kf.split(X):
        #Create train/test sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        #fit model
        temp_model = model.fit(X_train, y_train)
        
        #get predictions on train set
        train_preds = temp_model.predict(X_train)
        
        #get predictions on test set
        preds = temp_model.predict(X_test)
        
        #get statistics
        train_rmse.append(np.sqrt(mean_squared_error(y_train, train_preds)))
        
        pearse.append( np.corrcoef(y_test, preds)[0,1] )
        test_rmse.append( np.sqrt(mean_squared_error(y_test, preds)) )
        rve.append( explained_variance_score(y_test, preds) )
        max_err.append( np.abs(y_test - preds).max() )
    
    #return statistics estimations
    return (
         np.mean(pearse),
         np.mean(test_rmse),
         np.mean(rve),
         np.max(max_err),
        np.mean(train_rmse)
       )

In [None]:
#get_model_statistics(Ridge(), X_train, y_train)

# Ridge Model Evaluation

In [None]:
alphas = np.linspace(0.01, 5, 100)
print(alphas)

In [None]:
results = pd.DataFrame(columns = ["Pearson", "Test RMSE", "RVE", "Max_err", "Train RMSE"])
for alpha in alphas:
    (train_rmse, test_rmse, 
     pearse, rve, max_err) = get_model_statistics_simple(Ridge(alpha=alpha), X_train, y_train)
    
    results.loc[alpha] = {
        "Pearson": pearse,
        "Test RMSE": test_rmse,
        "RVE":rve,
        "Max_err": max_err,
        "Train RMSE": train_rmse
    }

In [None]:
def plot_df(df, column, ax, title="", x_label="", y_label="", legend=""):
    ax.set_title(title)
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    
    ax.plot(df.index, df[column], "-*", label=legend)
    ax.legend()

f = plt.figure()
ax = plt.axes()
plot_df(results, "Train RMSE", ax, legend="Train RMSE")
plot_df(results, "Test RMSE", ax, legend="Test RMSE", title="RMSE",)


In [None]:
f,ax = plt.subplots(1,2, figsize=(12,4))
ax[0].plot(results.index, results.Max_err, "-*")
ax[0].set_title("Max Error in the Model")

ax[1].plot(results.index, results.Pearson, '-*')
ax[1].set_title("Pearson Correlation Coef.")

In [None]:
coefs = pd.DataFrame(columns=X_train.columns)
for alpha in alphas:
    model = Ridge(alpha=alpha).fit(X_train, y_train)
    coefs.loc[alpha] = dict(zip(model.feature_names_in_, model.coef_))
coefs.plot(figsize=(12,10));

It seems that the plot alpha/error does not evolves to a "sweet spot".
Instead of a U like shape, the error evolves in a logarithmic manner.



# Lasso Model Evaluation

In [None]:
#SIMPLE CROSS VALIDATION
coefs=[]
rmse_train = []
rmse_test = []
alphas = np.linspace(0.01, 5, 100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=21)

for alpha in alphas:
    lasso = Lasso(alpha=alpha, max_iter=100000).fit(X_train, y_train)
    preds_tr=lasso.predict(X_train)
    preds_te=lasso.predict(X_test)
    rmse_train.append(mean_squared_error(y_train, preds_tr, squared=False))
    rmse_test.append(mean_squared_error(y_test, preds_te, squared=False))
    coefs.append(lasso.coef_)
coefs = np.array(coefs)
    
plt.plot(alphas, rmse_train, label="rmse Train")    
plt.plot(alphas, rmse_test, label="rmse Test")  
plt.xlabel("Alpha Value", fontsize=12)
plt.ylabel("RMSE", fontsize=12)
#plt.grid()
plt.legend()
plt.savefig("lasso_rmse.pdf", dpi=25, format="pdf")
plt.show()


In [None]:
N,M=X_train.shape
plt.figure(figsize=(12,8))
for i in range(M):
    plt.plot(alphas, coefs[:,i], label="Var %d" % (i+1))
plt.xscale("log")
plt.legend()
plt.grid()
plt.show()

In [None]:
alphas[1:] - alphas[:-1]

In [None]:
model = LinearRegression()
get_model_statistics_cv(model, X_train, y_train)

In [None]:
get_model_statistics_cv(Ridge(alpha=0), X_train, y_train)