## Machine Learning Project on UCI Parkinsons Telemonitoring Data Set

Importación de librerías

In [None]:
#!pip install qgrid

In [31]:
from __future__ import division

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import numpy.matlib as matlib

from numpy import random
from numpy import round
import math

import pandas as pd
import qgrid

import scipy as sc
from scipy.spatial import distance
from scipy import stats

#algunas advertencias que queremos evitar
import warnings
warnings.filterwarnings("always")

Cargamos la base de datos

In [2]:
#Cargamos la bd que está en un archivo .data y ahora la podemos manejar de forma matricial
db = np.loadtxt('DB/parkinsons_updrs.data', delimiter=',', skiprows=1)  # Assuming ',' delimiter

#X: Toma todas las filas (muestras) y las columnas 6-21 (características)
X = db[:,6:22]
#Y: Toma todas las filas y la columna 4, corresponde a la salida de la regresión
Y = db[:,4]
#G: Toma todas las filas y la columna 0, corresponde a la asociación en grupos del dataset
G = db[:,0]

In [3]:
print(X.shape)
print(Y.shape)
print(G.shape)

(5875, 16)
(5875,)
(5875,)


Medidas de error

In [35]:
#Mean Absolute Percentage Error
def MAPE(Y, Y_est):
    N = np.size(Y)
    mape = (1/N)*np.sum(abs((Y_est.reshape(N,1) - Y.reshape(N,1))/Y.reshape(N,1)))
    return mape

#Root Mean Square Error
def RMSE(Y, Y_est):
    N = np.size(Y)
    rmse = math.sqrt((1/N)*np.sum((Y_est.reshape(N,1) - Y.reshape(N,1))**2))
    return rmse

#### Regresión Lineal Múltiple

In [13]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score

# Normalizamos los datos
scaler = preprocessing.StandardScaler().fit(X)
X_norm = scaler.transform(X)

random.seed(19680801)

iterations = 10
gss = GroupShuffleSplit(n_splits=iterations, train_size=.7)
index = gss.split(X, Y, G)

# modelo
model = Pipeline([('poly', PolynomialFeatures()), ('linear', LinearRegression(fit_intercept=False))])
#print(model.get_params().keys())

parameters = {'poly__degree': [1,2,3]}

# métricas de error
# greater_is_better=True by default
mae = make_scorer(mean_absolute_error)
r2 = make_scorer(r2_score)

scores =  {'mae':mae,'r2':r2}

# Run the grid search
grid_obj = GridSearchCV(model, parameters, scoring=scores, cv=index, return_train_score=True, refit=False)
grid_obj = grid_obj.fit(X_norm, Y)



In [14]:
outcomes = pd.DataFrame(grid_obj.cv_results_)
outcomes = outcomes[['params', 'mean_train_mae', 'std_train_mae', 'mean_train_r2', 'std_train_r2',
            'mean_test_mae', 'std_test_mae', 'mean_test_r2', 'std_test_r2']]
outcomes

Unnamed: 0,params,mean_train_mae,std_train_mae,mean_train_r2,std_train_r2,mean_test_mae,std_test_mae,mean_test_r2,std_test_r2
0,{'poly__degree': 1},6.351467,0.137507,0.137742,0.036423,7.111647,0.257081,-0.1902,0.159749
1,{'poly__degree': 2},5.461242,0.140235,0.316153,0.035992,8.235385,1.381189,-2.500878,4.340854
2,{'poly__degree': 3},4.150685,0.15017,0.560703,0.031053,42.00983,51.07546,-16383.064456,38457.80504


#### Ventana de Parzen

In [24]:
def kernel_gaussiano(x):
    return (np.exp((-0.5)*x**2))

def Nadaraya_Watson(X_train, Y_train, X_val, h): # h equivale al ancho de la ventana o kernel
    Nval = len(X_val)
    Ntrain = len(X_train)
    Y_val = np.zeros(Nval)
    
    for i in range(Nval):
        muestra = X_val[i,:]
        numerador = 0
        denominador = 0
        for j in range(Ntrain):
            distancia = distance.euclidean(muestra, X_train[j])/h
            peso = kernel_gaussiano(distancia)
            numerador += peso * Y_train[j]
            denominador += peso
        Y_val[i] = numerador/denominador
    #Se retorna un vector que contiene las predicciones para cada una de las muestras en X_val, en el mismo orden.      
    return Y_val

In [32]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn import preprocessing
from sklearn.metrics import mean_absolute_error, r2_score

def executePARZEN_NW(h):
    
    iterations = 2
    errorMAE = np.zeros(iterations)
    errorMAPE = np.zeros(iterations)
    errorR2 = np.zeros(iterations)
    
    random.seed(19680801)
    gss = GroupShuffleSplit(n_splits=iterations, train_size=.7)
    j=0
    for train_idx, test_idx in gss.split(X, Y, G):
        Xtrain = X[train_idx,:]
        Ytrain = Y[train_idx]
        Xtest = X[test_idx,:]
        Ytest = Y[test_idx]

        #Normalizamos los datos
        media = np.mean(Xtrain)
        desvia = np.std(Xtrain)
        Xtrain = sc.stats.stats.zscore(Xtrain)
        Xtest = (Xtest - np.matlib.repmat(media, Xtest.shape[0], 1))/np.matlib.repmat(desvia, Xtest.shape[0], 1)

        Yest = Nadaraya_Watson(Xtrain, Ytrain, Xtest, h)

        #Evaluamos las predicciones del modelo con los datos de test
        errorMAPE[j] = MAPE(Ytest, Yest)
        errorMAE[j] = mean_absolute_error(Ytest, Yest)
        errorR2[j] = r2_score(Ytest, Yest)
        j += 1

    mape = np.mean(errorMAPE)
    std_mape = np.std(errorMAPE)
    mae = np.mean(errorMAE)
    std_mae = np.std(errorMAE)
    r2 = np.mean(errorR2)
    std_r2 = np.std(errorR2)
    
    return(mape, std_mape, mae, std_mae, r2, std_r2)


In [None]:
executePARZEN_NW(0.5)

In [None]:
df_types = pd.DataFrame({
    'Ancho del kernel' : pd.Series(['0.1', '0.5', '1', '2', '5'])})
df_types["MAPE"] = ""
df_types["std_MAPE"] = ""
df_types["MAE"] = ""
df_types["std_MAE"] = ""
df_types["R2"] = ""
df_types["std_R2"] = ""
df_types.set_index(['Ancho del kernel'], inplace=True)

valoresH = np.array([0.1,0.5,1,2,5])
for i in range(np.size(valoresH)):
    mape, std_mape, mae, std_mae, r2, std_r2  = executePARZEN_NW(valoresH[i])
    df_types["MAPE"][i] = round(mape,4)
    df_types["std_MAPE"][i] = round(std_mape,4)
    df_types["MAE"][i] = round(mae,4)
    df_types["std_MAE"][i] = round(std_mae,4)
    df_types["R2"][i] = round(r2,4)
    df_types["std_R2"][i] = round(std_r2,4)
    
#df_types.sort_index(inplace=True)
qgrid_widget = qgrid.show_grid(df_types, show_toolbar=False)
qgrid_widget

In [None]:
qgrid_widget.get_changed_df()

#### Redes Neuronales Artificales

In [37]:
# MAE, RMSE, MAPE

from sklearn.model_selection import GroupShuffleSplit
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score

# Normalizamos los datos
scaler = preprocessing.StandardScaler().fit(X)
X_norm = scaler.transform(X)

random.seed(19680801)

iterations = 10
gss = GroupShuffleSplit(n_splits=iterations, train_size=.7)
index = gss.split(X, Y, G)

model = MLPRegressor(activation='tanh')

parameters = {'hidden_layer_sizes': [(8),(16),(32),(8,8),(16,16),(32,32)],
             'max_iter': [100,500]}
#[500,1000,1500]

# greater_is_better=True by default
mae = make_scorer(mean_absolute_error)
rmse = make_scorer(RMSE)
mape = make_scorer(MAPE)

scores =  {'mae':mae, 'rmse':rmse, 'mape':mape}

# Run the grid search
grid_obj = GridSearchCV(model, parameters, scoring=scores, cv=index, return_train_score=True, refit=False)
grid_obj = grid_obj.fit(X_norm, Y)









In [38]:
outcomes = pd.DataFrame(grid_obj.cv_results_)
outcomes = outcomes[['params', 'mean_train_mae', 'std_train_mae', 'mean_train_rmse', 'std_train_rmse', 'mean_train_mape', 'std_train_mape',
            'mean_test_mae', 'std_test_mae', 'mean_test_rmse', 'std_test_rmse', 'mean_test_mape', 'std_test_mape']]
outcomes

Unnamed: 0,params,mean_train_mae,std_train_mae,mean_train_rmse,std_train_rmse,mean_train_mape,std_train_mape,mean_test_mae,std_test_mae,mean_test_rmse,std_test_rmse,mean_test_mape,std_test_mape
0,"{'hidden_layer_sizes': 8, 'max_iter': 100}",6.716662,0.241907,8.297941,0.227638,0.352653,0.017591,7.648551,1.04919,9.285081,1.362603,0.389551,0.055015
1,"{'hidden_layer_sizes': 8, 'max_iter': 500}",5.554457,0.199074,6.853846,0.208516,0.341781,0.025969,7.489612,0.513028,9.09415,0.56503,0.442633,0.094408
2,"{'hidden_layer_sizes': 16, 'max_iter': 100}",6.138732,0.087171,7.366249,0.130848,0.371345,0.019243,7.044561,0.340851,8.528543,0.599519,0.410478,0.076985
3,"{'hidden_layer_sizes': 16, 'max_iter': 500}",5.399281,0.171298,6.69526,0.184113,0.333283,0.024461,7.600595,0.581662,9.182833,0.621956,0.44542,0.096894
4,"{'hidden_layer_sizes': 32, 'max_iter': 100}",5.890899,0.134254,7.127833,0.162373,0.361492,0.020549,7.180905,0.316841,8.671141,0.434664,0.421849,0.082245
5,"{'hidden_layer_sizes': 32, 'max_iter': 500}",5.129302,0.195422,6.423025,0.215926,0.317628,0.026564,7.628261,0.526831,9.218773,0.560384,0.448776,0.100888
6,"{'hidden_layer_sizes': (8, 8), 'max_iter': 100}",6.943613,0.354662,8.578912,0.374229,0.360133,0.020742,7.499929,0.82837,9.0384,0.958172,0.384151,0.057937
7,"{'hidden_layer_sizes': (8, 8), 'max_iter': 500}",5.01786,0.225684,6.316063,0.241053,0.305847,0.027138,7.772965,0.561015,9.410614,0.540157,0.459323,0.101112
8,"{'hidden_layer_sizes': (16, 16), 'max_iter': 100}",5.999495,0.147167,7.296051,0.198969,0.358875,0.020963,6.973722,0.33029,8.456558,0.520348,0.410438,0.077857
9,"{'hidden_layer_sizes': (16, 16), 'max_iter': 500}",4.560961,0.239221,5.861313,0.288979,0.279495,0.027946,7.993504,0.489512,9.671294,0.533522,0.468771,0.103836


#### Random Forest

In [40]:
# MAE, R2, MAPE
from sklearn.model_selection import GroupShuffleSplit
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score

# Normalizamos los datos
scaler = preprocessing.StandardScaler().fit(X)
X_norm = scaler.transform(X)

random.seed(19680801)

iterations = 10
gss = GroupShuffleSplit(n_splits=iterations, train_size=.7)
index = gss.split(X, Y, G)

model = RandomForestRegressor()

parameters = {'n_estimators': [5,10,20,50],
             'max_features': [5,10,16]}

# greater_is_better=True by default
mae = make_scorer(mean_absolute_error)
r2 = make_scorer(r2_score)

scores =  {'mae':mae,'r2':r2}

# Run the grid search
grid_obj = GridSearchCV(model, parameters, scoring=scores, cv=index, return_train_score=True, refit=False)
grid_obj = grid_obj.fit(X_norm, Y)




In [41]:
outcomes = pd.DataFrame(grid_obj.cv_results_)
outcomes = outcomes[['params', 'mean_train_mae', 'std_train_mae', 'mean_train_r2', 'std_train_r2',
            'mean_test_mae', 'std_test_mae', 'mean_test_r2', 'std_test_r2']]
outcomes

Unnamed: 0,params,mean_train_mae,std_train_mae,mean_train_r2,std_train_r2,mean_test_mae,std_test_mae,mean_test_r2,std_test_r2
0,"{'max_features': 5, 'n_estimators': 5}",2.154013,0.1042,0.842394,0.013321,7.741684,0.342507,-0.442071,0.182841
1,"{'max_features': 5, 'n_estimators': 10}",2.004675,0.079608,0.880579,0.00997,7.573355,0.36675,-0.363678,0.167094
2,"{'max_features': 5, 'n_estimators': 20}",1.893591,0.065349,0.902065,0.008073,7.408518,0.366373,-0.299132,0.160944
3,"{'max_features': 5, 'n_estimators': 50}",1.820054,0.069992,0.914683,0.007084,7.329166,0.329492,-0.266925,0.145353
4,"{'max_features': 10, 'n_estimators': 5}",2.108366,0.087829,0.844848,0.012959,7.796806,0.372851,-0.46147,0.16348
5,"{'max_features': 10, 'n_estimators': 10}",1.944756,0.085535,0.886159,0.011095,7.645188,0.40613,-0.387185,0.17314
6,"{'max_features': 10, 'n_estimators': 20}",1.841006,0.081576,0.906005,0.008264,7.499807,0.361325,-0.32872,0.141652
7,"{'max_features': 10, 'n_estimators': 50}",1.779992,0.07402,0.916483,0.00731,7.429142,0.373306,-0.301597,0.144956
8,"{'max_features': 16, 'n_estimators': 5}",2.066239,0.087739,0.849869,0.012567,7.896331,0.375462,-0.488681,0.155882
9,"{'max_features': 16, 'n_estimators': 10}",1.940315,0.07772,0.885392,0.008373,7.649168,0.389107,-0.3916,0.17233
