## Machine Learning Project on UCI Parkinsons Telemonitoring Data Set

Importación de librerías

In [None]:
#!pip install qgrid

In [1]:
from __future__ import division

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import numpy.matlib as matlib

from numpy import random
from numpy import round
import math

import pandas as pd
import qgrid

import scipy as sc
from scipy.spatial import distance
from scipy import stats

#algunas advertencias que queremos evitar
import warnings
warnings.filterwarnings("always")

Cargamos la base de datos

In [2]:
#Cargamos la bd que está en un archivo .data y ahora la podemos manejar de forma matricial
db = np.loadtxt('DB/parkinsons_updrs.data', delimiter=',', skiprows=1)  # Assuming ',' delimiter

#X: Toma todas las filas (muestras) y las columnas 6-21 (características)
X = db[:,6:22]
#Y: Toma todas las filas y la columna 4, corresponde a la salida de la regresión
Y = db[:,4]
#G: Toma todas las filas y la columna 0, corresponde a la asociación en grupos del dataset
G = db[:,0]

In [3]:
print(X.shape)
print(Y.shape)
print(G.shape)

(5875, 16)
(5875,)
(5875,)


Medidas de error

In [4]:
#Mean Absolute Percentage Error
def MAPE(Y, Y_est):
    N = np.size(Y)
    mape = (1/N)*np.sum(abs((Y_est.reshape(N,1) - Y.reshape(N,1))/Y.reshape(N,1)))
    return mape

#### Regresión Lineal Múltiple

In [6]:
# MAE, R2, RMSE

from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score, mean_squared_error

# Normalizamos los datos
scaler = preprocessing.StandardScaler().fit(X)
X_norm = scaler.transform(X)

random.seed(19680801)

iterations = 10
gss = GroupShuffleSplit(n_splits=iterations, train_size=.7)
index = gss.split(X, Y, G)

# modelo
model = Pipeline([('poly', PolynomialFeatures()), ('linear', LinearRegression(fit_intercept=False))])
#print(model.get_params().keys())

parameters = {'poly__degree': [1,2,3,4]}

# métricas de error
# greater_is_better=True by default
mae = make_scorer(mean_absolute_error)
r2 = make_scorer(r2_score)
rmse = make_scorer(mean_squared_error, squared=False)

scores = {'mae':mae,'r2':r2,'rmse':rmse}

# Run the grid search
grid_obj = GridSearchCV(model, parameters, scoring=scores, cv=index, return_train_score=True, refit=False)
grid_obj = grid_obj.fit(X_norm, Y)

In [7]:
outcomes = pd.DataFrame(grid_obj.cv_results_)
outcomes = outcomes[['params', 'mean_train_mae', 'mean_train_r2', 'mean_train_rmse',
            'mean_test_mae', 'std_test_mae', 'mean_test_r2', 'std_test_r2', 'mean_test_rmse', 'std_test_rmse']]
outcomes

Unnamed: 0,params,mean_train_mae,mean_train_r2,mean_train_rmse,mean_test_mae,std_test_mae,mean_test_r2,std_test_r2,mean_test_rmse,std_test_rmse
0,{'poly__degree': 1},6.351467,0.137742,7.524888,7.113906,0.258448,-0.1907575,0.1597205,8.57752,0.4485203
1,{'poly__degree': 2},5.461242,0.316153,6.700819,8.240104,1.386231,-2.519604,4.355412,12.99009,7.007837
2,{'poly__degree': 3},4.150653,0.560697,5.368819,42.478125,51.503028,-16759.49,38899.56,551.6074,842.1256
3,{'poly__degree': 4},0.000259,1.0,0.000496,96705.65473,112886.283714,-77638700000.0,86425670000.0,1706706.0,1352049.0


#### Ventana de Parzen

In [5]:
def kernel_gaussiano(x):
    return (np.exp((-0.5)*x**2))

def Nadaraya_Watson(X_train, Y_train, X_val, h): # h equivale al ancho de la ventana o kernel
    Nval = len(X_val)
    Ntrain = len(X_train)
    Y_val = np.zeros(Nval)
    
    for i in range(Nval):
        muestra = X_val[i,:]
        numerador = 0
        denominador = 0
        for j in range(Ntrain):
            distancia = distance.euclidean(muestra, X_train[j])/h
            peso = kernel_gaussiano(distancia)
            numerador += peso * Y_train[j]
            denominador += peso
        Y_val[i] = numerador/denominador
    #Se retorna un vector que contiene las predicciones para cada una de las muestras en X_val, en el mismo orden.      
    return Y_val

In [11]:
# MAE, R2

from sklearn.model_selection import GroupShuffleSplit
from sklearn import preprocessing
from sklearn.metrics import mean_absolute_error, r2_score

def executePARZEN_NW(h):
    
    iterations = 5
    errorMAE = np.zeros(iterations)
    errorR2 = np.zeros(iterations)
    
    random.seed(19680801)
    gss = GroupShuffleSplit(n_splits=iterations, train_size=.7)
    j=0
    for train_idx, test_idx in gss.split(X, Y, G):
        Xtrain = X[train_idx,:]
        Ytrain = Y[train_idx]
        Xtest = X[test_idx,:]
        Ytest = Y[test_idx]

        #Normalizamos los datos
        media = np.mean(Xtrain)
        desvia = np.std(Xtrain)
        Xtrain = sc.stats.stats.zscore(Xtrain)
        Xtest = (Xtest - np.matlib.repmat(media, Xtest.shape[0], 1))/np.matlib.repmat(desvia, Xtest.shape[0], 1)

        Yest = Nadaraya_Watson(Xtrain, Ytrain, Xtest, h)

        #Evaluamos las predicciones del modelo con los datos de test
        errorMAE[j] = mean_absolute_error(Ytest, Yest)
        errorR2[j] = r2_score(Ytest, Yest)
        j += 1

    mae = np.mean(errorMAE)
    std_mae = np.std(errorMAE)
    r2 = np.mean(errorR2)
    std_r2 = np.std(errorR2)
    
    return(round(mae,4), round(std_mae,4), round(r2,4), round(std_r2,4))


In [13]:
df_types = pd.DataFrame({
    'Ancho del kernel' : pd.Series(['0.5', '1', '2', '5','10'])})
df_types["MAE"] = ""
df_types["std_MAE"] = ""
df_types["R2"] = ""
df_types["std_R2"] = ""
df_types.set_index(['Ancho del kernel'], inplace=True)

valoresH = np.array([0.5,1,2,5,10])
for i in range(np.size(valoresH)):
    mae, std_mae, r2, std_r2  = executePARZEN_NW(valoresH[i])
    df_types["MAE"][i] = mae
    df_types["std_MAE"][i] = std_mae
    df_types["R2"][i] = r2
    df_types["std_R2"][i] = std_r2
    
#df_types.sort_index(inplace=True)
qgrid_widget = qgrid.show_grid(df_types, show_toolbar=False)
qgrid_widget

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [14]:
qgrid_widget.get_changed_df()

Unnamed: 0_level_0,MAE,std_MAE,R2,std_R2
Ancho del kernel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.5,10.3501,1.8655,-1.5605,0.9141
1.0,7.5742,1.0014,-0.3956,0.4086
2.0,7.0959,0.8243,-0.1634,0.2377
5.0,7.018,0.7643,-0.1185,0.1969
10.0,6.9963,0.7396,-0.1074,0.1825


#### Redes Neuronales Artificales

In [None]:
# MAE, R2, RMSE

from sklearn.model_selection import GroupShuffleSplit
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score, mean_squared_error

# Normalizamos los datos
scaler = preprocessing.StandardScaler().fit(X)
X_norm = scaler.transform(X)

random.seed(19680801)

iterations = 10
gss = GroupShuffleSplit(n_splits=iterations, train_size=.7)
index = gss.split(X, Y, G)

model = MLPRegressor(activation='tanh')

parameters = {'hidden_layer_sizes': [(8),(16),(32),(8,8),(16,16),(32,32),(8,8,8),(16,16,16),(32,32,32)],
             'max_iter': [500,1000,1500]}

# métricas de error
# greater_is_better=True by default
mae = make_scorer(mean_absolute_error)
r2 = make_scorer(r2_score)
rmse = make_scorer(mean_squared_error, squared=False)

scores = {'mae':mae,'r2':r2,'rmse':rmse}

# Run the grid search
grid_obj = GridSearchCV(model, parameters, scoring=scores, cv=index, return_train_score=True, refit=False)
grid_obj = grid_obj.fit(X_norm, Y)

In [12]:
outcomes = pd.DataFrame(grid_obj.cv_results_)
outcomes = outcomes[['params', 'mean_train_mae', 'mean_train_r2', 'mean_train_rmse',
            'mean_test_mae', 'std_test_mae', 'mean_test_r2', 'std_test_r2', 'mean_test_rmse', 'std_test_rmse']]
outcomes

Unnamed: 0,params,mean_train_mae,mean_train_r2,mean_train_rmse,mean_test_mae,std_test_mae,mean_test_r2,std_test_r2,mean_test_rmse,std_test_rmse
0,"{'hidden_layer_sizes': 8, 'max_iter': 500}",5.562746,0.279462,6.875116,7.54749,0.465845,-0.358031,0.233509,9.136517,0.529503
1,"{'hidden_layer_sizes': 8, 'max_iter': 1000}",5.406609,0.314536,6.705013,7.631354,0.584386,-0.389432,0.267782,9.233823,0.663682
2,"{'hidden_layer_sizes': 8, 'max_iter': 1500}",5.374483,0.318367,6.68524,7.670587,0.564504,-0.389749,0.238831,9.247882,0.600639
3,"{'hidden_layer_sizes': 16, 'max_iter': 500}",5.382395,0.318178,6.684793,7.587403,0.556264,-0.369906,0.240422,9.178715,0.594459
4,"{'hidden_layer_sizes': 16, 'max_iter': 1000}",5.141643,0.366153,6.447493,7.737584,0.532623,-0.41783,0.217151,9.353877,0.584137
5,"{'hidden_layer_sizes': 16, 'max_iter': 1500}",5.022856,0.389383,6.331027,7.852918,0.522782,-0.462418,0.198323,9.509979,0.574289
6,"{'hidden_layer_sizes': 32, 'max_iter': 500}",5.136924,0.371123,6.423922,7.637718,0.519401,-0.381516,0.215259,9.230038,0.557211
7,"{'hidden_layer_sizes': 32, 'max_iter': 1000}",4.770705,0.442188,6.047095,7.847101,0.458356,-0.46027,0.201002,9.498697,0.525808
8,"{'hidden_layer_sizes': 32, 'max_iter': 1500}",4.628095,0.470133,5.891623,7.995462,0.495639,-0.525111,0.231852,9.698625,0.572901
9,"{'hidden_layer_sizes': (8, 8), 'max_iter': 500}",5.020207,0.389342,6.329784,7.705481,0.456041,-0.42792,0.21826,9.381973,0.484795


#### Random Forest

In [None]:
# MAE, R2
from sklearn.model_selection import GroupShuffleSplit
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score

# Normalizamos los datos
scaler = preprocessing.StandardScaler().fit(X)
X_norm = scaler.transform(X)

random.seed(19680801)

iterations = 10
gss = GroupShuffleSplit(n_splits=iterations, train_size=.7)
index = gss.split(X, Y, G)

model = RandomForestRegressor()

parameters = {'n_estimators': [5,10,20,50,100],
             'max_features': [5,10,16]}

# greater_is_better=True by default
mae = make_scorer(mean_absolute_error)
r2 = make_scorer(r2_score)

scores = {'mae':mae,'r2':r2}

# Run the grid search
grid_obj = GridSearchCV(model, parameters, scoring=scores, cv=index, return_train_score=True, refit=False)
grid_obj = grid_obj.fit(X_norm, Y)


In [9]:
outcomes = pd.DataFrame(grid_obj.cv_results_)
outcomes = outcomes[['params', 'mean_train_mae', 'mean_train_r2',
            'mean_test_mae', 'std_test_mae', 'mean_test_r2', 'std_test_r2']]
outcomes

Unnamed: 0,params,mean_train_mae,mean_train_r2,mean_test_mae,std_test_mae,mean_test_r2,std_test_r2
0,"{'max_features': 5, 'n_estimators': 5}",2.154013,0.842394,7.736161,0.342465,-0.440328,0.18188
1,"{'max_features': 5, 'n_estimators': 10}",2.004675,0.880579,7.567269,0.36771,-0.361952,0.166379
2,"{'max_features': 5, 'n_estimators': 20}",1.893591,0.902065,7.403156,0.367118,-0.297528,0.16027
3,"{'max_features': 5, 'n_estimators': 50}",1.820054,0.914683,7.324262,0.330162,-0.265431,0.144835
4,"{'max_features': 5, 'n_estimators': 100}",1.802447,0.918256,7.320253,0.334875,-0.261077,0.145253
5,"{'max_features': 10, 'n_estimators': 5}",2.075158,0.847546,7.809514,0.303777,-0.464814,0.17034
6,"{'max_features': 10, 'n_estimators': 10}",1.94788,0.885185,7.631766,0.31351,-0.388329,0.167694
7,"{'max_features': 10, 'n_estimators': 20}",1.840819,0.905582,7.465276,0.329436,-0.324875,0.14955
8,"{'max_features': 10, 'n_estimators': 50}",1.783124,0.916239,7.410669,0.360195,-0.300313,0.1433
9,"{'max_features': 10, 'n_estimators': 100}",1.756294,0.920376,7.4131,0.351831,-0.297298,0.146518


#### Máquinas de Vectores de Soporte