## Machine Learning Project on UCI Parkinsons Telemonitoring Data Set

Importación de librerías:

In [None]:
#!pip install qgrid

In [1]:
from __future__ import division

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import numpy.matlib as matlib

from numpy import random
import math

import pandas as pd
import qgrid

import scipy as sc
from scipy.spatial import distance
from scipy import stats

#algunas advertencias que queremos evitar
import warnings
warnings.filterwarnings("always")

Cargamos la base de datos:

In [2]:
#Cargamos la bd que está en un archivo .data y ahora la podemos manejar de forma matricial
db = np.loadtxt('DB/parkinsons_updrs.data', delimiter=',', skiprows=1)  # Assuming ',' delimiter

#X: Toma todas las filas (muestras) y las columnas 6-21 (características)
X = db[:,6:22]
#Y: Toma todas las filas y la columna 4, corresponde a la salida de la regresión
Y = db[:,4]
#G: Toma todas las filas y la columna 0, corresponde a la asociación en grupos del dataset
G = db[:,0]

In [3]:
print(X.shape)
print(Y.shape)
print(G.shape)

(5875, 16)
(5875,)
(5875,)


Medidas de error para evaluar los métodos de regresión:

In [4]:
#Mean Square Error
def MSE(Y_est,Y):
    N = np.size(Y)
    mse = (1/N)*np.sum((Y_est.reshape(N,1) - Y.reshape(N,1))**2)
    return mse

#Mean Percentage Error
def MAE(Y_est,Y):
    N = np.size(Y)
    mae = (1/N)*np.sum(abs(Y_est.reshape(N,1) - Y.reshape(N,1)))
    return mae

#Mean Absolute Percentage Error
def MAPE(Y_est,Y):
    N = np.size(Y)
    mape = (1/N)*np.sum(abs((Y_est.reshape(N,1) - Y.reshape(N,1))/Y.reshape(N,1)))
    return mape

#Root Mean Square Error
def RMSE(Y_est,Y):
    N = np.size(Y)
    rmse = math.sqrt((1/N)*np.sum((Y_est.reshape(N,1) - Y.reshape(N,1))**2))
    return rmse

#Coeficiente de determinación
def R2(Y_est,Y):
    N = np.size(Y)
    average = np.sum(Y.reshape(N,1))/N
    SST = np.sum((average - Y.reshape(N,1))**2)
    SSE = np.sum((Y_est.reshape(N,1) - Y.reshape(N,1))**2)
    R2 = 1 - (SSE/SST)
    return R2


#### Regresión Polinomial Múltiple

In [41]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn import preprocessing

def PolynomialRegression(degree):
    iterations = 10
    random.seed(19680801)
    errorTrainMAE = np.zeros(iterations)
    errorTrainR2 = np.zeros(iterations)
    errorValMAE = np.zeros(iterations)
    errorValR2 = np.zeros(iterations)

    gss = GroupShuffleSplit(n_splits=iterations, train_size=.7)
    j=0
    for train_idx, test_idx in gss.split(X, Y, G):
        Xtrain = X[train_idx,:]
        Ytrain = Y[train_idx]
        Xtest = X[test_idx,:]
        Ytest = Y[test_idx]

        #Normalizamos los datos
        #media = np.mean(Xtrain)
        #desvia = np.std(Xtrain)
        #Xtrain = sc.stats.stats.zscore(Xtrain)
        #Xtest = (Xtest - np.matlib.repmat(media, Xtest.shape[0], 1))/np.matlib.repmat(desvia, Xtest.shape[0], 1)
        
        scaler = preprocessing.StandardScaler().fit(Xtrain)
        Xtrain = scaler.transform(Xtrain)
        Xtest = scaler.transform(Xtest)

        #Creación del modelo
        #polynomial_features = PolynomialFeatures(degree=degree)
        #Xtrain_poly = polynomial_features.fit_transform(Xtrain) # transforms the existing features to higher degree features.

        # fit the transformed features to Linear Regression
        #model = LinearRegression()
        #model.fit(Xtrain_poly, Ytrain)
        
        model = Pipeline([('poly', PolynomialFeatures(degree=degree)),
                  ('linear', LinearRegression(fit_intercept=False))])
        
        model.fit(Xtrain, Ytrain)
        
        #Validación
        YestTrain = model.predict(Xtrain)
        YestVal = model.predict(Xtest)
        #YestVal = model.predict(polynomial_features.fit_transform(Xtest))
        #Uso del modelo previamente entrenado para hacer predicciones con las muestras de test
        #Se escalan los datos de acuerdo a como se entrenó el modelo para predecir correctamente

        #Evaluamos las predicciones del modelo con los datos de Ytest
        errorTrainMAE[j] = MAE(YestTrain, Ytrain)
        errorTrainR2[j] = R2(YestTrain, Ytrain)
        errorValMAE[j] = MAE(YestVal, Ytest)
        errorValR2[j] = R2(YestVal, Ytest)
        j += 1

    #
    return(np.mean(errorTrainMAE), np.mean(errorTrainR2), np.mean(errorValMAE), np.mean(errorValR2))


In [42]:
PolynomialRegression(2)

(5.461242028272216, 0.3161529298022627, 8.240103665000163, -2.519604188875836)

#### Redes Neuronales Artificiales

In [6]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.neural_network import MLPRegressor
from sklearn import preprocessing

#from sklearn.metrics import mean_absolute_error
#from sklearn.metrics import r2_score

def ANN(layers, neurons, epochs):
    iterations = 10
    random.seed(19680801)
    errorMAE = np.zeros(iterations)
    errorMSE = np.zeros(iterations)
    errorRMSE = np.zeros(iterations)
    errorR2 = np.zeros(iterations)

    gss = GroupShuffleSplit(n_splits=iterations, train_size=0.7)
    j=0
    for train_idx, test_idx in gss.split(X, Y, G):
        Xtrain = X[train_idx,:]
        Ytrain = Y[train_idx]
        Xtest = X[test_idx,:]
        Ytest = Y[test_idx]

        #Normalizamos los datos
        media = np.mean(Xtrain, axis=0)
        desvia = np.std(Xtrain, axis=0)
        #Xtrain = sc.stats.stats.zscore(Xtrain)
        Xtrain = preprocessing.scale(Xtrain)
        Xtest = (Xtest - np.matlib.repmat(media, Xtest.shape[0], 1))/np.matlib.repmat(desvia, Xtest.shape[0], 1)

        #Creación del modelo y entrenamiento        
        numberOfNeurons = []
        for i in range(layers):
            numberOfNeurons.append(neurons)
            
        mlp = MLPRegressor(hidden_layer_sizes=numberOfNeurons, activation='tanh', max_iter=epochs).fit(Xtrain, Ytrain)

        #Validación
        Yest = mlp.predict(Xtest)

        #Evaluamos las predicciones del modelo con los datos de Ytest
        errorMAE[j] = MAE(Yest,Ytest)
        errorMSE[j] = MSE(Yest,Ytest)
        errorRMSE[j] = RMSE(Yest,Ytest)
        errorR2[j] = R2(Yest,Ytest)
        j += 1

    return(np.mean(errorMAE), np.std(errorMAE), np.mean(errorMSE), np.std(errorMSE), np.mean(errorRMSE), np.std(errorRMSE), np.mean(errorR2), np.std(errorR2))


In [None]:
ANN(3, 10, 1)

Resultados ANN

In [None]:
import pandas as pd
import qgrid
df_types = pd.DataFrame({
    'N. de capas ocultas' : pd.Series([1,1,1,2,2,2,3,3,3]),
    'Neuronas por capa' : pd.Series([20,28,36,20,28,36,20,28,36])})
df_types["MAE"] = ""
df_types["IC MAE"] = ""
df_types["MSE"] = ""
df_types["IC MSE"] = ""
df_types["RMSE"] = ""
df_types["IC RMSE"] = ""
df_types["R2"] = ""
df_types["IC R2"] = ""
df_types.set_index(['N. de capas ocultas','Neuronas por capa'], inplace=False)
#df_types.sort_index(inplace=True)

epochs = 1
index = 0
maxLayers = 3
neurons = [20,28,36]
for i in range(maxLayers):
    layers = i+1
    for j in range(np.size(neurons)):
        mae, std_mae, mse, std_mse, rmse, std_rmse, r2, std_r2 = ANN(layers, neurons[j], epochs)
        df_types["MAE"][index] = round(mae, 3)
        df_types["IC MAE"][index] = round(std_mae, 3)
        df_types["MSE"][index] = round(mse, 3)
        df_types["IC MSE"][index] = round(std_mse, 3)
        df_types["RMSE"][index] = round(rmse, 3)
        df_types["IC RMSE"][index] = round(std_rmse, 3)
        df_types["R2"][index] = round(r2, 3)
        df_types["IC R2"][index] = round(std_r2, 3)
        index += 1

qgrid_widget = qgrid.show_grid(df_types, show_toolbar=False)
qgrid_widget

In [None]:
epochs = 1500
maxLayers = 3
neurons = [10,20,32]
for i in range(maxLayers):
    layers = i+1
    for j in range(np.size(neurons)):
        n = neurons[j]
        mae, std_mae, mse, std_mse, rmse, std_rmse, r2, std_r2 = ANN(layers, n, epochs)
        print('layers: ' + str(layers) + ', neurons: ' + str(n))
        print('MAE: ' + str(round(mae,3)) + ' +- ' + str(round(std_mae,3)))
        print('MSE: ' + str(round(mse,3)) + ' +- ' + str(round(std_mse,3)))
        print('RMSE: ' + str(round(rmse,3)) + ' +- ' + str(round(std_rmse,3)))
        print('R2: ' + str(round(r2,3)) + ' +- ' + str(round(std_r2,3)))
        print('\n')
        

#### Random Forest

In [10]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.ensemble import RandomForestRegressor

def RF(trees, features):
    iterations = 10
    random.seed(19680801)
    errorMAE = np.zeros(iterations)
    errorR2 = np.zeros(iterations)

    gss = GroupShuffleSplit(n_splits=iterations, train_size=.7)
    j=0
    for train_idx, test_idx in gss.split(X, Y, G):
        Xtrain = X[train_idx,:]
        Ytrain = Y[train_idx]
        Xtest = X[test_idx,:]
        Ytest = Y[test_idx]

        #Normalizamos los datos
        media = np.mean(Xtrain)
        desvia = np.std(Xtrain)
        Xtrain = sc.stats.stats.zscore(Xtrain)
        Xtest = (Xtest - np.matlib.repmat(media, Xtest.shape[0], 1))/np.matlib.repmat(desvia, Xtest.shape[0], 1)

        #Creación del modelo y entrenamiento
        model = RandomForestRegressor(n_estimators=trees, max_features=features).fit(Xtrain, Ytrain)
        
        #Validación
        Yest = model.predict(Xtest)
        
        #Evaluación de las predicciones
        errorMAE[j] = MAE(Yest,Ytest)
        errorR2[j] = R2(Yest,Ytest)
        j += 1
    
    return(np.mean(errorMAE), np.std(errorMAE), np.mean(errorR2), np.std(errorR2))
         

In [None]:
RF(25, 5)

Resultados RF

In [None]:
import pandas as pd
import qgrid
randn = np.random.randn
df_types = pd.DataFrame({
    'Numero de arboles' : pd.Series([5,5,5,10,10,10,20,20,20,50,50,50]), 
    'Variables analizadas por nodo' : pd.Series([5,10,15,5,10,15,5,10,15,5,10,15])})
df_types["MAE"] = ""
df_types["IC MAE"] = ""
df_types["R2"] = ""
df_types["IC R2"] = ""
df_types.set_index(['Numero de arboles','Variables analizadas por nodo'], inplace=False)
#df_types.sort_index(inplace=True)

index = 0
trees = np.array([5,10,20,50])
maxFeatures = np.array([5,10,15])
for i in range(np.size(trees)):
    est = trees[i]
    for j in range(np.size(maxFeatures)):
        mae, std_mae, r2, std_r2 = RF(est, maxFeatures[j])
        df_types["MAE"][index] = round(mae, 3)
        df_types["IC MAE"][index] = round(std_mae, 3)
        df_types["R2"][index] = round(r2, 3)
        df_types["IC R2"][index] = round(std_r2, 3)
        index += 1

qgrid_widget = qgrid.show_grid(df_types, show_toolbar=False)
qgrid_widget

In [None]:
trees = [5,10,20,50,100]
maxFeatures = [5,10,15]
for i in range(np.size(trees)):
    est = trees[i]
    for j in range(np.size(maxFeatures)):
        features = maxFeatures[j]
        mae, std_mae, r2, std_r2 = RF(est, maxFeatures[j])
        print('trees: ' + str(est) + ', features: ' + str(features))
        print('MAE: ' + str(round(mae,3)) + ' +- ' + str(round(std_mae,3)))
        print('R2: ' + str(round(r2,3)) + ' +- ' + str(round(std_r2,3)))
        print('\n')
        