## Machine Learning Project on UCI Parkinsons Telemonitoring Data Set

Importación de librerías:

In [2]:
#!pip install qgrid

In [1]:
from __future__ import division

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import numpy.matlib as matlib

from numpy import random
import math

import pandas as pd
import qgrid

import scipy as sc
from scipy.spatial import distance
from scipy import stats

#algunas advertencias que queremos evitar
import warnings
warnings.filterwarnings("always")

Cargamos la base de datos:

In [2]:
#Cargamos la bd que está en un archivo .data y ahora la podemos manejar de forma matricial
db = np.loadtxt('DB/parkinsons_updrs.data', delimiter=',', skiprows=1)  # Assuming ',' delimiter

#X: Toma todas las filas (muestras) y las columnas 6-21 (características)
X = db[:,6:22]
#Y: Toma todas las filas y la columna 4, corresponde a la salida de la regresión
Y = db[:,4]
#G: Toma todas las filas y la columna 0, corresponde a la asociación en grupos del dataset
G = db[:,0]

In [3]:
print(X.shape)
print(Y.shape)
print(G.shape)

(5875, 16)
(5875,)
(5875,)


Medidas de error para evaluar los métodos de regresión:

In [5]:
##Mean Absolute Percentage Error
def MAPE(Y, Y_est):
    N = np.size(Y)
    mape = np.sum(abs((Y_est.reshape(N,1) - Y.reshape(N,1))/Y.reshape(N,1)))/N
    return mape

#### RF - Shuffle-Split test

In [7]:
# MAE, MAPE
from sklearn.model_selection import ShuffleSplit
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_absolute_error

# Normalizamos los datos
scaler = preprocessing.StandardScaler().fit(X)
X_norm = scaler.transform(X)

random.seed(19680801)

iterations = 10
bootstrap = ShuffleSplit(n_splits=iterations, train_size=.7)
index = bootstrap.split(X, Y)

model = RandomForestRegressor()

parameters = {'n_estimators': [5,10,20,50,100],
             'max_features': [5,10,16]}

# greater_is_better=True by default
mae = make_scorer(mean_absolute_error)
mape = make_scorer(MAPE)

scores = {'mae':mae,'mape':mape}

# Run the grid search
grid_obj = GridSearchCV(model, parameters, scoring=scores, cv=index, return_train_score=True, refit=False, verbose=12)
grid_obj = grid_obj.fit(X_norm, Y)

Fitting 10 folds for each of 15 candidates, totalling 150 fits
[CV] max_features=5, n_estimators=5 ..................................
[CV]  max_features=5, n_estimators=5, mae=(train=2.322, test=5.928), mape=(train=0.138, test=0.364), total=   0.1s
[CV] max_features=5, n_estimators=5 ..................................
[CV]  max_features=5, n_estimators=5, mae=(train=2.312, test=5.878), mape=(train=0.140, test=0.360), total=   0.1s

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s



[CV] max_features=5, n_estimators=5 ..................................
[CV]  max_features=5, n_estimators=5, mae=(train=2.281, test=5.711), mape=(train=0.139, test=0.340), total=   0.1s
[CV] max_features=5, n_estimators=5 ..................................
[CV]  max_features=5, n_estimators=5, mae=(train=2.316, test=5.953), mape=(train=0.139, test=0.363), total=   0.1s

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.3s remaining:    0.0s



[CV] max_features=5, n_estimators=5 ..................................
[CV]  max_features=5, n_estimators=5, mae=(train=2.331, test=5.707), mape=(train=0.139, test=0.348), total=   0.1s
[CV] max_features=5, n_estimators=5 ..................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.6s remaining:    0.0s


[CV]  max_features=5, n_estimators=5, mae=(train=2.333, test=5.680), mape=(train=0.143, test=0.350), total=   0.1s
[CV] max_features=5, n_estimators=5 ..................................
[CV]  max_features=5, n_estimators=5, mae=(train=2.328, test=5.680), mape=(train=0.142, test=0.347), total=   0.1s
[CV] max_features=5, n_estimators=5 ..................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.8s remaining:    0.0s


[CV]  max_features=5, n_estimators=5, mae=(train=2.368, test=5.721), mape=(train=0.143, test=0.351), total=   0.1s
[CV] max_features=5, n_estimators=5 ..................................
[CV]  max_features=5, n_estimators=5, mae=(train=2.372, test=5.669), mape=(train=0.141, test=0.346), total=   0.1s
[CV] max_features=5, n_estimators=5 ..................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    1.0s remaining:    0.0s


[CV]  max_features=5, n_estimators=5, mae=(train=2.371, test=5.690), mape=(train=0.142, test=0.349), total=   0.2s
[CV] max_features=5, n_estimators=10 .................................


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.2s remaining:    0.0s


[CV]  max_features=5, n_estimators=10, mae=(train=2.166, test=5.604), mape=(train=0.135, test=0.351), total=   0.3s
[CV] max_features=5, n_estimators=10 .................................


[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    1.5s remaining:    0.0s


[CV]  max_features=5, n_estimators=10, mae=(train=2.158, test=5.568), mape=(train=0.133, test=0.344), total=   0.2s
[CV] max_features=5, n_estimators=10 .................................
[CV]  max_features=5, n_estimators=10, mae=(train=2.182, test=5.602), mape=(train=0.133, test=0.333), total=   0.2s
[CV] max_features=5, n_estimators=10 .................................
[CV]  max_features=5, n_estimators=10, mae=(train=2.153, test=5.522), mape=(train=0.129, test=0.343), total=   0.2s
[CV] max_features=5, n_estimators=10 .................................
[CV]  max_features=5, n_estimators=10, mae=(train=2.194, test=5.497), mape=(train=0.136, test=0.340), total=   0.2s
[CV] max_features=5, n_estimators=10 .................................
[CV]  max_features=5, n_estimators=10, mae=(train=2.182, test=5.586), mape=(train=0.133, test=0.342), total=   0.2s
[CV] max_features=5, n_estimators=10 .................................
[CV]  max_features=5, n_estimators=10, mae=(train=2.158, test=5.4

[CV]  max_features=10, n_estimators=5, mae=(train=2.267, test=5.739), mape=(train=0.135, test=0.346), total=   0.2s
[CV] max_features=10, n_estimators=5 .................................
[CV]  max_features=10, n_estimators=5, mae=(train=2.297, test=5.610), mape=(train=0.141, test=0.346), total=   0.1s
[CV] max_features=10, n_estimators=5 .................................
[CV]  max_features=10, n_estimators=5, mae=(train=2.279, test=5.614), mape=(train=0.138, test=0.337), total=   0.1s
[CV] max_features=10, n_estimators=5 .................................
[CV]  max_features=10, n_estimators=5, mae=(train=2.342, test=5.506), mape=(train=0.139, test=0.340), total=   0.1s
[CV] max_features=10, n_estimators=5 .................................
[CV]  max_features=10, n_estimators=5, mae=(train=2.278, test=5.478), mape=(train=0.137, test=0.333), total=   0.2s
[CV] max_features=10, n_estimators=10 ................................
[CV]  max_features=10, n_estimators=10, mae=(train=2.115, test=5.

[CV]  max_features=10, n_estimators=100, mae=(train=1.932, test=5.093), mape=(train=0.121, test=0.318), total=   2.9s
[CV] max_features=16, n_estimators=5 .................................
[CV]  max_features=16, n_estimators=5, mae=(train=2.251, test=5.518), mape=(train=0.133, test=0.340), total=   0.2s
[CV] max_features=16, n_estimators=5 .................................
[CV]  max_features=16, n_estimators=5, mae=(train=2.238, test=5.709), mape=(train=0.134, test=0.344), total=   0.2s
[CV] max_features=16, n_estimators=5 .................................
[CV]  max_features=16, n_estimators=5, mae=(train=2.238, test=5.524), mape=(train=0.132, test=0.327), total=   0.2s
[CV] max_features=16, n_estimators=5 .................................
[CV]  max_features=16, n_estimators=5, mae=(train=2.218, test=5.584), mape=(train=0.132, test=0.348), total=   0.2s
[CV] max_features=16, n_estimators=5 .................................
[CV]  max_features=16, n_estimators=5, mae=(train=2.321, test=5

[CV]  max_features=16, n_estimators=100, mae=(train=1.881, test=5.242), mape=(train=0.116, test=0.330), total=   4.4s
[CV] max_features=16, n_estimators=100 ...............................
[CV]  max_features=16, n_estimators=100, mae=(train=1.915, test=5.148), mape=(train=0.118, test=0.320), total=   4.2s
[CV] max_features=16, n_estimators=100 ...............................
[CV]  max_features=16, n_estimators=100, mae=(train=1.892, test=5.244), mape=(train=0.117, test=0.322), total=   4.3s
[CV] max_features=16, n_estimators=100 ...............................
[CV]  max_features=16, n_estimators=100, mae=(train=1.920, test=5.113), mape=(train=0.119, test=0.316), total=   4.6s
[CV] max_features=16, n_estimators=100 ...............................
[CV]  max_features=16, n_estimators=100, mae=(train=1.916, test=5.073), mape=(train=0.119, test=0.318), total=   4.2s
[CV] max_features=16, n_estimators=100 ...............................
[CV]  max_features=16, n_estimators=100, mae=(train=1.9

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:  2.9min finished


In [8]:
outcomes = pd.DataFrame(grid_obj.cv_results_)
outcomes = outcomes[['params', 'mean_train_mae', 'mean_train_mape',
            'mean_test_mae', 'std_test_mae', 'mean_test_mape', 'std_test_mape']]
outcomes

Unnamed: 0,params,mean_train_mae,mean_train_mape,mean_test_mae,std_test_mae,mean_test_mape,std_test_mape
0,"{'max_features': 5, 'n_estimators': 5}",2.333279,0.14057,5.761719,0.105619,0.352008,0.007519
1,"{'max_features': 5, 'n_estimators': 10}",2.173265,0.133136,5.523599,0.076072,0.340602,0.005796
2,"{'max_features': 5, 'n_estimators': 20}",2.062211,0.127018,5.387389,0.077702,0.333209,0.004616
3,"{'max_features': 5, 'n_estimators': 50}",1.984287,0.122601,5.326619,0.071194,0.331037,0.005063
4,"{'max_features': 5, 'n_estimators': 100}",1.964212,0.121823,5.306001,0.076891,0.329752,0.005041
5,"{'max_features': 10, 'n_estimators': 5}",2.283015,0.136856,5.640646,0.091426,0.344532,0.007241
6,"{'max_features': 10, 'n_estimators': 10}",2.125071,0.129639,5.387692,0.061613,0.332078,0.004856
7,"{'max_features': 10, 'n_estimators': 20}",2.014704,0.123401,5.274788,0.068001,0.326523,0.005452
8,"{'max_features': 10, 'n_estimators': 50}",1.946679,0.120374,5.198586,0.095896,0.323105,0.006222
9,"{'max_features': 10, 'n_estimators': 100}",1.924429,0.119544,5.191212,0.059584,0.322963,0.004967


#### SVR - Shuffle split test

In [9]:
# MAE, MAPE, RMSE

from sklearn.model_selection import ShuffleSplit
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error

# Normalizamos los datos
scaler = preprocessing.StandardScaler().fit(X)
X_norm = scaler.transform(X)

random.seed(19680801)

iterations = 10
bootstrap = ShuffleSplit(n_splits=iterations, train_size=.7)
index = bootstrap.split(X, Y)

model = SVR()

parameters = {'kernel': ['linear', 'rbf'],
              'C': [0.01, 1, 10],
              'gamma': [0.01, 0.1, 1]}

#print(model.get_params().keys())

# métricas de error
# greater_is_better=True by default
mae = make_scorer(mean_absolute_error)
mape = make_scorer(MAPE)
rmse = make_scorer(mean_squared_error, squared=False)

scores = {'mae':mae,'mape':mape,'rmse':rmse}

# Run the grid search
grid_obj = GridSearchCV(model, parameters, scoring=scores, cv=index, return_train_score=True, refit=False, verbose=12)
grid_obj = grid_obj.fit(X_norm, Y)

Fitting 10 folds for each of 18 candidates, totalling 180 fits
[CV] C=0.01, gamma=0.01, kernel=linear ...............................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  C=0.01, gamma=0.01, kernel=linear, mae=(train=6.531, test=6.555), mape=(train=0.388, test=0.397), rmse=(train=7.799, test=7.839), total=   0.7s
[CV] C=0.01, gamma=0.01, kernel=linear ...............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV]  C=0.01, gamma=0.01, kernel=linear, mae=(train=6.536, test=6.571), mape=(train=0.389, test=0.391), rmse=(train=7.824, test=7.855), total=   0.7s
[CV] C=0.01, gamma=0.01, kernel=linear ...............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.8s remaining:    0.0s


[CV]  C=0.01, gamma=0.01, kernel=linear, mae=(train=6.519, test=6.627), mape=(train=0.388, test=0.386), rmse=(train=7.806, test=7.870), total=   0.7s
[CV] C=0.01, gamma=0.01, kernel=linear ...............................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    2.7s remaining:    0.0s


[CV]  C=0.01, gamma=0.01, kernel=linear, mae=(train=6.528, test=6.600), mape=(train=0.388, test=0.392), rmse=(train=7.815, test=7.866), total=   0.9s
[CV] C=0.01, gamma=0.01, kernel=linear ...............................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    3.9s remaining:    0.0s


[CV]  C=0.01, gamma=0.01, kernel=linear, mae=(train=6.533, test=6.584), mape=(train=0.387, test=0.392), rmse=(train=7.802, test=7.846), total=   0.8s
[CV] C=0.01, gamma=0.01, kernel=linear ...............................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.9s remaining:    0.0s


[CV]  C=0.01, gamma=0.01, kernel=linear, mae=(train=6.507, test=6.636), mape=(train=0.390, test=0.401), rmse=(train=7.784, test=7.885), total=   0.8s
[CV] C=0.01, gamma=0.01, kernel=linear ...............................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    5.9s remaining:    0.0s


[CV]  C=0.01, gamma=0.01, kernel=linear, mae=(train=6.548, test=6.529), mape=(train=0.391, test=0.387), rmse=(train=7.830, test=7.810), total=   0.6s
[CV] C=0.01, gamma=0.01, kernel=linear ...............................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    6.8s remaining:    0.0s


[CV]  C=0.01, gamma=0.01, kernel=linear, mae=(train=6.598, test=6.429), mape=(train=0.391, test=0.387), rmse=(train=7.895, test=7.610), total=   0.7s
[CV] C=0.01, gamma=0.01, kernel=linear ...............................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    7.7s remaining:    0.0s


[CV]  C=0.01, gamma=0.01, kernel=linear, mae=(train=6.559, test=6.531), mape=(train=0.390, test=0.389), rmse=(train=7.857, test=7.758), total=   0.6s
[CV] C=0.01, gamma=0.01, kernel=linear ...............................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    8.6s remaining:    0.0s


[CV]  C=0.01, gamma=0.01, kernel=linear, mae=(train=6.615, test=6.384), mape=(train=0.396, test=0.385), rmse=(train=7.878, test=7.630), total=   0.7s
[CV] C=0.01, gamma=0.01, kernel=rbf ..................................


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    9.5s remaining:    0.0s


[CV]  C=0.01, gamma=0.01, kernel=rbf, mae=(train=6.907, test=6.904), mape=(train=0.428, test=0.438), rmse=(train=8.064, test=8.105), total=   1.1s
[CV] C=0.01, gamma=0.01, kernel=rbf ..................................


[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:   11.2s remaining:    0.0s


[CV]  C=0.01, gamma=0.01, kernel=rbf, mae=(train=6.887, test=6.953), mape=(train=0.425, test=0.428), rmse=(train=8.082, test=8.107), total=   1.5s
[CV] C=0.01, gamma=0.01, kernel=rbf ..................................
[CV]  C=0.01, gamma=0.01, kernel=rbf, mae=(train=6.858, test=7.028), mape=(train=0.421, test=0.422), rmse=(train=8.059, test=8.215), total=   1.2s
[CV] C=0.01, gamma=0.01, kernel=rbf ..................................
[CV]  C=0.01, gamma=0.01, kernel=rbf, mae=(train=6.890, test=6.941), mape=(train=0.422, test=0.424), rmse=(train=8.077, test=8.141), total=   1.2s
[CV] C=0.01, gamma=0.01, kernel=rbf ..................................
[CV]  C=0.01, gamma=0.01, kernel=rbf, mae=(train=6.884, test=6.967), mape=(train=0.420, test=0.428), rmse=(train=8.084, test=8.141), total=   1.2s
[CV] C=0.01, gamma=0.01, kernel=rbf ..................................
[CV]  C=0.01, gamma=0.01, kernel=rbf, mae=(train=6.879, test=6.981), mape=(train=0.425, test=0.439), rmse=(train=8.051, test=8.1

[CV]  C=0.01, gamma=1, kernel=linear, mae=(train=6.615, test=6.384), mape=(train=0.396, test=0.385), rmse=(train=7.878, test=7.630), total=   0.8s
[CV] C=0.01, gamma=1, kernel=rbf .....................................
[CV]  C=0.01, gamma=1, kernel=rbf, mae=(train=6.891, test=6.878), mape=(train=0.429, test=0.438), rmse=(train=8.050, test=8.084), total=   1.3s
[CV] C=0.01, gamma=1, kernel=rbf .....................................
[CV]  C=0.01, gamma=1, kernel=rbf, mae=(train=6.860, test=6.936), mape=(train=0.425, test=0.428), rmse=(train=8.057, test=8.093), total=   1.3s
[CV] C=0.01, gamma=1, kernel=rbf .....................................
[CV]  C=0.01, gamma=1, kernel=rbf, mae=(train=6.832, test=7.010), mape=(train=0.420, test=0.422), rmse=(train=8.039, test=8.193), total=   1.4s
[CV] C=0.01, gamma=1, kernel=rbf .....................................
[CV]  C=0.01, gamma=1, kernel=rbf, mae=(train=6.873, test=6.916), mape=(train=0.424, test=0.425), rmse=(train=8.058, test=8.115), total= 

[CV]  C=1, gamma=0.1, kernel=linear, mae=(train=6.488, test=6.301), mape=(train=0.375, test=0.370), rmse=(train=7.916, test=7.612), total=   0.7s
[CV] C=1, gamma=0.1, kernel=linear ...................................
[CV]  C=1, gamma=0.1, kernel=linear, mae=(train=6.449, test=6.410), mape=(train=0.377, test=0.375), rmse=(train=7.868, test=7.754), total=   0.8s
[CV] C=1, gamma=0.1, kernel=linear ...................................
[CV]  C=1, gamma=0.1, kernel=linear, mae=(train=6.483, test=6.308), mape=(train=0.378, test=0.369), rmse=(train=7.910, test=7.706), total=   1.2s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV]  C=1, gamma=0.1, kernel=rbf, mae=(train=5.619, test=5.777), mape=(train=0.339, test=0.354), rmse=(train=7.013, test=7.161), total=   1.2s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV]  C=1, gamma=0.1, kernel=rbf, mae=(train=5.638, test=5.753), mape=(train=0.340, test=0.346), rmse=(train=7.048, test=7.161), total=

[CV]  C=10, gamma=0.01, kernel=linear, mae=(train=6.440, test=6.426), mape=(train=0.376, test=0.372), rmse=(train=7.884, test=7.828), total=   2.2s
[CV] C=10, gamma=0.01, kernel=linear .................................
[CV]  C=10, gamma=0.01, kernel=linear, mae=(train=6.486, test=6.299), mape=(train=0.375, test=0.369), rmse=(train=7.940, test=7.638), total=   1.8s
[CV] C=10, gamma=0.01, kernel=linear .................................
[CV]  C=10, gamma=0.01, kernel=linear, mae=(train=6.447, test=6.405), mape=(train=0.376, test=0.375), rmse=(train=7.889, test=7.757), total=   2.2s
[CV] C=10, gamma=0.01, kernel=linear .................................
[CV]  C=10, gamma=0.01, kernel=linear, mae=(train=6.479, test=6.316), mape=(train=0.377, test=0.369), rmse=(train=7.929, test=7.714), total=   1.9s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV]  C=10, gamma=0.01, kernel=rbf, mae=(train=6.006, test=6.115), mape=(train=0.355, test=0.368), rmse=(train=7.404, test=7

[CV]  C=10, gamma=1, kernel=linear, mae=(train=6.418, test=6.468), mape=(train=0.371, test=0.375), rmse=(train=7.837, test=7.900), total=   2.5s
[CV] C=10, gamma=1, kernel=linear ....................................
[CV]  C=10, gamma=1, kernel=linear, mae=(train=6.398, test=6.521), mape=(train=0.375, test=0.385), rmse=(train=7.813, test=7.885), total=   2.5s
[CV] C=10, gamma=1, kernel=linear ....................................
[CV]  C=10, gamma=1, kernel=linear, mae=(train=6.440, test=6.426), mape=(train=0.376, test=0.372), rmse=(train=7.884, test=7.828), total=   2.1s
[CV] C=10, gamma=1, kernel=linear ....................................
[CV]  C=10, gamma=1, kernel=linear, mae=(train=6.486, test=6.299), mape=(train=0.375, test=0.369), rmse=(train=7.940, test=7.638), total=   1.7s
[CV] C=10, gamma=1, kernel=linear ....................................
[CV]  C=10, gamma=1, kernel=linear, mae=(train=6.447, test=6.405), mape=(train=0.376, test=0.375), rmse=(train=7.889, test=7.757), total

[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:  5.6min finished


In [10]:
outcomes = pd.DataFrame(grid_obj.cv_results_)
outcomes = outcomes[['params', 'mean_train_mae', 'mean_train_mape', 'mean_train_rmse',
            'mean_test_mae', 'std_test_mae', 'mean_test_mape', 'std_test_mape', 'mean_test_rmse', 'std_test_rmse']]
outcomes

Unnamed: 0,params,mean_train_mae,mean_train_mape,mean_train_rmse,mean_test_mae,std_test_mae,mean_test_mape,std_test_mape,mean_test_rmse,std_test_rmse
0,"{'C': 0.01, 'gamma': 0.01, 'kernel': 'linear'}",6.547438,0.389751,7.829037,6.544609,0.07761,0.390615,0.005033,7.796943,0.094769
1,"{'C': 0.01, 'gamma': 0.01, 'kernel': 'rbf'}",6.90678,0.424767,8.096045,6.909495,0.078715,0.426658,0.00652,8.08156,0.095335
2,"{'C': 0.01, 'gamma': 0.1, 'kernel': 'linear'}",6.547438,0.389751,7.829037,6.544609,0.07761,0.390615,0.005033,7.796943,0.094769
3,"{'C': 0.01, 'gamma': 0.1, 'kernel': 'rbf'}",6.839634,0.421851,8.014643,6.844186,0.078025,0.423924,0.006311,8.001129,0.094375
4,"{'C': 0.01, 'gamma': 1, 'kernel': 'linear'}",6.547438,0.389751,7.829037,6.544609,0.07761,0.390615,0.005033,7.796943,0.094769
5,"{'C': 0.01, 'gamma': 1, 'kernel': 'rbf'}",6.882301,0.424856,8.07352,6.890739,0.078897,0.427039,0.006111,8.064694,0.093254
6,"{'C': 1, 'gamma': 0.01, 'kernel': 'linear'}",6.43336,0.374701,7.844092,6.439827,0.080731,0.376038,0.004897,7.829911,0.107594
7,"{'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}",6.361805,0.379664,7.689593,6.366547,0.080833,0.381053,0.005063,7.66407,0.097268
8,"{'C': 1, 'gamma': 0.1, 'kernel': 'linear'}",6.43336,0.374701,7.844092,6.439827,0.080731,0.376038,0.004897,7.829911,0.107594
9,"{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}",5.641367,0.339341,7.057021,5.732334,0.096693,0.345206,0.00564,7.099435,0.120882


#### RF - KFold - test

In [9]:
# MAE, MAPE
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_absolute_error

# Normalizamos los datos
scaler = preprocessing.StandardScaler().fit(X)
X_norm = scaler.transform(X)

random.seed(19680801)

iterations = 10
crossval = KFold(n_splits=iterations)
index = crossval.split(X, Y)

model = RandomForestRegressor()

parameters = {'n_estimators': [5,10,20,50,100],
             'max_features': [5,10,16]}

# greater_is_better=True by default
mae = make_scorer(mean_absolute_error)
mape = make_scorer(MAPE)

scores = {'mae':mae,'mape':mape}

# Run the grid search
grid_obj = GridSearchCV(model, parameters, scoring=scores, cv=index, return_train_score=True, refit=False, verbose=12)
grid_obj = grid_obj.fit(X_norm, Y)

Fitting 10 folds for each of 15 candidates, totalling 150 fits
[CV] max_features=5, n_estimators=5 ..................................
[CV]  max_features=5, n_estimators=5, mae=(train=2.281, test=9.174), mape=(train=0.139, test=0.448), total=   0.1s
[CV] max_features=5, n_estimators=5 ..................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV]  max_features=5, n_estimators=5, mae=(train=2.265, test=7.343), mape=(train=0.139, test=0.322), total=   0.1s
[CV] max_features=5, n_estimators=5 ..................................
[CV]  max_features=5, n_estimators=5, mae=(train=2.347, test=5.735), mape=(train=0.140, test=0.371), total=   0.1s
[CV] max_features=5, n_estimators=5 ..................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s remaining:    0.0s


[CV]  max_features=5, n_estimators=5, mae=(train=2.182, test=8.245), mape=(train=0.125, test=0.721), total=   0.1s
[CV] max_features=5, n_estimators=5 ..................................
[CV]  max_features=5, n_estimators=5, mae=(train=2.148, test=9.172), mape=(train=0.120, test=0.942), total=   0.1s
[CV] max_features=5, n_estimators=5 ..................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.5s remaining:    0.0s


[CV]  max_features=5, n_estimators=5, mae=(train=2.328, test=7.340), mape=(train=0.140, test=0.493), total=   0.1s
[CV] max_features=5, n_estimators=5 ..................................
[CV]  max_features=5, n_estimators=5, mae=(train=2.363, test=6.597), mape=(train=0.141, test=0.339), total=   0.1s
[CV] max_features=5, n_estimators=5 ..................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.8s remaining:    0.0s


[CV]  max_features=5, n_estimators=5, mae=(train=2.294, test=6.512), mape=(train=0.136, test=0.412), total=   0.1s
[CV] max_features=5, n_estimators=5 ..................................
[CV]  max_features=5, n_estimators=5, mae=(train=2.203, test=9.128), mape=(train=0.136, test=0.300), total=   0.1s
[CV] max_features=5, n_estimators=5 ..................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    1.1s remaining:    0.0s


[CV]  max_features=5, n_estimators=5, mae=(train=2.249, test=8.075), mape=(train=0.136, test=0.298), total=   0.1s
[CV] max_features=5, n_estimators=10 .................................


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.3s remaining:    0.0s


[CV]  max_features=5, n_estimators=10, mae=(train=2.071, test=8.565), mape=(train=0.127, test=0.412), total=   0.2s
[CV] max_features=5, n_estimators=10 .................................


[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    1.5s remaining:    0.0s


[CV]  max_features=5, n_estimators=10, mae=(train=2.098, test=7.136), mape=(train=0.130, test=0.310), total=   0.2s
[CV] max_features=5, n_estimators=10 .................................
[CV]  max_features=5, n_estimators=10, mae=(train=2.149, test=5.724), mape=(train=0.131, test=0.373), total=   0.3s
[CV] max_features=5, n_estimators=10 .................................
[CV]  max_features=5, n_estimators=10, mae=(train=2.038, test=8.304), mape=(train=0.121, test=0.737), total=   0.3s
[CV] max_features=5, n_estimators=10 .................................
[CV]  max_features=5, n_estimators=10, mae=(train=2.030, test=8.439), mape=(train=0.113, test=0.880), total=   0.2s
[CV] max_features=5, n_estimators=10 .................................
[CV]  max_features=5, n_estimators=10, mae=(train=2.164, test=7.269), mape=(train=0.131, test=0.490), total=   0.2s
[CV] max_features=5, n_estimators=10 .................................
[CV]  max_features=5, n_estimators=10, mae=(train=2.123, test=6.0

[CV]  max_features=10, n_estimators=5, mae=(train=2.243, test=7.076), mape=(train=0.134, test=0.459), total=   0.3s
[CV] max_features=10, n_estimators=5 .................................
[CV]  max_features=10, n_estimators=5, mae=(train=2.262, test=6.388), mape=(train=0.139, test=0.318), total=   0.2s
[CV] max_features=10, n_estimators=5 .................................
[CV]  max_features=10, n_estimators=5, mae=(train=2.218, test=6.299), mape=(train=0.135, test=0.406), total=   0.2s
[CV] max_features=10, n_estimators=5 .................................
[CV]  max_features=10, n_estimators=5, mae=(train=2.215, test=8.742), mape=(train=0.138, test=0.285), total=   0.3s
[CV] max_features=10, n_estimators=5 .................................
[CV]  max_features=10, n_estimators=5, mae=(train=2.186, test=8.375), mape=(train=0.135, test=0.313), total=   0.3s
[CV] max_features=10, n_estimators=10 ................................
[CV]  max_features=10, n_estimators=10, mae=(train=2.020, test=8.

[CV]  max_features=10, n_estimators=100, mae=(train=1.820, test=7.810), mape=(train=0.116, test=0.289), total=   4.3s
[CV] max_features=16, n_estimators=5 .................................
[CV]  max_features=16, n_estimators=5, mae=(train=2.145, test=8.923), mape=(train=0.130, test=0.432), total=   0.3s
[CV] max_features=16, n_estimators=5 .................................
[CV]  max_features=16, n_estimators=5, mae=(train=2.206, test=7.331), mape=(train=0.139, test=0.329), total=   0.3s
[CV] max_features=16, n_estimators=5 .................................
[CV]  max_features=16, n_estimators=5, mae=(train=2.253, test=5.787), mape=(train=0.137, test=0.373), total=   0.3s
[CV] max_features=16, n_estimators=5 .................................
[CV]  max_features=16, n_estimators=5, mae=(train=2.095, test=8.061), mape=(train=0.122, test=0.707), total=   0.4s
[CV] max_features=16, n_estimators=5 .................................
[CV]  max_features=16, n_estimators=5, mae=(train=2.147, test=8

[CV]  max_features=16, n_estimators=100, mae=(train=1.790, test=7.950), mape=(train=0.107, test=0.713), total=   6.2s
[CV] max_features=16, n_estimators=100 ...............................
[CV]  max_features=16, n_estimators=100, mae=(train=1.776, test=8.429), mape=(train=0.100, test=0.895), total=   6.0s
[CV] max_features=16, n_estimators=100 ...............................
[CV]  max_features=16, n_estimators=100, mae=(train=1.886, test=6.885), mape=(train=0.118, test=0.448), total=   6.5s
[CV] max_features=16, n_estimators=100 ...............................
[CV]  max_features=16, n_estimators=100, mae=(train=1.881, test=6.229), mape=(train=0.117, test=0.325), total=   5.9s
[CV] max_features=16, n_estimators=100 ...............................
[CV]  max_features=16, n_estimators=100, mae=(train=1.874, test=6.192), mape=(train=0.114, test=0.392), total=   6.5s
[CV] max_features=16, n_estimators=100 ...............................
[CV]  max_features=16, n_estimators=100, mae=(train=1.8

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:  4.1min finished


In [12]:
outcomes = pd.DataFrame(grid_obj.cv_results_)
outcomes = outcomes[['params', 'mean_train_mae', 'mean_train_mape',
            'mean_test_mae', 'std_test_mae', 'mean_test_mape', 'std_test_mape']]
outcomes

Unnamed: 0,params,mean_train_mae,mean_train_mape,mean_test_mae,std_test_mae,mean_test_mape,std_test_mape
0,"{'max_features': 5, 'n_estimators': 5}",2.266022,0.135196,7.732258,1.164133,0.464764,0.199353
1,"{'max_features': 5, 'n_estimators': 10}",2.091629,0.127205,7.428535,1.074363,0.448466,0.192557
2,"{'max_features': 5, 'n_estimators': 20}",1.983832,0.122131,7.336715,1.166284,0.443202,0.189839
3,"{'max_features': 5, 'n_estimators': 50}",1.907968,0.117822,7.326582,1.178809,0.443909,0.194557
4,"{'max_features': 5, 'n_estimators': 100}",1.888864,0.117213,7.297624,1.17522,0.442001,0.19354
5,"{'max_features': 10, 'n_estimators': 5}",2.203865,0.132563,7.592081,1.089945,0.456322,0.196915
6,"{'max_features': 10, 'n_estimators': 10}",2.040243,0.124083,7.565194,1.068258,0.453076,0.191465
7,"{'max_features': 10, 'n_estimators': 20}",1.932545,0.118959,7.465034,1.202005,0.450056,0.192397
8,"{'max_features': 10, 'n_estimators': 50}",1.871662,0.115854,7.36552,1.152578,0.444179,0.193357
9,"{'max_features': 10, 'n_estimators': 100}",1.848217,0.11473,7.358063,1.16789,0.444437,0.191966


#### Radius Neighbours Regressor

In [20]:
# MAE, MAPE, RMSE

from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error

# Normalizamos los datos
scaler = preprocessing.StandardScaler().fit(X)
X_norm = scaler.transform(X)

random.seed(19680801)

iterations = 10
gss = GroupShuffleSplit(n_splits=iterations, train_size=.7)
index = gss.split(X, Y, G)

# modelo
model = RadiusNeighborsRegressor(weights='distance', p=2)

parameters = {'radius': [15,20]}

# métricas de error
# greater_is_better=True by default
mae = make_scorer(mean_absolute_error)
mape = make_scorer(MAPE)

scores = {'mae':mae,'mape':mape}

# Run the grid search
grid_obj = GridSearchCV(model, parameters, scoring=scores, cv=index, return_train_score=True, refit=False, verbose=12)
grid_obj = grid_obj.fit(X_norm, Y)

Fitting 10 folds for each of 2 candidates, totalling 20 fits
[CV] radius=15 .......................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  radius=15, mae=(train=0.000, test=7.231), mape=(train=0.000, test=0.561), total=   0.6s
[CV] radius=15 .......................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.2s remaining:    0.0s


[CV]  radius=15, mae=(train=0.000, test=7.380), mape=(train=0.000, test=0.442), total=   0.7s
[CV] radius=15 .......................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   10.3s remaining:    0.0s


[CV]  radius=15, mae=(train=0.000, test=6.006), mape=(train=0.000, test=0.320), total=   0.6s
[CV] radius=15 .......................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   14.9s remaining:    0.0s


[CV]  radius=15, mae=(train=0.000, test=6.515), mape=(train=0.000, test=0.346), total=   0.6s
[CV] radius=15 .......................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   19.5s remaining:    0.0s


[CV]  radius=15, mae=(train=0.000, test=7.974), mape=(train=0.000, test=0.326), total=   0.7s
[CV] radius=15 .......................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   24.9s remaining:    0.0s


[CV]  radius=15, mae=(train=0.000, test=7.473), mape=(train=0.000, test=0.472), total=   0.6s
[CV] radius=15 .......................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   29.9s remaining:    0.0s


[CV]  radius=15, mae=(train=0.000, test=6.916), mape=(train=0.000, test=0.583), total=   0.8s
[CV] radius=15 .......................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   35.3s remaining:    0.0s


[CV]  radius=15, mae=(train=0.000, test=6.524), mape=(train=0.000, test=0.414), total=   0.7s
[CV] radius=15 .......................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   39.8s remaining:    0.0s


[CV]  radius=15, mae=(train=0.000, test=7.256), mape=(train=0.000, test=0.356), total=   0.6s
[CV] radius=15 .......................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   44.7s remaining:    0.0s


[CV]  radius=15, mae=(train=0.000, test=6.464), mape=(train=0.000, test=0.450), total=   0.7s
[CV] radius=20 .......................................................


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   49.9s remaining:    0.0s


[CV]  radius=20, mae=(train=0.000, test=7.231), mape=(train=0.000, test=0.561), total=   0.7s
[CV] radius=20 .......................................................


[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:   55.8s remaining:    0.0s


[CV]  radius=20, mae=(train=0.000, test=7.380), mape=(train=0.000, test=0.442), total=   0.6s
[CV] radius=20 .......................................................
[CV]  radius=20, mae=(train=0.000, test=5.994), mape=(train=0.000, test=0.320), total=   0.5s
[CV] radius=20 .......................................................
[CV]  radius=20, mae=(train=0.000, test=6.513), mape=(train=0.000, test=0.346), total=   0.6s
[CV] radius=20 .......................................................
[CV]  radius=20, mae=(train=0.000, test=7.965), mape=(train=0.000, test=0.325), total=   0.6s
[CV] radius=20 .......................................................
[CV]  radius=20, mae=(train=0.000, test=7.467), mape=(train=0.000, test=0.472), total=   0.6s
[CV] radius=20 .......................................................
[CV]  radius=20, mae=(train=0.000, test=6.914), mape=(train=0.000, test=0.583), total=   0.5s
[CV] radius=20 .......................................................
[CV]  radi

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  1.6min finished


In [21]:
outcomes = pd.DataFrame(grid_obj.cv_results_)
outcomes = outcomes[['params', 'mean_train_mae', 'mean_train_mape',
            'mean_test_mae', 'std_test_mae', 'mean_test_mape', 'std_test_mape']]
outcomes

Unnamed: 0,params,mean_train_mae,mean_train_mape,mean_test_mae,std_test_mae,mean_test_mape,std_test_mape
0,{'radius': 15},0.0,0.0,6.973949,0.563214,0.427041,0.088541
1,{'radius': 20},0.0,0.0,6.971159,0.564041,0.426926,0.088577
