## Machine Learning Project on UCI Parkinsons Telemonitoring Data Set

Importación de librerías

In [None]:
#!pip install qgrid

In [1]:
from __future__ import division

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import numpy.matlib as matlib

from numpy import random
import math

import pandas as pd
import qgrid

import scipy as sc
from scipy.spatial import distance
from scipy import stats

#algunas advertencias que queremos evitar
import warnings
warnings.filterwarnings("always")

Cargamos la base de datos

In [2]:
#Cargamos la bd que está en un archivo .data y ahora la podemos manejar de forma matricial
db = np.loadtxt('DB/parkinsons_updrs.data', delimiter=',', skiprows=1)  # Assuming ',' delimiter

#X: Toma todas las filas (muestras) y las columnas 6-21 (características)
X = db[:,6:22]
#Y: Toma todas las filas y la columna 4, corresponde a la salida de la regresión
Y = db[:,4]
#G: Toma todas las filas y la columna 0, corresponde a la asociación en grupos del dataset
G = db[:,0]

In [3]:
print(X.shape)
print(Y.shape)
print(G.shape)

(5875, 16)
(5875,)
(5875,)


#### Regresión Lineal Múltiple

In [71]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score

# Normalizamos los datos
scaler = preprocessing.StandardScaler().fit(X)
X_norm = scaler.transform(X)

random.seed(19680801)

iterations = 10
gss = GroupShuffleSplit(n_splits=iterations, train_size=.7)
index = gss.split(X, Y, G)

# modelo
model = Pipeline([('poly', PolynomialFeatures()), ('linear', LinearRegression(fit_intercept=False))])
#print(model.get_params().keys())

parameters = {'poly__degree': [1,2,3]}

# métricas de error
mae = make_scorer(mean_absolute_error)
r2 = make_scorer(r2_score)

scores =  {'mae':mae,'r2':r2}

# Run the grid search
grid_obj = GridSearchCV(model, parameters, scoring=scores, cv=index, return_train_score=True, refit=False)
grid_obj = grid_obj.fit(X_norm, Y)



In [72]:
outcomes = pd.DataFrame(grid_obj.cv_results_)
outcomes = outcomes[['params', 'mean_train_mae', 'std_train_mae', 'mean_train_r2', 'std_train_r2',
            'mean_test_mae', 'std_test_mae', 'mean_test_r2', 'std_test_r2']]
outcomes

Unnamed: 0,params,mean_train_mae,std_train_mae,mean_train_r2,std_train_r2,mean_test_mae,std_test_mae,mean_test_r2,std_test_r2
0,{'poly__degree': 1},6.351467,0.137507,0.137742,0.036423,7.111647,0.257081,-0.1902,0.159749
1,{'poly__degree': 2},5.461242,0.140235,0.316153,0.035992,8.235385,1.381189,-2.500878,4.340854
2,{'poly__degree': 3},4.150686,0.150169,0.560703,0.031053,42.009883,51.07543,-16383.067737,38457.803648
