#### Carga de datos

In [3]:
import numpy as np
datos_train = np.load('data/train.npy')

# Separación de los atributos (X_train) y del target (y_train)
X_train = datos_train[:,:-1]
y_train = datos_train[:,-1]

#### Exploración de los datos

In [4]:
# Exploración de los datos
print('X_train shape: ', np.shape(X_train))
print('y_train shape: ', np.shape(y_train))

# Target balanceado??
val, ocur = np.unique(y_train, return_counts=True)
print('valores de "y": ', val)
print('frecuenca de "y": ', ocur)

# Datos estandarizados??
print('Valores: ', X_train[0]) 

X_train shape:  (152, 10)
y_train shape:  (152,)
valores de "y":  [0. 1.]
frecuenca de "y":  [75 77]
Valores:  [ 0.08643772 -0.11146045 -1.8330433  -0.94005044 -0.04319178  0.32683947
  0.24654552  1.24681431 -1.10444142  0.50027139]


#### Definición del diccionario de métricas

In [5]:
import sklearn.metrics as metrics
metricas = {'accuracy': 'accuracy',
           'recall': 'recall',
           'precision': 'precision',
           'fscore': 'f1'}

#### Entrenamiento del algoritmo de clasificación SVM

In [6]:
# Hiperparámetros por defecto
from sklearn.svm import SVC
model_def = SVC()

In [7]:
# Hacer cross-validation 
from sklearn.model_selection import cross_validate, cross_val_score, KFold
from pprint import pprint
res = cross_validate(model_def, X_train, y_train, cv = KFold(n_splits=5, shuffle=True, random_state=42), scoring=metricas)
pprint(res)

{'fit_time': array([1.49771309e+00, 9.86099243e-04, 9.72986221e-04, 8.95977020e-04,
       9.39846039e-04]),
 'score_time': array([0.00283813, 0.00285482, 0.00265479, 0.00285912, 0.00243998]),
 'test_accuracy': array([0.90322581, 0.90322581, 0.9       , 0.86666667, 0.9       ]),
 'test_fscore': array([0.90909091, 0.90322581, 0.89655172, 0.85714286, 0.88888889]),
 'test_precision': array([1.        , 0.93333333, 0.92857143, 0.85714286, 0.92307692]),
 'test_recall': array([0.83333333, 0.875     , 0.86666667, 0.85714286, 0.85714286])}


In [8]:
# Tunear los hiperparámetros
alg = SVC(C=10, gamma='auto', kernel='poly', random_state=42)
res = cross_validate(alg, X_train, y_train, cv = KFold(n_splits=5, shuffle=True, random_state=42), scoring=metricas)
print('Results: ', np.round(np.mean(res['test_accuracy']),4), '+-', np.round(np.std(res['test_accuracy']), 4))

Results:  0.8746 +- 0.0497


In [9]:
# Entrenar modelo final
final_model = SVC(C=0.25, gamma=0.1, kernel='sigmoid', random_state=42)
final_model = final_model.fit(X_train, y_train)

#### Guardado del modelo definitivo

In [11]:
# Guardar modelo final
import pickle
with open('data/model_ejemplo.pickle', 'wb') as fw:
    pickle.dump(final_model, fw)