In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure
import matplotlib.gridspec as gridspec

from sklearn import neighbors, datasets
from sklearn import naive_bayes
from sklearn import svm
from sklearn import tree

from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE
from sklearn.impute import SimpleImputer
from sklearn import model_selection
from sklearn.preprocessing import (
    KBinsDiscretizer,
    LabelEncoder,
    MinMaxScaler,
    Normalizer,
    OneHotEncoder,
    OrdinalEncoder,
    PowerTransformer,
    RobustScaler,
    StandardScaler,
)
    

# Datos

In [3]:
data = pd.read_csv('tp-2020-2c-train-cols2.csv')
targets = pd.read_csv('tp-2020-2c-train-cols1.csv')

In [4]:
df = (pd.merge(left=data, right=targets, how="left", on="id_usuario", validate="one_to_one"))

In [5]:
df.head()

Unnamed: 0,tipo_de_sala,nombre,id_usuario,genero,edad,amigos,parientes,id_ticket,precio_ticket,fila,nombre_sede,volveria
0,4d,Señor Camilo Pedro,117,hombre,73.5,0,0,59258;,1,,fiumark_quilmes,0
1,4d,Señora Raquel Angelica,658,mujer,35.0,1,1,586:6;,2,,fiumark_quilmes,0
2,normal,Señor Antonio Federico,794,hombre,,0,0,"RE""39822",3,,fiumark_chacarita,0
3,4d,Señor Osvaldo Aureliano,455,hombre,,0,0,"C17""4:39",1,,fiumark_palermo,0
4,4d,Señorita Rita Eudosia,173,mujer,4.0,1,1,569964,2,,fiumark_palermo,1


# Preprocesamiento

In [6]:
encoding_params = {'onehot_drop':'first'
                  }
scaling_params = {'standard_withmean':True}
selection_params = {'vt_threshold':0,
                    'rfe_estimator':'estimator'}

columnas_a_encodear = ['tipo_de_sala', 'genero', 'nombre_sede']

In [7]:
#encoders
def OrdinalEncoderWrapper(encoding_params):
    return OrdinalEncoder()

def OneHotWrapper(encoding_params):
    return OneHotEncoder(drop=encoding_params['onehot_drop'])

def LabelEncoderWrapper(encoding_params):
    return LabelEncoder()

encoders = {'ordinal': OrdinalEncoderWrapper(encoding_params),
           'label': LabelEncoderWrapper(encoding_params),
            'onehot': OneHotWrapper(encoding_params)
           }

#scalers
def StandardScalerWrapper(scaling_params):
    return StandardScaler(with_mean=scaling_params['standard_withmean'])

def MinMaxScalerWrapper(scaling_params):
    return MinMaxScaler()

def RobustScalerWrapper(scaling_params):
    return RobustScaler()

def PowerTransformerWrapper(scaling_params):
    return PowerTransformer()

def NormalizerWrapper(scaling_params):
    return Normalizer()

scalers = {'standard': StandardScalerWrapper(scaling_params),
          'minmax': MinMaxScalerWrapper(scaling_params),
          'robust': RobustScalerWrapper(scaling_params),
           'power' : PowerTransformerWrapper(scaling_params),
           'normalizer' : NormalizerWrapper(scaling_params)
          }

#Selectors


def VarianceThresholdWrapper(selection_params):
    return VarianceThreshold()

def RFEWrapper(selection_params):
    return RFE(selection_params['rfe_estimator'])

def FeatureHasherWrapper(selection_params):
    return FeatureHasher()


selectors = {'var_thres': VarianceThresholdWrapper(selection_params),
            'rfe': RFEWrapper(selection_params),
            'feature_hasher': FeatureHasherWrapper(selection_params)}

In [8]:
#Funciones auxiliares
#robado de la practica
def droppear_nulos_por_columna(data):
    NULL_REMOVE_PCT = 0.30
    cols = data.isna().mean()
    cols = cols[cols < NULL_REMOVE_PCT]
    return data[cols.index]

def droppear_filas_sin_sede(data):
    _data = data.drop(data.loc[data['nombre_sede'].isna()].index, inplace=False)
    _data.reset_index(drop=True)
    return _data
    

def prepro_1(X, y, encoder, scaler, selector):
    _X = X.copy(deep=True)
    _y = y.copy(deep=True)
    
    #tratamiento de nulos
    _X = droppear_nulos_por_columna(_X)
    #_X = droppear_filas_sin_sede(_X) rompe el dataframe, no se por que
    _X.drop(['id_ticket','nombre','id_usuario'], axis=1, inplace=True)
    _X['edad'] = SimpleImputer(strategy='median').fit_transform(_X[['edad']])
    
    
    #encoding
    _encoder = encoders[encoder]
    if(encoder == 'onehot'):
        for col in columnas_a_encodear:
            
            encoded = _encoder.fit(_X[[col]].astype(str))
            categories = list(encoded.categories_)
            encoded = encoded.transform(_X[[col]].astype(str)).todense().astype(int)
            encoded = pd.DataFrame(encoded)
            categories = np.delete(categories, 0)
            encoded.columns = categories
            
            _X = pd.concat([_X, encoded], axis=1)
            _X.drop(labels=col, axis=1, inplace=True)
    
    #seleccion
    _selector = selectors[selector]
    #_X = _selector.fit_transform(_X,_y)
    
    #train-test split
    X_train, X_test, y_train, y_test = model_selection.train_test_split(_X, _y, test_size=0.3, random_state=7)
    
    #escalado
    if(scaler != 'none'):
        _scaler = scalers[scaler]
        X_train = _scaler.fit_transform(X_train, y_train)
        X_test = _scaler.transform(X_test)
    
    y_train = y_train['volveria'].to_numpy(copy=True)
    y_test = y_test['volveria'].to_numpy(copy=True)
    
    return X_train, X_test, y_train, y_test


In [110]:
X_train_a, X_test_a, y_train_a, y_test_a = prepro_1(data, targets, 'onehot', 'standard', 'var_thres')
X_train_b, X_test_b, y_train_b, y_test_b = prepro_1(data, targets, 'onehot', 'minmax', 'var_thres')
X_train_c, X_test_c, y_train_c, y_test_c = prepro_1(data, targets, 'onehot', 'robust', 'var_thres')
X_train_d, X_test_d, y_train_d, y_test_d = prepro_1(data, targets, 'onehot', 'power', 'var_thres')
X_train_e, X_test_e, y_train_e, y_test_e = prepro_1(data, targets, 'onehot', 'normalizer', 'var_thres')

datos = [
    [X_train_a, X_test_a, y_train_a, y_test_a, 'onehot', 'standard', 'var_thres'],
    [X_train_b, X_test_b, y_train_b, y_test_b, 'onehot', 'minmax', 'var_thres'],
    [X_train_c, X_test_c, y_train_c, y_test_c, 'onehot', 'robust', 'var_thres'],
    [X_train_d, X_test_d, y_train_d, y_test_d, 'onehot', 'power', 'var_thres'],
    [X_train_e, X_test_e, y_train_e, y_test_e, 'onehot', 'normalizer', 'var_thres']
]

---

# Entrenando modelos

### 3.1 KNN

Entrenamiento con valores por defecto.

In [10]:
#Datos escalados con StandardScaler
clf_knn = neighbors.KNeighborsClassifier()
clf_knn.fit(X_train_a, y_train_a)
pred = clf_knn.predict(X_test_a)
score = (y_test_a == pred)
score.mean()

0.7634854771784232

In [11]:
#Datos escalados con MinMaxScaler
clf_knn.fit(X_train_b, y_train_b)
pred = clf_knn.predict(X_test_b)
score = (y_test_b == pred)
score.mean()

0.7634854771784232

In [12]:
#Datos escalados con RobustScaler
clf_knn.fit(X_train_c, y_train_c)
pred = clf_knn.predict(X_test_c)
score = (y_test_c == pred)
score.mean()

0.7800829875518672

In [13]:
#Datos escalados con PowerTransformer
clf_knn.fit(X_train_d, y_train_d)
pred = clf_knn.predict(X_test_d)
score = (y_test_d == pred)
score.mean()

0.7676348547717843

In [14]:
#Datos escalados con Normalizer
clf_knn.fit(X_train_e, y_train_e)
pred = clf_knn.predict(X_test_e)
score = (y_test_e == pred)
score.mean()

0.7676348547717843

Busqueda de hiperparámetros mediante GridSearch

In [117]:
knn_grid_params = {
    'n_neighbors': [3,5,11,19],
    'weights' : ['distance', 'uniform'],
    'metric' : ['euclidean', 'manhattan']
}

gs = model_selection.GridSearchCV(
    clf_knn,
    knn_grid_params,
    verbose = 3,
    cv = 3,
    n_jobs = -1
)

In [119]:
metricas = []
for conjunto in datos:
    
    gs_results = gs.fit(conjunto[0], conjunto[2])
    row = []
    row.append(conjunto[4])
    row.append(conjunto[5])
    row.append(conjunto[6])
    for key in gs_results.best_params_:
        row.append(gs_results.best_params_[key])
    row.append(gs_results.best_score_)
    metricas.append(row)
    
dd = pd.DataFrame(metricas, columns=['encoding','scaling','selection','metric', 'n_neighbors', 'weights', 'score'])
dd

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Fitting 3 folds for each of 16 candidates, totalling 48 fits


Unnamed: 0,encoding,scaling,selection,metric,n_neighbors,weights,score
0,onehot,standard,var_thres,euclidean,5,uniform,0.826807
1,onehot,minmax,var_thres,manhattan,11,uniform,0.82857
2,onehot,robust,var_thres,manhattan,19,uniform,0.825053
3,onehot,power,var_thres,euclidean,19,distance,0.812499
4,onehot,normalizer,var_thres,manhattan,5,uniform,0.810764


### 3.2 Naive Bayes

#### 3.2.1 Gaussiano

In [24]:
clf_nb_gauss = naive_bayes.GaussianNB()
clf_nb_gauss.fit(X_train_a, y_train_a)
pred = clf_nb_gauss.predict(X_test_a)
score = (y_test_a == pred)
score.mean()

0.6348547717842323

In [25]:
clf_nb_gauss = naive_bayes.GaussianNB()
clf_nb_gauss.fit(X_train_b, y_train_b)
pred = clf_nb_gauss.predict(X_test_b)
score = (y_test_b == pred)
score.mean()

0.6390041493775933

In [26]:
clf_nb_gauss = naive_bayes.GaussianNB()
clf_nb_gauss.fit(X_train_c, y_train_c)
pred = clf_nb_gauss.predict(X_test_c)
score = (y_test_c == pred)
score.mean()

0.6390041493775933

In [27]:
clf_nb_gauss = naive_bayes.GaussianNB()
clf_nb_gauss.fit(X_train_d, y_train_d)
pred = clf_nb_gauss.predict(X_test_d)
score = (y_test_d == pred)
score.mean()

0.6141078838174274

In [28]:
clf_nb_gauss = naive_bayes.GaussianNB()
clf_nb_gauss.fit(X_train_e, y_train_e)
pred = clf_nb_gauss.predict(X_test_e)
score = (y_test_e == pred)
score.mean()

0.6597510373443983

#### 3.2.2 Multinomial

In [37]:
#StandardScaler, RobustScaler y PowerTransformer arrojan valores negativos, no podemos usar el resultado en MultinomialNB

In [33]:
#Datos escalados con MinMaxScaler
nb_mult_alpha = 1.0
clf_nb_mult = naive_bayes.MultinomialNB()
clf_nb_mult.fit(X_train_b, y_train_b)
pred = clf_nb_mult.predict(X_test_b)
score = (y_test_b == pred)
score.mean()

0.7883817427385892

In [39]:
#Datos escalados con Normalizer
nb_mult_alpha = 1.0
clf_nb_mult = naive_bayes.MultinomialNB()
clf_nb_mult.fit(X_train_e, y_train_e)
pred = clf_nb_mult.predict(X_test_e)
score = (y_test_e == pred)
score.mean()

0.6182572614107884

#### 3.2.3 Complemento

In [None]:
#No se vio en clase

#### 3.2.4 Bernoulli

In [23]:
#No se vio en clase

### 3.3 SVM

#### 3.3.1 Lineal

In [49]:
#Datos escalados con StandardScaler

clf_linear_svc = svm.LinearSVC()
clf_linear_svc.fit(X_train_a, y_train_a)
pred = clf_linear_svc.predict(X_test_a)
score = (y_test_a == pred)
score.mean()



0.7883817427385892

In [53]:
#Datos escalados con MinMaxScaler

clf_linear_svc = svm.LinearSVC()
clf_linear_svc.fit(X_train_b, y_train_b)
pred = clf_linear_svc.predict(X_test_b)
score = (y_test_b == pred)
score.mean()

0.7883817427385892

In [52]:
#Datos escalados con RobustScaler

clf_linear_svc = svm.LinearSVC()
clf_linear_svc.fit(X_train_c, y_train_c)
pred = clf_linear_svc.predict(X_test_c)
score = (y_test_c == pred)
score.mean()



0.7925311203319502

In [120]:
#Datos escalados con PowerTransformer

clf_linear_svc = svm.LinearSVC()
clf_linear_svc.fit(X_train_d, y_train_d)
pred = clf_linear_svc.predict(X_test_d)
score = (y_test_d == pred)
score.mean()

0.8008298755186722

In [50]:
#Datos escalados con Normalizer

clf_linear_svc = svm.LinearSVC()
clf_linear_svc.fit(X_train_e, y_train_e)
pred = clf_linear_svc.predict(X_test_e)
score = (y_test_e == pred)
score.mean()

0.6846473029045643

Busqueda de hiperparametros mediante GridSearch

In [121]:
svc_grid_params = {
    'C' : [0.01, 0.1, 1, 10],
    'penalty' : ['l1', 'l2']
}

gs = model_selection.GridSearchCV(
    clf_linear_svc,
    svc_grid_params,
    verbose = 3,
    cv = 5,
    n_jobs = -1
)

In [122]:
metricas = []
for conjunto in datos:
    
    gs_results = gs.fit(conjunto[0], conjunto[2])
    row = []
    row.append(conjunto[4])
    row.append(conjunto[5])
    row.append(conjunto[6])
    for key in gs_results.best_params_:
        row.append(gs_results.best_params_[key])
    row.append(gs_results.best_score_)
    metricas.append(row)
    
dd = pd.DataFrame(metricas, columns=['encoding','scaling','selection','C', 'penalty', 'score'])
dd

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits


        nan 0.80714286]
        nan 0.80892857]


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits


        nan 0.80714286]
        nan 0.79821429]


Fitting 5 folds for each of 8 candidates, totalling 40 fits


        nan 0.76428571]


Unnamed: 0,encoding,scaling,selection,C,penalty,score
0,onehot,standard,var_thres,0.1,l2,0.808929
1,onehot,minmax,var_thres,0.01,l2,0.810714
2,onehot,robust,var_thres,0.01,l2,0.819643
3,onehot,power,var_thres,0.01,l2,0.801786
4,onehot,normalizer,var_thres,10.0,l2,0.764286


---

#### 3.3.2 Polinomial

Entrenamiento con valores por defecto

In [43]:
#onehot
#Datos escalados con StandardScaler
clf_svc = svm.SVC(kernel='poly')
clf_svc.fit(X_train_a, y_train_a)
pred = clf_svc.predict(X_test_a)
score = (y_test_a == pred)
score.mean()

0.7925311203319502

In [44]:
#onehot
#Datos escalados con MinMaxScaler
clf_svc = svm.SVC(kernel='poly')
clf_svc.fit(X_train_b, y_train_b)
pred = clf_svc.predict(X_test_b)
score = (y_test_b == pred)
score.mean()

0.8008298755186722

In [45]:
#onehot
#Datos escalados con RobustScaler
clf_svc = svm.SVC(kernel='poly')
clf_svc.fit(X_train_c, y_train_c)
pred = clf_svc.predict(X_test_c)
score = (y_test_c == pred)
score.mean()

0.7925311203319502

In [46]:
#onehot
#Datos escalados con PowerTransformer
clf_svc = svm.SVC(kernel='poly')
clf_svc.fit(X_train_d, y_train_d)
pred = clf_svc.predict(X_test_d)
score = (y_test_d == pred)
score.mean()

0.8257261410788381

In [47]:
#onehot
#Datos escalados con Normalizer
clf_svc = svm.SVC(kernel='poly')
clf_svc.fit(X_train_e, y_train_e)
pred = clf_svc.predict(X_test_e)
score = (y_test_e == pred)
score.mean()

0.6804979253112033

Busqueda de hiperparametros mediante GridSearch

In [62]:
svc_grid_params = {
    'C' : [0.01, 0.1, 1, 10],
    'degree': [2,3,4],
    'coef0': [0.0, 0.5, 1.0],
    'gamma': [1.0, 1.5, 2.0]
}

gs = model_selection.GridSearchCV(
    clf_svc,
    svc_grid_params,
    verbose = 3,
    cv = 5,
    n_jobs = -1
)

In [63]:
gs_results = gs.fit(X_train_a, y_train_a)
display(gs_results.best_params_)
display(gs_results.best_score_)
#scores = model_selection.cross_val_score(clf_svc, df_X, df_y, cv=5)
#display(scores)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


{'C': 0.01, 'coef0': 0.5, 'degree': 2, 'gamma': 1.5}

0.8321428571428573

In [None]:
gs_results = gs.fit(X_train_b, y_train_b)
display(gs_results.best_params_)
display(gs_results.best_score_)
#scores = model_selection.cross_val_score(clf_svc, df_X, df_y, cv=5)
#display(scores)

In [None]:
gs_results = gs.fit(X_train_c, y_train_c)
display(gs_results.best_params_)
display(gs_results.best_score_)
#scores = model_selection.cross_val_score(clf_svc, df_X, df_y, cv=5)
#display(scores)

In [None]:
gs_results = gs.fit(X_train_d, y_train_d)
display(gs_results.best_params_)
display(gs_results.best_score_)
#scores = model_selection.cross_val_score(clf_svc, df_X, df_y, cv=5)
#display(scores)

In [None]:
gs_results = gs.fit(X_train_e, y_train_e)
display(gs_results.best_params_)
display(gs_results.best_score_)
#scores = model_selection.cross_val_score(clf_svc, df_X, df_y, cv=5)
#display(scores)

---

#### 3.3.3 Radial

Entrenamiento con valores por defecto

In [65]:
#onehot
#Datos escalados con StandardScaler
clf_svc = svm.SVC(kernel='rbf')
clf_svc.fit(X_train_a, y_train_a)
pred = clf_svc.predict(X_test_a)
score = (y_test_a == pred)
score.mean()

0.8049792531120332

In [66]:
#onehot
#Datos escalados con MinMaxScaler
clf_svc = svm.SVC(kernel='rbf')
clf_svc.fit(X_train_b, y_train_b)
pred = clf_svc.predict(X_test_b)
score = (y_test_b == pred)
score.mean()

0.8008298755186722

In [67]:
#onehot
#Datos escalados con RobustScaler
clf_svc = svm.SVC(kernel='rbf')
clf_svc.fit(X_train_c, y_train_c)
pred = clf_svc.predict(X_test_c)
score = (y_test_c == pred)
score.mean()

0.8091286307053942

In [123]:
#onehot
#Datos escalados con PowerTransformer
clf_svc = svm.SVC(kernel='rbf')
clf_svc.fit(X_train_d, y_train_d)
pred = clf_svc.predict(X_test_d)
score = (y_test_d == pred)
score.mean()

0.8091286307053942

In [69]:
#onehot
#Datos escalados con Normalizer
clf_svc = svm.SVC(kernel='rbf')
clf_svc.fit(X_train_e, y_train_e)
pred = clf_svc.predict(X_test_e)
score = (y_test_e == pred)
score.mean()

0.6721991701244814

Busqueda de hiperparametros mediante GridSearch

In [124]:
svc_grid_params = {
    'C' : [0.01, 0.1, 1, 10, 50],
    'gamma': [0.05, 0.1, 0.5, 1.0, 1.5, 2.0]
}

gs = model_selection.GridSearchCV(
    clf_svc,
    svc_grid_params,
    verbose = 3,
    cv = 5,
    n_jobs = -1
)

In [125]:
metricas = []
for conjunto in datos:
    
    gs_results = gs.fit(conjunto[0], conjunto[2])
    row = []
    row.append(conjunto[4])
    row.append(conjunto[5])
    row.append(conjunto[6])
    for key in gs_results.best_params_:
        row.append(gs_results.best_params_[key])
    row.append(gs_results.best_score_)
    metricas.append(row)
    
dd = pd.DataFrame(metricas, columns=['encoding','scaling','selection','C', 'gamma', 'score'])
dd

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 30 candidates, totalling 150 fits


Unnamed: 0,encoding,scaling,selection,C,gamma,score
0,onehot,standard,var_thres,10,0.05,0.8375
1,onehot,minmax,var_thres,10,0.5,0.830357
2,onehot,robust,var_thres,10,0.1,0.8375
3,onehot,power,var_thres,1,0.1,0.841071
4,onehot,normalizer,var_thres,50,1.0,0.8


---

#### 3.3.4 Sigmoide

Entrenamiento con valores por defecto

In [86]:
#onehot
#standardscaler
clf_svc = svm.SVC(kernel='sigmoid')
clf_svc.fit(X_train_a, y_train_a)
pred = clf_svc.predict(X_test_a)
score = (y_test_a == pred)
score.mean()

0.7261410788381742

In [87]:
#onehot
#minmaxscaler
clf_svc = svm.SVC(kernel='sigmoid')
clf_svc.fit(X_train_b, y_train_b)
pred = clf_svc.predict(X_test_b)
score = (y_test_b == pred)
score.mean()

0.6846473029045643

In [88]:
#onehot
#robust
clf_svc = svm.SVC(kernel='sigmoid')
clf_svc.fit(X_train_c, y_train_c)
pred = clf_svc.predict(X_test_c)
score = (y_test_c == pred)
score.mean()

0.6431535269709544

In [126]:
#onehot
#powertransformer
clf_svc = svm.SVC(kernel='sigmoid')
clf_svc.fit(X_train_d, y_train_d)
pred = clf_svc.predict(X_test_d)
score = (y_test_d == pred)
score.mean()

0.7095435684647303

In [90]:
#onehot
#normalizer
clf_svc = svm.SVC(kernel='sigmoid')
clf_svc.fit(X_train_e, y_train_e)
pred = clf_svc.predict(X_test_e)
score = (y_test_e == pred)
score.mean()

0.6556016597510373

Busqueda de hiperparametros mediante GridSearch

In [131]:
svc_grid_params = {
    'C' : [0.01, 0.1, 1, 10],
    'gamma': [0.05, 0.1, 0.5, 1.0, 1.5, 2.0],
    'coef0': [0.0, 0.5, 1.0]
}

gs = model_selection.GridSearchCV(
    clf_svc,
    svc_grid_params,
    verbose = 1,
    cv = 5,
    n_jobs = -1
)

In [133]:
metricas = []
for conjunto in datos:
    
    gs_results = gs.fit(conjunto[0], conjunto[2])
    row = []
    row.append(conjunto[4])
    row.append(conjunto[5])
    row.append(conjunto[6])
    for key in gs_results.best_params_:
        row.append(gs_results.best_params_[key])
    row.append(gs_results.best_score_)
    metricas.append(row)
    
dd = pd.DataFrame(metricas, columns=['encoding','scaling','selection','C', 'coef0', 'gamma', 'score'])
dd

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits


Unnamed: 0,encoding,scaling,selection,C,coef0,gamma,score
0,onehot,standard,var_thres,0.1,0.5,0.1,0.792857
1,onehot,minmax,var_thres,0.1,0.0,0.5,0.791071
2,onehot,robust,var_thres,1.0,0.5,0.05,0.776786
3,onehot,power,var_thres,0.1,0.0,0.05,0.796429
4,onehot,normalizer,var_thres,10.0,0.0,0.5,0.7125


---

### 3.4 Regresor Lineal (Logistic Regression)

In [None]:
from sklearn.linear_model import LogisticRegression
clf_logreg = LogisticRegression()
clf_logreg.fit(X_train, y_train)
pred = clf_logreg.predict(X_test)
score = (y_test == pred)
score.mean()

In [None]:
scores = model_selection.cross_val_score(clf_logreg, df_X, df_y, cv=5)
scores

### 3.5 Decision Tree

In [None]:
clf_tree = tree.DecisionTreeClassifier(random_state=0, max_depth=2)
clf_tree.fit(X_train, y_train)
pred = clf_tree.predict(X_test)
score = (y_test == pred)
score.mean()

In [None]:
scores = model_selection.cross_val_score(clf_tree, df_X, df_y, cv=5)
scores

### 3.6 Redes Neuronales

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

### 3.7 Ensambles

In [None]:
from sklearn import ensemble

#### 3.7.1 RandomForest

In [None]:
random_forest = ensemble.RandomForestClassifier()
random_forest.fit(X_train, y_train)
pred = random_forest.predict(X_test)
score = (y_test == pred)
score.mean()

#### 3.7.2 Gradient Boost