# Ensambles

## Preparación de ambiente

### Carga de módulos

In [1]:
# Data Wrangling
import os 
import librosa # Librería para manejo de audio
import numpy as np
import pandas as pd
from scipy.io import wavfile # Manejo de de audio

# Data visualization
import cufflinks as cf
import IPython.display as ipd # Widget para Jupyter
import matplotlib.pyplot as plt

# Modeling
from xgboost.sklearn import XGBClassifier # pip install xgboost
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier, VotingRegressor
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier 

# Model performance
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score, RandomizedSearchCV

# Enviroment setup
cf.set_config_file(theme='solar', offline=True)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

### Funciones relevantes

In [2]:
# Función que concentra cross validation para clasificación
def classification_metrics(X, y, estimator):
    ls_scores_roc = cross_val_score(estimator=estimator, X=X, y=y, scoring="accuracy", n_jobs=-1, cv=4)
    print(f"Accuracy media: {np.mean(ls_scores_roc):,.2f}, desviación estándar: {np.std(ls_scores_roc)}")

In [3]:
# Función que reproduce sonido en Jupyter
def wavPlayer(filepath):
    rate, data = wavfile.read(filepath)
    display(pd.Series(data).iplot())
    return ipd.Audio(filepath, autoplay=True)

In [4]:
# Función para visualizar el mapa de calor de las frecuencias de sonido
def plot_heatmap(data):
    X = librosa.stft(data)
    Xdb = librosa.amplitude_to_db(abs(X))
    plt.figure(figsize=(14, 5))
    librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz') 
    plt.colorbar()

In [5]:
# Función que encapsula la lectura del alrchivo, el modelo y las variables para redecir el número
def predict_number(filepath, estimator, features):
    # Apertura del archivo
    x, sr = librosa.load(filepath, sr=None)
    # Generación de tabla para validación
    X_val = pd.DataFrame(np.reshape(abs(librosa.stft(x).mean(axis = 1).T), (1025,1))).T
    X_val.columns = X_train.columns
    real = int(filepath.split('/')[-1][0])
    # Predicción
    pred = int(estimator.predict(X_val)[0])
    print(f"The real is: {real}")
    print(f"The predicted is: {estimator.predict(X_val)[0]}")
    return wavPlayer(filepath=filepath)

## Carga de datos

### Lectura de archivos

In [6]:
# https://github.com/Jakobovski/free-spoken-digit-dataset

In [7]:
# Carga de archivos de sonido
file = os.listdir('./recordings/')
data=[]
for i in file:
    x, sr = librosa.load('./recordings/'+i, sr=None)
    data.append(x)

### Transformación

In [9]:
# Aplicación de Transformada de Fourier a frecuencias de sonido
data_tf=[]
for i in range(len(data)):
    data_tf.append(abs(librosa.stft(data[i]).mean(axis = 1).T))
data_tf= np.array(data_tf)


n_fft=2048 is too large for input signal of length=1843


n_fft=2048 is too large for input signal of length=1778


n_fft=2048 is too large for input signal of length=1987


n_fft=2048 is too large for input signal of length=1976


n_fft=2048 is too large for input signal of length=1868


n_fft=2048 is too large for input signal of length=1884


n_fft=2048 is too large for input signal of length=1749


n_fft=2048 is too large for input signal of length=1953


n_fft=2048 is too large for input signal of length=1819


n_fft=2048 is too large for input signal of length=2016


n_fft=2048 is too large for input signal of length=1446


n_fft=2048 is too large for input signal of length=1838


n_fft=2048 is too large for input signal of length=1410


n_fft=2048 is too large for input signal of length=1813


n_fft=2048 is too large for input signal of length=1560


n_fft=2048 is too large for input signal of length=1873


n_fft=2048 is too large for input signal of length=1892


n_fft=2048 is

In [11]:
data_tf[0].shape

(1025,)

In [13]:
df = pd.DataFrame(data_tf)
df["target"] = [i[0] for i in file]

In [14]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1016,1017,1018,1019,1020,1021,1022,1023,1024,target
0,6.0496,3.2506,0.2206,0.0549,0.1934,0.1839,0.0803,0.1041,0.1408,0.0767,...,0.0559,0.0447,0.0207,0.0658,0.1069,0.0442,0.0103,0.0778,0.1410,5
1,0.0098,0.0045,0.0009,0.0150,0.0284,0.0140,0.0009,0.0216,0.0416,0.0214,...,0.0039,0.0020,0.0001,0.0031,0.0062,0.0031,0.0002,0.0030,0.0061,3
2,0.0104,0.0057,0.0007,0.0029,0.0056,0.0030,0.0008,0.0075,0.0122,0.0060,...,0.0125,0.0064,0.0002,0.0057,0.0115,0.0059,0.0003,0.0050,0.0094,1
3,0.0245,0.0128,0.0013,0.0007,0.0023,0.0019,0.0013,0.0011,0.0023,0.0016,...,0.0009,0.0008,0.0008,0.0009,0.0014,0.0015,0.0012,0.0030,0.0050,2
4,0.0064,0.0031,0.0007,0.0024,0.0061,0.0041,0.0010,0.0039,0.0044,0.0016,...,0.0040,0.0026,0.0004,0.0002,0.0006,0.0005,0.0006,0.0013,0.0023,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,6.0726,3.2722,0.2715,0.1703,0.2874,0.1715,0.0634,0.1406,0.1484,0.0292,...,0.1156,0.0784,0.0174,0.0402,0.0942,0.0317,0.0112,0.0385,0.0210,5
2996,0.0256,0.0132,0.0005,0.0077,0.0154,0.0077,0.0003,0.0073,0.0139,0.0067,...,0.0017,0.0007,0.0001,0.0006,0.0015,0.0009,0.0001,0.0030,0.0062,3
2997,0.0056,0.0030,0.0004,0.0022,0.0033,0.0020,0.0011,0.0044,0.0101,0.0056,...,0.0227,0.0112,0.0003,0.0098,0.0191,0.0094,0.0002,0.0099,0.0197,1
2998,0.0219,0.0117,0.0008,0.0004,0.0009,0.0007,0.0004,0.0008,0.0008,0.0002,...,0.0020,0.0017,0.0003,0.0027,0.0045,0.0022,0.0005,0.0010,0.0017,2


### EDA

In [21]:
# Visualización de frecuencias de sonido
wavPlayer("./recordings/3_george_4.wav")

None

In [38]:
# Visualización de frecuencias de sonido
file=np.random.choice(os.listdir('./recordings/'),1)[0]
wavPlayer(os.path.join("./recordings/",file))

None

In [39]:
data_tf.shape

(3000, 1025)

In [40]:
data_tf

array([[6.0496254e+00, 3.2506449e+00, 2.2056498e-01, ..., 1.0281985e-02,
        7.7806108e-02, 1.4099786e-01],
       [9.7945621e-03, 4.4603227e-03, 8.5042720e-04, ..., 2.3927305e-04,
        2.9632535e-03, 6.1178626e-03],
       [1.0351014e-02, 5.7409001e-03, 7.4218219e-04, ..., 2.5643336e-04,
        5.0466871e-03, 9.3986392e-03],
       ...,
       [5.5758017e-03, 2.9636016e-03, 3.7174209e-04, ..., 2.2764613e-04,
        9.9054147e-03, 1.9710919e-02],
       [2.1931127e-02, 1.1744886e-02, 8.4645703e-04, ..., 5.2618631e-04,
        1.0265303e-03, 1.6972022e-03],
       [6.6575684e-02, 3.2002680e-02, 1.5585378e-03, ..., 2.6732354e-04,
        5.0351484e-04, 1.2593111e-05]], dtype=float32)

In [44]:
# Preparación de X y y
X = df[[x for x in df.columns if x != "target"]]
y = df["target"]

In [45]:
y.value_counts()

3    300
6    300
9    300
0    300
4    300
5    300
8    300
2    300
1    300
7    300
Name: target, dtype: int64

### Separación de sets

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [48]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,1024
327,0.0048,0.0025,0.0004,0.0115,0.0236,0.0110,0.0004,0.0384,0.0740,0.0386,...,0.0042,0.0091,0.0053,0.0004,0.0105,0.0202,0.0101,0.0003,0.0035,0.0073
1261,0.0233,0.0123,0.0009,0.0007,0.0007,0.0002,0.0004,0.0004,0.0006,0.0006,...,0.0011,0.0016,0.0007,0.0001,0.0004,0.0011,0.0010,0.0005,0.0006,0.0007
2042,0.0077,0.0042,0.0008,0.0190,0.0366,0.0185,0.0023,0.0374,0.0757,0.0407,...,0.0173,0.0341,0.0167,0.0003,0.0064,0.0134,0.0071,0.0001,0.0157,0.0313
2384,5.9753,3.2100,0.1971,0.0315,0.1736,0.1835,0.0675,0.0833,0.1485,0.1065,...,0.0408,0.0705,0.0540,0.0145,0.0036,0.0334,0.0341,0.0238,0.0349,0.0387
1643,6.6568,3.5181,0.1452,0.0603,0.0663,0.1311,0.0367,0.0208,0.0412,0.0729,...,0.0294,0.0587,0.0471,0.0121,0.0087,0.0381,0.0375,0.0033,0.0258,0.0403
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2287,0.0256,0.0140,0.0027,0.0031,0.0028,0.0020,0.0019,0.0023,0.0035,0.0032,...,0.0030,0.0052,0.0025,0.0004,0.0009,0.0012,0.0020,0.0016,0.0065,0.0118
1994,0.0181,0.0123,0.0064,0.0111,0.0174,0.0125,0.0050,0.0047,0.0105,0.0103,...,0.0166,0.0292,0.0149,0.0011,0.0510,0.1024,0.0504,0.0012,0.0092,0.0171
1804,0.0146,0.0099,0.0015,0.0190,0.0325,0.0187,0.0030,0.0177,0.0308,0.0202,...,0.0117,0.0201,0.0108,0.0012,0.0069,0.0102,0.0055,0.0012,0.0025,0.0069
2471,0.0041,0.0020,0.0005,0.0054,0.0099,0.0055,0.0003,0.0068,0.0130,0.0069,...,0.0016,0.0030,0.0017,0.0004,0.0009,0.0017,0.0009,0.0003,0.0040,0.0075


## Modelado

### Bosque Aleatorio

#### Modelado

In [49]:
RandomForestClassifier?

In [50]:
bos = RandomForestClassifier(n_estimators=100, max_depth=5)

In [51]:
bos.fit(X_train, y_train)

RandomForestClassifier(max_depth=5)

#### Cross-validation

In [52]:
classification_metrics(X=X_train, y=y_train, estimator=bos)

Accuracy media: 0.66, desviación estándar: 0.0060784990178291785


In [58]:
classification_metrics(X=X_test, y=y_test, estimator=bos)

Accuracy media: 0.59, desviación estándar: 0.031144201871544423


#### Hyperparametrización

In [53]:
param_dict = {"n_estimators": [x for x in range(100, 1500, 100)], # Número de árboles a construir
              "max_features": ["auto", "sqrt", "log2"], # Número máximo de variables a considerar
              "criterion": ["gini", "entropy"], # Criterio de selección de corte
              "class_weight": ["balanced", None], # Balanceo o no de la target
              "min_samples_split": [x for x in range(2, 50, 2)], # Número mínimo de muestras que debe tener una hoja para cortar
              "min_samples_leaf": [x/100 for x in range(5, 55, 5)]} # Número mínimo que debe tener una hoja

In [54]:
# Búsqueda aleatorizada
search = RandomizedSearchCV(param_distributions=param_dict, cv=4, n_jobs=-1, scoring="accuracy", estimator=bos, verbose=5,n_iter=10)

In [55]:
# WARNING! Este código demora mucho
search.fit(X_train, y_train)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


RandomizedSearchCV(cv=4, estimator=RandomForestClassifier(max_depth=5),
                   n_jobs=-1,
                   param_distributions={'class_weight': ['balanced', None],
                                        'criterion': ['gini', 'entropy'],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [0.05, 0.1, 0.15,
                                                             0.2, 0.25, 0.3,
                                                             0.35, 0.4, 0.45,
                                                             0.5],
                                        'min_samples_split': [2, 4, 6, 8, 10,
                                                              12, 14, 16, 18,
                                                              20, 22, 24, 26,
                                                              28, 30, 32, 3

In [56]:
search.best_estimator_

RandomForestClassifier(max_depth=5, max_features='log2', min_samples_leaf=0.05,
                       min_samples_split=40, n_estimators=800)

In [57]:
search.best_score_

0.5213293995689083

### AdaBoost

#### Modelado

In [59]:
AdaBoostClassifier?

In [60]:
ada = AdaBoostClassifier(n_estimators=100, learning_rate=0.05)

#### Cross-validation

In [61]:
# Cross validation
classification_metrics(X=X_train, y=y_train, estimator=ada)

Accuracy media: 0.33, desviación estándar: 0.020940280881049544


#### Hyperparametrización

In [62]:
# Hiperparametrización
search_grid={'n_estimators':[50,100,200],
             'learning_rate':[.001,0.01,.1]}

In [63]:
search = RandomizedSearchCV(param_distributions=search_grid, cv=4, n_jobs=-1, scoring="accuracy", estimator=ada, verbose=5,n_iter=9)

In [64]:
# WARNING! Este código demora mucho
search.fit(X_train, y_train)

Fitting 4 folds for each of 9 candidates, totalling 36 fits


RandomizedSearchCV(cv=4,
                   estimator=AdaBoostClassifier(learning_rate=0.05,
                                                n_estimators=100),
                   n_iter=9, n_jobs=-1,
                   param_distributions={'learning_rate': [0.001, 0.01, 0.1],
                                        'n_estimators': [50, 100, 200]},
                   scoring='accuracy', verbose=5)

In [65]:
search.best_estimator_

AdaBoostClassifier(learning_rate=0.1, n_estimators=200)

In [66]:
search.best_score_

0.3959691029879332

### Gradient Boosting

#### Modelado

In [68]:
GradientBoostingClassifier?

In [69]:
gb = GradientBoostingClassifier(learning_rate=0.05, min_samples_leaf=0.05)

#### Cross-validation

In [70]:
classification_metrics(X=X_train, y=y_train, estimator=gb)

[CV 4/4] END class_weight=None, criterion=entropy, max_features=auto, min_samples_leaf=0.15, min_samples_split=16, n_estimators=500;, score=0.335 total time=   9.2s
[CV 2/4] END learning_rate=0.01, n_estimators=50;, score=0.265 total time=  13.2s
[CV 3/4] END learning_rate=0.01, n_estimators=100;, score=0.295 total time=  24.3s
[CV 1/4] END learning_rate=0.1, n_estimators=100;, score=0.407 total time=  22.3s




[CV 3/4] END class_weight=None, criterion=gini, max_features=log2, min_samples_leaf=0.05, min_samples_split=40, n_estimators=800;, score=0.509 total time=   5.2s
[CV 2/4] END class_weight=None, criterion=entropy, max_features=auto, min_samples_leaf=0.45, min_samples_split=6, n_estimators=100;, score=0.103 total time=   0.2s
[CV 4/4] END class_weight=None, criterion=entropy, max_features=auto, min_samples_leaf=0.45, min_samples_split=6, n_estimators=100;, score=0.105 total time=   0.2s
[CV 2/4] END class_weight=None, criterion=entropy, max_features=auto, min_samples_leaf=0.5, min_samples_split=36, n_estimators=600;, score=0.103 total time=   0.9s
[CV 2/4] END class_weight=balanced, criterion=entropy, max_features=auto, min_samples_leaf=0.5, min_samples_split=24, n_estimators=1000;, score=0.101 total time=   1.4s
[CV 1/4] END class_weight=balanced, criterion=gini, max_features=auto, min_samples_leaf=0.2, min_samples_split=16, n_estimators=900;, score=0.316 total time=   5.7s
[CV 1/4] END



[CV 1/4] END class_weight=None, criterion=entropy, max_features=auto, min_samples_leaf=0.15, min_samples_split=16, n_estimators=500;, score=0.332 total time=   9.3s
[CV 1/4] END learning_rate=0.01, n_estimators=50;, score=0.258 total time=  13.4s
[CV 4/4] END learning_rate=0.01, n_estimators=100;, score=0.267 total time=  25.1s
[CV 4/4] END learning_rate=0.1, n_estimators=100;, score=0.363 total time=  22.7s
[CV 2/4] END class_weight=balanced, criterion=entropy, max_features=log2, min_samples_leaf=0.3, min_samples_split=4, n_estimators=300;, score=0.284 total time=   1.0s
[CV 4/4] END class_weight=None, criterion=gini, max_features=sqrt, min_samples_leaf=0.4, min_samples_split=14, n_estimators=600;, score=0.105 total time=   1.1s
[CV 4/4] END class_weight=balanced, criterion=entropy, max_features=auto, min_samples_leaf=0.15, min_samples_split=48, n_estimators=800;, score=0.359 total time=  13.2s
[CV 4/4] END learning_rate=0.001, n_estimators=100;, score=0.194 total time=  25.2s
[CV 1/4



[CV 2/4] END class_weight=None, criterion=entropy, max_features=auto, min_samples_leaf=0.15, min_samples_split=18, n_estimators=400;, score=0.325 total time=   8.0s
[CV 4/4] END class_weight=balanced, criterion=gini, max_features=auto, min_samples_leaf=0.2, min_samples_split=16, n_estimators=900;, score=0.340 total time=   5.7s
[CV 4/4] END learning_rate=0.001, n_estimators=50;, score=0.162 total time=  12.8s
[CV 1/4] END learning_rate=0.01, n_estimators=100;, score=0.293 total time=  24.9s
[CV 2/4] END learning_rate=0.1, n_estimators=100;, score=0.375 total time=  24.9s
[CV 4/4] END class_weight=None, criterion=gini, max_features=log2, min_samples_leaf=0.05, min_samples_split=40, n_estimators=800;, score=0.525 total time=   5.6s
[CV 3/4] END class_weight=None, criterion=entropy, max_features=auto, min_samples_leaf=0.5, min_samples_split=36, n_estimators=600;, score=0.105 total time=   0.9s
[CV 3/4] END class_weight=balanced, criterion=entropy, max_features=auto, min_samples_leaf=0.5, 



[CV 2/4] END class_weight=None, criterion=entropy, max_features=auto, min_samples_leaf=0.15, min_samples_split=16, n_estimators=500;, score=0.329 total time=   9.1s
[CV 4/4] END learning_rate=0.01, n_estimators=50;, score=0.253 total time=  13.9s
[CV 4/4] END learning_rate=0.01, n_estimators=200;, score=0.286 total time=  49.4s




[CV 1/4] END class_weight=balanced, criterion=entropy, max_features=log2, min_samples_leaf=0.3, min_samples_split=4, n_estimators=300;, score=0.268 total time=   1.0s
[CV 3/4] END class_weight=None, criterion=gini, max_features=sqrt, min_samples_leaf=0.4, min_samples_split=14, n_estimators=600;, score=0.105 total time=   1.1s
[CV 3/4] END class_weight=balanced, criterion=entropy, max_features=auto, min_samples_leaf=0.15, min_samples_split=48, n_estimators=800;, score=0.342 total time=  12.8s
[CV 2/4] END learning_rate=0.001, n_estimators=100;, score=0.172 total time=  26.6s
[CV 3/4] END learning_rate=0.1, n_estimators=50;, score=0.315 total time=  12.2s
[CV 1/4] END learning_rate=0.1, n_estimators=200;, score=0.425 total time=  41.3s




[CV 4/4] END class_weight=balanced, criterion=entropy, max_features=log2, min_samples_leaf=0.3, min_samples_split=4, n_estimators=300;, score=0.281 total time=   0.9s
[CV 1/4] END class_weight=None, criterion=gini, max_features=sqrt, min_samples_leaf=0.4, min_samples_split=14, n_estimators=600;, score=0.105 total time=   0.9s
[CV 1/4] END class_weight=balanced, criterion=entropy, max_features=auto, min_samples_leaf=0.15, min_samples_split=48, n_estimators=800;, score=0.343 total time=  13.4s
[CV 3/4] END learning_rate=0.001, n_estimators=100;, score=0.151 total time=  27.1s
[CV 4/4] END learning_rate=0.1, n_estimators=50;, score=0.308 total time=  12.1s
[CV 3/4] END learning_rate=0.1, n_estimators=200;, score=0.345 total time=  41.8s




[CV 2/4] END class_weight=None, criterion=gini, max_features=log2, min_samples_leaf=0.05, min_samples_split=40, n_estimators=800;, score=0.522 total time=   5.0s
[CV 1/4] END class_weight=None, criterion=entropy, max_features=auto, min_samples_leaf=0.45, min_samples_split=6, n_estimators=100;, score=0.105 total time=   0.2s
[CV 3/4] END class_weight=None, criterion=entropy, max_features=auto, min_samples_leaf=0.45, min_samples_split=6, n_estimators=100;, score=0.105 total time=   0.2s
[CV 1/4] END class_weight=None, criterion=entropy, max_features=auto, min_samples_leaf=0.5, min_samples_split=36, n_estimators=600;, score=0.105 total time=   0.8s
[CV 1/4] END class_weight=balanced, criterion=entropy, max_features=auto, min_samples_leaf=0.5, min_samples_split=24, n_estimators=1000;, score=0.098 total time=   1.7s
[CV 2/4] END class_weight=balanced, criterion=gini, max_features=auto, min_samples_leaf=0.2, min_samples_split=16, n_estimators=900;, score=0.309 total time=   6.1s
[CV 3/4] END



[CV 3/4] END class_weight=balanced, criterion=entropy, max_features=log2, min_samples_leaf=0.3, min_samples_split=4, n_estimators=300;, score=0.260 total time=   0.9s
[CV 2/4] END class_weight=None, criterion=gini, max_features=sqrt, min_samples_leaf=0.4, min_samples_split=14, n_estimators=600;, score=0.103 total time=   0.9s
[CV 2/4] END class_weight=balanced, criterion=entropy, max_features=auto, min_samples_leaf=0.15, min_samples_split=48, n_estimators=800;, score=0.337 total time=  12.7s
[CV 1/4] END learning_rate=0.001, n_estimators=100;, score=0.190 total time=  25.5s
[CV 2/4] END learning_rate=0.1, n_estimators=50;, score=0.346 total time=  14.8s
[CV 4/4] END learning_rate=0.1, n_estimators=200;, score=0.377 total time=  41.4s




Accuracy media: 0.78, desviación estándar: 0.011424865371894032


### Voting

#### Modelado

In [35]:
VotingClassifier?

[0;31mInit signature:[0m
[0mVotingClassifier[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mestimators[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvoting[0m[0;34m=[0m[0;34m'hard'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mweights[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_jobs[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mflatten_transform[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Soft Voting/Majority Rule classifier for unfitted estimators.

Read more in the :ref:`User Guide <voting_classifier>`.

.. versionadded:: 0.17

Parameters
----------
estimators : list of (str, estimator) tuples
    Invoking the ``fit`` method on the ``VotingClassifier`` will fit clones
    of those original estimators that

In [71]:
naiveb = GaussianNB()
logreg = LogisticRegression()
dctree = DecisionTreeClassifier()

#### Soft-voting

##### Cross-validation

In [72]:
vc = VotingClassifier(estimators=[("logreg", logreg), ("arbol", dctree), ("bayes", naiveb)], voting='soft')

In [73]:
classification_metrics(X=X_train, y=y_train, estimator=vc)

Accuracy media: 0.63, desviación estándar: 0.022272990291241426


#### Hard-voting

##### Cross-validation

In [74]:
vc = VotingClassifier(estimators=[("logreg", logreg), ("arbol", dctree), ("bayes", naiveb)], voting='hard')

In [75]:
classification_metrics(X=X_train, y=y_train, estimator=vc)

Accuracy media: 0.66, desviación estándar: 0.01731615962970638


### XGBoost

#### Modelado

In [76]:
xgb = XGBClassifier(learning_rate=0.1, n_estimators=190, max_depth=5, min_child_weight=2, objective="binary:logistic", subsample=0.9, colsample_bytree=0.8, seed=23333)

In [77]:
xgb.fit(X_train, y_train.astype(int))

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=2, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=190, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)

#### Cross-validation

In [78]:
classification_metrics(estimator=xgb, X = X_train, y=y_train.astype(int))

Accuracy media: 0.87, desviación estándar: 0.01689577131173663


In [79]:
xgb.score(X_test, y_test.astype(int))

0.8906666666666667

### Análisis de resultados

In [46]:
# Predicción del número de acuerdo con sus atributos
df["pred"] = xgb.predict(X)

In [47]:
# Análisis de predicción por clases
pd.DataFrame(data=confusion_matrix(y_true=df["target"], y_pred=df["pred"], labels=xgb.classes_), index=xgb.classes_, columns=xgb.classes_).iplot(kind="heatmap", colorscale="Blues")

TypeError: '<' not supported between instances of 'int' and 'str'

In [None]:
# Accuracy por clase
for num in sorted(df["target"].unique()):
    print(num, accuracy_score(y_true=df.loc[df["target"] == num, "target"], y_pred=df.loc[df["target"] == num, "pred"]))

In [None]:
# Accuracy global
accuracy_score(y_true=df["target"], y_pred=df["pred"])