In [2]:
import numpy as np
from sklearn.model_selection import KFold

In [12]:
X = np.array([[2], [3], [7], [9]])
y = np.array([10,20,30,40]) 

In [19]:
kf = KFold(n_splits=2,shuffle=False)

In [14]:
# retorna o número de iterações de divisão 
kf.get_n_splits(X)

2

In [15]:
print(kf)

KFold(n_splits=2, random_state=None, shuffle=False)


In [25]:
# split = gere indices para dividir os dados de treino e teste
for train_index, test_index in kf.split(X):
    print('Train:', train_index, 'Test:', test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(f'X_train {X_train}')
    print(f'y_train  {y_train}')
    print(f'X_test  {X_test}')
    print(f'y_test  {y_test}\n')

Train: [2 3] Test: [0 1]
X_train [[7]
 [9]]
y_train  [30 40]
X_test  [[2]
 [3]]
y_test  [10 20]

Train: [0 1] Test: [2 3]
X_train [[2]
 [3]]
y_train  [10 20]
X_test  [[7]
 [9]]
y_test  [30 40]



### Embaralhar

In [26]:
kf = KFold(n_splits=2,shuffle=True)

In [27]:
for train_index, test_index in kf.split(X):
    print('Train:', train_index, 'Test:', test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(f'X_train {X_train}')
    print(f'y_train  {y_train}')
    print(f'X_test  {X_test}')
    print(f'y_test  {y_test}\n')

Train: [0 1] Test: [2 3]
X_train [[2]
 [3]]
y_train  [10 20]
X_test  [[7]
 [9]]
y_test  [30 40]

Train: [2 3] Test: [0 1]
X_train [[7]
 [9]]
y_train  [30 40]
X_test  [[2]
 [3]]
y_test  [10 20]



### Aplicando staking

In [28]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss, accuracy_score
from sklearn.pipeline import make_pipeline

In [30]:
kf = KFold(n_splits=2, shuffle=False)

In [31]:
second_level = np.zeros((X.shape[0], 4))

In [32]:
second_level

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [34]:
for tr, ts in kf.split(X, y):
    # X_train, X_test = X[train_index], X[test_index]
    #y_train, y_test = y[train_index], y[test_index]
    
    Xtr, Xval = X[tr], X[ts]
    ytr, yval = y[tr], y[ts]
    
    # n_jobs = 6 nucleos
    rf  = RandomForestClassifier(n_estimators=100, n_jobs=6, random_state=10)
    rf.fit(Xtr, ytr)
    # probabilidade de um eveneto pertencer a classe 1 ou a classe 0
    prf = rf.predict_proba(Xval)[:,1]
    prf_ = (prf > 0.5).astype(int)
    
    print(f'RF Acurácia: {accuracy_score(yval, prf_)} - Log Loss: {log_loss(yval, prf)}')    
    
    et = ExtraTreesClassifier(n_estimators=100, n_jobs=6, random_state=10)
    et.fit(Xtr, ytr)
    pet = et.predict_proba(Xval)[:,1]
    pet_ = (pet > 0.5).astype(int)
    
    print(f'ET Acurácia: {accuracy_score(yval, pet_)} - Log Loss: {log_loss(yval, pet)}')
    
    lr1 = make_pipeline(StandardScaler(), LogisticRegression())
    lr1.fit(Xtr, ytr)
    plr1 = lr1.predict_proba(Xval)[:,1]
    plr1_1 = (plr1 > 0.5).astype(int)
    
    print(f'LR StdScaler Acurácia: {accuracy_score(yval, plr1_1)} - Log Loss: {log_loss(yval, plr1)}')
        
    lr2 = make_pipeline(MinMaxScaler(), LogisticRegression())
    lr2.fit(Xtr, ytr)
    plr2 = lr2.predict_proba(Xval)[:,1]
    plr2_ = (plr2 > 0.5).astype(int)
    
    print(f'LR MinMax Acurácia: {accuracy_score(yval, plr2_)} - Log Loss: {log_loss(yval, plr2)} \n')
    
    # matriz com as previsões   
    
    second_level[ts, 0] = prf
    second_level[ts, 1] = pet
    second_level[ts, 2] = plr1
    second_level[ts, 3] = plr2
    

RF Acurácia: 0.0 - Log Loss: 0.7901823324741968
ET Acurácia: 0.0 - Log Loss: 17.269388197455342
LR StdScaler Acurácia: 0.0 - Log Loss: 1.7125608280756106
LR MinMax Acurácia: 0.0 - Log Loss: 0.8149497647348093 

RF Acurácia: 0.0 - Log Loss: 0.8507766006709531
ET Acurácia: 0.0 - Log Loss: 17.26978799617044
LR StdScaler Acurácia: 0.0 - Log Loss: 3.037970289283166
LR MinMax Acurácia: 0.0 - Log Loss: 1.090904081081186 



In [35]:
second_level

array([[0.29      , 0.        , 0.01714104, 0.20850832],
       [0.29      , 0.        , 0.03311291, 0.24756958],
       [0.76      , 1.        , 0.99770216, 0.88089291],
       [0.76      , 1.        , 0.99984514, 0.94736022]])