In [68]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from scipy import stats
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from mlens.ensemble import SuperLearner
import pickle

In [69]:
train_data_values = pd.read_csv('./data/train_values.csv', index_col=0)
train_data_labels = pd.read_csv('./data/train_labels.csv', index_col=0)

train_data = pd.concat([train_data_values, train_data_labels], axis=1)

In [70]:
train_data.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
28830,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,2
94947,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
590882,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,2
201944,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,3


In [71]:
train_data.columns

Index(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'count_families', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
     

In [72]:
numeric_features = [
  'count_floors_pre_eq', 'age', 
  'area_percentage', 'height_percentage',
  'count_families'
]

categorical_features = [
  'land_surface_condition', 'foundation_type',
  'roof_type', 'ground_floor_type',
  'other_floor_type', 'position',
  'plan_configuration', 'legal_ownership_status'
]

other_features = list(np.setdiff1d(
  train_data.columns,
  categorical_features + ['damage_grade'],
  assume_unique=True
))

Let's see how imbalanced the data really is:

In [73]:
train_data[train_data['damage_grade'] == 1].shape

(25124, 39)

In [74]:
train_data[train_data['damage_grade'] == 2].shape

(148259, 39)

In [75]:
train_data[train_data['damage_grade'] == 3].shape

(87218, 39)

Let's see if we can improve the imbalance by removing outliers. 

In [76]:
train_data = train_data[(np.abs(stats.zscore(train_data[numeric_features])) < 3).all(axis=1)]

In [77]:
train_data[train_data['damage_grade'] == 1].shape

(22900, 39)

In [78]:
train_data[train_data['damage_grade'] == 2].shape

(142343, 39)

In [79]:
train_data[train_data['damage_grade'] == 3].shape

(84588, 39)

It has helped a little bit, but it also helps indirectly with balancing the data by other techniques later.

In [80]:
X_train, X_test, y_train, y_test = train_test_split(train_data_values,
                                                    train_data_labels['damage_grade'],
                                                    test_size=0.25,
                                                    random_state=999)

In [81]:
X_train.head(10)

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
985502,6,673,2937,2,5,13,6,t,i,x,...,0,0,0,0,0,0,0,0,0,0
140510,3,1387,2791,5,100,15,10,t,r,q,...,0,1,0,0,0,0,0,0,0,0
701507,11,765,3900,2,20,11,7,t,r,n,...,0,0,0,0,0,0,0,0,0,0
438410,17,1313,2490,3,0,8,6,t,r,q,...,0,0,0,0,0,0,0,0,0,0
800610,21,1219,2948,2,0,6,5,t,r,q,...,0,0,0,0,0,0,0,0,0,0
152270,22,763,4002,1,10,12,3,t,r,n,...,0,0,0,0,0,0,0,0,0,0
511298,16,1399,4589,1,30,6,3,n,r,n,...,0,0,0,0,0,0,0,0,0,0
42065,10,337,296,2,5,5,5,t,r,q,...,0,0,0,0,0,0,0,0,0,0
669566,13,1154,11118,2,20,7,4,n,r,n,...,0,0,0,0,0,0,0,0,0,0
760886,7,773,471,2,0,10,5,n,r,q,...,0,0,0,0,0,0,0,0,0,0


In [82]:
preprocessor = ColumnTransformer(
  transformers=[
      ('ohe', OneHotEncoder(drop="first"), categorical_features)
  ],
  remainder='passthrough'
)

In [83]:
X_train = pd.DataFrame(
  preprocessor.fit_transform(X_train),
  index=X_train.index,
  columns=(
    list(preprocessor.named_transformers_['ohe'].get_feature_names(categorical_features)) +
    other_features
  )
)

X_test = pd.DataFrame(preprocessor.transform(X_test),
                      index=X_test.index,
                      columns=X_train.columns)

In [84]:
X_train.head(10)

Unnamed: 0_level_0,land_surface_condition_o,land_surface_condition_t,foundation_type_i,foundation_type_r,foundation_type_u,foundation_type_w,roof_type_q,roof_type_x,ground_floor_type_m,ground_floor_type_v,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
985502,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
140510,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
701507,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
438410,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
800610,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
152270,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
511298,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42065,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669566,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
760886,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [85]:
sm = SMOTE(random_state=27)

In [86]:
X_train, y_train = sm.fit_sample(X_train, y_train)

In [20]:
parameters = { 
  'min_samples_split': [5, 10],
  'n_estimators': [100, 150]
}

In [None]:
rfc = GridSearchCV(
  RandomForestClassifier(), 
  parameters, 
  cv=5,
  scoring=make_scorer(f1_score, average='micro'),
  verbose=2
).fit(X_train, y_train)

In [22]:
dummy = DummyClassifier(strategy='uniform').fit(X_train, y_train)

In [23]:
rfc_pred = rfc.predict(X_test)

In [24]:
dummy_pred = dummy.predict(X_test)

In [25]:
f1_score(y_test, rfc_pred, average='micro')

0.7154456570121718

In [26]:
f1_score(y_test, dummy_pred, average='micro')

0.3321975103989194

In [27]:
rfc.best_params_

{'min_samples_split': 5, 'n_estimators': 150}

In [28]:
rfc.best_estimator_.feature_importances_

array([4.12697544e-03, 1.34810288e-02, 1.48282469e-02, 5.58917837e-02,
       7.12132740e-03, 1.57045446e-02, 1.81129986e-02, 2.44310584e-02,
       4.94440893e-04, 3.40336518e-02, 9.13055031e-03, 7.27331681e-04,
       2.86526569e-02, 5.89886443e-03, 1.01790457e-02, 9.01005288e-04,
       1.68189435e-02, 1.62162891e-02, 2.45230148e-04, 4.56663325e-03,
       1.14111400e-05, 3.00304655e-05, 5.99546937e-05, 1.35121410e-04,
       2.91398000e-03, 2.84988168e-04, 2.28845934e-03, 9.48629233e-04,
       4.89757230e-03, 1.38871781e-03, 1.19122582e-01, 8.80213876e-02,
       8.42721671e-02, 3.61010965e-02, 7.27864144e-02, 6.36890294e-02,
       4.75152927e-02, 9.86192223e-03, 5.67590125e-02, 7.08064322e-03,
       3.58430938e-03, 8.63231908e-03, 2.14022479e-02, 1.86385007e-02,
       9.54495662e-03, 8.64149719e-03, 5.23642474e-03, 3.25112912e-03,
       2.05913881e-02, 8.80682209e-03, 5.30510040e-03, 4.05387948e-03,
       1.06703636e-03, 1.66313150e-04, 6.80921112e-05, 2.04605176e-04,
      

In [29]:
X_test.columns

Index(['land_surface_condition_o', 'land_surface_condition_t',
       'foundation_type_i', 'foundation_type_r', 'foundation_type_u',
       'foundation_type_w', 'roof_type_q', 'roof_type_x',
       'ground_floor_type_m', 'ground_floor_type_v', 'ground_floor_type_x',
       'ground_floor_type_z', 'other_floor_type_q', 'other_floor_type_s',
       'other_floor_type_x', 'position_o', 'position_s', 'position_t',
       'plan_configuration_c', 'plan_configuration_d', 'plan_configuration_f',
       'plan_configuration_m', 'plan_configuration_n', 'plan_configuration_o',
       'plan_configuration_q', 'plan_configuration_s', 'plan_configuration_u',
       'legal_ownership_status_r', 'legal_ownership_status_v',
       'legal_ownership_status_w', 'geo_level_1_id', 'geo_level_2_id',
       'geo_level_3_id', 'count_floors_pre_eq', 'age', 'area_percentage',
       'height_percentage', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
    

In [30]:
feature_importantance_series = pd.Series(rfc.best_estimator_.feature_importances_)

In [31]:
feature_importantance_series.index = X_test.columns

In [32]:
feature_importantance_series.sort_values(ascending=False)

geo_level_1_id                            0.119123
geo_level_2_id                            0.088021
geo_level_3_id                            0.084272
age                                       0.072786
area_percentage                           0.063689
has_superstructure_mud_mortar_stone       0.056759
foundation_type_r                         0.055892
height_percentage                         0.047515
count_floors_pre_eq                       0.036101
ground_floor_type_v                       0.034034
other_floor_type_q                        0.028653
roof_type_x                               0.024431
has_superstructure_cement_mortar_brick    0.021402
count_families                            0.020591
has_superstructure_timber                 0.018639
roof_type_q                               0.018113
position_s                                0.016819
position_t                                0.016216
foundation_type_w                         0.015705
foundation_type_i              

In [95]:
xgb_parameters = {
  'max_depth': [6, 10],
  'colsample_bytree': [0.8]
}

In [None]:
xgb_model = GridSearchCV(
  XGBClassifier(), 
  xgb_parameters, 
  cv=5,
  scoring=make_scorer(f1_score, average='micro'),
  verbose=2
).fit(X_train, y_train)

In [97]:
xgb_model.best_params_

{'colsample_bytree': 0.8, 'max_depth': 10}

In [98]:
xgb_model.best_estimator_.feature_importances_

array([0.0051761 , 0.00898156, 0.0269817 , 0.35768974, 0.01765117,
       0.01960185, 0.01617406, 0.01229804, 0.00468889, 0.04411929,
       0.00520313, 0.00319394, 0.04038239, 0.00439255, 0.00741334,
       0.00229994, 0.02620419, 0.01077505, 0.00310347, 0.00574895,
       0.        , 0.00519479, 0.00327846, 0.00330452, 0.00405099,
       0.00249583, 0.00676117, 0.00264473, 0.00439563, 0.0039854 ,
       0.04047594, 0.00817027, 0.00366049, 0.02061346, 0.01205816,
       0.00760393, 0.0078385 , 0.01316659, 0.06037288, 0.02069468,
       0.00621894, 0.01279929, 0.02414555, 0.01093778, 0.00836611,
       0.00564653, 0.01229437, 0.00542065, 0.01404187, 0.01610699,
       0.00522421, 0.0042778 , 0.00442137, 0.00244279, 0.00073108,
       0.00164371, 0.00313185, 0.        , 0.00530979, 0.00399351],
      dtype=float32)

In [99]:
feature_importantance_series = pd.Series(xgb_model.best_estimator_.feature_importances_)
feature_importantance_series.index = X_test.columns
feature_importantance_series.sort_values(ascending=False)

foundation_type_r                         0.357690
has_superstructure_mud_mortar_stone       0.060373
ground_floor_type_v                       0.044119
geo_level_1_id                            0.040476
other_floor_type_q                        0.040382
foundation_type_i                         0.026982
position_s                                0.026204
has_superstructure_cement_mortar_brick    0.024146
has_superstructure_stone_flag             0.020695
count_floors_pre_eq                       0.020613
foundation_type_w                         0.019602
foundation_type_u                         0.017651
roof_type_q                               0.016174
has_secondary_use                         0.016107
count_families                            0.014042
has_superstructure_adobe_mud              0.013167
has_superstructure_mud_mortar_brick       0.012799
roof_type_x                               0.012298
has_superstructure_rc_engineered          0.012294
age                            

In [100]:
xgb_pred = xgb_model.predict(X_test.as_matrix())

  """Entry point for launching an IPython kernel.


In [101]:
f1_score(y_test, xgb_pred, average='micro')

0.722153151908643

In [89]:
light_parameters = {
  'num_leaves': [50, 80, 150],
  'min_data_in_leaf': [50, 100, 300],
  'max_depth': [5, 10]
}

In [None]:
light_model = GridSearchCV(
  LGBMClassifier(), 
  light_parameters, 
  cv=5,
  scoring=make_scorer(f1_score, average='micro'),
  verbose=2
).fit(X_train, y_train)

In [91]:
feature_importantance_series = pd.Series(light_model.best_estimator_.feature_importances_)
feature_importantance_series.index = X_test.columns
feature_importantance_series.sort_values(ascending=False)

geo_level_2_id                            7895
geo_level_1_id                            6902
geo_level_3_id                            5436
area_percentage                           3016
age                                       2881
height_percentage                         1955
count_floors_pre_eq                       1250
count_families                            1105
has_superstructure_mud_mortar_stone        977
has_superstructure_timber                  928
roof_type_q                                923
position_s                                 636
land_surface_condition_t                   634
has_superstructure_cement_mortar_brick     623
other_floor_type_q                         586
foundation_type_r                          541
ground_floor_type_v                        519
has_secondary_use                          517
ground_floor_type_x                        504
other_floor_type_x                         481
has_superstructure_bamboo                  442
has_superstru

In [92]:
light_model.best_params_

{'max_depth': 10, 'min_data_in_leaf': 50, 'num_leaves': 150}

In [93]:
light_pred = light_model.predict(X_test)

In [94]:
f1_score(y_test, light_pred, average='micro')

0.7130358705161854

In [26]:
ada_parameters = {
  'n_estimators': [50, 100, 200],
  'learning_rate': [0.1, 1, 2]
}

In [None]:
ada_model = GridSearchCV(
  AdaBoostClassifier(), 
  ada_parameters, 
  cv=5,
  scoring=make_scorer(f1_score, average='micro'),
  verbose=2
).fit(X_train, y_train)

In [29]:
feature_importantance_series = pd.Series(ada_model.best_estimator_.feature_importances_)
feature_importantance_series.index = X_test.columns
feature_importantance_series.sort_values(ascending=False)

geo_level_1_id                            0.245
age                                       0.075
area_percentage                           0.075
roof_type_q                               0.070
count_floors_pre_eq                       0.070
count_families                            0.060
other_floor_type_q                        0.050
land_surface_condition_t                  0.050
position_t                                0.030
position_s                                0.030
foundation_type_r                         0.025
has_secondary_use                         0.025
height_percentage                         0.020
geo_level_2_id                            0.020
ground_floor_type_v                       0.015
other_floor_type_x                        0.015
has_superstructure_mud_mortar_stone       0.015
has_superstructure_timber                 0.015
has_superstructure_bamboo                 0.015
has_superstructure_mud_mortar_brick       0.010
has_superstructure_cement_mortar_brick  

In [30]:
ada_model.best_params_

{'learning_rate': 1, 'n_estimators': 200}

In [31]:
ada_pred = ada_model.predict(X_test)
f1_score(y_test, ada_pred, average='micro')

0.6477414007459594

Ada did the worst among the first layer models.

## Gather all the models for stacking

In [102]:
rf_model = RandomForestClassifier(min_samples_split = 5, n_estimators = 150).fit(X_train, y_train)
xgb_model = XGBClassifier(colsample_bytree = 0.8, max_depth = 10).fit(X_train, y_train)
light_model = LGBMClassifier(min_data_in_leaf = 50, num_leaves = 150, max_depth = 10).fit(X_train, y_train)

In [103]:
stacking_train_df = pd.DataFrame({
 'rf_pred': rf_model.predict(X_train),
  'xgb_pred': xgb_model.predict(X_train),
  'light_pred': light_model.predict(X_train)
})

In [104]:
stacking_test_df = pd.DataFrame({
 'rf_pred': rf_model.predict(X_test),
  'xgb_pred': xgb_model.predict(X_test.as_matrix()),
  'light_pred': light_model.predict(X_test)
})

  This is separate from the ipykernel package so we can avoid doing imports until


In [105]:
stacking_train_df

Unnamed: 0,rf_pred,xgb_pred,light_pred
0,1,1,1
1,2,2,2
2,3,3,2
3,3,3,3
4,3,3,3
...,...,...,...
333886,3,3,3
333887,3,3,3
333888,3,3,3
333889,3,3,3


In [106]:
cat_parameters = {
  'depth': [6, 7, 8, 9],
  'l2_leaf_reg': [10, 20],
  'one_hot_max_size': [10]
}

In [None]:
cat_model = GridSearchCV(
  CatBoostClassifier(), 
  cat_parameters, 
  cv=5,
  scoring=make_scorer(f1_score, average='micro'),
  verbose=2
).fit(stacking_train_df, y_train)

In [108]:
cat_model.best_params_

{'depth': 6, 'l2_leaf_reg': 10, 'one_hot_max_size': 10}

In [109]:
cat_pred = cat_model.predict(stacking_test_df)
f1_score(y_test, cat_pred, average='micro')

0.7166275268223051

## Let's put all these together

In [123]:
from mlens.metrics import make_scorer
accuracy_scorer = make_scorer(f1_score, average='micro')

In [124]:
ensemble = SuperLearner(scorer=accuracy_scorer, random_state=123, verbose=2, folds=5)

In [125]:
ensemble.add([
  RandomForestClassifier(min_samples_split=5, n_estimators=150, random_state=123), 
  XGBClassifier(colsample_bytree=0.8, max_depth=10, seed=123),
  LGBMClassifier(min_data_in_leaf=50, num_leaves=150, max_depth=10, random_state=123)
])

SuperLearner(array_check=None, backend=None, folds=5,
       layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
   name='layer-1', propagate_features=None, raise_on_exception=True,
   random_state=3582, shuffle=False,
   stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
   indexer=FoldIndex(X=None, folds=5, raise_on_ex...e=micro))],
   n_jobs=-1, name='group-7', raise_on_exception=True, transformers=[])],
   verbose=1)],
       model_selection=False, n_jobs=None, raise_on_exception=True,
       random_state=123, sample_size=20,
       scorer=make_scorer(f1_score, average=micro), shuffle=False,
       verbose=2)

In [126]:
ensemble.add_meta(CatBoostClassifier(
  random_seed=123,
  depth=6, 
  l2_leaf_reg=10, 
  one_hot_max_size=10
))

SuperLearner(array_check=None, backend=None, folds=5,
       layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
   name='layer-1', propagate_features=None, raise_on_exception=True,
   random_state=3582, shuffle=False,
   stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
   indexer=FoldIndex(X=None, folds=5, raise_on_ex...e=micro))],
   n_jobs=-1, name='group-8', raise_on_exception=True, transformers=[])],
   verbose=1)],
       model_selection=False, n_jobs=None, raise_on_exception=True,
       random_state=123, sample_size=20,
       scorer=make_scorer(f1_score, average=micro), shuffle=False,
       verbose=2)

In [127]:
sub_preprocessor = ColumnTransformer(
  transformers=[('ohe', OneHotEncoder(drop="first"), categorical_features)],
  remainder='passthrough'
)

In [128]:
X = sub_preprocessor.fit_transform(train_data_values)

In [129]:
sm = SMOTE(random_state=27)

X_train, y_train = sm.fit_sample(X, train_data_labels['damage_grade'])

In [None]:
ensemble.fit(X_train, y_train)

In [131]:
test_data_values = pd.read_csv('./data/test_values.csv', index_col=0)

In [132]:
X_test = sub_preprocessor.transform(test_data_values)

In [133]:
preds = ensemble.predict(X_test)


Predicting 2 layers




Processing layer-1             done | 00:00:11
Processing layer-2             done | 00:00:00
Predict complete                    | 00:00:12


In [134]:
sub = pd.DataFrame({
  'building_id': test_data_values.index,
  'damage_grade': map(int, preds)
})

In [135]:
sub.to_csv('final-submission', index=False)

In [137]:
pickle.dump(ensemble, open('ensemble_model.pickle', 'wb'))