In [279]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from scipy import stats
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import GridSearchCV
import pickle

In [245]:
train_data_values = pd.read_csv('./data/train_values.csv', index_col=0)
train_data_labels = pd.read_csv('./data/train_labels.csv', index_col=0)

train_data = pd.concat([train_data_values, train_data_labels], axis=1)

In [246]:
train_data.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
28830,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,2
94947,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
590882,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,2
201944,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,3


In [247]:
train_data.columns

Index(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'count_families', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
     

In [248]:
numeric_features = [
  'count_floors_pre_eq', 'age', 
  'area_percentage', 'height_percentage',
  'count_families'
]

categorical_features = [
  'land_surface_condition', 'foundation_type',
  'roof_type', 'ground_floor_type',
  'other_floor_type', 'position',
  'plan_configuration', 'legal_ownership_status'
]

other_features = list(np.setdiff1d(
  train_data.columns,
  numeric_features + categorical_features + ['damage_grade'],
  assume_unique=True
))

Let's see how imbalanced the data really is:

In [249]:
train_data[train_data['damage_grade'] == 1].shape

(25124, 39)

In [250]:
train_data[train_data['damage_grade'] == 2].shape

(148259, 39)

In [251]:
train_data[train_data['damage_grade'] == 3].shape

(87218, 39)

Let's see if we can improve the imbalance by removing outliers. 

In [252]:
train_data = train_data[(np.abs(stats.zscore(train_data[numeric_features])) < 3).all(axis=1)]

In [253]:
train_data[train_data['damage_grade'] == 1].shape

(22900, 39)

In [254]:
train_data[train_data['damage_grade'] == 2].shape

(142343, 39)

In [255]:
train_data[train_data['damage_grade'] == 3].shape

(84588, 39)

It has helped a little bit, but it also helps indirectly with balancing the data by other techniques later.

In [261]:
X_train, X_test, y_train, y_test = train_test_split(train_data_values,
                                                    train_data_labels['damage_grade'],
                                                    test_size=0.25,
                                                    random_state=999)

In [262]:
X_train.head(10)

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
985502,6,673,2937,2,5,13,6,t,i,x,...,0,0,0,0,0,0,0,0,0,0
140510,3,1387,2791,5,100,15,10,t,r,q,...,0,1,0,0,0,0,0,0,0,0
701507,11,765,3900,2,20,11,7,t,r,n,...,0,0,0,0,0,0,0,0,0,0
438410,17,1313,2490,3,0,8,6,t,r,q,...,0,0,0,0,0,0,0,0,0,0
800610,21,1219,2948,2,0,6,5,t,r,q,...,0,0,0,0,0,0,0,0,0,0
152270,22,763,4002,1,10,12,3,t,r,n,...,0,0,0,0,0,0,0,0,0,0
511298,16,1399,4589,1,30,6,3,n,r,n,...,0,0,0,0,0,0,0,0,0,0
42065,10,337,296,2,5,5,5,t,r,q,...,0,0,0,0,0,0,0,0,0,0
669566,13,1154,11118,2,20,7,4,n,r,n,...,0,0,0,0,0,0,0,0,0,0
760886,7,773,471,2,0,10,5,n,r,q,...,0,0,0,0,0,0,0,0,0,0


In [263]:
preprocessor = ColumnTransformer(
  transformers=[
      ('scale', StandardScaler(), numeric_features),
      ('ohe', OneHotEncoder(drop="first"), categorical_features),
  ],
  remainder='passthrough'
)

In [264]:
X_train = pd.DataFrame(
  preprocessor.fit_transform(X_train),
  index=X_train.index,
  columns=(
    numeric_features +
    list(preprocessor.named_transformers_['ohe'].get_feature_names(categorical_features)) +
    other_features
  )
)

X_test = pd.DataFrame(preprocessor.transform(X_test),
                      index=X_test.index,
                      columns=X_train.columns)

In [266]:
X_train.head(10)

Unnamed: 0_level_0,count_floors_pre_eq,age,area_percentage,height_percentage,count_families,land_surface_condition_o,land_surface_condition_t,foundation_type_i,foundation_type_r,foundation_type_u,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
985502,-0.178352,-0.292638,1.132216,0.292746,0.038073,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
140510,3.939973,0.997278,1.587064,2.370974,0.038073,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
701507,-0.178352,-0.088967,0.677369,0.812303,0.038073,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
438410,1.194423,-0.360529,-0.004903,0.292746,2.440849,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
800610,-0.178352,-0.360529,-0.459751,-0.226811,0.038073,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
152270,-1.551127,-0.224748,0.904792,-1.265924,0.038073,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
511298,-1.551127,0.046813,-0.459751,-1.265924,0.038073,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42065,-0.178352,-0.292638,-0.687175,-0.226811,0.038073,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669566,-0.178352,-0.088967,-0.232327,-0.746368,0.038073,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
760886,-0.178352,-0.360529,0.449945,-0.226811,0.038073,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [267]:
sm = SMOTE(random_state=27)

In [268]:
X_train, y_train = sm.fit_sample(X_train, y_train)

In [269]:
parameters = { 
  'min_samples_split': [5, 10],
  'n_estimators': [100, 150]
}

In [270]:
rfc = GridSearchCV(
  RandomForestClassifier(), 
  parameters, 
  cv=5,
  scoring=make_scorer(f1_score, average='micro'),
  verbose=2
).fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] min_samples_split=5, n_estimators=100 ...........................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............ min_samples_split=5, n_estimators=100, total= 1.1min
[CV] min_samples_split=5, n_estimators=100 ...........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min remaining:    0.0s


[CV] ............ min_samples_split=5, n_estimators=100, total= 1.1min
[CV] min_samples_split=5, n_estimators=100 ...........................
[CV] ............ min_samples_split=5, n_estimators=100, total= 1.0min
[CV] min_samples_split=5, n_estimators=100 ...........................
[CV] ............ min_samples_split=5, n_estimators=100, total= 1.2min
[CV] min_samples_split=5, n_estimators=100 ...........................
[CV] ............ min_samples_split=5, n_estimators=100, total= 1.2min
[CV] min_samples_split=5, n_estimators=150 ...........................
[CV] ............ min_samples_split=5, n_estimators=150, total= 1.8min
[CV] min_samples_split=5, n_estimators=150 ...........................
[CV] ............ min_samples_split=5, n_estimators=150, total= 1.7min
[CV] min_samples_split=5, n_estimators=150 ...........................
[CV] ............ min_samples_split=5, n_estimators=150, total= 1.9min
[CV] min_samples_split=5, n_estimators=150 ...........................
[CV] .

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 29.2min finished


In [271]:
dummy = DummyClassifier(strategy='uniform').fit(X_train, y_train)

In [272]:
rfc_pred = rfc.predict(X_test)

In [273]:
dummy_pred = dummy.predict(X_test)

In [274]:
f1_score(y_test, rfc_pred, average='micro')

0.7165968289051587

In [275]:
f1_score(y_test, dummy_pred, average='micro')

0.33502171877638104

In [276]:
rfc.best_params_

{'min_samples_split': 5, 'n_estimators': 150}

In [282]:
rfc.best_estimator_.feature_importances_

array([2.92296582e-02, 8.83960942e-02, 6.72254860e-02, 4.80869200e-02,
       1.78072941e-02, 4.13191497e-03, 1.37437973e-02, 1.39436211e-02,
       5.91142376e-02, 6.46220929e-03, 1.21474070e-02, 1.75105778e-02,
       2.00185622e-02, 4.65657470e-04, 2.70254468e-02, 9.27486134e-03,
       7.33965965e-04, 2.90039048e-02, 8.26810526e-03, 8.84462960e-03,
       9.35835562e-04, 1.39244364e-02, 1.18996413e-02, 2.46487910e-04,
       4.54938022e-03, 1.06675987e-05, 3.16206403e-05, 4.89488916e-05,
       1.33869047e-04, 2.91408864e-03, 2.97573616e-04, 2.17590532e-03,
       9.60927838e-04, 4.94622138e-03, 1.39280077e-03, 1.25490348e-01,
       9.14946560e-02, 8.74917413e-02, 9.91631329e-03, 5.65425092e-02,
       7.31118530e-03, 3.54663519e-03, 8.61321098e-03, 2.17385952e-02,
       1.88280721e-02, 9.91578188e-03, 6.15442519e-03, 3.40755108e-03,
       3.36516901e-03, 8.45894381e-03, 5.32858701e-03, 3.76157741e-03,
       1.23051857e-03, 1.67669891e-04, 7.47521381e-05, 1.90433752e-04,
      

In [283]:
X_test.columns

Index(['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'count_families', 'land_surface_condition_o',
       'land_surface_condition_t', 'foundation_type_i', 'foundation_type_r',
       'foundation_type_u', 'foundation_type_w', 'roof_type_q', 'roof_type_x',
       'ground_floor_type_m', 'ground_floor_type_v', 'ground_floor_type_x',
       'ground_floor_type_z', 'other_floor_type_q', 'other_floor_type_s',
       'other_floor_type_x', 'position_o', 'position_s', 'position_t',
       'plan_configuration_c', 'plan_configuration_d', 'plan_configuration_f',
       'plan_configuration_m', 'plan_configuration_n', 'plan_configuration_o',
       'plan_configuration_q', 'plan_configuration_s', 'plan_configuration_u',
       'legal_ownership_status_r', 'legal_ownership_status_v',
       'legal_ownership_status_w', 'geo_level_1_id', 'geo_level_2_id',
       'geo_level_3_id', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure

In [285]:
feature_importantance_series = pd.Series(rfc.best_estimator_.feature_importances_)

In [287]:
feature_importantance_series.index = X_test.columns

In [291]:
feature_importantance_series.sort_values(ascending=False)

geo_level_1_id                            0.125490
geo_level_2_id                            0.091495
age                                       0.088396
geo_level_3_id                            0.087492
area_percentage                           0.067225
foundation_type_r                         0.059114
has_superstructure_mud_mortar_stone       0.056543
height_percentage                         0.048087
count_floors_pre_eq                       0.029230
other_floor_type_q                        0.029004
ground_floor_type_v                       0.027025
has_superstructure_cement_mortar_brick    0.021739
roof_type_x                               0.020019
has_superstructure_timber                 0.018828
count_families                            0.017807
roof_type_q                               0.017511
foundation_type_i                         0.013944
position_s                                0.013924
land_surface_condition_t                  0.013744
foundation_type_w              