In [12]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_validate,KFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings('ignore')

In [13]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

Elegimos nuestros features, y abrimos los archivos de set train y test values.

In [14]:
features = ['age',
            'geo_level_1_id',
            'geo_level_2_id',
            'geo_level_3_id',
            'count_families',
            'base_condition',
            'volume_percentage',
            'area_percentage',
            'height_percentage', 
            'geolevel_grouped_age_mean',
            'geolevel_grouped_height_mean',
            'geolevel_grouped_area_mean',
            'construction_type_grouped_age_mean',
            'construction_type_grouped_height_mean',
            'construction_type_grouped_area_mean',
            'count_floors_pre_eq',
            'has_secondary_use', 'has_secondary_use_agriculture', 'has_secondary_use_hotel',
            'has_secondary_use_rental', 'has_secondary_use_institution', 'has_secondary_use_school',
            'has_secondary_use_industry', 'has_secondary_use_health_post', 'has_secondary_use_gov_office',
            'has_secondary_use_use_police', 'has_secondary_use_other', 
            'has_superstructure_rc_engineered', 'has_superstructure_rc_non_engineered', 'has_superstructure_cement_mortar_brick',
            'has_superstructure_cement_mortar_stone', 'has_superstructure_mud_mortar_brick', 'has_superstructure_mud_mortar_stone',
            'has_superstructure_stone_flag', 'has_superstructure_timber', 'has_superstructure_adobe_mud', 
            'has_superstructure_bamboo', 'has_superstructure_other',
            'land_surface_condition_n', 'land_surface_condition_o', 'land_surface_condition_t',
            'foundation_type_h', 'foundation_type_i', 'foundation_type_r', 'foundation_type_u', 'foundation_type_w',
            'roof_type_n', 'roof_type_q', 'roof_type_x',
            'ground_floor_type_f', 'ground_floor_type_m', 'ground_floor_type_v', 'ground_floor_type_x', 'ground_floor_type_z',
            'other_floor_type_j', 'other_floor_type_q', 'other_floor_type_s', 'other_floor_type_x',
            'position_j', 'position_o', 'position_s', 'position_t',
            'plan_configuration_a', 'plan_configuration_c', 'plan_configuration_d', 'plan_configuration_f', 'plan_configuration_m', 'plan_configuration_n', 'plan_configuration_o', 'plan_configuration_q', 'plan_configuration_s', 'plan_configuration_u',]

In [15]:
set_train = pd.read_csv('data_set.csv')
test_values = pd.read_csv('data_test.csv')

In [16]:
train, test = train_test_split(set_train, test_size = 0.20, shuffle = False)

In [17]:
train_values_subset = train.iloc[:, 0:-1]
train_labels = train.loc[:, 'damage_grade']

In [18]:
test_values_subset = test.iloc[:, 0:-1]
test_labels = test.loc[:, 'damage_grade']

Buscamos hiperparametros ideales con Random Search

In [19]:
n_jobs=[-1]
n_estimators=np.arange(300, 500,100)
colsample_bytree=[0.8, 1.0]
max_depth=np.arange(10, 12, 1)
learning_rate = [0.1]
param_grid={'n_jobs':n_jobs,
           'n_estimators':n_estimators,
           'max_depth':max_depth,
            'learning_rate': learning_rate
           }

In [20]:
rf=XGBClassifier(eval_metric='mlogloss')
kf=KFold(n_splits=2,shuffle=True)

In [21]:
rs=RandomizedSearchCV(rf,param_distributions=param_grid,cv=kf,scoring='f1_micro')

In [22]:
start_time = timer(None)
rs.fit(train_values_subset, train_labels.values.ravel())
timer(start_time)


 Time taken: 0 hours 36 minutes and 17.93 seconds.


In [23]:
rs.best_params_

{'n_jobs': -1, 'n_estimators': 300, 'max_depth': 10, 'learning_rate': 0.1}

Los mejores hiperparametros para nuestro modelo son los siguientes:
n_jobs=-1
n_estimators=300
max_depth=10
learning_rate=0.1

In [24]:
train_values_subset = pd.get_dummies(train_values_subset)

In [25]:
preds = rs.predict(test_values_subset)
f1_score(test_labels, preds, average='micro')

0.7500047965311487

In [26]:
test_values_subset = test_values[features]
test_values_subset

Unnamed: 0,age,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_families,base_condition,volume_percentage,area_percentage,height_percentage,geolevel_grouped_age_mean,...,plan_configuration_a,plan_configuration_c,plan_configuration_d,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u
0,20,17,596,11307,1,1.673320,42,7,6,26.304348,...,0,0,0,0,0,0,0,0,0,1
1,25,6,141,11987,1,1.673320,65,13,5,34.166667,...,0,0,0,0,0,0,0,0,0,1
2,5,22,19,10044,1,1.673320,20,4,5,15.000000,...,0,0,0,0,0,0,0,0,0,1
3,0,26,39,633,2,1.673320,57,19,3,12.681332,...,0,0,0,0,0,0,0,0,0,1
4,15,17,289,7970,1,1.673320,56,8,7,45.000000,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86863,70,4,605,3623,1,1.673320,120,20,6,41.521739,...,0,0,0,0,0,0,0,0,0,1
86864,25,10,1407,11907,1,0.979221,42,6,7,25.400000,...,0,0,0,0,0,0,0,0,0,1
86865,50,22,1136,7712,1,1.673320,9,3,3,32.666667,...,0,0,0,0,0,0,0,0,0,1
86866,5,6,1041,912,1,1.673320,45,9,5,18.265306,...,0,0,0,0,0,0,0,0,0,1


In [28]:
predictions = rs.predict(test_values_subset)

In [29]:
c = []
for i in range(len(predictions)):
    c.append(int(predictions[i]))

Creamos el archivo para submittear

In [30]:
submission_format = pd.read_csv('submission_format.csv',index_col='building_id')
my_submission = pd.DataFrame(data=c, columns=submission_format.columns,
                            index=submission_format.index)
my_submission.to_csv('submision.csv')

Este modelo obtuvo un resultado final de 0.7479 en Driven Data