In [5]:
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [6]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [7]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:
id='1qoDO4rTseQvNtJmzhN_V760AO3X9MPiJ'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('train_values.csv')

In [4]:
id='1lSIh7K94GMnkm5O0bD5faCnYtnKGDunD'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('train_labels.csv')

In [19]:
id='1sMSmYeYzFAr7c6PZiPRf-laFhV3DeYa9'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('test_values.csv')

In [24]:
id='1-dHK1rnU6dupSS5TdaLsWuljGSpNRcEy'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('submission_format.csv')

In [8]:
train_values = pd.read_csv('train_values.csv')
train_labels = pd.read_csv('train_labels.csv')

In [9]:
set_train = train_values.merge(train_labels)
set_train.drop( columns=["building_id"], inplace=True)
set_train = set_train[set_train['count_floors_pre_eq'] <= 6]
set_train.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,6,487,12198,2,30,6,5,t,r,n,f,q,t,d,1,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,3
1,8,900,2812,2,10,8,7,o,r,n,x,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,2
2,21,363,8973,2,10,5,5,t,r,n,f,x,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,3
3,22,418,10694,2,10,6,5,t,r,n,f,x,s,d,0,1,0,0,0,0,1,1,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,2
4,11,131,1488,3,30,8,9,t,r,n,f,x,s,d,1,0,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,3


In [10]:
columnas_seleccionadas = ['age',
                         'geo_level_1_id',
                         'area_percentage',
                         'count_floors_pre_eq',
                         'geo_level_2_id',
                         'height_percentage']
train_values_subset = set_train[columnas_seleccionadas]
train_labels = set_train.iloc[:,-1]

In [11]:
train_values_subset = pd.get_dummies(train_values_subset)

In [12]:
param_grid = {'randomforestclassifier__n_estimators': [15, 20],
              'randomforestclassifier__max_depth': [30, 35],
              'randomforestclassifier__max_features': [1, 4]
             }

In [13]:
pipe = make_pipeline(StandardScaler(), 
                     RandomForestClassifier(random_state=1))

In [14]:
rf_model = GridSearchCV(pipe, param_grid, cv=5)

In [15]:
rf_model.fit(train_values_subset, train_labels.values.ravel())

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                   

In [16]:
rf_model.best_params_

{'randomforestclassifier__max_depth': 30,
 'randomforestclassifier__max_features': 4,
 'randomforestclassifier__n_estimators': 20}

In [17]:
preds = rf_model.predict(train_values_subset)
f1_score(train_labels, preds, average='micro')

0.8790221062327295

In [20]:
test_values = pd.read_csv('test_values.csv',index_col='building_id')

In [21]:
test_values_subset = test_values[columnas_seleccionadas]
test_values_subset = pd.get_dummies(test_values_subset)

In [22]:
predictions = rf_model.predict(test_values_subset)

In [25]:
submission_format = pd.read_csv('submission_format.csv',index_col='building_id')

In [26]:
my_submission = pd.DataFrame(data=predictions, columns=submission_format.columns,
                            index=submission_format.index)

In [27]:
my_submission.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,3
99355,3
890251,3
745817,1
421793,2


In [28]:
my_submission.to_csv('submision.csv')

In [29]:
!head submision.csv

building_id,damage_grade
300051,3
99355,3
890251,3
745817,1
421793,2
871976,2
691228,1
896100,3
343471,2


In [None]:
# Gini importance
rf_model.feature_importances_

In [None]:
plt.bar(test_values_subset.columns, rf_model.feature_importances_)
plt.xlabel('Features')
plt.ylabel('Importancia')
plt.title('Importancia Features con RF')
plt.show()