In [23]:
import numpy as np
import pandas as pd

In [24]:
import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib.ticker as ticker
import seaborn as sns

In [25]:
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
import multiprocessing

In [26]:
labels = pd.read_csv('../csv/train_labels.csv')
labels.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [27]:
values = pd.read_csv('../csv/train_values.csv')
values.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
building_id,802906,28830,94947,590882,201944,333020,728451,475515,441126,989500
geo_level_1_id,6,8,21,22,11,8,9,20,0,26
geo_level_2_id,487,900,363,418,131,558,475,323,757,886
geo_level_3_id,12198,2812,8973,10694,1488,6089,12066,12236,7219,994
count_floors_pre_eq,2,2,2,2,3,2,2,2,2,1
age,30,10,10,10,30,10,25,0,15,0
area_percentage,6,8,5,6,8,9,3,8,8,13
height_percentage,5,7,5,5,9,5,4,6,6,4
land_surface_condition,t,o,t,t,t,t,n,t,t,t
foundation_type,r,r,r,r,r,r,r,w,r,i


In [28]:
values.isnull().values.any()

False

In [29]:
labels.isnull().values.any()

False

In [30]:
values.dtypes 

building_id                                int64
geo_level_1_id                             int64
geo_level_2_id                             int64
geo_level_3_id                             int64
count_floors_pre_eq                        int64
age                                        int64
area_percentage                            int64
height_percentage                          int64
land_surface_condition                    object
foundation_type                           object
roof_type                                 object
ground_floor_type                         object
other_floor_type                          object
position                                  object
plan_configuration                        object
has_superstructure_adobe_mud               int64
has_superstructure_mud_mortar_stone        int64
has_superstructure_stone_flag              int64
has_superstructure_cement_mortar_stone     int64
has_superstructure_mud_mortar_brick        int64
has_superstructure_c

In [31]:
values["building_id"].count() == values["building_id"].drop_duplicates().count()

True

In [32]:
values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   building_id                             260601 non-null  int64 
 1   geo_level_1_id                          260601 non-null  int64 
 2   geo_level_2_id                          260601 non-null  int64 
 3   geo_level_3_id                          260601 non-null  int64 
 4   count_floors_pre_eq                     260601 non-null  int64 
 5   age                                     260601 non-null  int64 
 6   area_percentage                         260601 non-null  int64 
 7   height_percentage                       260601 non-null  int64 
 8   land_surface_condition                  260601 non-null  object
 9   foundation_type                         260601 non-null  object
 10  roof_type                               260601 non-null 

In [33]:
to_be_categorized = ["land_surface_condition", "foundation_type", "roof_type",\
                     "position", "ground_floor_type", "other_floor_type",\
                     "plan_configuration", "legal_ownership_status"]
for row in to_be_categorized:
    values[row] = values[row].astype("category")
values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   building_id                             260601 non-null  int64   
 1   geo_level_1_id                          260601 non-null  int64   
 2   geo_level_2_id                          260601 non-null  int64   
 3   geo_level_3_id                          260601 non-null  int64   
 4   count_floors_pre_eq                     260601 non-null  int64   
 5   age                                     260601 non-null  int64   
 6   area_percentage                         260601 non-null  int64   
 7   height_percentage                       260601 non-null  int64   
 8   land_surface_condition                  260601 non-null  category
 9   foundation_type                         260601 non-null  category
 10  roof_type                       

In [34]:
datatypes = dict(values.dtypes)
for row in values.columns:
    if datatypes[row] != "int64" and datatypes[row] != "int32" and \
       datatypes[row] != "int16" and datatypes[row] != "int8":
        continue
    if values[row].nlargest(1).item() > 32767 and values[row].nlargest(1).item() < 2**31:
        values[row] = values[row].astype(np.int32)
    elif values[row].nlargest(1).item() > 127:
        values[row] = values[row].astype(np.int16)
    else:
        values[row] = values[row].astype(np.int8)
values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   building_id                             260601 non-null  int32   
 1   geo_level_1_id                          260601 non-null  int8    
 2   geo_level_2_id                          260601 non-null  int16   
 3   geo_level_3_id                          260601 non-null  int16   
 4   count_floors_pre_eq                     260601 non-null  int8    
 5   age                                     260601 non-null  int16   
 6   area_percentage                         260601 non-null  int8    
 7   height_percentage                       260601 non-null  int8    
 8   land_surface_condition                  260601 non-null  category
 9   foundation_type                         260601 non-null  category
 10  roof_type                       

In [35]:
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   building_id   260601 non-null  int64
 1   damage_grade  260601 non-null  int64
dtypes: int64(2)
memory usage: 4.0 MB


In [36]:
labels["building_id"] = labels["building_id"].astype(np.int32)
labels["damage_grade"] = labels["damage_grade"].astype(np.int8)
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   building_id   260601 non-null  int32
 1   damage_grade  260601 non-null  int8 
dtypes: int32(1), int8(1)
memory usage: 1.2 MB


# Nuevo Modelo

In [64]:
important_values = values\
                .merge(labels, on="building_id")
important_values.drop(columns=["building_id"], inplace = True)
important_values["geo_level_1_id"] = important_values["geo_level_1_id"].astype("category")
important_values

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
1,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,2
2,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
3,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,2
4,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,25,1335,1621,1,55,6,3,n,r,n,...,0,0,0,0,0,0,0,0,0,2
260597,17,715,2060,2,0,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
260598,17,51,8163,3,55,6,7,t,r,q,...,0,0,0,0,0,0,0,0,0,3
260599,26,39,1851,2,10,14,6,t,r,x,...,0,0,0,0,0,0,0,0,0,2


In [65]:
important_values.shape

(260601, 39)

In [66]:

X_train, X_test, y_train, y_test = train_test_split(important_values.drop(columns = 'damage_grade'),
                                                    important_values['damage_grade'], test_size = 0.2, random_state = 123)

In [67]:
#OneHotEncoding
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 

features_to_encode = ["geo_level_1_id", "land_surface_condition", "foundation_type", "roof_type",\
                     "position", "ground_floor_type", "other_floor_type",\
                     "plan_configuration", "legal_ownership_status"]
for feature in features_to_encode:
    X_train = encode_and_bind(X_train, feature)
    X_test = encode_and_bind(X_test, feature)

In [68]:
X_train

Unnamed: 0,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,...,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
103291,1274,4190,2,25,8,5,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
233923,1207,12014,1,10,9,3,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
166653,944,8232,3,40,7,6,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
150634,488,12448,2,0,7,5,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
82720,302,5339,1,10,5,3,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192476,217,10644,1,25,4,6,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
17730,600,4813,2,20,13,8,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
28030,463,4692,2,10,9,4,1,1,0,0,...,0,0,0,0,0,0,0,0,1,0
15725,600,157,2,50,5,8,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [43]:
# Busco los mejores tres parametros indicados abajo.
max_features = [35, 40, 45]
min_samples_split = [10, 15, 20]
min_impurity_decrease = [0.0, 0.01, 0.025, 0.05, 0.1]

hyperF = {'min_impurity_decrease': min_impurity_decrease,
          'max_features': max_features,  
          'min_samples_split': min_samples_split}

gridF = GridSearchCV(estimator = RandomForestClassifier(random_state = 123,
                                                        n_estimators = 150,
                                                        max_depth = None,
                                                        min_samples_leaf = 1,
                                                        criterion = "gini"),
                     scoring = 'f1_micro',
                     param_grid = hyperF,
                     cv = 3,
                     verbose = 1, 
                     n_jobs = -1)

bestF = gridF.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [78]:
res = pd.DataFrame(bestF.cv_results_)
res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_features,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,168.8942,0.348185,9.13668,0.070775,25,2,100,"{'max_features': 25, 'min_samples_split': 2, '...",0.722149,0.719828,0.720231,0.720736,0.001013,27
1,208.924055,2.001771,11.789438,0.11273,25,2,125,"{'max_features': 25, 'min_samples_split': 2, '...",0.722998,0.719252,0.720231,0.720827,0.001586,26
2,249.261491,2.136242,14.58621,0.869812,25,2,150,"{'max_features': 25, 'min_samples_split': 2, '...",0.72389,0.720044,0.720634,0.721522,0.001691,24
3,158.569144,1.199241,9.221139,0.270588,25,5,100,"{'max_features': 25, 'min_samples_split': 5, '...",0.72966,0.728894,0.727771,0.728775,0.000776,18
4,208.194351,3.467084,11.281549,0.920234,25,5,125,"{'max_features': 25, 'min_samples_split': 5, '...",0.729847,0.729599,0.728577,0.729341,0.00055,17
5,248.114718,6.535707,13.411221,1.021981,25,5,150,"{'max_features': 25, 'min_samples_split': 5, '...",0.730049,0.729743,0.728894,0.729562,0.000489,15
6,158.231525,1.424682,7.723133,0.860115,25,10,100,"{'max_features': 25, 'min_samples_split': 10, ...",0.731502,0.731081,0.733009,0.731864,0.000828,9
7,200.198144,1.690419,10.574529,0.198641,25,10,125,"{'max_features': 25, 'min_samples_split': 10, ...",0.731919,0.73157,0.732952,0.732147,0.000586,8
8,239.661673,2.22466,10.85423,1.235204,25,10,150,"{'max_features': 25, 'min_samples_split': 10, ...",0.732034,0.731642,0.733383,0.732353,0.000746,7
9,240.061188,1.417414,8.108287,1.012318,40,2,100,"{'max_features': 40, 'min_samples_split': 2, '...",0.723789,0.720461,0.721972,0.722074,0.001361,21


In [75]:
# Utilizo los mejores parametros segun el GridSearch
rf_model = RandomForestClassifier(n_estimators = 150,
                                  max_depth = None,
                                  max_features = 40,
                                  min_samples_split = 10,
                                  min_samples_leaf = 1,
                                  criterion = "gini",
                                  verbose=True)
rf_model.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:  2.8min finished


RandomForestClassifier(max_features=40, min_samples_split=10, n_estimators=200,
                       verbose=True)

In [76]:
rf_model.score(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:   12.8s finished


0.8981676899462778

In [77]:
# Calculo el F1 score para mi training set.
y_preds = rf_model.predict(X_test)
f1_score(y_test, y_preds, average='micro')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    3.1s finished


0.7385698662727116

In [23]:
# rf_model.feature_importances_

In [24]:
# test_values = pd.read_csv('../csv/test_values.csv', index_col = "building_id")
# test_values

In [25]:
# test_values_subset = test_values
# test_values_subset["geo_level_1_id"] = test_values_subset["geo_level_1_id"].astype("category")
# test_values_subset

In [26]:
# def encode_and_bind(original_dataframe, feature_to_encode):
#     dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
#     res = pd.concat([original_dataframe, dummies], axis=1)
#     res = res.drop([feature_to_encode], axis=1)
#     return(res) 

# features_to_encode = ["geo_level_1_id", "land_surface_condition", "foundation_type", "roof_type",\
#                      "position", "ground_floor_type", "other_floor_type",\
#                      "plan_configuration", "legal_ownership_status"]
# for feature in features_to_encode:
#     test_values_subset = encode_and_bind(test_values_subset, feature)
# test_values_subset

In [27]:
# Genero las predicciones para los test.
# preds = rf_model.predict(test_values_subset)

In [28]:
# submission_format = pd.read_csv('../csv/submission_format.csv', index_col = "building_id")

In [29]:
# my_submission = pd.DataFrame(data=preds,
#                              columns=submission_format.columns,
#                              index=submission_format.index)

In [30]:
# my_submission.head()

In [31]:
# my_submission.to_csv('../csv/predictions/jf-model-4-submission-all-params.csv')

In [32]:
# !head ../csv/predictions/jf-model-4-submission-all-params.csv