# Importo Pandas y Numpy

In [1]:
import numpy as np
import pandas as pd

# Gráficos

In [2]:
import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib.ticker as ticker
import seaborn as sns

# Preprocesado y modelado

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
import multiprocessing

In [4]:
labels = pd.read_csv('../csv/train_labels.csv')
labels.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [5]:
values = pd.read_csv('../csv/train_values.csv')
values.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
building_id,802906,28830,94947,590882,201944,333020,728451,475515,441126,989500
geo_level_1_id,6,8,21,22,11,8,9,20,0,26
geo_level_2_id,487,900,363,418,131,558,475,323,757,886
geo_level_3_id,12198,2812,8973,10694,1488,6089,12066,12236,7219,994
count_floors_pre_eq,2,2,2,2,3,2,2,2,2,1
age,30,10,10,10,30,10,25,0,15,0
area_percentage,6,8,5,6,8,9,3,8,8,13
height_percentage,5,7,5,5,9,5,4,6,6,4
land_surface_condition,t,o,t,t,t,t,n,t,t,t
foundation_type,r,r,r,r,r,r,r,w,r,i


Primero observamos que los tipos de las columnas corresponden a lo que deberían ser.

# Datos Nulos

Vemos que los dos dataframes utilizados no poseen datos nulos.

In [6]:
values.isnull().values.any()

False

In [7]:
labels.isnull().values.any()

False

# Preprocesamiento de Datos

## Id's de edificios únicos.

In [8]:
values.dtypes 

building_id                                int64
geo_level_1_id                             int64
geo_level_2_id                             int64
geo_level_3_id                             int64
count_floors_pre_eq                        int64
age                                        int64
area_percentage                            int64
height_percentage                          int64
land_surface_condition                    object
foundation_type                           object
roof_type                                 object
ground_floor_type                         object
other_floor_type                          object
position                                  object
plan_configuration                        object
has_superstructure_adobe_mud               int64
has_superstructure_mud_mortar_stone        int64
has_superstructure_stone_flag              int64
has_superstructure_cement_mortar_stone     int64
has_superstructure_mud_mortar_brick        int64
has_superstructure_c

Todos los building_id son unicos

In [9]:
values["building_id"].count() == values["building_id"].drop_duplicates().count()

True

## Memoria

In [10]:
values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   building_id                             260601 non-null  int64 
 1   geo_level_1_id                          260601 non-null  int64 
 2   geo_level_2_id                          260601 non-null  int64 
 3   geo_level_3_id                          260601 non-null  int64 
 4   count_floors_pre_eq                     260601 non-null  int64 
 5   age                                     260601 non-null  int64 
 6   area_percentage                         260601 non-null  int64 
 7   height_percentage                       260601 non-null  int64 
 8   land_surface_condition                  260601 non-null  object
 9   foundation_type                         260601 non-null  object
 10  roof_type                               260601 non-null 

Transformamos los "object" type a "category" type para no ocupar tanta memoria.

In [11]:
to_be_categorized = ["land_surface_condition", "foundation_type", "roof_type",\
                     "position", "ground_floor_type", "other_floor_type",\
                     "plan_configuration", "legal_ownership_status"]
for row in to_be_categorized:
    values[row] = values[row].astype("category")
values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   building_id                             260601 non-null  int64   
 1   geo_level_1_id                          260601 non-null  int64   
 2   geo_level_2_id                          260601 non-null  int64   
 3   geo_level_3_id                          260601 non-null  int64   
 4   count_floors_pre_eq                     260601 non-null  int64   
 5   age                                     260601 non-null  int64   
 6   area_percentage                         260601 non-null  int64   
 7   height_percentage                       260601 non-null  int64   
 8   land_surface_condition                  260601 non-null  category
 9   foundation_type                         260601 non-null  category
 10  roof_type                       

Hacemos que cada dato numérico se castee al tipo de menor capacidad que pueda contenerlo.

In [12]:
datatypes = dict(values.dtypes)
for row in values.columns:
    if datatypes[row] != "int64" and datatypes[row] != "int32" and \
       datatypes[row] != "int16" and datatypes[row] != "int8":
        continue
    if values[row].nlargest(1).item() > 32767 and values[row].nlargest(1).item() < 2**31:
        values[row] = values[row].astype(np.int32)
    elif values[row].nlargest(1).item() > 127:
        values[row] = values[row].astype(np.int16)
    else:
        values[row] = values[row].astype(np.int8)
values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   building_id                             260601 non-null  int32   
 1   geo_level_1_id                          260601 non-null  int8    
 2   geo_level_2_id                          260601 non-null  int16   
 3   geo_level_3_id                          260601 non-null  int16   
 4   count_floors_pre_eq                     260601 non-null  int8    
 5   age                                     260601 non-null  int16   
 6   area_percentage                         260601 non-null  int8    
 7   height_percentage                       260601 non-null  int8    
 8   land_surface_condition                  260601 non-null  category
 9   foundation_type                         260601 non-null  category
 10  roof_type                       

## Repetimos lo mismo para el primer DataFrame

In [13]:
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   building_id   260601 non-null  int64
 1   damage_grade  260601 non-null  int64
dtypes: int64(2)
memory usage: 4.0 MB


In [14]:
labels["building_id"] = labels["building_id"].astype(np.int32)
labels["damage_grade"] = labels["damage_grade"].astype(np.int8)
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   building_id   260601 non-null  int32
 1   damage_grade  260601 non-null  int8 
dtypes: int32(1), int8(1)
memory usage: 1.2 MB


# Primer Modelo

In [15]:
important_values = values[["building_id", "count_floors_pre_eq", "age",\
                           "area_percentage", "height_percentage", "has_superstructure_adobe_mud",\
                           "has_superstructure_mud_mortar_stone","has_superstructure_stone_flag",\
                           "has_superstructure_cement_mortar_stone","has_superstructure_mud_mortar_brick",\
                           "has_superstructure_cement_mortar_brick","has_superstructure_timber",\
                           "has_superstructure_bamboo","has_superstructure_rc_non_engineered",\
                           "has_superstructure_rc_engineered","has_superstructure_other"]]\
                .merge(labels, on="building_id")
important_values.drop(columns=["building_id"], inplace = True)
important_values

Unnamed: 0,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,damage_grade
0,2,30,6,5,1,1,0,0,0,0,0,0,0,0,0,3
1,2,10,8,7,0,1,0,0,0,0,0,0,0,0,0,2
2,2,10,5,5,0,1,0,0,0,0,0,0,0,0,0,3
3,2,10,6,5,0,1,0,0,0,0,1,1,0,0,0,2
4,3,30,8,9,1,0,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,1,55,6,3,0,1,0,0,0,0,0,0,0,0,0,2
260597,2,0,6,5,0,1,0,0,0,0,0,0,0,0,0,3
260598,3,55,6,7,0,1,0,0,0,0,0,0,0,0,0,3
260599,2,10,14,6,0,0,0,0,0,1,0,0,0,0,0,2


In [16]:
X_train, X_test, y_train, y_test = train_test_split(important_values.drop(columns = 'damage_grade'),
                                                    important_values['damage_grade'], random_state = 123)

In [17]:
modelo = RandomForestClassifier(n_estimators = 150, max_depth = 15, criterion = "gini", verbose=True)
modelo.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:   46.5s finished


RandomForestClassifier(max_depth=15, n_estimators=150, verbose=True)

In [18]:
modelo.score(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    6.2s finished


0.6213609618828345

In [19]:
preds = modelo.predict(X_test)
preds

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    2.0s finished


array([2, 2, 1, ..., 2, 3, 2], dtype=int8)

In [20]:
print(
    classification_report(
        y_true = y_test,
        y_pred = modelo.predict(X_test)
    )
)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


              precision    recall  f1-score   support

           1       0.60      0.31      0.41      6325
           2       0.60      0.90      0.72     36951
           3       0.57      0.16      0.25     21875

    accuracy                           0.59     65151
   macro avg       0.59      0.46      0.46     65151
weighted avg       0.59      0.59      0.53     65151



[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    2.7s finished


# Segundo Modelo

In [21]:
param_grid = {
                 'n_estimators': [150],
                 'max_depth'   : [None, 3, 10, 20],
                 'criterion'   : ['gini', 'entropy']
             }

grid = GridSearchCV(
                    estimator  = RandomForestClassifier(random_state = 123),
                    param_grid = param_grid,
                    scoring    = 'accuracy',
                    n_jobs     = multiprocessing.cpu_count() - 1,
                    cv         = RepeatedKFold(n_splits=5, n_repeats=3, random_state=123), 
                    refit      = True,
                    verbose    = 1,
                    return_train_score = True
                   )

grid.fit(X = X_train, y = y_train)

resultados = pd.DataFrame(grid.cv_results_)
resultados.filter(regex = '(param*|mean_t|std_t)')\
          .drop(columns = 'params')\
          .sort_values('mean_test_score', ascending = False)\
          .head(4)

Fitting 15 folds for each of 8 candidates, totalling 120 fits


Unnamed: 0,param_criterion,param_max_depth,param_n_estimators,mean_test_score,std_test_score,mean_train_score,std_train_score
2,gini,10,150,0.5904,0.001956,0.595806,0.000563
3,gini,20,150,0.589958,0.002119,0.663071,0.000723
6,entropy,10,150,0.589557,0.001645,0.594589,0.0006
7,entropy,20,150,0.589119,0.002107,0.665376,0.000604


In [22]:
modelo_final = grid.best_estimator_
modelo_final

RandomForestClassifier(max_depth=10, n_estimators=150, random_state=123)

In [23]:
print(classification_report(y_true = y_test,y_pred = modelo_final.predict(X_test)))

              precision    recall  f1-score   support

           1       0.63      0.27      0.38      6325
           2       0.59      0.94      0.72     36951
           3       0.58      0.09      0.16     21875

    accuracy                           0.59     65151
   macro avg       0.60      0.43      0.42     65151
weighted avg       0.59      0.59      0.50     65151

