In [None]:
# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt

# Preprocesado y modelado
# ==============================================================================
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_blobs
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
import multiprocessing
from keras.models import Sequential
from keras.layers.core import Dense
import sklearn as sk
from sklearn.model_selection import StratifiedKFold
import sklearn.neural_network
import datetime

# PREPARO CSV


In [None]:
labels=pd.read_csv("train_labels.csv")
labels.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [None]:
values=pd.read_csv("train_values.csv")
values.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,n,f,q,t,d,1,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,n,x,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,n,f,x,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,n,f,x,s,d,0,1,0,0,0,0,1,1,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,n,f,x,s,d,1,0,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0


In [None]:
values.isnull().values.any()

False

In [None]:
values = values.dropna()

In [None]:
labels.isnull().values.any()

False

In [None]:
values.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   building_id                             260601 non-null  int64 
 1   geo_level_1_id                          260601 non-null  int64 
 2   geo_level_2_id                          260601 non-null  int64 
 3   geo_level_3_id                          260601 non-null  int64 
 4   count_floors_pre_eq                     260601 non-null  int64 
 5   age                                     260601 non-null  int64 
 6   area_percentage                         260601 non-null  int64 
 7   height_percentage                       260601 non-null  int64 
 8   land_surface_condition                  260601 non-null  object
 9   foundation_type                         260601 non-null  object
 10  roof_type                               260601 non-null 

In [None]:
to_be_categorized = ["land_surface_condition", "foundation_type", "roof_type",\
                     "position", "ground_floor_type", "other_floor_type",\
                     "plan_configuration", "legal_ownership_status", "geo_level_1_id"]
for row in to_be_categorized:
    values[row] = values[row].astype("category")
values.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   building_id                             260601 non-null  int64   
 1   geo_level_1_id                          260601 non-null  category
 2   geo_level_2_id                          260601 non-null  int64   
 3   geo_level_3_id                          260601 non-null  int64   
 4   count_floors_pre_eq                     260601 non-null  int64   
 5   age                                     260601 non-null  int64   
 6   area_percentage                         260601 non-null  int64   
 7   height_percentage                       260601 non-null  int64   
 8   land_surface_condition                  260601 non-null  category
 9   foundation_type                         260601 non-null  category
 10  roof_type                       

In [None]:
datatypes = dict(values.dtypes)
for row in values.columns:
    if datatypes[row] != "int64" and datatypes[row] != "int32" and \
       datatypes[row] != "int16" and datatypes[row] != "int8":
        continue
    if values[row].nlargest(1).item() > 32767 and values[row].nlargest(1).item() < 2**31:
        values[row] = values[row].astype(np.int32)
    elif values[row].nlargest(1).item() > 127:
        values[row] = values[row].astype(np.int16)
    else:
        values[row] = values[row].astype(np.int8)
values.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   building_id                             260601 non-null  int32   
 1   geo_level_1_id                          260601 non-null  category
 2   geo_level_2_id                          260601 non-null  int16   
 3   geo_level_3_id                          260601 non-null  int16   
 4   count_floors_pre_eq                     260601 non-null  int8    
 5   age                                     260601 non-null  int16   
 6   area_percentage                         260601 non-null  int8    
 7   height_percentage                       260601 non-null  int8    
 8   land_surface_condition                  260601 non-null  category
 9   foundation_type                         260601 non-null  category
 10  roof_type                       

In [None]:
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   building_id   260601 non-null  int64
 1   damage_grade  260601 non-null  int64
dtypes: int64(2)
memory usage: 4.0 MB


In [None]:
labels["building_id"] = labels["building_id"].astype(np.int32)
labels["damage_grade"] = labels["damage_grade"].astype(np.int8)
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   building_id   260601 non-null  int32
 1   damage_grade  260601 non-null  int8 
dtypes: int32(1), int8(1)
memory usage: 1.2 MB


In [None]:
important_values = values\
                .merge(labels, on="building_id")
important_values.drop(columns=["building_id"], inplace = True)

In [None]:
important_values.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   geo_level_1_id                          260601 non-null  category
 1   geo_level_2_id                          260601 non-null  int16   
 2   geo_level_3_id                          260601 non-null  int16   
 3   count_floors_pre_eq                     260601 non-null  int8    
 4   age                                     260601 non-null  int16   
 5   area_percentage                         260601 non-null  int8    
 6   height_percentage                       260601 non-null  int8    
 7   land_surface_condition                  260601 non-null  category
 8   foundation_type                         260601 non-null  category
 9   roof_type                               260601 non-null  category
 10  ground_floor_type               

In [None]:
X_train, X_test, y_train, y_test = train_test_split(important_values.drop(columns = 'damage_grade'),
                                        important_values['damage_grade'], test_size = 0.2, random_state = 123)


cat_cols = X_train.select_dtypes(include=['category', "bool"]).columns.to_list()
numeric_cols = X_train.select_dtypes(include=['int16', 'int8', "float64"]).columns.to_list()

preprocessor = ColumnTransformer([('onehot', OneHotEncoder(handle_unknown='ignore'), cat_cols)],
                                  remainder='passthrough')

X_train = preprocessor.fit_transform(X_train)
X_test  = preprocessor.transform(X_test)

nn = [2, 10, 20, 10]  # número de neuronas por capa.
nr = [3, 5, 20, 15, 10]
nd = [10, 4, 16, 20, 6]

modelo_1 = MLPClassifier()

params = {
        'activation': ['logistic'],
        'alpha': [0.0001],
        'learning_rate': ['adaptive'],
        'hidden_layer_sizes': [tuple(nn[1:]), tuple(nr[1:]), tuple(nd[1:])],
        'learning_rate_init' : [0.001],
        'n_iter_no_change' : [100],
        'batch_size' : [400, 300, 350]
        }

folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)
random_search = RandomizedSearchCV(modelo_1, param_distributions=params, n_iter=param_comb, n_jobs=-1, cv=skf.split(X_train,y_train), verbose=3, random_state=1001)

random_search.fit(X_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 26.5min finished


RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x7f7c4087e2d0>,
                   error_score=nan,
                   estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                           batch_size='auto', beta_1=0.9,
                                           beta_2=0.999, early_stopping=False,
                                           epsilon=1e-08,
                                           hidden_layer_sizes=(100,),
                                           learning_rate='constant',
                                           learning_rate_init=0.001,
                                           max_fun=15000, max_iter=200,
                                           momentum=0.9, n_iter_no_chan...
                   iid='deprecated', n_iter=5, n_jobs=-1,
                   param_distributions={'activation': ['logistic'],
                                        'alpha': [0.0001],
                                        'batch_size': [400, 

In [None]:
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results


 All results:
{'mean_fit_time': array([223.42220577, 199.86959187, 190.83474986, 211.94076641,
       196.30728666]), 'std_fit_time': array([ 1.79231589,  0.87244965,  0.43937409,  2.41412178, 36.8055556 ]), 'mean_score_time': array([0.23116652, 0.20751111, 0.19440675, 0.23730493, 0.17677172]), 'std_score_time': array([0.00928642, 0.00222269, 0.00104933, 0.00689308, 0.05234747]), 'param_n_iter_no_change': masked_array(data=[100, 100, 100, 100, 100],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_learning_rate_init': masked_array(data=[0.001, 0.001, 0.001, 0.001, 0.001],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_learning_rate': masked_array(data=['adaptive', 'adaptive', 'adaptive', 'adaptive',
                   'adaptive'],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_hidden_layer_sizes

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_iter_no_change,param_learning_rate_init,param_learning_rate,param_hidden_layer_sizes,param_batch_size,param_alpha,param_activation,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,223.422206,1.792316,0.231167,0.009286,100,0.001,adaptive,"(5, 20, 15, 10)",350,0.0001,logistic,"{'n_iter_no_change': 100, 'learning_rate_init'...",0.646128,0.646094,0.660815,0.651012,0.006932,4
1,199.869592,0.87245,0.207511,0.002223,100,0.001,adaptive,"(4, 16, 20, 6)",400,0.0001,logistic,"{'n_iter_no_change': 100, 'learning_rate_init'...",0.655481,0.652325,0.657692,0.655166,0.002203,2
2,190.83475,0.439374,0.194407,0.001049,100,0.001,adaptive,"(10, 20, 10)",350,0.0001,logistic,"{'n_iter_no_change': 100, 'learning_rate_init'...",0.668,0.663923,0.607586,0.646503,0.027569,5
3,211.940766,2.414122,0.237305,0.006893,100,0.001,adaptive,"(5, 20, 15, 10)",400,0.0001,logistic,"{'n_iter_no_change': 100, 'learning_rate_init'...",0.662978,0.669305,0.640712,0.657665,0.012263,1
4,196.307287,36.805556,0.176772,0.052347,100,0.001,adaptive,"(4, 16, 20, 6)",300,0.0001,logistic,"{'n_iter_no_change': 100, 'learning_rate_init'...",0.654028,0.646151,0.659793,0.653324,0.005591,3
