In [19]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing

from sklearn.ensemble import RandomForestClassifier

In [2]:
train_values = pd.read_csv('train_values.csv', dtype= {'building_id': np.int32,\
                                              'geo_level_1_id': np.int8,\
                                              'geo_level_2_id': np.int16,\
                                              'geo_level_3_id': np.int16,\
                                              'count_floors_pre_eq': np.int16,\
                                              'age': np.int16,\
                                              'area_percentage': np.int8,\
                                              'height_percentage': np.int8,\
                                              'land_surface_condition': 'category',\
                                              'foundation_type': 'category',\
                                              'roof_type': 'category',\
                                              'ground_floor_type':'category',\
                                              'other_floor_type': 'category',\
                                              'position': 'category',\
                                              'plan_configuration':'category',\
                                              'has_superstructure_adobe_mud':'boolean',\
                                              'has_superstructure_mud_mortar_stone':'boolean',\
                                              'has_superstructure_stone_flag':'boolean',\
                                              'has_superstructure_cement_mortar_stone':'boolean',\
                                              'has_superstructure_mud_mortar_brick':'boolean',\
                                              'has_superstructure_cement_mortar_brick':'boolean',\
                                              'has_superstructure_timber':'boolean',\
                                              'has_superstructure_bamboo':'boolean',\
                                              'has_superstructure_rc_non_engineered':'boolean',\
                                              'has_superstructure_rc_engineered':'boolean',\
                                              'has_superstructure_other':'boolean',\
                                              'legal_ownership_status':'category',\
                                              'count_families': np.int16,\
                                              'has_secondary_use':'boolean',\
                                              'has_secondary_use_agriculture':'boolean',\
                                              'has_secondary_use_hotel':'boolean',\
                                              'has_secondary_use_rental':'boolean',\
                                              'has_secondary_use_institution':'boolean',\
                                              'has_secondary_use_school':'boolean',\
                                              'has_secondary_use_industry':'boolean',\
                                              'has_secondary_use_health_post':'boolean',\
                                              'has_secondary_use_gov_office':'boolean',\
                                              'has_secondary_use_use_police':'boolean',\
                                              'has_secondary_use_other':'boolean'
                                              })
train_labels = pd.read_csv("train_labels.csv")
test_values = pd.read_csv("test_values.csv")

In [3]:
df = train_values.set_index("building_id").loc[:,["count_floors_pre_eq","count_families"]]

In [4]:
#normalize for nn
x = df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled,columns = df.columns,index = df.index)


In [5]:
df.reset_index()

Unnamed: 0,building_id,count_floors_pre_eq,count_families
0,802906,0.125,0.111111
1,28830,0.125,0.111111
2,94947,0.125,0.111111
3,590882,0.125,0.111111
4,201944,0.250,0.111111
...,...,...,...
260596,688636,0.000,0.111111
260597,669485,0.125,0.111111
260598,602512,0.250,0.111111
260599,151409,0.125,0.111111


In [6]:
df2 = train_values.merge(df, on = "building_id")

In [7]:
df2

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq_x,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,count_floors_pre_eq_y,count_families_y
0,802906,6,487,12198,2,30,6,5,t,r,...,False,False,False,False,False,False,False,False,0.125,0.111111
1,28830,8,900,2812,2,10,8,7,o,r,...,False,False,False,False,False,False,False,False,0.125,0.111111
2,94947,21,363,8973,2,10,5,5,t,r,...,False,False,False,False,False,False,False,False,0.125,0.111111
3,590882,22,418,10694,2,10,6,5,t,r,...,False,False,False,False,False,False,False,False,0.125,0.111111
4,201944,11,131,1488,3,30,8,9,t,r,...,False,False,False,False,False,False,False,False,0.250,0.111111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,688636,25,1335,1621,1,55,6,3,n,r,...,False,False,False,False,False,False,False,False,0.000,0.111111
260597,669485,17,715,2060,2,0,6,5,t,r,...,False,False,False,False,False,False,False,False,0.125,0.111111
260598,602512,17,51,8163,3,55,6,7,t,r,...,False,False,False,False,False,False,False,False,0.250,0.111111
260599,151409,26,39,1851,2,10,14,6,t,r,...,False,False,False,False,False,False,False,False,0.125,0.111111


In [8]:
df2["volume_percentage"] = df2["height_percentage"] * df2["area_percentage"]

In [9]:
columns_importances =["geo_level_1_id",
                      "geo_level_2_id",
                      "has_superstructure_mud_mortar_stone",
                      "geo_level_3_id",
                      "foundation_type_r",
                      "has_superstructure_cement_mortar_brick",
                      "other_floor_type_q",
                      "age",
                      "foundation_type_i",
                      "has_superstructure_timber",
                      "roof_type_n",
                      'roof_type_x',
                      'has_superstructure_mud_mortar_brick',
                      'position_s',
                      'ground_floor_type_f',
                      'other_floor_type_x',
                      'has_superstructure_adobe_mud',
                      'ground_floor_type_v',
                      'roof_type_q',
                      'has_secondary_use',
                      'foundation_type_u',
                      'land_surface_condition_t',
                      'count_families',
                      'building_id']

In [10]:
df2 = pd.get_dummies(df2)
df3 = df2.loc[:,columns_importances]

In [11]:
x_pre = df3.set_index('building_id')
y_pre = train_labels.loc[:,'damage_grade']

x = x_pre
y = y_pre
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 23)

In [12]:
hiper = {"hidden_layer_sizes" : [10,20,30,40,50,100,200], 
         "activation": ["tanh", "relu"],
        "solver": ["sgd","adam"],
        "learning_rate":['constant', "invscaling", "adaptive"],
         "max_iter":[500]}
nn_mlp = MLPClassifier()
gd_sr = GridSearchCV(nn_mlp,param_grid = hiper, scoring = 'f1_micro',cv=5,n_jobs=-1)

In [13]:
gd_sr.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=MLPClassifier(), n_jobs=-1,
             param_grid={'activation': ['tanh', 'relu'],
                         'hidden_layer_sizes': [10, 20, 30, 40, 50, 100, 200],
                         'learning_rate': ['constant', 'invscaling',
                                           'adaptive'],
                         'max_iter': [500], 'solver': ['sgd', 'adam']},
             scoring='f1_micro')

In [14]:
print(gd_sr.best_score_)
bst_parms = gd_sr.best_params_
print(bst_parms)

0.6015397160399079
{'activation': 'relu', 'hidden_layer_sizes': 200, 'learning_rate': 'constant', 'max_iter': 500, 'solver': 'adam'}


In [15]:
mlp_class = MLPClassifier(hidden_layer_sizes = bst_parms["hidden_layer_sizes"],
                         activation = bst_parms["activation"],
                         solver = bst_parms["solver"],
                         learning_rate = bst_parms["learning_rate"])
model0 = mlp_class.fit(x_train, y_train)

model0.predict(x_test)
print("Training Score: {}".format(mlp_class.score(x_train, y_train)))
print("Test Score: {}".format(mlp_class.score(x_test, y_test)))

Training Score: 0.6074635456638526
Test Score: 0.6064733984382494


In [16]:
#hago una primera prediccion y se la agrego como columna nueva para predecir de nuevo
df3["first_prediction"] = model0.predict(df3.set_index("building_id"))

In [17]:
x_pre = df3.set_index('building_id')
y_pre = train_labels.loc[:,'damage_grade']

x = x_pre
y = y_pre
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [24]:
rf = RandomForestClassifier(n_estimators =  ,
                                  min_samples_split =  )
model1 = rf.fit(x_train, y_train)

model1.predict(x_test)
print("Training Score: {}".format(rf.score(x_train, y_train)))
print("Test Score: {}".format(rf.score(x_test, y_test)))

Training Score: 0.8113344205679202
Test Score: 0.7235279445904722
