In [2]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing

from sklearn.ensemble import RandomForestClassifier

In [3]:
train_values = pd.read_csv('train_values.csv', dtype= {'building_id': np.int32,\
                                              'geo_level_1_id': np.int8,\
                                              'geo_level_2_id': np.int16,\
                                              'geo_level_3_id': np.int16,\
                                              'count_floors_pre_eq': np.int16,\
                                              'age': np.int16,\
                                              'area_percentage': np.int8,\
                                              'height_percentage': np.int8,\
                                              'land_surface_condition': 'category',\
                                              'foundation_type': 'category',\
                                              'roof_type': 'category',\
                                              'ground_floor_type':'category',\
                                              'other_floor_type': 'category',\
                                              'position': 'category',\
                                              'plan_configuration':'category',\
                                              'has_superstructure_adobe_mud':'boolean',\
                                              'has_superstructure_mud_mortar_stone':'boolean',\
                                              'has_superstructure_stone_flag':'boolean',\
                                              'has_superstructure_cement_mortar_stone':'boolean',\
                                              'has_superstructure_mud_mortar_brick':'boolean',\
                                              'has_superstructure_cement_mortar_brick':'boolean',\
                                              'has_superstructure_timber':'boolean',\
                                              'has_superstructure_bamboo':'boolean',\
                                              'has_superstructure_rc_non_engineered':'boolean',\
                                              'has_superstructure_rc_engineered':'boolean',\
                                              'has_superstructure_other':'boolean',\
                                              'legal_ownership_status':'category',\
                                              'count_families': np.int16,\
                                              'has_secondary_use':'boolean',\
                                              'has_secondary_use_agriculture':'boolean',\
                                              'has_secondary_use_hotel':'boolean',\
                                              'has_secondary_use_rental':'boolean',\
                                              'has_secondary_use_institution':'boolean',\
                                              'has_secondary_use_school':'boolean',\
                                              'has_secondary_use_industry':'boolean',\
                                              'has_secondary_use_health_post':'boolean',\
                                              'has_secondary_use_gov_office':'boolean',\
                                              'has_secondary_use_use_police':'boolean',\
                                              'has_secondary_use_other':'boolean'
                                              })
train_labels = pd.read_csv("train_labels.csv")
test_values = pd.read_csv("test_values.csv")

In [5]:
df = train_values.set_index("building_id").loc[:,["count_floors_pre_eq","count_families"]]

In [6]:
#normalize for nn
x = df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled,columns = df.columns,index = df.index)

In [7]:
df2 = train_values.merge(df, on = "building_id")

In [12]:
df2.columns


Index(['building_id', 'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq_x', 'age', 'area_percentage', 'height_percentage',
       'has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone',
       'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'count_families_x', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
       'has_secondary_use_health_post', 'has_secondary_use_gov_office',
       'has_secondary_use_use_police', 'has_secondary_use_other',
       'count_floors_

In [13]:
columns_importances =["geo_level_1_id",
                      "geo_level_2_id",
                      "has_superstructure_mud_mortar_stone",
                      "geo_level_3_id",
                      "foundation_type_r",
                      "has_superstructure_cement_mortar_brick",
                      "other_floor_type_q",
                      "age",
                      "foundation_type_i",
                      "has_superstructure_timber",
                      "roof_type_n",
                      'roof_type_x',
                      'has_superstructure_mud_mortar_brick',
                      'position_s',
                      'ground_floor_type_f',
                      'other_floor_type_x',
                      'has_superstructure_adobe_mud',
                      'ground_floor_type_v',
                      'roof_type_q',
                      'has_secondary_use',
                      'foundation_type_u',
                      'land_surface_condition_t',
                      'count_families_x',
                      'building_id']

In [14]:
df2 = pd.get_dummies(df2)
df3 = df2.loc[:,columns_importances]

In [15]:
x_pre = df3.set_index('building_id')
y_pre = train_labels.loc[:,'damage_grade']

x = x_pre
y = y_pre
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 23)

In [18]:
hiper = {"hidden_layer_sizes" : [10,20,30,40,50,100,200], 
         "activation": ["tanh", "relu"],
        "solver": ["sgd","adam"],
        "learning_rate":['constant', "invscaling", "adaptive"],
         "max_iter":[500]}
nn_mlp = MLPClassifier()
rn_sr = RandomizedSearchCV(nn_mlp,param_distributions = hiper, scoring = 'f1_micro',cv=5,n_jobs=-1)

In [None]:
rn_sr.fit(x_train, y_train)

In [None]:
print(gd_sr.best_score_)
bst_parms = gd_sr.best_params_
print(bst_parms)

In [None]:
mlp_class = MLPClassifier(hidden_layer_sizes = bst_parms["hidden_layer_sizes"],
                         activation = bst_parms["activation"],
                         solver = bst_parms["solver"],
                         learning_rate = bst_parms["learning_rate"])
model0 = mlp_class.fit(x_train, y_train)

model0.predict(x_test)
print("Training Score: {}".format(mlp_class.score(x_train, y_train)))
print("Test Score: {}".format(mlp_class.score(x_test, y_test)))

In [None]:
df3["first_prediction"] = model0.predict(df3.set_index("building_id"))

In [None]:
x_pre = df3.set_index('building_id')
y_pre = train_labels.loc[:,'damage_grade']

x = x_pre
y = y_pre
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [None]:
rf = RandomForestClassifier(n_estimators =  500,
                            min_samples_split =  20,
                            criterion = "gini")
model1 = rf.fit(x_train, y_train)

model1.predict(x_test)
print("Training Score: {}".format(rf.score(x_train, y_train)))
print("Test Score: {}".format(rf.score(x_test, y_test)))