In [2]:
import numpy as np
import pandas as pd

In [3]:
import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib.ticker as ticker
import seaborn as sns

In [4]:
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, make_scorer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
import multiprocessing

In [5]:
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.ensemble import BaggingClassifier

In [6]:
from xgboost import XGBClassifier

In [7]:
labels = pd.read_csv('../../csv/train_labels.csv')
labels.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [8]:
values = pd.read_csv('../../csv/train_values.csv')
values.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,260591,260592,260593,260594,260595,260596,260597,260598,260599,260600
building_id,802906,28830,94947,590882,201944,333020,728451,475515,441126,989500,...,560805,207683,226421,159555,827012,688636,669485,602512,151409,747594
geo_level_1_id,6,8,21,22,11,8,9,20,0,26,...,20,10,8,27,8,25,17,17,26,21
geo_level_2_id,487,900,363,418,131,558,475,323,757,886,...,368,1382,767,181,268,1335,715,51,39,9
geo_level_3_id,12198,2812,8973,10694,1488,6089,12066,12236,7219,994,...,5980,1903,8613,1537,4718,1621,2060,8163,1851,9101
count_floors_pre_eq,2,2,2,2,3,2,2,2,2,1,...,1,2,2,6,2,1,2,3,2,3
age,30,10,10,10,30,10,25,0,15,0,...,25,25,5,0,20,55,0,55,10,10
area_percentage,6,8,5,6,8,9,3,8,8,13,...,5,5,13,13,8,6,6,6,14,7
height_percentage,5,7,5,5,9,5,4,6,6,4,...,3,5,5,12,5,3,5,7,6,6
land_surface_condition,t,o,t,t,t,t,n,t,t,t,...,n,t,t,t,t,n,t,t,t,n
foundation_type,r,r,r,r,r,r,r,w,r,i,...,r,r,r,r,r,r,r,r,r,r


In [9]:
to_be_categorized = ["land_surface_condition", "foundation_type", "roof_type",\
                     "position", "ground_floor_type", "other_floor_type",\
                     "plan_configuration", "legal_ownership_status"]
for row in to_be_categorized:
    values[row] = values[row].astype("category")
values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   building_id                             260601 non-null  int64   
 1   geo_level_1_id                          260601 non-null  int64   
 2   geo_level_2_id                          260601 non-null  int64   
 3   geo_level_3_id                          260601 non-null  int64   
 4   count_floors_pre_eq                     260601 non-null  int64   
 5   age                                     260601 non-null  int64   
 6   area_percentage                         260601 non-null  int64   
 7   height_percentage                       260601 non-null  int64   
 8   land_surface_condition                  260601 non-null  category
 9   foundation_type                         260601 non-null  category
 10  roof_type                       

In [10]:
datatypes = dict(values.dtypes)
for row in values.columns:
    if datatypes[row] != "int64" and datatypes[row] != "int32" and \
       datatypes[row] != "int16" and datatypes[row] != "int8":
        continue
    if values[row].nlargest(1).item() > 32767 and values[row].nlargest(1).item() < 2**31:
        values[row] = values[row].astype(np.int32)
    elif values[row].nlargest(1).item() > 127:
        values[row] = values[row].astype(np.int16)
    else:
        values[row] = values[row].astype(np.int8)

In [11]:
labels["building_id"] = labels["building_id"].astype(np.int32)
labels["damage_grade"] = labels["damage_grade"].astype(np.int8)
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   building_id   260601 non-null  int32
 1   damage_grade  260601 non-null  int8 
dtypes: int32(1), int8(1)
memory usage: 1.2 MB


In [12]:
values['age_is_leq_than_100'] = (values['age'] <= 100).astype(np.int8)
# values['age_is_betw_100_and_200'] = ((values['age'] > 100) & (values['age'] <= 200)).astype(np.int8)
values['age_is_greater_than_200'] = (values['age'] > 200).astype(np.int8)
values[values['age'] >= 100]

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,age_is_leq_than_100,age_is_greater_than_200
216,612661,11,883,2822,5,100,12,11,t,r,...,0,0,0,0,0,0,0,0,1,0
324,737045,21,477,4348,3,190,6,7,t,r,...,0,0,0,0,0,0,0,0,0,0
386,435557,14,1120,3789,3,995,12,5,t,r,...,0,0,0,0,0,0,0,0,0,1
419,370360,6,1253,5667,4,105,11,9,t,r,...,0,0,0,0,0,0,0,0,0,0
472,393373,10,310,3637,2,100,4,4,t,r,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260093,34321,17,1393,5939,2,995,8,4,t,r,...,0,0,0,0,0,0,0,0,0,1
260469,1038559,3,1387,9059,5,105,9,11,t,r,...,0,0,0,0,0,0,0,0,0,0
260541,798100,13,1365,11849,1,100,6,3,t,r,...,0,0,0,0,0,0,0,0,1,0
260542,156434,8,696,7863,2,995,8,8,t,r,...,0,0,0,0,0,0,0,0,0,1


In [13]:
important_values = values\
                .merge(labels, on="building_id")
important_values.drop(columns=["building_id"], inplace = True)
important_values["geo_level_1_id"] = important_values["geo_level_1_id"].astype("category")
important_values

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,age_is_leq_than_100,age_is_greater_than_200,damage_grade
0,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,1,0,3
1,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,1,0,2
2,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,1,0,3
3,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,1,0,2
4,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,25,1335,1621,1,55,6,3,n,r,n,...,0,0,0,0,0,0,0,1,0,2
260597,17,715,2060,2,0,6,5,t,r,n,...,0,0,0,0,0,0,0,1,0,3
260598,17,51,8163,3,55,6,7,t,r,q,...,0,0,0,0,0,0,0,1,0,3
260599,26,39,1851,2,10,14,6,t,r,x,...,0,0,0,0,0,0,0,1,0,2


In [14]:
df = pd.DataFrame()
df[['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 'damage_grade']] =\
    important_values[['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 'damage_grade']]
df

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,damage_grade
0,6,487,12198,3
1,8,900,2812,2
2,21,363,8973,3
3,22,418,10694,2
4,11,131,1488,3
...,...,...,...,...
260596,25,1335,1621,2
260597,17,715,2060,3
260598,17,51,8163,3
260599,26,39,1851,2


In [17]:
df['damage_grade_mean_geo_level_2_id'] = df.groupby('geo_level_2_id')['damage_grade'].transform('mean')
df

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,damage_grade,damage_grade_mean_geo_level_2_id
0,6,487,12198,3,2.740741
1,8,900,2812,2,2.487437
2,21,363,8973,3,2.518750
3,22,418,10694,2,2.107317
4,11,131,1488,3,2.348748
...,...,...,...,...,...
260596,25,1335,1621,2,1.931034
260597,17,715,2060,3,2.931034
260598,17,51,8163,3,2.894895
260599,26,39,1851,2,1.532194


In [18]:
df['damage_grade_mean_geo_level_3_id'] = df.groupby('geo_level_3_id')['damage_grade'].transform('mean')
df

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,damage_grade,damage_grade_mean_geo_level_2_id,damage_grade_mean_geo_level_3_id
0,6,487,12198,3,2.740741,2.837838
1,8,900,2812,2,2.487437,2.062500
2,21,363,8973,3,2.518750,2.580882
3,22,418,10694,2,2.107317,2.096774
4,11,131,1488,3,2.348748,2.368852
...,...,...,...,...,...,...
260596,25,1335,1621,2,1.931034,2.071429
260597,17,715,2060,3,2.931034,2.979592
260598,17,51,8163,3,2.894895,2.818182
260599,26,39,1851,2,1.532194,1.792373


In [25]:
df.sort_values('damage_grade_mean_geo_level_2_id')

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,damage_grade,damage_grade_mean_geo_level_2_id,damage_grade_mean_geo_level_3_id
115793,19,688,2449,1,1.000000,1.000000
16837,23,1263,4542,1,1.000000,1.000000
88421,19,1043,3079,1,1.000000,1.000000
247368,19,115,10136,1,1.000000,1.000000
124666,1,14,5,1,1.218182,1.142857
...,...,...,...,...,...,...
212372,14,552,721,3,3.000000,3.000000
124195,2,1213,8714,3,3.000000,3.000000
98139,2,509,3467,3,3.000000,3.000000
243505,2,509,4910,3,3.000000,3.000000
