In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

from sklearn.metrics import f1_score
from xgboost import XGBClassifier

In [2]:
train_values = pd.read_csv('train_values.csv', dtype= {'building_id': np.int32,\
                                              'geo_level_1_id': np.int8,\
                                              'geo_level_2_id': np.int16,\
                                              'geo_level_3_id': np.int16,\
                                              'count_floors_pre_eq': np.int16,\
                                              'age': np.int16,\
                                              'area_percentage': np.int8,\
                                              'height_percentage': np.int8,\
                                              'land_surface_condition': 'category',\
                                              'foundation_type': 'category',\
                                              'roof_type': 'category',\
                                              'ground_floor_type':'category',\
                                              'other_floor_type': 'category',\
                                              'position': 'category',\
                                              'plan_configuration':'category',\
                                              'has_superstructure_adobe_mud':'boolean',\
                                              'has_superstructure_mud_mortar_stone':'boolean',\
                                              'has_superstructure_stone_flag':'boolean',\
                                              'has_superstructure_cement_mortar_stone':'boolean',\
                                              'has_superstructure_mud_mortar_brick':'boolean',\
                                              'has_superstructure_cement_mortar_brick':'boolean',\
                                              'has_superstructure_timber':'boolean',\
                                              'has_superstructure_bamboo':'boolean',\
                                              'has_superstructure_rc_non_engineered':'boolean',\
                                              'has_superstructure_rc_engineered':'boolean',\
                                              'has_superstructure_other':'boolean',\
                                              'legal_ownership_status':'category',\
                                              'count_families': np.int16,\
                                              'has_secondary_use':'boolean',\
                                              'has_secondary_use_agriculture':'boolean',\
                                              'has_secondary_use_hotel':'boolean',\
                                              'has_secondary_use_rental':'boolean',\
                                              'has_secondary_use_institution':'boolean',\
                                              'has_secondary_use_school':'boolean',\
                                              'has_secondary_use_industry':'boolean',\
                                              'has_secondary_use_health_post':'boolean',\
                                              'has_secondary_use_gov_office':'boolean',\
                                              'has_secondary_use_use_police':'boolean',\
                                              'has_secondary_use_other':'boolean'
                                              })
train_labels = pd.read_csv("train_labels.csv")
test_values = pd.read_csv("test_values.csv")

In [3]:
x_pre = pd.get_dummies(train_values).set_index('building_id')
y_pre = train_labels.loc[:,'damage_grade']

x = x_pre
y = y_pre
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [None]:
hiper = {"criterion":["entropy","gini"],"splitter":["best","random"],"min_samples_split":range(2,40,5)\
         ,"max_features":[int,"log2"]}
dt_clf = DecisionTreeClassifier()
gd_sr = GridSearchCV(dt_clf,param_grid=hiper,scoring='f1_micro',cv=5,n_jobs=-1)
gd_sr.fit(x_train, y_train)

In [None]:
gd_sr.best_score_

In [None]:
gd_sr.best_params_

In [None]:
hiper = {"criterion":["entropy","gini"],"splitter":["best","random"],"min_samples_split":range(37,70)\
         ,"max_features":[int,"auto","sqrt","log2"]}
dt_clf = DecisionTreeClassifier()
gd_sr = GridSearchCV(dt_clf,param_grid=hiper,scoring='f1_micro',cv=5,n_jobs=-1)

In [None]:
gd_sr.best_score_

In [None]:
gd_sr.best_params_

In [None]:
hiper = {"criterion":["entropy","gini"],"splitter":["best","random"],"min_samples_split":range(57,100)\
         ,"presort" : [True, False]}
dt_clf = DecisionTreeClassifier()
gd_sr = GridSearchCV(dt_clf,param_grid=hiper,scoring='f1_micro',cv=5,n_jobs=-1)
gd_sr.fit(x_train, y_train)

In [None]:
gd_sr.best_score_

In [None]:
gd_sr.best_params_

In [None]:
hiper = {"criterion":["entropy","gini"],"splitter":["best","random"],"min_samples_split":range(99,200)\
         ,"presort": [True, False]}
dt_clf = DecisionTreeClassifier()
gd_sr = GridSearchCV(dt_clf,param_grid=hiper,scoring='f1_micro',cv=5,n_jobs=-1)
gd_sr.fit(x_train, y_train)

In [None]:
gd_sr.best_score_

In [None]:
gd_sr.best_params_

In [None]:
dt = DecisionTreeClassifier(criterion = 'gini', min_samples_split= 127, splitter = 'best',presort = True)
model = dt.fit(x_train, y_train)

model.predict(x_test)
print("Training Score: {}".format(dt.score(x_train, y_train)))
print("Test Score: {}".format(dt.score(x_test, y_test)))

In [None]:
results = permutation_importance(model, x_test, y_test, n_repeats = 10, random_state = 42, n_jobs = 2)

In [None]:
for i in results.importances_mean.argsort()[::-1]:
     if results.importances_mean[i] - 2 * results.importances_std[i] > 0:
         print(f"{x_pre.columns[i]:<8}"
               f"{results.importances_mean[i]:.3f}"
               f" +/- {results.importances_std[i]:.3f}")

In [None]:
rf = RandomForestClassifier(min_samples_split= 127, n_estimators=500,criterion = "gini")
model = rf.fit(x_train, y_train)

model.predict(x_test)
print("Training Score: {}".format(rf.score(x_train, y_train)))
print("Test Score: {}".format(rf.score(x_test, y_test)))

In [None]:
adropear = ['count_floors_pre_eq','has_secondary_use_agriculture','has_secondary_use_hotel',
            'has_secondary_use_rental','has_secondary_use_institution',
           'has_secondary_use_school','has_secondary_use_industry','has_secondary_use_health_post',
            'has_secondary_use_gov_office','has_secondary_use_use_police','has_secondary_use_other',
            'has_superstructure_bamboo','area_percentage','height_percentage']

train_values['volume_percentage']=train_values['area_percentage']* train_values['height_percentage']

train_values.drop(columns = adropear, inplace = True)

In [6]:
columns_importances =["geo_level_1_id",
                      "geo_level_2_id",
                      "has_superstructure_mud_mortar_stone",
                      "geo_level_3_id",
                      "foundation_type_r",
                      "has_superstructure_cement_mortar_brick",
                      "other_floor_type_q",
                      "age",
                      "foundation_type_i",
                      "has_superstructure_timber",
                      "roof_type_n",
                      'roof_type_x',
                      'has_superstructure_mud_mortar_brick',
                      'position_s',
                      'ground_floor_type_f',
                      'other_floor_type_x',
                      'has_superstructure_adobe_mud',
                      'ground_floor_type_v',
                      'roof_type_q',
                      'has_secondary_use',
                      'foundation_type_u',
                      'land_surface_condition_t',
                      'count_families',
                      'building_id']

In [7]:
train_dummies = pd.get_dummies(train_values)
train_dummies.columns

Index(['building_id', 'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone',
       'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'count_families', 'has_secondary_use', 'has_secondary_use_agriculture',
       'has_secondary_use_hotel', 'has_secondary_use_rental',
       'has_secondary_use_institution', 'has_secondary_use_school',
       'has_secondary_use_industry', 'has_secondary_use_health_post',
       'has_secondary_use_gov_office', 'has_secondary_use_use_police',
       'has_secondary_use_other', 'land_surface_condition_n

In [8]:
train_importants = train_dummies.loc[:,columns_importances]

for column in train_importants.select_dtypes(include = "bool"):
    train_importants[column] = train_importants[column].astype(np.uint8)
train_importants

Unnamed: 0,geo_level_1_id,geo_level_2_id,has_superstructure_mud_mortar_stone,geo_level_3_id,foundation_type_r,has_superstructure_cement_mortar_brick,other_floor_type_q,age,foundation_type_i,has_superstructure_timber,...,ground_floor_type_f,other_floor_type_x,has_superstructure_adobe_mud,ground_floor_type_v,roof_type_q,has_secondary_use,foundation_type_u,land_surface_condition_t,count_families,building_id
0,6,487,1,12198,1,0,1,30,0,0,...,1,0,1,0,0,0,0,1,1,802906
1,8,900,1,2812,1,0,1,10,0,0,...,0,0,0,0,0,0,0,0,1,28830
2,21,363,1,8973,1,0,0,10,0,0,...,1,1,0,0,0,0,0,1,1,94947
3,22,418,1,10694,1,0,0,10,0,1,...,1,1,0,0,0,0,0,1,1,590882
4,11,131,0,1488,1,0,0,30,0,0,...,1,1,1,0,0,0,0,1,1,201944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,25,1335,1,1621,1,0,0,55,0,0,...,1,0,0,0,0,0,0,0,1,688636
260597,17,715,1,2060,1,0,1,0,0,0,...,1,0,0,0,0,0,0,1,1,669485
260598,17,51,1,8163,1,0,1,55,0,0,...,1,0,0,0,1,0,0,1,1,602512
260599,26,39,0,1851,1,1,0,10,0,0,...,0,0,0,1,0,0,0,1,1,151409


In [9]:
x_pre = train_importants.set_index('building_id')
y_pre = train_labels.loc[:,'damage_grade']
x = x_pre
y = y_pre
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [None]:
rf = RandomForestClassifier(min_samples_split= 20, n_estimators=500,criterion = "gini")
model = rf.fit(x_train, y_train)

model.predict(x_test)
print("Training Score: {}".format(rf.score(x_train, y_train)))
print("Test Score: {}".format(rf.score(x_test, y_test)))

In [None]:
test_values["volume_percentage"] = test_values["area_percentage"] * test_values["height_percentage"]
test_values_subset = pd.get_dummies(test_values.drop(columns = adropear))\
                        .loc[:,columns_importances]\
                        .set_index('building_id')

predictions = model.predict(test_values_subset)
test_values_subset['damage_grade'] = predictions
test_values_subset.loc[:,'damage_grade'].to_csv("submission_5.csv")

In [None]:
rf2 = RandomForestClassifier(min_samples_split= 20, n_estimators=250,criterion = "gini")
model2 = rf2.fit(x_train, y_train)

model2.predict(x_test)
print("Training Score: {}".format(rf2.score(x_train, y_train)))
print("Test Score: {}".format(rf2.score(x_test, y_test)))

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
hiper = {"min_samples_split": range(20,100,10), "n_estimators" : [100,250,500],
         "criterion" : ["gini","entropy"]}
rf_clf = RandomForestClassifier()
rn_sr = RandomizedSearchCV(rf_clf,param_distributions = hiper,scoring='f1_micro',cv=5,n_jobs=-1)

In [None]:
search = rn_sr.fit(x_train, y_train)

In [None]:
search.best_score_

In [None]:
search.best_params_

In [None]:
rf3 = RandomForestClassifier(min_samples_split= 20, n_estimators=100,criterion = "gini")
model3 = rf3.fit(x_train, y_train)

model3.predict(x_test)
print("Training Score: {}".format(rf3.score(x_train, y_train)))
print("Test Score: {}".format(rf3.score(x_test, y_test)))

In [None]:
hiper = {"min_samples_split": range(1,20), "n_estimators" : range(10,100,10),
         "criterion" : ["gini","entropy"]}
rf_clf = RandomForestClassifier()
rn_sr = RandomizedSearchCV(rf_clf,param_distributions = hiper,scoring='f1_micro',cv=5,n_jobs=-1)

In [None]:
search = rn_sr.fit(x_train, y_train)

In [None]:
search.best_score_

In [None]:
search.best_params_

In [None]:
rf4 = RandomForestClassifier(min_samples_split= search.best_params_["min_samples_split"],
                             n_estimators=search.best_params_["n_estimators"],
                             criterion = search.best_params_["criterion"])
model4 = rf4.fit(x_train, y_train)

model4.predict(x_test)
print("Training Score: {}".format(rf4.score(x_train, y_train)))
print("Test Score: {}".format(rf4.score(x_test, y_test)))

In [None]:
hiper = {"min_samples_split": range(1,20), "n_estimators" : range(10,100,10),
         "criterion" : ["gini","entropy"]}
rf_clf = RandomForestClassifier()
gd_sr = GridSearchCV(rf_clf,param_grid=hiper,scoring='f1_micro',cv=5,n_jobs=-1)


In [None]:
search = gd_sr.fit(x_train, y_train)

In [None]:
search.best_score_

In [None]:
search.best_params_

In [None]:
rf5 = RandomForestClassifier(min_samples_split= search.best_params_["min_samples_split"],
                             n_estimators=search.best_params_["n_estimators"],
                             criterion = search.best_params_["criterion"])
model5 = rf5.fit(x_train, y_train)

model5.predict(x_test)
print("Training Score: {}".format(rf5.score(x_train, y_train)))
print("Test Score: {}".format(rf5.score(x_test, y_test)))

In [None]:
clf = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.02, max_delta_step=0,
              max_depth=12, min_child_weight=1, missing=-1,
              monotone_constraints='()', n_estimators=2000, n_jobs=4, nthread=4,
              num_class=3, num_parallel_tree=1, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=0.8, tree_method='hist', validate_parameters=1,
              verbosity=None)

model = clf.fit(x_train,y_train,eval_set=[(x_test,y_test)],verbose=50, early_stopping_rounds=100)

In [None]:
print("Training Score: {}".format(clf.score(x_train, y_train)))
print("Test Score: {}".format(clf.score(x_test, y_test)))

In [None]:
train_values["first_prediction"] = model.predict(train_importants.set_index('building_id'))

In [None]:
train_values_pred = train_values.copy()

In [None]:
for column in train_values_pred.select_dtypes(include = "category").columns:
    train_values_pred[column] = train_values_pred[column].astype("bool")
for column in train_values_pred.select_dtypes(include = "bool").columns:
    train_values_pred[column] = train_values_pred[column].astype(np.uint8)

In [None]:
x_pre = train_values_pred.set_index('building_id')
y_pre = train_labels.loc[:,'damage_grade']
x = x_pre
y = y_pre
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [None]:
clf = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.02, max_delta_step=0,
              max_depth=12, min_child_weight=1, missing=-1,
              monotone_constraints='()', n_estimators=2000, n_jobs=4, nthread=4,
              num_class=3, num_parallel_tree=1, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=0.8, tree_method='hist', validate_parameters=1,
              verbosity=None)

model = clf.fit(x_train,y_train,eval_set=[(x_test,y_test)],verbose=50, early_stopping_rounds=100)

In [None]:
print("Training Score: {}".format(clf.score(x_train, y_train)))
print("Test Score: {}".format(clf.score(x_test, y_test)))

In [10]:

X = pd.get_dummies(train_values).drop(columns = 'building_id')
y = train_labels.loc[:,'damage_grade']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

geo_levels_train = X_train.copy().loc[:,['geo_level_1_id','geo_level_2_id','geo_level_3_id']]
geo_levels_test = X_test.copy().loc[:,['geo_level_1_id','geo_level_2_id','geo_level_3_id']]
#me quedo con las columnas para los calculos 
encode_geo_train = X_train.merge(y_train.rename("damage_grade"), right_index=True,left_index=True).loc[:,['geo_level_1_id','geo_level_2_id','geo_level_3_id','damage_grade']]
encode_geo_test = X_test.merge(y_test.rename("damage_grade"), right_index=True,left_index=True).loc[:,['geo_level_1_id','geo_level_2_id','geo_level_3_id','damage_grade']]

# cantidad muestras en el training set 
nTR = len(encode_geo_train)
# cantidad muestras en el test set
nTS = len(encode_geo_test)

# weighting function
def peso(n,m):
    return (n)/(m+n)

#cantidad de cada damage_grade

#training_set
nDG1Tr = len(encode_geo_train.loc[encode_geo_train['damage_grade']==1])
nDG2Tr = len(encode_geo_train.loc[encode_geo_train['damage_grade']==2])
nDG3Tr = len(encode_geo_train.loc[encode_geo_train['damage_grade']==3])
#test set
nDG1Ts = len(encode_geo_test.loc[encode_geo_test['damage_grade']==1])
nDG2Ts = len(encode_geo_test.loc[encode_geo_test['damage_grade']==2])
nDG3Ts = len(encode_geo_test.loc[encode_geo_test['damage_grade']==3])

var_total = encode_geo_test['damage_grade'].var()

#geo_level_1
geo_1 = encode_geo_train.loc[:,['geo_level_1_id','damage_grade']]\
        .value_counts().to_frame().reset_index().rename(columns={0:'count'})
#var_total_geo1 = geo_1['damage_grade'].var()
varianzaDG1 = encode_geo_train.loc[:,['geo_level_1_id','damage_grade']].groupby('geo_level_1_id').agg('var').reset_index().rename(columns = {'damage_grade':'var_DG'})
geo1 = geo_1.pivot_table(values=['count'], index=['geo_level_1_id'],columns=['damage_grade'], aggfunc= lambda x: x).fillna(0)
geo1 = geo1.reset_index().droplevel(level=0,axis=1).rename(columns = {1:'count_DG1', 2:'count_DG2',3:'count_DG3'}).drop(columns='')
geo1 = geo1.merge(varianzaDG1, left_index = True, right_on='geo_level_1_id').drop(columns='geo_level_1_id')

#geo_level_2
geo_2 = encode_geo_train.loc[:,['geo_level_2_id','damage_grade']]\
        .value_counts().to_frame().reset_index().rename(columns={0:'count'})
varianzaDG2 = encode_geo_train.loc[:,['geo_level_2_id','damage_grade']].groupby('geo_level_2_id').agg('var').reset_index().rename(columns = {'damage_grade':'var_DG'})
geo2 = geo_2.pivot_table(values=['count'], index=['geo_level_2_id'],columns=['damage_grade'], aggfunc= lambda x: x).fillna(0)
geo2 = geo2.reset_index().droplevel(level=0,axis=1).rename(columns = {1:'count_DG1', 2:'count_DG2',3:'count_DG3'}).drop(columns='')
geo2 = geo2.merge(varianzaDG2, left_index = True, right_on='geo_level_2_id').drop(columns='geo_level_2_id')


#geo_level_3
geo_3 = encode_geo_train.loc[:,['geo_level_3_id','damage_grade']]\
        .value_counts().to_frame().reset_index().rename(columns={0:'count'})
varianzaDG3 = encode_geo_train.loc[:,['geo_level_3_id','damage_grade']].groupby('geo_level_3_id').agg('var').reset_index().rename(columns = {'damage_grade':'var_DG'})
geo3 = geo_3.pivot_table(values=['count'], index=['geo_level_3_id'],columns=['damage_grade'], aggfunc= lambda x: x).fillna(0)
geo3 = geo3.reset_index().droplevel(level=0,axis=1).rename(columns = {1:'count_DG1', 2:'count_DG2',3:'count_DG3'}).drop(columns='')
geo3 = geo3.merge(varianzaDG3, left_index = True, right_on='geo_level_3_id').drop(columns='geo_level_3_id')

#calculos probs psoteriori
#para cada caso puedo calcular solo dos, la tercera puedo calcularla con las dos primeras (1-prob1-prob2)

geo1['prob_post_DG1_geo1'] = geo1['count_DG1']/(geo1['count_DG1']+geo1['count_DG2']+geo1['count_DG3'])
geo1['prob_post_DG2_geo1'] = geo1['count_DG2']/(geo1['count_DG1']+geo1['count_DG2']+geo1['count_DG3'])
geo1['prob_post_DG3_geo1'] = geo1['count_DG3']/(geo1['count_DG1']+geo1['count_DG2']+geo1['count_DG3'])

geo2['prob_post_DG1_geo2'] = geo2['count_DG1']/(geo2['count_DG1']+geo2['count_DG2']+geo2['count_DG3'])
geo2['prob_post_DG2_geo2'] = geo2['count_DG2']/(geo2['count_DG1']+geo2['count_DG2']+geo2['count_DG3'])
geo2['prob_post_DG3_geo2'] = geo2['count_DG3']/(geo2['count_DG1']+geo2['count_DG2']+geo2['count_DG3'])

geo3['prob_post_DG1_geo3'] = geo3['count_DG1']/(geo3['count_DG1']+geo3['count_DG2']+geo3['count_DG3'])
geo3['prob_post_DG2_geo3'] = geo3['count_DG2']/(geo3['count_DG1']+geo3['count_DG2']+geo3['count_DG3'])
geo3['prob_post_DG3_geo3'] = geo3['count_DG3']/(geo3['count_DG1']+geo3['count_DG2']+geo3['count_DG3'])

#geo_1
geo1['est_proba_DG1_geo1'] = geo1['prob_post_DG1_geo1']*peso(geo1['count_DG1']+geo1['count_DG2']+geo1['count_DG3'],geo1['var_DG']/var_total) + (1-peso(geo1['count_DG1']+geo1['count_DG2']+geo1['count_DG3'],geo1['var_DG']/var_total))*(nDG1Tr/nTR)
geo1['est_proba_DG2_geo1'] = geo1['prob_post_DG2_geo1']*peso(geo1['count_DG1']+geo1['count_DG2']+geo1['count_DG3'],geo1['var_DG']/var_total) + (1-peso(geo1['count_DG1']+geo1['count_DG2']+geo1['count_DG3'],geo1['var_DG']/var_total))*(nDG2Tr/nTR)
geo1['est_proba_DG3_geo1'] = geo1['prob_post_DG3_geo1']*peso(geo1['count_DG1']+geo1['count_DG2']+geo1['count_DG3'],geo1['var_DG']/var_total) + (1-peso(geo1['count_DG1']+geo1['count_DG2']+geo1['count_DG3'],geo1['var_DG']/var_total))*(nDG3Tr/nTR)

#geo_2
geo2['est_proba_DG1_geo2'] = geo2['prob_post_DG1_geo2']*peso(geo2['count_DG1']+geo2['count_DG2']+geo2['count_DG3'],geo2['var_DG']/var_total) + (1-peso(geo2['count_DG1']+geo2['count_DG2']+geo2['count_DG3'],geo2['var_DG']/var_total))*(nDG1Tr/nTR)
geo2['est_proba_DG2_geo2'] = geo2['prob_post_DG2_geo2']*peso(geo2['count_DG1']+geo2['count_DG2']+geo2['count_DG3'],geo2['var_DG']/var_total) + (1-peso(geo2['count_DG1']+geo2['count_DG2']+geo2['count_DG3'],geo2['var_DG']/var_total))*(nDG2Tr/nTR)
geo2['est_proba_DG3_geo2'] = geo2['prob_post_DG3_geo2']*peso(geo2['count_DG1']+geo2['count_DG2']+geo2['count_DG3'],geo2['var_DG']/var_total) + (1-peso(geo2['count_DG1']+geo2['count_DG2']+geo2['count_DG3'],geo2['var_DG']/var_total))*(nDG3Tr/nTR)

#geo_3
geo3['est_proba_DG1_geo3'] = geo3['prob_post_DG1_geo3']*peso(geo3['count_DG1']+geo3['count_DG2']+geo3['count_DG3'],geo3['var_DG']/var_total) + (1-peso(geo3['count_DG1']+geo3['count_DG2']+geo3['count_DG3'],geo3['var_DG']/var_total))*(nDG1Tr/nTR)
geo3['est_proba_DG2_geo3'] = geo3['prob_post_DG2_geo3']*peso(geo3['count_DG1']+geo3['count_DG2']+geo3['count_DG3'],geo3['var_DG']/var_total) + (1-peso(geo3['count_DG1']+geo3['count_DG2']+geo3['count_DG3'],geo3['var_DG']/var_total))*(nDG2Tr/nTR)
geo3['est_proba_DG3_geo3'] = geo3['prob_post_DG3_geo3']*peso(geo3['count_DG1']+geo3['count_DG2']+geo3['count_DG3'],geo3['var_DG']/var_total) + (1-peso(geo3['count_DG1']+geo3['count_DG2']+geo3['count_DG3'],geo3['var_DG']/var_total))*(nDG3Tr/nTR)

to_merge_1 = geo1.loc[:,['est_proba_DG1_geo1','est_proba_DG2_geo1','est_proba_DG3_geo1']]
to_merge_2 = geo2.loc[:,['est_proba_DG1_geo2','est_proba_DG2_geo2','est_proba_DG3_geo2']]
to_merge_3 = geo3.loc[:,['est_proba_DG1_geo3','est_proba_DG2_geo3','est_proba_DG3_geo3']]

X_train = X_train.merge(to_merge_1, left_on = 'geo_level_1_id', right_index = True, how = 'left').drop(columns = 'geo_level_1_id')
X_train = X_train.merge(to_merge_2, left_on = 'geo_level_2_id', right_index = True, how = 'left').drop(columns = 'geo_level_2_id')
X_train = X_train.merge(to_merge_3, left_on = 'geo_level_3_id', right_index = True, how = 'left').drop(columns = 'geo_level_3_id')

del geo1
del geo2
del geo3
del geo_1
del geo_2
del geo_3

var_total = encode_geo_test['damage_grade'].var()

#geo_level_1
geo_1 = encode_geo_test.loc[:,['geo_level_1_id','damage_grade']]\
        .value_counts().to_frame().reset_index().rename(columns={0:'count'})
#var_total_geo1 = geo_1['damage_grade'].var()
varianzaDG1 = encode_geo_test.loc[:,['geo_level_1_id','damage_grade']].groupby('geo_level_1_id').agg('var').reset_index().rename(columns = {'damage_grade':'var_DG'})
geo1 = geo_1.pivot_table(values=['count'], index=['geo_level_1_id'],columns=['damage_grade'], aggfunc= lambda x: x).fillna(0)
geo1 = geo1.reset_index().droplevel(level=0,axis=1).rename(columns = {1:'count_DG1', 2:'count_DG2',3:'count_DG3'}).drop(columns='')
geo1 = geo1.merge(varianzaDG1, left_index = True, right_on='geo_level_1_id').drop(columns='geo_level_1_id')

#geo_level_2
geo_2 = encode_geo_test.loc[:,['geo_level_2_id','damage_grade']]\
        .value_counts().to_frame().reset_index().rename(columns={0:'count'})
varianzaDG2 = encode_geo_test.loc[:,['geo_level_2_id','damage_grade']].groupby('geo_level_2_id').agg('var').reset_index().rename(columns = {'damage_grade':'var_DG'})
geo2 = geo_2.pivot_table(values=['count'], index=['geo_level_2_id'],columns=['damage_grade'], aggfunc= lambda x: x).fillna(0)
geo2 = geo2.reset_index().droplevel(level=0,axis=1).rename(columns = {1:'count_DG1', 2:'count_DG2',3:'count_DG3'}).drop(columns='')
geo2 = geo2.merge(varianzaDG2, left_index = True, right_on='geo_level_2_id').drop(columns='geo_level_2_id')


#geo_level_3
geo_3 = encode_geo_test.loc[:,['geo_level_3_id','damage_grade']]\
        .value_counts().to_frame().reset_index().rename(columns={0:'count'})
varianzaDG3 = encode_geo_test.loc[:,['geo_level_3_id','damage_grade']].groupby('geo_level_3_id').agg('var').reset_index().rename(columns = {'damage_grade':'var_DG'})
geo3 = geo_3.pivot_table(values=['count'], index=['geo_level_3_id'],columns=['damage_grade'], aggfunc= lambda x: x).fillna(0)
geo3 = geo3.reset_index().droplevel(level=0,axis=1).rename(columns = {1:'count_DG1', 2:'count_DG2',3:'count_DG3'}).drop(columns='')
geo3 = geo3.merge(varianzaDG3, left_index = True, right_on='geo_level_3_id').drop(columns='geo_level_3_id')

#calculos probs psoteriori
#para cada caso puedo calcular solo dos, la tercera puedo calcularla con las dos primeras (1-prob1-prob2)

geo1['prob_post_DG1_geo1'] = geo1['count_DG1']/(geo1['count_DG1']+geo1['count_DG2']+geo1['count_DG3'])
geo1['prob_post_DG2_geo1'] = geo1['count_DG2']/(geo1['count_DG1']+geo1['count_DG2']+geo1['count_DG3'])
geo1['prob_post_DG3_geo1'] = geo1['count_DG3']/(geo1['count_DG1']+geo1['count_DG2']+geo1['count_DG3'])

geo2['prob_post_DG1_geo2'] = geo2['count_DG1']/(geo2['count_DG1']+geo2['count_DG2']+geo2['count_DG3'])
geo2['prob_post_DG2_geo2'] = geo2['count_DG2']/(geo2['count_DG1']+geo2['count_DG2']+geo2['count_DG3'])
geo2['prob_post_DG3_geo2'] = geo2['count_DG3']/(geo2['count_DG1']+geo2['count_DG2']+geo2['count_DG3'])

geo3['prob_post_DG1_geo3'] = geo3['count_DG1']/(geo3['count_DG1']+geo3['count_DG2']+geo3['count_DG3'])
geo3['prob_post_DG2_geo3'] = geo3['count_DG2']/(geo3['count_DG1']+geo3['count_DG2']+geo3['count_DG3'])
geo3['prob_post_DG3_geo3'] = geo3['count_DG3']/(geo3['count_DG1']+geo3['count_DG2']+geo3['count_DG3'])

#geo_1
geo1['est_proba_DG1_geo1'] = geo1['prob_post_DG1_geo1']*peso(geo1['count_DG1']+geo1['count_DG2']+geo1['count_DG3'],geo1['var_DG']/var_total) + (1-peso(geo1['count_DG1']+geo1['count_DG2']+geo1['count_DG3'],geo1['var_DG']/var_total))*(nDG1Ts/nTS)
geo1['est_proba_DG2_geo1'] = geo1['prob_post_DG2_geo1']*peso(geo1['count_DG1']+geo1['count_DG2']+geo1['count_DG3'],geo1['var_DG']/var_total) + (1-peso(geo1['count_DG1']+geo1['count_DG2']+geo1['count_DG3'],geo1['var_DG']/var_total))*(nDG2Ts/nTS)
geo1['est_proba_DG3_geo1'] = geo1['prob_post_DG3_geo1']*peso(geo1['count_DG1']+geo1['count_DG2']+geo1['count_DG3'],geo1['var_DG']/var_total) + (1-peso(geo1['count_DG1']+geo1['count_DG2']+geo1['count_DG3'],geo1['var_DG']/var_total))*(nDG3Ts/nTS)

#geo_2
geo2['est_proba_DG1_geo2'] = geo2['prob_post_DG1_geo2']*peso(geo2['count_DG1']+geo2['count_DG2']+geo2['count_DG3'],geo2['var_DG']/var_total) + (1-peso(geo2['count_DG1']+geo2['count_DG2']+geo2['count_DG3'],geo2['var_DG']/var_total))*(nDG1Ts/nTS)
geo2['est_proba_DG2_geo2'] = geo2['prob_post_DG2_geo2']*peso(geo2['count_DG1']+geo2['count_DG2']+geo2['count_DG3'],geo2['var_DG']/var_total) + (1-peso(geo2['count_DG1']+geo2['count_DG2']+geo2['count_DG3'],geo2['var_DG']/var_total))*(nDG2Ts/nTS)
geo2['est_proba_DG3_geo2'] = geo2['prob_post_DG3_geo2']*peso(geo2['count_DG1']+geo2['count_DG2']+geo2['count_DG3'],geo2['var_DG']/var_total) + (1-peso(geo2['count_DG1']+geo2['count_DG2']+geo2['count_DG3'],geo2['var_DG']/var_total))*(nDG3Ts/nTS)

#geo_3
geo3['est_proba_DG1_geo3'] = geo3['prob_post_DG1_geo3']*peso(geo3['count_DG1']+geo3['count_DG2']+geo3['count_DG3'],geo3['var_DG']/var_total) + (1-peso(geo3['count_DG1']+geo3['count_DG2']+geo3['count_DG3'],geo3['var_DG']/var_total))*(nDG1Ts/nTS)
geo3['est_proba_DG2_geo3'] = geo3['prob_post_DG2_geo3']*peso(geo3['count_DG1']+geo3['count_DG2']+geo3['count_DG3'],geo3['var_DG']/var_total) + (1-peso(geo3['count_DG1']+geo3['count_DG2']+geo3['count_DG3'],geo3['var_DG']/var_total))*(nDG2Ts/nTS)
geo3['est_proba_DG3_geo3'] = geo3['prob_post_DG3_geo3']*peso(geo3['count_DG1']+geo3['count_DG2']+geo3['count_DG3'],geo3['var_DG']/var_total) + (1-peso(geo3['count_DG1']+geo3['count_DG2']+geo3['count_DG3'],geo3['var_DG']/var_total))*(nDG3Ts/nTS)

to_merge_1 = geo1.loc[:,['est_proba_DG1_geo1','est_proba_DG2_geo1','est_proba_DG3_geo1']]
to_merge_2 = geo2.loc[:,['est_proba_DG1_geo2','est_proba_DG2_geo2','est_proba_DG3_geo2']]
to_merge_3 = geo3.loc[:,['est_proba_DG1_geo3','est_proba_DG2_geo3','est_proba_DG3_geo3']]

X_test = X_test.merge(to_merge_1, left_on = 'geo_level_1_id', right_index = True, how = 'left').drop(columns = 'geo_level_1_id')
X_test = X_test.merge(to_merge_2, left_on = 'geo_level_2_id', right_index = True, how = 'left').drop(columns = 'geo_level_2_id')
X_test = X_test.merge(to_merge_3, left_on = 'geo_level_3_id', right_index = True, how = 'left').drop(columns = 'geo_level_3_id')

X_test.fillna(0, inplace = True)
X_train.fillna(0, inplace = True)

X_train

for column in X_train.select_dtypes("object").columns:

    X_train[column]  = X_train[column].astype("bool")
    X_train[column]  = X_train[column].astype(np.uint8)

X_train.dtypes

for column in X_train.select_dtypes("bool").columns:

    
    X_train[column]  = X_train[column].astype(np.uint8)

In [11]:
X_train

Unnamed: 0,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,...,legal_ownership_status_w,est_proba_DG1_geo1,est_proba_DG2_geo1,est_proba_DG3_geo1,est_proba_DG1_geo2,est_proba_DG2_geo2,est_proba_DG3_geo2,est_proba_DG1_geo3,est_proba_DG2_geo3,est_proba_DG3_geo3
170377,3,25,2,5,1,0,0,0,0,0,...,0,0.203585,0.650798,0.145617,0.029385,0.712534,0.258081,0.000000,0.789474,0.210526
184895,3,40,10,8,0,1,0,0,0,0,...,0,0.083863,0.666646,0.249492,0.064190,0.822474,0.113336,0.000000,0.333333,0.666667
26740,2,45,8,5,0,1,0,0,0,0,...,1,0.054090,0.591803,0.354107,0.001865,0.902478,0.095657,0.000000,0.000000,0.000000
198210,2,30,7,4,0,1,0,1,0,0,...,0,0.054090,0.591803,0.354107,0.018599,0.879725,0.101675,0.017491,0.921366,0.061143
254313,3,30,6,7,0,1,0,0,0,0,...,0,0.013119,0.179196,0.807685,0.000496,0.729097,0.270407,0.000000,0.500000,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,2,25,6,5,0,1,0,0,0,0,...,0,0.203585,0.650798,0.145617,0.058668,0.432457,0.508875,0.000000,0.000000,1.000000
103694,2,5,5,4,0,1,0,0,0,0,...,0,0.054499,0.548534,0.396967,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
131932,3,20,22,7,0,1,0,0,0,0,...,0,0.034576,0.451078,0.514347,0.188497,0.809024,0.002479,0.003795,0.278682,0.717523
146867,3,10,11,7,0,1,0,0,0,0,...,0,0.054499,0.548534,0.396967,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [12]:
columns_importances =["has_superstructure_mud_mortar_stone",
                      "foundation_type_r",
                      "area_percentage",
                      "has_superstructure_cement_mortar_brick",
                      "other_floor_type_q",
                      "age",
                      "height_percentage",
                      "foundation_type_i",
                      "has_superstructure_timber",
                      "roof_type_n",
                      "count_floors_pre_eq",
                      'roof_type_x',
                      'has_superstructure_mud_mortar_brick',
                      'position_s',
                      'ground_floor_type_f',
                      'other_floor_type_x',
                      'has_superstructure_adobe_mud',
                      'ground_floor_type_v',
                      'roof_type_q',
                      'has_secondary_use',
                      'foundation_type_u',
                      'land_surface_condition_t',
                      'est_proba_DG1_geo1',
                      'est_proba_DG2_geo1',
                      'est_proba_DG3_geo1',
                      'est_proba_DG1_geo2',
                      'est_proba_DG2_geo2',
                      'est_proba_DG3_geo2',
                      'est_proba_DG1_geo3',
                      'est_proba_DG2_geo3',
                      'est_proba_DG3_geo3',
                      ]

In [13]:
x_train_importants_encoded = X_train.loc[:,columns_importances]
x_train_importants_encoded

Unnamed: 0,has_superstructure_mud_mortar_stone,foundation_type_r,area_percentage,has_superstructure_cement_mortar_brick,other_floor_type_q,age,height_percentage,foundation_type_i,has_superstructure_timber,roof_type_n,...,land_surface_condition_t,est_proba_DG1_geo1,est_proba_DG2_geo1,est_proba_DG3_geo1,est_proba_DG1_geo2,est_proba_DG2_geo2,est_proba_DG3_geo2,est_proba_DG1_geo3,est_proba_DG2_geo3,est_proba_DG3_geo3
170377,0,1,2,0,0,25,5,0,0,1,...,0,0.203585,0.650798,0.145617,0.029385,0.712534,0.258081,0.000000,0.789474,0.210526
184895,1,1,10,0,1,40,8,0,0,1,...,0,0.083863,0.666646,0.249492,0.064190,0.822474,0.113336,0.000000,0.333333,0.666667
26740,1,1,8,0,1,45,5,0,0,1,...,1,0.054090,0.591803,0.354107,0.001865,0.902478,0.095657,0.000000,0.000000,0.000000
198210,1,1,7,0,1,30,4,0,0,1,...,1,0.054090,0.591803,0.354107,0.018599,0.879725,0.101675,0.017491,0.921366,0.061143
254313,1,1,6,0,1,30,7,0,0,0,...,1,0.013119,0.179196,0.807685,0.000496,0.729097,0.270407,0.000000,0.500000,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,1,1,6,0,0,25,5,0,0,0,...,1,0.203585,0.650798,0.145617,0.058668,0.432457,0.508875,0.000000,0.000000,1.000000
103694,1,1,5,0,1,5,4,0,1,1,...,0,0.054499,0.548534,0.396967,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
131932,1,1,22,0,1,20,7,0,1,0,...,0,0.034576,0.451078,0.514347,0.188497,0.809024,0.002479,0.003795,0.278682,0.717523
146867,1,1,11,0,1,10,7,0,0,0,...,1,0.054499,0.548534,0.396967,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [14]:
x_test_importants_encoded = X_test.loc[:,columns_importances]
x_test_importants_encoded

Unnamed: 0,has_superstructure_mud_mortar_stone,foundation_type_r,area_percentage,has_superstructure_cement_mortar_brick,other_floor_type_q,age,height_percentage,foundation_type_i,has_superstructure_timber,roof_type_n,...,land_surface_condition_t,est_proba_DG1_geo1,est_proba_DG2_geo1,est_proba_DG3_geo1,est_proba_DG1_geo2,est_proba_DG2_geo2,est_proba_DG3_geo2,est_proba_DG1_geo3,est_proba_DG2_geo3,est_proba_DG3_geo3
111801,True,1,7,False,0,20,3,0,False,1,...,1,0.056273,0.547612,0.396115,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
91817,True,1,7,False,1,60,6,0,False,1,...,1,0.122468,0.740992,0.136539,0.601800,0.314308,0.083892,0.000000,0.000000,0.000000
251661,False,1,6,False,1,50,5,0,False,1,...,1,0.357898,0.559043,0.083060,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
205479,True,1,7,False,1,50,6,0,True,0,...,0,0.033070,0.425992,0.540937,0.000000,0.666667,0.333333,0.056447,0.752877,0.190676
22618,True,1,8,False,0,15,4,0,False,1,...,1,0.096961,0.660103,0.242935,0.001586,0.114478,0.883936,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209800,True,1,9,False,1,5,7,0,False,0,...,1,0.012867,0.180269,0.806864,0.319336,0.580429,0.100236,0.000000,0.000000,0.000000
45779,True,1,8,False,0,10,5,0,True,0,...,1,0.197493,0.674822,0.127686,0.022306,0.902344,0.075349,0.007350,0.041923,0.950727
177645,True,1,6,False,1,35,5,0,False,1,...,1,0.055580,0.600255,0.344166,0.038492,0.510127,0.451381,0.000000,0.000000,0.000000
36857,True,1,5,False,0,15,5,0,False,0,...,0,0.056273,0.547612,0.396115,0.002434,0.710641,0.286925,0.000000,1.000000,0.000000


In [15]:
for column in x_test_importants_encoded.select_dtypes("object").columns:

    x_test_importants_encoded[column]  = x_test_importants_encoded[column].astype("bool")
    
for column in x_test_importants_encoded.select_dtypes("bool").columns:
    x_test_importants_encoded[column]  = x_test_importants_encoded[column].astype(np.uint8)

In [16]:
x_test_importants_encoded

Unnamed: 0,has_superstructure_mud_mortar_stone,foundation_type_r,area_percentage,has_superstructure_cement_mortar_brick,other_floor_type_q,age,height_percentage,foundation_type_i,has_superstructure_timber,roof_type_n,...,land_surface_condition_t,est_proba_DG1_geo1,est_proba_DG2_geo1,est_proba_DG3_geo1,est_proba_DG1_geo2,est_proba_DG2_geo2,est_proba_DG3_geo2,est_proba_DG1_geo3,est_proba_DG2_geo3,est_proba_DG3_geo3
111801,1,1,7,0,0,20,3,0,0,1,...,1,0.056273,0.547612,0.396115,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
91817,1,1,7,0,1,60,6,0,0,1,...,1,0.122468,0.740992,0.136539,0.601800,0.314308,0.083892,0.000000,0.000000,0.000000
251661,0,1,6,0,1,50,5,0,0,1,...,1,0.357898,0.559043,0.083060,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
205479,1,1,7,0,1,50,6,0,1,0,...,0,0.033070,0.425992,0.540937,0.000000,0.666667,0.333333,0.056447,0.752877,0.190676
22618,1,1,8,0,0,15,4,0,0,1,...,1,0.096961,0.660103,0.242935,0.001586,0.114478,0.883936,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209800,1,1,9,0,1,5,7,0,0,0,...,1,0.012867,0.180269,0.806864,0.319336,0.580429,0.100236,0.000000,0.000000,0.000000
45779,1,1,8,0,0,10,5,0,1,0,...,1,0.197493,0.674822,0.127686,0.022306,0.902344,0.075349,0.007350,0.041923,0.950727
177645,1,1,6,0,1,35,5,0,0,1,...,1,0.055580,0.600255,0.344166,0.038492,0.510127,0.451381,0.000000,0.000000,0.000000
36857,1,1,5,0,0,15,5,0,0,0,...,0,0.056273,0.547612,0.396115,0.002434,0.710641,0.286925,0.000000,1.000000,0.000000


In [17]:
clf = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.02, max_delta_step=0,
              max_depth=12, min_child_weight=1, missing=-1,
              monotone_constraints='()', n_estimators=2000, n_jobs=4, nthread=4,
              num_class=3, num_parallel_tree=1, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=0.8, tree_method='hist', validate_parameters=1,
              verbosity=None)

model = clf.fit(x_train_importants_encoded,y_train,eval_set=[(x_test_importants_encoded,y_test)],verbose=50, early_stopping_rounds=100)



[0]	validation_0-mlogloss:1.09034
[50]	validation_0-mlogloss:0.84711
[100]	validation_0-mlogloss:0.76795
[150]	validation_0-mlogloss:0.73475
[200]	validation_0-mlogloss:0.72011
[250]	validation_0-mlogloss:0.71298
[300]	validation_0-mlogloss:0.70945
[350]	validation_0-mlogloss:0.70818
[400]	validation_0-mlogloss:0.70779
[450]	validation_0-mlogloss:0.70788
[500]	validation_0-mlogloss:0.70853
[524]	validation_0-mlogloss:0.70886


In [18]:
print("Training Score: {}".format(clf.score(x_train_importants_encoded, y_train)))
print("Test Score: {}".format(clf.score(x_test_importants_encoded, y_test)))

Training Score: 0.810941097467383
Test Score: 0.6654323593177414


In [None]:
geo_levels_train

In [None]:
geo_levels_test

In [None]:
x_train_importants_encoded["geo_level_1_id"] = geo_levels_train['geo_level_1_id']
x_train_importants_encoded["geo_level_2_id"] = geo_levels_train['geo_level_2_id']
x_train_importants_encoded["geo_level_3_id"] = geo_levels_train['geo_level_3_id']
x_test_importants_encoded["geo_level_1_id"] = geo_levels_test['geo_level_1_id']
x_test_importants_encoded["geo_level_2_id"] = geo_levels_test['geo_level_2_id']
x_test_importants_encoded["geo_level_3_id"] = geo_levels_test['geo_level_3_id']


x_train_importants_encoded

In [None]:
x_test_importants_encoded

In [None]:
clf = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.02, max_delta_step=0,
              max_depth=12, min_child_weight=1, missing=-1,
              monotone_constraints='()', n_estimators=2000, n_jobs=4, nthread=4,
              num_class=3, num_parallel_tree=1, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=0.8, tree_method='hist', validate_parameters=1,
              verbosity=None)

model = clf.fit(x_train_importants_encoded,y_train,eval_set=[(x_test_importants_encoded,y_test)],
                verbose=50)

In [None]:
print("Training Score: {}".format(clf.score(x_train_importants_encoded, y_train)))
print("Test Score: {}".format(clf.score(x_test_importants_encoded, y_test)))

In [None]:
X_train

In [None]:
for column in X_train.select_dtypes(include = "object").columns:

    X_train[column]  = X_train[column].astype("bool")
    
for column in X_test.select_dtypes("bool").columns:
    X_train[column]  = X_train[column].astype(np.uint8)
    
for column in X_test.select_dtypes("object").columns:

    X_test[column]  = X_test[column].astype("bool")
    
for column in X_test.select_dtypes("bool").columns:
    X_test[column]  = X_test[column].astype(np.uint8)

In [None]:
clf = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.02, max_delta_step=0,
              max_depth=12, min_child_weight=1, missing=-1,
              monotone_constraints='()', n_estimators=2000, n_jobs=4, nthread=4,
              num_class=3, num_parallel_tree=1, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=0.8, tree_method='hist', validate_parameters=1,
              verbosity=None)

model = clf.fit(X_train,y_train,eval_set=[(X_test,y_test)],
                verbose=50,early_stopping_rounds=100)

In [None]:
print("Training Score: {}".format(clf.score(X_train, y_train)))
print("Test Score: {}".format(clf.score(X_test, y_test)))