In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib.ticker as ticker
import seaborn as sns

In [3]:
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, make_scorer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
import multiprocessing

In [4]:
from xgboost import XGBClassifier

In [5]:
labels = pd.read_csv('../../csv/train_labels.csv')
labels.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [6]:
values = pd.read_csv('../../csv/train_values.csv')
values.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,260591,260592,260593,260594,260595,260596,260597,260598,260599,260600
building_id,802906,28830,94947,590882,201944,333020,728451,475515,441126,989500,...,560805,207683,226421,159555,827012,688636,669485,602512,151409,747594
geo_level_1_id,6,8,21,22,11,8,9,20,0,26,...,20,10,8,27,8,25,17,17,26,21
geo_level_2_id,487,900,363,418,131,558,475,323,757,886,...,368,1382,767,181,268,1335,715,51,39,9
geo_level_3_id,12198,2812,8973,10694,1488,6089,12066,12236,7219,994,...,5980,1903,8613,1537,4718,1621,2060,8163,1851,9101
count_floors_pre_eq,2,2,2,2,3,2,2,2,2,1,...,1,2,2,6,2,1,2,3,2,3
age,30,10,10,10,30,10,25,0,15,0,...,25,25,5,0,20,55,0,55,10,10
area_percentage,6,8,5,6,8,9,3,8,8,13,...,5,5,13,13,8,6,6,6,14,7
height_percentage,5,7,5,5,9,5,4,6,6,4,...,3,5,5,12,5,3,5,7,6,6
land_surface_condition,t,o,t,t,t,t,n,t,t,t,...,n,t,t,t,t,n,t,t,t,n
foundation_type,r,r,r,r,r,r,r,w,r,i,...,r,r,r,r,r,r,r,r,r,r


In [7]:
values.isnull().values.any()

False

In [8]:
labels.isnull().values.any()

False

In [9]:
values.dtypes 

building_id                                int64
geo_level_1_id                             int64
geo_level_2_id                             int64
geo_level_3_id                             int64
count_floors_pre_eq                        int64
age                                        int64
area_percentage                            int64
height_percentage                          int64
land_surface_condition                    object
foundation_type                           object
roof_type                                 object
ground_floor_type                         object
other_floor_type                          object
position                                  object
plan_configuration                        object
has_superstructure_adobe_mud               int64
has_superstructure_mud_mortar_stone        int64
has_superstructure_stone_flag              int64
has_superstructure_cement_mortar_stone     int64
has_superstructure_mud_mortar_brick        int64
has_superstructure_c

In [10]:
values["building_id"].count() == values["building_id"].drop_duplicates().count()

True

In [11]:
values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   building_id                             260601 non-null  int64 
 1   geo_level_1_id                          260601 non-null  int64 
 2   geo_level_2_id                          260601 non-null  int64 
 3   geo_level_3_id                          260601 non-null  int64 
 4   count_floors_pre_eq                     260601 non-null  int64 
 5   age                                     260601 non-null  int64 
 6   area_percentage                         260601 non-null  int64 
 7   height_percentage                       260601 non-null  int64 
 8   land_surface_condition                  260601 non-null  object
 9   foundation_type                         260601 non-null  object
 10  roof_type                               260601 non-null 

In [12]:
to_be_categorized = ["land_surface_condition", "foundation_type", "roof_type",\
                     "position", "ground_floor_type", "other_floor_type",\
                     "plan_configuration", "legal_ownership_status"]
for row in to_be_categorized:
    values[row] = values[row].astype("category")
values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   building_id                             260601 non-null  int64   
 1   geo_level_1_id                          260601 non-null  int64   
 2   geo_level_2_id                          260601 non-null  int64   
 3   geo_level_3_id                          260601 non-null  int64   
 4   count_floors_pre_eq                     260601 non-null  int64   
 5   age                                     260601 non-null  int64   
 6   area_percentage                         260601 non-null  int64   
 7   height_percentage                       260601 non-null  int64   
 8   land_surface_condition                  260601 non-null  category
 9   foundation_type                         260601 non-null  category
 10  roof_type                       

In [13]:
datatypes = dict(values.dtypes)
for row in values.columns:
    if datatypes[row] != "int64" and datatypes[row] != "int32" and \
       datatypes[row] != "int16" and datatypes[row] != "int8":
        continue
    if values[row].nlargest(1).item() > 32767 and values[row].nlargest(1).item() < 2**31:
        values[row] = values[row].astype(np.int32)
    elif values[row].nlargest(1).item() > 127:
        values[row] = values[row].astype(np.int16)
    else:
        values[row] = values[row].astype(np.int8)
values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   building_id                             260601 non-null  int32   
 1   geo_level_1_id                          260601 non-null  int8    
 2   geo_level_2_id                          260601 non-null  int16   
 3   geo_level_3_id                          260601 non-null  int16   
 4   count_floors_pre_eq                     260601 non-null  int8    
 5   age                                     260601 non-null  int16   
 6   area_percentage                         260601 non-null  int8    
 7   height_percentage                       260601 non-null  int8    
 8   land_surface_condition                  260601 non-null  category
 9   foundation_type                         260601 non-null  category
 10  roof_type                       

In [14]:
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   building_id   260601 non-null  int64
 1   damage_grade  260601 non-null  int64
dtypes: int64(2)
memory usage: 4.0 MB


In [15]:
labels["building_id"] = labels["building_id"].astype(np.int32)
labels["damage_grade"] = labels["damage_grade"].astype(np.int8)
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   building_id   260601 non-null  int32
 1   damage_grade  260601 non-null  int8 
dtypes: int32(1), int8(1)
memory usage: 1.2 MB


# Nuevo Modelo

In [16]:
important_values = values\
                .merge(labels, on="building_id")
important_values.drop(columns=["building_id"], inplace = True)
important_values["geo_level_1_id"] = important_values["geo_level_1_id"].astype("category")
important_values

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
1,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,2
2,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
3,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,2
4,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,25,1335,1621,1,55,6,3,n,r,n,...,0,0,0,0,0,0,0,0,0,2
260597,17,715,2060,2,0,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
260598,17,51,8163,3,55,6,7,t,r,q,...,0,0,0,0,0,0,0,0,0,3
260599,26,39,1851,2,10,14,6,t,r,x,...,0,0,0,0,0,0,0,0,0,2


In [17]:

X_train, X_test, y_train, y_test = train_test_split(important_values.drop(columns = 'damage_grade'),
                                                    important_values['damage_grade'], test_size = 0.2, random_state = 123)

In [18]:
#OneHotEncoding
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 

features_to_encode = ["geo_level_1_id", "land_surface_condition", "foundation_type", "roof_type",\
                     "position", "ground_floor_type", "other_floor_type",\
                     "plan_configuration", "legal_ownership_status"]
for feature in features_to_encode:
    X_train = encode_and_bind(X_train, feature)
    X_test = encode_and_bind(X_test, feature)

In [19]:
X_train

Unnamed: 0,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,...,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
103291,1274,4190,2,25,8,5,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
233923,1207,12014,1,10,9,3,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
166653,944,8232,3,40,7,6,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
150634,488,12448,2,0,7,5,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
82720,302,5339,1,10,5,3,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192476,217,10644,1,25,4,6,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
17730,600,4813,2,20,13,8,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
28030,463,4692,2,10,9,4,1,1,0,0,...,0,0,0,0,0,0,0,0,1,0
15725,600,157,2,50,5,8,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [20]:
# # Busco los mejores tres parametros indicados abajo.
# import time
# print(time.gmtime())
# n_estimators = [350]
# learning_rate = [0.45]
# gamma = [0, 1, 2]
# subsample = [0.2, 0.5, 0.8]
# # max_depth = [None, 3, 6, 10]
# # min_child_weight = [0, 1, 2]
# # max_delta_step = [0, 5, 10]

# hyperF = {'learning_rate': learning_rate,
#           'n_estimators': n_estimators,
#           'gamma': gamma,
#           'subsample': subsample         
#          }

# gridF = GridSearchCV(estimator = XGBClassifier(label_encoder = False),
#                      scoring = make_scorer(f1_score, average = 'micro'),
#                      param_grid = hyperF,
#                      cv = 3,
#                      verbose = 4, 
#                      n_jobs = -1)

# bestF = gridF.fit(X_train, y_train)
# print(time.gmtime())

In [21]:
# res = pd.DataFrame(bestF.cv_results_)
# res.loc[res['rank_test_score'] <= 10]

In [30]:
import time

def my_grid_search():
    print(time.gmtime())
    i = 1
    df = pd.DataFrame({'subsample': [],
                       'gamma': [],
                       'learning_rate': [],
                       'max_depth': [],
                       'score': []})
    for subsample in [0.75, 0.825, 0.885, 0.95]:
        for gamma in [0.5, 1, 1.5]:
            for learning_rate in [0.425, 0.45, 0.55]:
                 for max_depth in [3, 6, 10, 15]:
                    model = XGBClassifier(n_estimators = 350,
                                          booster = 'gbtree',
                                          subsample = subsample,
                                          gamma = gamma,
                                          max_depth = max_depth,
                                          learning_rate = learning_rate,
                                          label_encoder = False,
                                          verbosity = 0)
                    model.fit(X_train, y_train)
                    y_preds = model.predict(X_test)
                    score = f1_score(y_test, y_preds, average = 'micro')
                    df = df.append(pd.Series(
                        data={'subsample': subsample,
                              'gamma': gamma,
                              'learning_rate': learning_rate,
                              'max_depth': max_depth,
                              'score': score},
                    name = i))
                    print(i, time.gmtime())
                    i += 1

    return df.sort_values('score', ascending = False)

df = my_grid_search()
df.to_csv('grid-search/res-1.csv')
df

time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=6, tm_min=54, tm_sec=0, tm_wday=0, tm_yday=193, tm_isdst=0)
1 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=6, tm_min=57, tm_sec=21, tm_wday=0, tm_yday=193, tm_isdst=0)




2 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=7, tm_min=1, tm_sec=11, tm_wday=0, tm_yday=193, tm_isdst=0)




3 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=7, tm_min=5, tm_sec=1, tm_wday=0, tm_yday=193, tm_isdst=0)




4 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=7, tm_min=8, tm_sec=51, tm_wday=0, tm_yday=193, tm_isdst=0)




5 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=7, tm_min=12, tm_sec=41, tm_wday=0, tm_yday=193, tm_isdst=0)




6 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=7, tm_min=16, tm_sec=30, tm_wday=0, tm_yday=193, tm_isdst=0)




7 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=7, tm_min=20, tm_sec=19, tm_wday=0, tm_yday=193, tm_isdst=0)




8 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=7, tm_min=24, tm_sec=9, tm_wday=0, tm_yday=193, tm_isdst=0)




9 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=7, tm_min=27, tm_sec=59, tm_wday=0, tm_yday=193, tm_isdst=0)




10 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=7, tm_min=31, tm_sec=49, tm_wday=0, tm_yday=193, tm_isdst=0)




11 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=7, tm_min=35, tm_sec=37, tm_wday=0, tm_yday=193, tm_isdst=0)




12 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=7, tm_min=39, tm_sec=27, tm_wday=0, tm_yday=193, tm_isdst=0)




13 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=7, tm_min=43, tm_sec=15, tm_wday=0, tm_yday=193, tm_isdst=0)




14 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=7, tm_min=47, tm_sec=5, tm_wday=0, tm_yday=193, tm_isdst=0)




15 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=7, tm_min=50, tm_sec=54, tm_wday=0, tm_yday=193, tm_isdst=0)




16 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=7, tm_min=54, tm_sec=43, tm_wday=0, tm_yday=193, tm_isdst=0)




17 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=7, tm_min=58, tm_sec=32, tm_wday=0, tm_yday=193, tm_isdst=0)




18 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=8, tm_min=2, tm_sec=22, tm_wday=0, tm_yday=193, tm_isdst=0)




19 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=8, tm_min=6, tm_sec=11, tm_wday=0, tm_yday=193, tm_isdst=0)




20 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=8, tm_min=10, tm_sec=1, tm_wday=0, tm_yday=193, tm_isdst=0)




21 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=8, tm_min=13, tm_sec=50, tm_wday=0, tm_yday=193, tm_isdst=0)




22 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=8, tm_min=17, tm_sec=40, tm_wday=0, tm_yday=193, tm_isdst=0)




23 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=8, tm_min=21, tm_sec=29, tm_wday=0, tm_yday=193, tm_isdst=0)




24 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=8, tm_min=25, tm_sec=18, tm_wday=0, tm_yday=193, tm_isdst=0)




25 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=8, tm_min=29, tm_sec=7, tm_wday=0, tm_yday=193, tm_isdst=0)




26 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=8, tm_min=32, tm_sec=50, tm_wday=0, tm_yday=193, tm_isdst=0)




27 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=8, tm_min=36, tm_sec=29, tm_wday=0, tm_yday=193, tm_isdst=0)




28 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=8, tm_min=40, tm_sec=14, tm_wday=0, tm_yday=193, tm_isdst=0)




29 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=8, tm_min=43, tm_sec=57, tm_wday=0, tm_yday=193, tm_isdst=0)




30 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=8, tm_min=47, tm_sec=40, tm_wday=0, tm_yday=193, tm_isdst=0)




31 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=8, tm_min=51, tm_sec=25, tm_wday=0, tm_yday=193, tm_isdst=0)




32 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=8, tm_min=55, tm_sec=10, tm_wday=0, tm_yday=193, tm_isdst=0)




33 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=8, tm_min=58, tm_sec=56, tm_wday=0, tm_yday=193, tm_isdst=0)




34 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=9, tm_min=2, tm_sec=40, tm_wday=0, tm_yday=193, tm_isdst=0)




35 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=9, tm_min=6, tm_sec=36, tm_wday=0, tm_yday=193, tm_isdst=0)




36 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=9, tm_min=10, tm_sec=23, tm_wday=0, tm_yday=193, tm_isdst=0)




37 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=9, tm_min=14, tm_sec=7, tm_wday=0, tm_yday=193, tm_isdst=0)




38 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=9, tm_min=17, tm_sec=52, tm_wday=0, tm_yday=193, tm_isdst=0)




39 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=9, tm_min=21, tm_sec=38, tm_wday=0, tm_yday=193, tm_isdst=0)




40 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=9, tm_min=25, tm_sec=23, tm_wday=0, tm_yday=193, tm_isdst=0)




41 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=9, tm_min=29, tm_sec=8, tm_wday=0, tm_yday=193, tm_isdst=0)




42 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=9, tm_min=32, tm_sec=51, tm_wday=0, tm_yday=193, tm_isdst=0)




43 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=9, tm_min=36, tm_sec=35, tm_wday=0, tm_yday=193, tm_isdst=0)




44 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=9, tm_min=40, tm_sec=19, tm_wday=0, tm_yday=193, tm_isdst=0)




45 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=9, tm_min=44, tm_sec=6, tm_wday=0, tm_yday=193, tm_isdst=0)




46 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=9, tm_min=47, tm_sec=50, tm_wday=0, tm_yday=193, tm_isdst=0)




47 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=9, tm_min=51, tm_sec=34, tm_wday=0, tm_yday=193, tm_isdst=0)




48 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=9, tm_min=55, tm_sec=18, tm_wday=0, tm_yday=193, tm_isdst=0)




49 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=9, tm_min=59, tm_sec=2, tm_wday=0, tm_yday=193, tm_isdst=0)




50 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=10, tm_min=2, tm_sec=47, tm_wday=0, tm_yday=193, tm_isdst=0)




51 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=10, tm_min=6, tm_sec=26, tm_wday=0, tm_yday=193, tm_isdst=0)




52 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=10, tm_min=10, tm_sec=6, tm_wday=0, tm_yday=193, tm_isdst=0)




53 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=10, tm_min=13, tm_sec=44, tm_wday=0, tm_yday=193, tm_isdst=0)




54 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=10, tm_min=17, tm_sec=23, tm_wday=0, tm_yday=193, tm_isdst=0)




55 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=10, tm_min=21, tm_sec=3, tm_wday=0, tm_yday=193, tm_isdst=0)




56 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=10, tm_min=24, tm_sec=43, tm_wday=0, tm_yday=193, tm_isdst=0)




57 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=10, tm_min=28, tm_sec=22, tm_wday=0, tm_yday=193, tm_isdst=0)




58 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=10, tm_min=32, tm_sec=0, tm_wday=0, tm_yday=193, tm_isdst=0)




59 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=10, tm_min=35, tm_sec=40, tm_wday=0, tm_yday=193, tm_isdst=0)




60 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=10, tm_min=39, tm_sec=21, tm_wday=0, tm_yday=193, tm_isdst=0)




61 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=10, tm_min=43, tm_sec=1, tm_wday=0, tm_yday=193, tm_isdst=0)




62 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=10, tm_min=46, tm_sec=41, tm_wday=0, tm_yday=193, tm_isdst=0)




63 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=10, tm_min=50, tm_sec=20, tm_wday=0, tm_yday=193, tm_isdst=0)




64 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=10, tm_min=54, tm_sec=0, tm_wday=0, tm_yday=193, tm_isdst=0)




65 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=10, tm_min=57, tm_sec=39, tm_wday=0, tm_yday=193, tm_isdst=0)




66 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=11, tm_min=1, tm_sec=19, tm_wday=0, tm_yday=193, tm_isdst=0)




67 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=11, tm_min=4, tm_sec=59, tm_wday=0, tm_yday=193, tm_isdst=0)




68 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=11, tm_min=8, tm_sec=38, tm_wday=0, tm_yday=193, tm_isdst=0)




69 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=11, tm_min=12, tm_sec=17, tm_wday=0, tm_yday=193, tm_isdst=0)




70 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=11, tm_min=15, tm_sec=57, tm_wday=0, tm_yday=193, tm_isdst=0)




71 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=11, tm_min=19, tm_sec=38, tm_wday=0, tm_yday=193, tm_isdst=0)




72 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=11, tm_min=23, tm_sec=18, tm_wday=0, tm_yday=193, tm_isdst=0)




73 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=11, tm_min=26, tm_sec=59, tm_wday=0, tm_yday=193, tm_isdst=0)




74 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=11, tm_min=30, tm_sec=40, tm_wday=0, tm_yday=193, tm_isdst=0)




75 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=11, tm_min=34, tm_sec=20, tm_wday=0, tm_yday=193, tm_isdst=0)




76 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=11, tm_min=37, tm_sec=53, tm_wday=0, tm_yday=193, tm_isdst=0)




77 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=11, tm_min=41, tm_sec=26, tm_wday=0, tm_yday=193, tm_isdst=0)




78 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=11, tm_min=45, tm_sec=0, tm_wday=0, tm_yday=193, tm_isdst=0)




79 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=11, tm_min=48, tm_sec=34, tm_wday=0, tm_yday=193, tm_isdst=0)




80 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=11, tm_min=52, tm_sec=7, tm_wday=0, tm_yday=193, tm_isdst=0)




81 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=11, tm_min=55, tm_sec=41, tm_wday=0, tm_yday=193, tm_isdst=0)




82 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=11, tm_min=59, tm_sec=14, tm_wday=0, tm_yday=193, tm_isdst=0)




83 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=12, tm_min=2, tm_sec=47, tm_wday=0, tm_yday=193, tm_isdst=0)




84 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=12, tm_min=6, tm_sec=20, tm_wday=0, tm_yday=193, tm_isdst=0)




85 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=12, tm_min=9, tm_sec=54, tm_wday=0, tm_yday=193, tm_isdst=0)




86 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=12, tm_min=13, tm_sec=27, tm_wday=0, tm_yday=193, tm_isdst=0)




87 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=12, tm_min=17, tm_sec=0, tm_wday=0, tm_yday=193, tm_isdst=0)




88 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=12, tm_min=20, tm_sec=33, tm_wday=0, tm_yday=193, tm_isdst=0)




89 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=12, tm_min=24, tm_sec=15, tm_wday=0, tm_yday=193, tm_isdst=0)




90 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=12, tm_min=27, tm_sec=47, tm_wday=0, tm_yday=193, tm_isdst=0)




91 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=12, tm_min=31, tm_sec=19, tm_wday=0, tm_yday=193, tm_isdst=0)




92 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=12, tm_min=34, tm_sec=51, tm_wday=0, tm_yday=193, tm_isdst=0)




93 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=12, tm_min=38, tm_sec=24, tm_wday=0, tm_yday=193, tm_isdst=0)




94 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=12, tm_min=41, tm_sec=58, tm_wday=0, tm_yday=193, tm_isdst=0)




95 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=12, tm_min=45, tm_sec=31, tm_wday=0, tm_yday=193, tm_isdst=0)




96 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=12, tm_min=49, tm_sec=5, tm_wday=0, tm_yday=193, tm_isdst=0)




97 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=12, tm_min=52, tm_sec=37, tm_wday=0, tm_yday=193, tm_isdst=0)




98 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=12, tm_min=56, tm_sec=9, tm_wday=0, tm_yday=193, tm_isdst=0)




99 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=12, tm_min=59, tm_sec=41, tm_wday=0, tm_yday=193, tm_isdst=0)




100 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=12, tm_hour=13, tm_min=3, tm_sec=13, tm_wday=0, tm_yday=193, tm_isdst=0)


Unnamed: 0,subsample,gamma,learning_rate,score
84,0.95,1.0,0.55,0.743098
8,0.80,1.0,0.45,0.743059
9,0.80,1.0,0.55,0.742810
58,0.90,1.0,0.45,0.742676
33,0.85,1.0,0.45,0.742580
...,...,...,...,...
71,0.90,4.0,0.25,0.735750
91,0.95,3.0,0.25,0.735500
21,0.80,4.0,0.25,0.735462
46,0.85,4.0,0.25,0.735327


In [60]:
import time

def my_grid_search():
    print(time.gmtime())
    i = 1
    df = pd.DataFrame({'subsample': [],
                       'gamma': [],
                       'learning_rate': [],
                       'max_depth': [],
                       'score': []})
    for subsample in [0.8, 0.885, 0.925, 0.95]:
        for gamma in [1]:
            for learning_rate in [0.525, 0.55, 0.575]:
                for max_depth in [6]:
                    model = XGBClassifier(n_estimators = 350,
                                          booster = 'gbtree',
                                          subsample = subsample,
                                          gamma = gamma,
                                          max_depth = max_depth,
                                          learning_rate = learning_rate,
                                          label_encoder = False,
                                          verbosity = 0)
                    model.fit(X_train, y_train)
                    y_preds = model.predict(X_test)
                    score = f1_score(y_test, y_preds, average = 'micro')
                    df = df.append(pd.Series(
                        data={'subsample': subsample,
                              'gamma': gamma,
                              'learning_rate': learning_rate,
                              'max_depth': max_depth,
                              'score': score},
                    name = i))
                    print(i, time.gmtime())
                    i += 1

    return df.sort_values('score', ascending = False)

df = my_grid_search()
df.to_csv('grid-search/res-red-4.csv')
df

time.struct_time(tm_year=2021, tm_mon=7, tm_mday=13, tm_hour=3, tm_min=40, tm_sec=55, tm_wday=1, tm_yday=194, tm_isdst=0)




1 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=13, tm_hour=3, tm_min=43, tm_sec=48, tm_wday=1, tm_yday=194, tm_isdst=0)




2 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=13, tm_hour=3, tm_min=47, tm_sec=38, tm_wday=1, tm_yday=194, tm_isdst=0)




3 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=13, tm_hour=3, tm_min=52, tm_sec=58, tm_wday=1, tm_yday=194, tm_isdst=0)




4 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=13, tm_hour=3, tm_min=57, tm_sec=10, tm_wday=1, tm_yday=194, tm_isdst=0)




5 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=13, tm_hour=4, tm_min=0, tm_sec=54, tm_wday=1, tm_yday=194, tm_isdst=0)




6 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=13, tm_hour=4, tm_min=4, tm_sec=44, tm_wday=1, tm_yday=194, tm_isdst=0)




7 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=13, tm_hour=4, tm_min=8, tm_sec=20, tm_wday=1, tm_yday=194, tm_isdst=0)




8 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=13, tm_hour=4, tm_min=11, tm_sec=55, tm_wday=1, tm_yday=194, tm_isdst=0)




9 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=13, tm_hour=4, tm_min=15, tm_sec=31, tm_wday=1, tm_yday=194, tm_isdst=0)




10 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=13, tm_hour=4, tm_min=19, tm_sec=24, tm_wday=1, tm_yday=194, tm_isdst=0)




11 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=13, tm_hour=4, tm_min=23, tm_sec=23, tm_wday=1, tm_yday=194, tm_isdst=0)




12 time.struct_time(tm_year=2021, tm_mon=7, tm_mday=13, tm_hour=4, tm_min=27, tm_sec=15, tm_wday=1, tm_yday=194, tm_isdst=0)


Unnamed: 0,subsample,gamma,learning_rate,max_depth,score
11,0.95,1.0,0.55,6.0,0.743098
2,0.8,1.0,0.55,6.0,0.74281
5,0.885,1.0,0.55,6.0,0.742465
4,0.885,1.0,0.525,6.0,0.74233
10,0.95,1.0,0.525,6.0,0.741966
7,0.925,1.0,0.525,6.0,0.741851
6,0.885,1.0,0.575,6.0,0.741371
12,0.95,1.0,0.575,6.0,0.741371
8,0.925,1.0,0.55,6.0,0.741313
3,0.8,1.0,0.575,6.0,0.740949


In [61]:
pd.read_csv('grid-search/res-red-1.csv')\
    .append(pd.read_csv('grid-search/res-red-2.csv'))\
    .append(pd.read_csv('grid-search/res-red-3.csv'))\
    .append(pd.read_csv('grid-search/res-red-4.csv'))\
    .drop_duplicates(['subsample', 'gamma', 'learning_rate', 'max_depth'])\
    .nlargest(15, 'score')\
    .sort_values('score', ascending = False)

Unnamed: 0.1,Unnamed: 0,subsample,gamma,learning_rate,max_depth,score
0,12,0.885,1.0,0.45,6.0,0.743577
1,2,0.75,1.0,0.45,6.0,0.743232
2,19,0.95,1.0,0.55,6.0,0.743098
1,4,0.885,1.0,0.425,6.0,0.743079
3,7,0.8,1.0,0.45,6.0,0.743059
4,9,0.8,1.0,0.55,6.0,0.74281
5,16,0.95,1.0,0.4,6.0,0.742503
6,14,0.885,1.0,0.55,6.0,0.742465
2,1,0.88,1.0,0.425,6.0,0.742369
3,4,0.885,1.0,0.525,6.0,0.74233


In [56]:
# Utilizo los mejores parametros segun el GridSearch
xgb_model = XGBClassifier(n_estimators = 350,
                          subsample = 0.885,
                          booster = 'gbtree',
                          gamma = 1,
                          learning_rate = 0.45,
                          label_encoder = False,
                          verbosity = 2)
xgb_model.fit(X_train, y_train)



Parameters: { "label_encoder" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[00:31:39] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 124 extra nodes, 0 pruned nodes, max_depth=6
[00:31:40] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 114 extra nodes, 4 pruned nodes, max_depth=6
[00:31:40] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 120 extra nodes, 4 pruned nodes, max_depth=6
[00:31:40] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[00:31:40] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 120 extra nodes, 4 pruned nodes, max_depth=6
[00:31:40] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 112 extra nodes, 8 pruned nodes, max_depth=6
[00:31:40] INFO: ../src/t

[00:31:50] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 80 extra nodes, 14 pruned nodes, max_depth=6
[00:31:50] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 114 extra nodes, 10 pruned nodes, max_depth=6
[00:31:50] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 64 extra nodes, 28 pruned nodes, max_depth=6
[00:31:50] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 80 extra nodes, 22 pruned nodes, max_depth=6
[00:31:50] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 40 extra nodes, 8 pruned nodes, max_depth=6
[00:31:51] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 108 extra nodes, 8 pruned nodes, max_depth=6
[00:31:51] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 94 extra nodes, 14 pruned nodes, max_depth=6
[00:31:51] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 96 extra nodes, 4 pruned nodes, max_depth=6
[00:31:51] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 72 extra nodes, 10 p

[00:32:00] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 72 extra nodes, 4 pruned nodes, max_depth=6
[00:32:00] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 70 extra nodes, 22 pruned nodes, max_depth=6
[00:32:00] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 100 extra nodes, 16 pruned nodes, max_depth=6
[00:32:00] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 86 extra nodes, 28 pruned nodes, max_depth=6
[00:32:00] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 82 extra nodes, 10 pruned nodes, max_depth=6
[00:32:01] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 92 extra nodes, 16 pruned nodes, max_depth=6
[00:32:01] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 98 extra nodes, 22 pruned nodes, max_depth=6
[00:32:01] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 116 extra nodes, 4 pruned nodes, max_depth=6
[00:32:01] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 94 extra nodes, 16 

[00:32:10] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 98 extra nodes, 14 pruned nodes, max_depth=6
[00:32:10] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 56 extra nodes, 40 pruned nodes, max_depth=6
[00:32:10] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 104 extra nodes, 8 pruned nodes, max_depth=6
[00:32:10] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 42 extra nodes, 28 pruned nodes, max_depth=6
[00:32:11] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 74 extra nodes, 22 pruned nodes, max_depth=6
[00:32:11] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 36 extra nodes, 6 pruned nodes, max_depth=6
[00:32:11] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 84 extra nodes, 32 pruned nodes, max_depth=6
[00:32:11] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 96 extra nodes, 6 pruned nodes, max_depth=6
[00:32:11] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 72 extra nodes, 14 pr

[00:32:20] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 106 extra nodes, 10 pruned nodes, max_depth=6
[00:32:21] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 106 extra nodes, 14 pruned nodes, max_depth=6
[00:32:21] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 92 extra nodes, 18 pruned nodes, max_depth=6
[00:32:21] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 60 extra nodes, 18 pruned nodes, max_depth=6
[00:32:21] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 58 extra nodes, 16 pruned nodes, max_depth=6
[00:32:21] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 92 extra nodes, 18 pruned nodes, max_depth=6
[00:32:21] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 94 extra nodes, 24 pruned nodes, max_depth=6
[00:32:21] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 114 extra nodes, 8 pruned nodes, max_depth=6
[00:32:22] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 76 extra nodes, 2

[00:32:32] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 86 extra nodes, 20 pruned nodes, max_depth=6
[00:32:32] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 68 extra nodes, 14 pruned nodes, max_depth=6
[00:32:32] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 102 extra nodes, 20 pruned nodes, max_depth=6
[00:32:32] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 84 extra nodes, 20 pruned nodes, max_depth=6
[00:32:32] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 50 extra nodes, 22 pruned nodes, max_depth=6
[00:32:32] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 84 extra nodes, 12 pruned nodes, max_depth=6
[00:32:32] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 104 extra nodes, 14 pruned nodes, max_depth=6
[00:32:33] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 82 extra nodes, 10 pruned nodes, max_depth=6
[00:32:33] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 98 extra nodes, 1

[00:32:43] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 56 extra nodes, 24 pruned nodes, max_depth=6
[00:32:43] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 72 extra nodes, 28 pruned nodes, max_depth=6
[00:32:43] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 88 extra nodes, 4 pruned nodes, max_depth=6
[00:32:43] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 66 extra nodes, 34 pruned nodes, max_depth=6
[00:32:43] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 40 extra nodes, 26 pruned nodes, max_depth=6
[00:32:44] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 52 extra nodes, 32 pruned nodes, max_depth=6
[00:32:44] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 70 extra nodes, 20 pruned nodes, max_depth=6
[00:32:44] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 64 extra nodes, 22 pruned nodes, max_depth=6
[00:32:44] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 74 extra nodes, 14 p

[00:32:54] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 96 extra nodes, 16 pruned nodes, max_depth=6
[00:32:54] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 58 extra nodes, 32 pruned nodes, max_depth=6
[00:32:54] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 88 extra nodes, 20 pruned nodes, max_depth=6
[00:32:55] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 56 extra nodes, 32 pruned nodes, max_depth=6
[00:32:55] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 94 extra nodes, 20 pruned nodes, max_depth=6
[00:32:55] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 60 extra nodes, 30 pruned nodes, max_depth=6
[00:32:55] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 76 extra nodes, 20 pruned nodes, max_depth=6
[00:32:55] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 74 extra nodes, 30 pruned nodes, max_depth=6
[00:32:55] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 88 extra nodes, 14 

[00:33:05] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 64 extra nodes, 40 pruned nodes, max_depth=6
[00:33:06] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 44 extra nodes, 24 pruned nodes, max_depth=6
[00:33:06] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 86 extra nodes, 26 pruned nodes, max_depth=6
[00:33:06] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 56 extra nodes, 24 pruned nodes, max_depth=6
[00:33:06] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 88 extra nodes, 16 pruned nodes, max_depth=6
[00:33:06] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 56 extra nodes, 14 pruned nodes, max_depth=6
[00:33:06] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 68 extra nodes, 30 pruned nodes, max_depth=6
[00:33:07] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 28 extra nodes, 34 pruned nodes, max_depth=6
[00:33:07] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 108 extra nodes, 12

[00:33:17] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 60 extra nodes, 30 pruned nodes, max_depth=6
[00:33:17] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 98 extra nodes, 22 pruned nodes, max_depth=6
[00:33:17] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 58 extra nodes, 16 pruned nodes, max_depth=6
[00:33:17] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 58 extra nodes, 38 pruned nodes, max_depth=6
[00:33:17] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 82 extra nodes, 18 pruned nodes, max_depth=6
[00:33:18] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 84 extra nodes, 12 pruned nodes, max_depth=6
[00:33:18] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 42 extra nodes, 36 pruned nodes, max_depth=6
[00:33:18] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 72 extra nodes, 32 pruned nodes, max_depth=6
[00:33:18] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 78 extra nodes, 24 

[00:33:28] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 90 extra nodes, 12 pruned nodes, max_depth=6
[00:33:28] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 48 extra nodes, 20 pruned nodes, max_depth=6
[00:33:28] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 112 extra nodes, 14 pruned nodes, max_depth=6
[00:33:29] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 74 extra nodes, 26 pruned nodes, max_depth=6
[00:33:29] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 68 extra nodes, 26 pruned nodes, max_depth=6
[00:33:29] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 32 extra nodes, 16 pruned nodes, max_depth=6
[00:33:29] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 94 extra nodes, 18 pruned nodes, max_depth=6
[00:33:29] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 62 extra nodes, 24 pruned nodes, max_depth=6
[00:33:29] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 82 extra nodes, 18

[00:33:39] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 72 extra nodes, 34 pruned nodes, max_depth=6
[00:33:40] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 92 extra nodes, 28 pruned nodes, max_depth=6
[00:33:40] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 48 extra nodes, 22 pruned nodes, max_depth=6
[00:33:40] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 70 extra nodes, 34 pruned nodes, max_depth=6
[00:33:40] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 80 extra nodes, 28 pruned nodes, max_depth=6
[00:33:40] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 48 extra nodes, 14 pruned nodes, max_depth=6
[00:33:40] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 80 extra nodes, 22 pruned nodes, max_depth=6
[00:33:40] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 74 extra nodes, 26 pruned nodes, max_depth=6
[00:33:41] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 78 extra nodes, 24 

[00:33:51] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 50 extra nodes, 20 pruned nodes, max_depth=6
[00:33:51] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 40 extra nodes, 56 pruned nodes, max_depth=6
[00:33:51] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 88 extra nodes, 28 pruned nodes, max_depth=6
[00:33:52] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 36 extra nodes, 34 pruned nodes, max_depth=6
[00:33:52] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 60 extra nodes, 16 pruned nodes, max_depth=6
[00:33:52] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 56 extra nodes, 26 pruned nodes, max_depth=6
[00:33:52] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 70 extra nodes, 34 pruned nodes, max_depth=6
[00:33:52] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 66 extra nodes, 32 pruned nodes, max_depth=6
[00:33:52] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 58 extra nodes, 22 

[00:34:03] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 36 extra nodes, 30 pruned nodes, max_depth=6
[00:34:03] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 68 extra nodes, 22 pruned nodes, max_depth=6
[00:34:03] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 52 extra nodes, 44 pruned nodes, max_depth=6
[00:34:03] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 42 extra nodes, 56 pruned nodes, max_depth=6
[00:34:03] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 40 extra nodes, 30 pruned nodes, max_depth=6
[00:34:03] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 82 extra nodes, 36 pruned nodes, max_depth=6
[00:34:04] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 78 extra nodes, 30 pruned nodes, max_depth=6
[00:34:04] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 52 extra nodes, 52 pruned nodes, max_depth=6
[00:34:04] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 60 extra nodes, 40 

[00:34:14] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 76 extra nodes, 24 pruned nodes, max_depth=6
[00:34:14] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 50 extra nodes, 40 pruned nodes, max_depth=6
[00:34:15] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 82 extra nodes, 18 pruned nodes, max_depth=6
[00:34:15] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 58 extra nodes, 14 pruned nodes, max_depth=6
[00:34:15] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 44 extra nodes, 24 pruned nodes, max_depth=6
[00:34:15] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 64 extra nodes, 46 pruned nodes, max_depth=6
[00:34:15] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 50 extra nodes, 24 pruned nodes, max_depth=6
[00:34:15] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 26 extra nodes, 34 pruned nodes, max_depth=6
[00:34:16] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 66 extra nodes, 24 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=1, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              label_encoder=False, learning_rate=0.45, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=350, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=0.885,
              tree_method='exact', validate_parameters=1, verbosity=2)

In [57]:
xgb_model.score(X_train, y_train)

0.8035542977743668

In [58]:
# Calculo el F1 score para mi training set.
y_preds = xgb_model.predict(X_test)
f1_score(y_test, y_preds, average='micro')

0.7435774447919264

In [26]:
test_values = pd.read_csv('../../csv/test_values.csv', index_col = "building_id")
test_values

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300051,17,596,11307,3,20,7,6,t,r,n,...,0,0,0,0,0,0,0,0,0,0
99355,6,141,11987,2,25,13,5,t,r,n,...,1,0,0,0,0,0,0,0,0,0
890251,22,19,10044,2,5,4,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
745817,26,39,633,1,0,19,3,t,r,x,...,0,0,1,0,0,0,0,0,0,0
421793,17,289,7970,3,15,8,7,t,r,q,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310028,4,605,3623,3,70,20,6,t,r,q,...,1,0,0,0,0,0,0,0,0,0
663567,10,1407,11907,3,25,6,7,n,r,n,...,0,0,0,0,0,0,0,0,0,0
1049160,22,1136,7712,1,50,3,3,t,r,n,...,0,0,0,0,0,0,0,0,0,0
442785,6,1041,912,2,5,9,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0


In [27]:
test_values_subset = test_values
test_values_subset["geo_level_1_id"] = test_values_subset["geo_level_1_id"].astype("category")
test_values_subset

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300051,17,596,11307,3,20,7,6,t,r,n,...,0,0,0,0,0,0,0,0,0,0
99355,6,141,11987,2,25,13,5,t,r,n,...,1,0,0,0,0,0,0,0,0,0
890251,22,19,10044,2,5,4,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
745817,26,39,633,1,0,19,3,t,r,x,...,0,0,1,0,0,0,0,0,0,0
421793,17,289,7970,3,15,8,7,t,r,q,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310028,4,605,3623,3,70,20,6,t,r,q,...,1,0,0,0,0,0,0,0,0,0
663567,10,1407,11907,3,25,6,7,n,r,n,...,0,0,0,0,0,0,0,0,0,0
1049160,22,1136,7712,1,50,3,3,t,r,n,...,0,0,0,0,0,0,0,0,0,0
442785,6,1041,912,2,5,9,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0


In [28]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 

features_to_encode = ["geo_level_1_id", "land_surface_condition", "foundation_type", "roof_type",\
                     "position", "ground_floor_type", "other_floor_type",\
                     "plan_configuration", "legal_ownership_status"]
for feature in features_to_encode:
    test_values_subset = encode_and_bind(test_values_subset, feature)
test_values_subset

Unnamed: 0_level_0,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,...,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300051,596,11307,3,20,7,6,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
99355,141,11987,2,25,13,5,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
890251,19,10044,2,5,4,5,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
745817,39,633,1,0,19,3,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
421793,289,7970,3,15,8,7,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310028,605,3623,3,70,20,6,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
663567,1407,11907,3,25,6,7,1,1,1,0,...,0,0,0,0,0,0,0,0,1,0
1049160,1136,7712,1,50,3,3,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
442785,1041,912,2,5,9,5,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0


In [29]:
test_values_subset.shape

(86868, 98)

In [30]:
# Genero las predicciones para los test.
preds = xgb_model.predict(test_values_subset)

In [31]:
submission_format = pd.read_csv('../../csv/submission_format.csv', index_col = "building_id")

In [32]:
my_submission = pd.DataFrame(data=preds,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [33]:
my_submission.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,3
99355,2
890251,2
745817,1
421793,3


In [34]:
my_submission.to_csv('../../csv/predictions/jf/xg-boost/jf-model-3-submission.csv')

In [35]:
!head ../../csv/predictions/jf/xg-boost/jf-model-3-submission.csv

building_id,damage_grade
300051,3
99355,2
890251,2
745817,1
421793,3
871976,2
691228,1
896100,3
343471,2
