# CAT BOOST

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib.ticker as ticker
import seaborn as sns
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
import multiprocessing

In [16]:
labels = pd.read_csv('../../csv/train_labels.csv')
labels.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [17]:
values = pd.read_csv('../../csv/train_values.csv')
values.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,260591,260592,260593,260594,260595,260596,260597,260598,260599,260600
building_id,802906,28830,94947,590882,201944,333020,728451,475515,441126,989500,...,560805,207683,226421,159555,827012,688636,669485,602512,151409,747594
geo_level_1_id,6,8,21,22,11,8,9,20,0,26,...,20,10,8,27,8,25,17,17,26,21
geo_level_2_id,487,900,363,418,131,558,475,323,757,886,...,368,1382,767,181,268,1335,715,51,39,9
geo_level_3_id,12198,2812,8973,10694,1488,6089,12066,12236,7219,994,...,5980,1903,8613,1537,4718,1621,2060,8163,1851,9101
count_floors_pre_eq,2,2,2,2,3,2,2,2,2,1,...,1,2,2,6,2,1,2,3,2,3
age,30,10,10,10,30,10,25,0,15,0,...,25,25,5,0,20,55,0,55,10,10
area_percentage,6,8,5,6,8,9,3,8,8,13,...,5,5,13,13,8,6,6,6,14,7
height_percentage,5,7,5,5,9,5,4,6,6,4,...,3,5,5,12,5,3,5,7,6,6
land_surface_condition,t,o,t,t,t,t,n,t,t,t,...,n,t,t,t,t,n,t,t,t,n
foundation_type,r,r,r,r,r,r,r,w,r,i,...,r,r,r,r,r,r,r,r,r,r


In [18]:
values["building_id"].count() == values["building_id"].drop_duplicates().count()

True

In [19]:
to_be_categorized = ["land_surface_condition", "foundation_type", "roof_type",\
                     "position", "ground_floor_type", "other_floor_type",\
                     "plan_configuration", "legal_ownership_status"]
for row in to_be_categorized:
    values[row] = values[row].astype("category")

In [20]:
datatypes = dict(values.dtypes)
for row in values.columns:
    if datatypes[row] != "int64" and datatypes[row] != "int32" and \
       datatypes[row] != "int16" and datatypes[row] != "int8":
        continue
    if values[row].nlargest(1).item() > 32767 and values[row].nlargest(1).item() < 2**31:
        values[row] = values[row].astype(np.int32)
    elif values[row].nlargest(1).item() > 127:
        values[row] = values[row].astype(np.int16)
    else:
        values[row] = values[row].astype(np.int8)

In [21]:
labels["building_id"] = labels["building_id"].astype(np.int32)
labels["damage_grade"] = labels["damage_grade"].astype(np.int8)

In [22]:
important_values = values\
                .merge(labels, on="building_id")
important_values.drop(columns=["building_id"], inplace = True)
important_values["geo_level_1_id"] = important_values["geo_level_1_id"].astype("category")
important_values

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
1,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,2
2,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
3,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,2
4,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,25,1335,1621,1,55,6,3,n,r,n,...,0,0,0,0,0,0,0,0,0,2
260597,17,715,2060,2,0,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
260598,17,51,8163,3,55,6,7,t,r,q,...,0,0,0,0,0,0,0,0,0,3
260599,26,39,1851,2,10,14,6,t,r,x,...,0,0,0,0,0,0,0,0,0,2


In [23]:
X_train, X_test, y_train, y_test = train_test_split(important_values.drop(columns = 'damage_grade'),
                                                    important_values['damage_grade'], test_size = 0.2, random_state = 123)

In [24]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 

features_to_encode = ["geo_level_1_id", "land_surface_condition", "foundation_type", "roof_type",\
                     "position", "ground_floor_type", "other_floor_type",\
                     "plan_configuration", "legal_ownership_status"]
for feature in features_to_encode:
    X_train = encode_and_bind(X_train, feature)
    X_test = encode_and_bind(X_test, feature)

In [25]:
pip install catboost

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [26]:
from catboost import CatBoostClassifier

In [None]:
model = CatBoostClassifier( iterations = 5000,
                            eval_metric='Accuracy',
                            loss_function='MultiClass',
                            use_best_model=True,
                            random_seed=1,
                            max_depth = None,
                            learning_rate = 0.15,
                            boost_from_average = False,
                            verbose=True)
model.fit(X_train, y_train, 
          cat_features=['geo_level_2_id', 'geo_level_3_id'],
          eval_set=(X_train,y_train))

0:	learn: 0.6930305	test: 0.6992565	best: 0.6992565 (0)	total: 124ms	remaining: 10m 18s
1:	learn: 0.6974242	test: 0.7030411	best: 0.7030411 (1)	total: 235ms	remaining: 9m 48s
2:	learn: 0.7017076	test: 0.7073724	best: 0.7073724 (2)	total: 313ms	remaining: 8m 41s
3:	learn: 0.7021105	test: 0.7080439	best: 0.7080439 (3)	total: 426ms	remaining: 8m 52s
4:	learn: 0.7024415	test: 0.7083557	best: 0.7083557 (4)	total: 531ms	remaining: 8m 50s
5:	learn: 0.7037989	test: 0.7100489	best: 0.7100489 (5)	total: 625ms	remaining: 8m 40s
6:	learn: 0.7044944	test: 0.7103223	best: 0.7103223 (6)	total: 749ms	remaining: 8m 54s
7:	learn: 0.7046623	test: 0.7102983	best: 0.7103223 (6)	total: 881ms	remaining: 9m 9s
8:	learn: 0.7049213	test: 0.7106629	best: 0.7106629 (8)	total: 1.03s	remaining: 9m 30s
9:	learn: 0.7050796	test: 0.7109219	best: 0.7109219 (9)	total: 1.2s	remaining: 10m
10:	learn: 0.7054490	test: 0.7113200	best: 0.7113200 (10)	total: 1.33s	remaining: 10m 5s
11:	learn: 0.7086627	test: 0.7151142	best: 0.

94:	learn: 0.7363008	test: 0.7568016	best: 0.7568016 (94)	total: 11.3s	remaining: 9m 45s
95:	learn: 0.7363056	test: 0.7568640	best: 0.7568640 (95)	total: 11.5s	remaining: 9m 47s
96:	learn: 0.7364735	test: 0.7568927	best: 0.7568927 (96)	total: 11.7s	remaining: 9m 49s
97:	learn: 0.7365838	test: 0.7569167	best: 0.7569167 (97)	total: 11.8s	remaining: 9m 51s
98:	learn: 0.7365359	test: 0.7568256	best: 0.7569167 (97)	total: 11.9s	remaining: 9m 51s
99:	learn: 0.7365695	test: 0.7569503	best: 0.7569503 (99)	total: 12.1s	remaining: 9m 51s
100:	learn: 0.7366798	test: 0.7571422	best: 0.7571422 (100)	total: 12.2s	remaining: 9m 53s
101:	learn: 0.7366702	test: 0.7570942	best: 0.7571422 (100)	total: 12.4s	remaining: 9m 53s
102:	learn: 0.7366894	test: 0.7571805	best: 0.7571805 (102)	total: 12.5s	remaining: 9m 56s
103:	learn: 0.7367805	test: 0.7571997	best: 0.7571997 (103)	total: 12.7s	remaining: 9m 57s
104:	learn: 0.7369100	test: 0.7573005	best: 0.7573005 (104)	total: 12.9s	remaining: 10m
105:	learn: 0.

185:	learn: 0.7410495	test: 0.7607109	best: 0.7607300 (178)	total: 26.1s	remaining: 11m 15s
186:	learn: 0.7412030	test: 0.7608212	best: 0.7608212 (186)	total: 26.2s	remaining: 11m 15s
187:	learn: 0.7411598	test: 0.7608356	best: 0.7608356 (187)	total: 26.4s	remaining: 11m 15s
188:	learn: 0.7412989	test: 0.7609267	best: 0.7609267 (188)	total: 26.6s	remaining: 11m 15s
189:	learn: 0.7413853	test: 0.7609411	best: 0.7609411 (189)	total: 26.7s	remaining: 11m 16s
190:	learn: 0.7413565	test: 0.7609507	best: 0.7609507 (190)	total: 26.9s	remaining: 11m 17s
191:	learn: 0.7414428	test: 0.7610178	best: 0.7610178 (191)	total: 27.1s	remaining: 11m 19s
192:	learn: 0.7414716	test: 0.7610370	best: 0.7610370 (192)	total: 27.3s	remaining: 11m 18s
193:	learn: 0.7415675	test: 0.7611090	best: 0.7611090 (193)	total: 27.4s	remaining: 11m 19s
194:	learn: 0.7415723	test: 0.7611234	best: 0.7611234 (194)	total: 27.6s	remaining: 11m 20s
195:	learn: 0.7415915	test: 0.7610562	best: 0.7611234 (194)	total: 27.8s	remaini

276:	learn: 0.7443208	test: 0.7632291	best: 0.7632339 (275)	total: 41s	remaining: 11m 39s
277:	learn: 0.7443640	test: 0.7632051	best: 0.7632339 (275)	total: 41.2s	remaining: 11m 40s
278:	learn: 0.7443832	test: 0.7632243	best: 0.7632339 (275)	total: 41.4s	remaining: 11m 40s
279:	learn: 0.7444551	test: 0.7632962	best: 0.7632962 (279)	total: 41.5s	remaining: 11m 39s
280:	learn: 0.7444887	test: 0.7632771	best: 0.7632962 (279)	total: 41.7s	remaining: 11m 40s
281:	learn: 0.7445127	test: 0.7632818	best: 0.7632962 (279)	total: 41.8s	remaining: 11m 39s
282:	learn: 0.7445750	test: 0.7633442	best: 0.7633442 (282)	total: 42s	remaining: 11m 40s
283:	learn: 0.7445990	test: 0.7634929	best: 0.7634929 (283)	total: 42.2s	remaining: 11m 40s
284:	learn: 0.7445894	test: 0.7634641	best: 0.7634929 (283)	total: 42.3s	remaining: 11m 40s
285:	learn: 0.7445271	test: 0.7634449	best: 0.7634929 (283)	total: 42.5s	remaining: 11m 41s
286:	learn: 0.7445942	test: 0.7634162	best: 0.7634929 (283)	total: 42.7s	remaining: 

367:	learn: 0.7465656	test: 0.7651429	best: 0.7651429 (366)	total: 56s	remaining: 11m 44s
368:	learn: 0.7465704	test: 0.7651333	best: 0.7651429 (366)	total: 56.1s	remaining: 11m 44s
369:	learn: 0.7465368	test: 0.7651429	best: 0.7651429 (366)	total: 56.3s	remaining: 11m 45s
370:	learn: 0.7466232	test: 0.7652053	best: 0.7652053 (370)	total: 56.5s	remaining: 11m 44s
371:	learn: 0.7466855	test: 0.7652197	best: 0.7652197 (371)	total: 56.7s	remaining: 11m 45s
372:	learn: 0.7467143	test: 0.7652485	best: 0.7652485 (372)	total: 56.8s	remaining: 11m 44s
373:	learn: 0.7467911	test: 0.7653540	best: 0.7653540 (373)	total: 57s	remaining: 11m 44s
374:	learn: 0.7467719	test: 0.7653012	best: 0.7653540 (373)	total: 57.1s	remaining: 11m 44s
375:	learn: 0.7468150	test: 0.7653156	best: 0.7653540 (373)	total: 57.3s	remaining: 11m 44s
376:	learn: 0.7468007	test: 0.7653348	best: 0.7653540 (373)	total: 57.5s	remaining: 11m 45s
377:	learn: 0.7468726	test: 0.7653828	best: 0.7653828 (377)	total: 57.7s	remaining: 

457:	learn: 0.7485706	test: 0.7668170	best: 0.7668601 (455)	total: 1m 10s	remaining: 11m 39s
458:	learn: 0.7485178	test: 0.7667162	best: 0.7668601 (455)	total: 1m 10s	remaining: 11m 39s
459:	learn: 0.7485610	test: 0.7667306	best: 0.7668601 (455)	total: 1m 10s	remaining: 11m 40s
460:	learn: 0.7485370	test: 0.7668074	best: 0.7668601 (455)	total: 1m 11s	remaining: 11m 40s
461:	learn: 0.7486234	test: 0.7668266	best: 0.7668601 (455)	total: 1m 11s	remaining: 11m 39s
462:	learn: 0.7486378	test: 0.7667930	best: 0.7668601 (455)	total: 1m 11s	remaining: 11m 39s
463:	learn: 0.7486378	test: 0.7667498	best: 0.7668601 (455)	total: 1m 11s	remaining: 11m 39s
464:	learn: 0.7486617	test: 0.7667354	best: 0.7668601 (455)	total: 1m 11s	remaining: 11m 39s
465:	learn: 0.7486042	test: 0.7667786	best: 0.7668601 (455)	total: 1m 11s	remaining: 11m 40s
466:	learn: 0.7486426	test: 0.7668649	best: 0.7668649 (466)	total: 1m 12s	remaining: 11m 39s
467:	learn: 0.7485946	test: 0.7668026	best: 0.7668649 (466)	total: 1m 

547:	learn: 0.7505180	test: 0.7683327	best: 0.7683567 (546)	total: 1m 25s	remaining: 11m 32s
548:	learn: 0.7505468	test: 0.7683855	best: 0.7683855 (548)	total: 1m 25s	remaining: 11m 32s
549:	learn: 0.7505516	test: 0.7683903	best: 0.7683903 (549)	total: 1m 25s	remaining: 11m 32s
550:	learn: 0.7506092	test: 0.7683711	best: 0.7683903 (549)	total: 1m 25s	remaining: 11m 32s
551:	learn: 0.7506140	test: 0.7682799	best: 0.7683903 (549)	total: 1m 25s	remaining: 11m 32s
552:	learn: 0.7506475	test: 0.7683087	best: 0.7683903 (549)	total: 1m 26s	remaining: 11m 31s
553:	learn: 0.7506763	test: 0.7682799	best: 0.7683903 (549)	total: 1m 26s	remaining: 11m 31s
554:	learn: 0.7507147	test: 0.7683998	best: 0.7683998 (554)	total: 1m 26s	remaining: 11m 31s
555:	learn: 0.7506907	test: 0.7684334	best: 0.7684334 (555)	total: 1m 26s	remaining: 11m 31s
556:	learn: 0.7506380	test: 0.7684478	best: 0.7684478 (556)	total: 1m 26s	remaining: 11m 31s
557:	learn: 0.7507435	test: 0.7684478	best: 0.7684478 (556)	total: 1m 

636:	learn: 0.7525710	test: 0.7696805	best: 0.7696805 (636)	total: 1m 39s	remaining: 11m 23s
637:	learn: 0.7526238	test: 0.7696662	best: 0.7696805 (636)	total: 1m 40s	remaining: 11m 23s
638:	learn: 0.7526909	test: 0.7697093	best: 0.7697093 (638)	total: 1m 40s	remaining: 11m 23s
639:	learn: 0.7526765	test: 0.7697285	best: 0.7697285 (639)	total: 1m 40s	remaining: 11m 23s
640:	learn: 0.7526669	test: 0.7696662	best: 0.7697285 (639)	total: 1m 40s	remaining: 11m 23s
641:	learn: 0.7526429	test: 0.7695942	best: 0.7697285 (639)	total: 1m 40s	remaining: 11m 23s
642:	learn: 0.7526717	test: 0.7696038	best: 0.7697285 (639)	total: 1m 40s	remaining: 11m 22s
643:	learn: 0.7526333	test: 0.7696086	best: 0.7697285 (639)	total: 1m 40s	remaining: 11m 22s
644:	learn: 0.7526094	test: 0.7696038	best: 0.7697285 (639)	total: 1m 41s	remaining: 11m 22s
645:	learn: 0.7525758	test: 0.7697045	best: 0.7697285 (639)	total: 1m 41s	remaining: 11m 22s
646:	learn: 0.7525566	test: 0.7696757	best: 0.7697285 (639)	total: 1m 

726:	learn: 0.7538565	test: 0.7704240	best: 0.7704384 (706)	total: 1m 54s	remaining: 11m 11s
727:	learn: 0.7538277	test: 0.7704480	best: 0.7704480 (727)	total: 1m 54s	remaining: 11m 11s
728:	learn: 0.7538661	test: 0.7705248	best: 0.7705248 (728)	total: 1m 54s	remaining: 11m 11s
729:	learn: 0.7538181	test: 0.7705200	best: 0.7705248 (728)	total: 1m 54s	remaining: 11m 11s
730:	learn: 0.7538037	test: 0.7705679	best: 0.7705679 (730)	total: 1m 54s	remaining: 11m 10s
731:	learn: 0.7537989	test: 0.7705008	best: 0.7705679 (730)	total: 1m 55s	remaining: 11m 10s
732:	learn: 0.7538709	test: 0.7704864	best: 0.7705679 (730)	total: 1m 55s	remaining: 11m 10s
733:	learn: 0.7539428	test: 0.7704528	best: 0.7705679 (730)	total: 1m 55s	remaining: 11m 10s
734:	learn: 0.7539668	test: 0.7704576	best: 0.7705679 (730)	total: 1m 55s	remaining: 11m 9s
735:	learn: 0.7539956	test: 0.7704720	best: 0.7705679 (730)	total: 1m 55s	remaining: 11m 9s
736:	learn: 0.7539956	test: 0.7704528	best: 0.7705679 (730)	total: 1m 55

816:	learn: 0.7550077	test: 0.7717815	best: 0.7719206 (812)	total: 2m 9s	remaining: 11m 3s
817:	learn: 0.7551084	test: 0.7718582	best: 0.7719206 (812)	total: 2m 9s	remaining: 11m 3s
818:	learn: 0.7551084	test: 0.7718486	best: 0.7719206 (812)	total: 2m 10s	remaining: 11m 3s
819:	learn: 0.7551708	test: 0.7719302	best: 0.7719302 (819)	total: 2m 10s	remaining: 11m 3s
820:	learn: 0.7551708	test: 0.7718054	best: 0.7719302 (819)	total: 2m 10s	remaining: 11m 3s
821:	learn: 0.7552283	test: 0.7717959	best: 0.7719302 (819)	total: 2m 10s	remaining: 11m 3s
822:	learn: 0.7552379	test: 0.7718246	best: 0.7719302 (819)	total: 2m 10s	remaining: 11m 3s
823:	learn: 0.7552187	test: 0.7718630	best: 0.7719302 (819)	total: 2m 10s	remaining: 11m 3s
824:	learn: 0.7552571	test: 0.7719493	best: 0.7719493 (824)	total: 2m 11s	remaining: 11m 3s
825:	learn: 0.7553003	test: 0.7719541	best: 0.7719541 (825)	total: 2m 11s	remaining: 11m 3s
826:	learn: 0.7552523	test: 0.7719446	best: 0.7719541 (825)	total: 2m 11s	remainin

906:	learn: 0.7566385	test: 0.7729710	best: 0.7730142 (902)	total: 2m 25s	remaining: 10m 57s
907:	learn: 0.7567153	test: 0.7729758	best: 0.7730142 (902)	total: 2m 25s	remaining: 10m 56s
908:	learn: 0.7566721	test: 0.7729998	best: 0.7730142 (902)	total: 2m 25s	remaining: 10m 56s
909:	learn: 0.7567057	test: 0.7729950	best: 0.7730142 (902)	total: 2m 26s	remaining: 10m 56s
910:	learn: 0.7566577	test: 0.7730286	best: 0.7730286 (910)	total: 2m 26s	remaining: 10m 57s
911:	learn: 0.7566577	test: 0.7730238	best: 0.7730286 (910)	total: 2m 26s	remaining: 10m 56s
912:	learn: 0.7566961	test: 0.7730286	best: 0.7730286 (910)	total: 2m 26s	remaining: 10m 56s
913:	learn: 0.7566673	test: 0.7729950	best: 0.7730286 (910)	total: 2m 26s	remaining: 10m 56s
914:	learn: 0.7567105	test: 0.7730670	best: 0.7730670 (914)	total: 2m 27s	remaining: 10m 56s
915:	learn: 0.7566625	test: 0.7730142	best: 0.7730670 (914)	total: 2m 27s	remaining: 10m 55s
916:	learn: 0.7566721	test: 0.7730238	best: 0.7730670 (914)	total: 2m 

995:	learn: 0.7580679	test: 0.7741126	best: 0.7741126 (990)	total: 2m 40s	remaining: 10m 44s
996:	learn: 0.7580775	test: 0.7741174	best: 0.7741174 (996)	total: 2m 40s	remaining: 10m 44s
997:	learn: 0.7581351	test: 0.7741126	best: 0.7741174 (996)	total: 2m 40s	remaining: 10m 44s
998:	learn: 0.7581782	test: 0.7741606	best: 0.7741606 (998)	total: 2m 40s	remaining: 10m 43s
999:	learn: 0.7581734	test: 0.7741750	best: 0.7741750 (999)	total: 2m 40s	remaining: 10m 43s
1000:	learn: 0.7582118	test: 0.7742325	best: 0.7742325 (1000)	total: 2m 41s	remaining: 10m 43s
1001:	learn: 0.7581591	test: 0.7742709	best: 0.7742709 (1001)	total: 2m 41s	remaining: 10m 43s
1002:	learn: 0.7582214	test: 0.7742805	best: 0.7742805 (1002)	total: 2m 41s	remaining: 10m 43s
1003:	learn: 0.7582166	test: 0.7742613	best: 0.7742805 (1002)	total: 2m 41s	remaining: 10m 43s
1004:	learn: 0.7582358	test: 0.7742613	best: 0.7742805 (1002)	total: 2m 41s	remaining: 10m 43s
1005:	learn: 0.7582550	test: 0.7741846	best: 0.7742805 (1002

1082:	learn: 0.7595405	test: 0.7748369	best: 0.7748609 (1081)	total: 2m 55s	remaining: 10m 34s
1083:	learn: 0.7595405	test: 0.7748609	best: 0.7748609 (1081)	total: 2m 55s	remaining: 10m 34s
1084:	learn: 0.7596076	test: 0.7748945	best: 0.7748945 (1084)	total: 2m 55s	remaining: 10m 34s
1085:	learn: 0.7596172	test: 0.7748801	best: 0.7748945 (1084)	total: 2m 55s	remaining: 10m 34s
1086:	learn: 0.7596652	test: 0.7749041	best: 0.7749041 (1086)	total: 2m 56s	remaining: 10m 34s
1087:	learn: 0.7596604	test: 0.7749089	best: 0.7749089 (1087)	total: 2m 56s	remaining: 10m 33s
1088:	learn: 0.7596364	test: 0.7748705	best: 0.7749089 (1087)	total: 2m 56s	remaining: 10m 33s
1089:	learn: 0.7596316	test: 0.7749185	best: 0.7749185 (1089)	total: 2m 56s	remaining: 10m 33s
1090:	learn: 0.7596316	test: 0.7749520	best: 0.7749520 (1090)	total: 2m 56s	remaining: 10m 33s
1091:	learn: 0.7596748	test: 0.7749616	best: 0.7749616 (1091)	total: 2m 56s	remaining: 10m 33s


In [14]:
model_pred = model.predict(X_test)
f1_score(y_test, model_pred, average='micro')

0.7415820878340784

In [75]:
test_values = pd.read_csv('../../csv/test_values.csv', index_col = "building_id")

In [76]:
test_values_subset = test_values
test_values_subset["geo_level_1_id"] = test_values_subset["geo_level_1_id"].astype("category")

In [77]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 

features_to_encode = ["geo_level_1_id", "land_surface_condition", "foundation_type", "roof_type",\
                     "position", "ground_floor_type", "other_floor_type",\
                     "plan_configuration", "legal_ownership_status"]
for feature in features_to_encode:
    test_values_subset = encode_and_bind(test_values_subset, feature)

In [78]:
preds = model.predict(test_values_subset)

In [79]:
submission_format = pd.read_csv('../../csv/submission_format.csv', index_col = "building_id")

In [80]:
my_submission = pd.DataFrame(data=preds,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [81]:
my_submission.to_csv('../../csv/predictions/jf/cat-boost/jf-model-2-2-submission.csv')