# CAT BOOST

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib.ticker as ticker
import seaborn as sns
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
import multiprocessing

In [2]:
labels = pd.read_csv('../../csv/train_labels.csv')
labels.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [3]:
values = pd.read_csv('../../csv/train_values.csv')
values.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,260591,260592,260593,260594,260595,260596,260597,260598,260599,260600
building_id,802906,28830,94947,590882,201944,333020,728451,475515,441126,989500,...,560805,207683,226421,159555,827012,688636,669485,602512,151409,747594
geo_level_1_id,6,8,21,22,11,8,9,20,0,26,...,20,10,8,27,8,25,17,17,26,21
geo_level_2_id,487,900,363,418,131,558,475,323,757,886,...,368,1382,767,181,268,1335,715,51,39,9
geo_level_3_id,12198,2812,8973,10694,1488,6089,12066,12236,7219,994,...,5980,1903,8613,1537,4718,1621,2060,8163,1851,9101
count_floors_pre_eq,2,2,2,2,3,2,2,2,2,1,...,1,2,2,6,2,1,2,3,2,3
age,30,10,10,10,30,10,25,0,15,0,...,25,25,5,0,20,55,0,55,10,10
area_percentage,6,8,5,6,8,9,3,8,8,13,...,5,5,13,13,8,6,6,6,14,7
height_percentage,5,7,5,5,9,5,4,6,6,4,...,3,5,5,12,5,3,5,7,6,6
land_surface_condition,t,o,t,t,t,t,n,t,t,t,...,n,t,t,t,t,n,t,t,t,n
foundation_type,r,r,r,r,r,r,r,w,r,i,...,r,r,r,r,r,r,r,r,r,r


In [4]:
values["building_id"].count() == values["building_id"].drop_duplicates().count()

True

In [5]:
to_be_categorized = ["land_surface_condition", "foundation_type", "roof_type",\
                     "position", "ground_floor_type", "other_floor_type",\
                     "plan_configuration", "legal_ownership_status"]
for row in to_be_categorized:
    values[row] = values[row].astype("category")

In [6]:
datatypes = dict(values.dtypes)
for row in values.columns:
    if datatypes[row] != "int64" and datatypes[row] != "int32" and \
       datatypes[row] != "int16" and datatypes[row] != "int8":
        continue
    if values[row].nlargest(1).item() > 32767 and values[row].nlargest(1).item() < 2**31:
        values[row] = values[row].astype(np.int32)
    elif values[row].nlargest(1).item() > 127:
        values[row] = values[row].astype(np.int16)
    else:
        values[row] = values[row].astype(np.int8)

In [7]:
labels["building_id"] = labels["building_id"].astype(np.int32)
labels["damage_grade"] = labels["damage_grade"].astype(np.int8)

In [8]:
important_values = values\
                .merge(labels, on="building_id")
important_values.drop(columns=["building_id"], inplace = True)
important_values["geo_level_1_id"] = important_values["geo_level_1_id"].astype("category")
important_values

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
1,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,2
2,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
3,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,2
4,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,25,1335,1621,1,55,6,3,n,r,n,...,0,0,0,0,0,0,0,0,0,2
260597,17,715,2060,2,0,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
260598,17,51,8163,3,55,6,7,t,r,q,...,0,0,0,0,0,0,0,0,0,3
260599,26,39,1851,2,10,14,6,t,r,x,...,0,0,0,0,0,0,0,0,0,2


In [9]:
X_train, X_test, y_train, y_test = train_test_split(important_values.drop(columns = 'damage_grade'),
                                                    important_values['damage_grade'], test_size = 0.2, random_state = 123)

In [10]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 

features_to_encode = ["geo_level_1_id", "land_surface_condition", "foundation_type", "roof_type",\
                     "position", "ground_floor_type", "other_floor_type",\
                     "plan_configuration", "legal_ownership_status"]
for feature in features_to_encode:
    X_train = encode_and_bind(X_train, feature)
    X_test = encode_and_bind(X_test, feature)

In [11]:
pip install catboost

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [11]:
from catboost import CatBoostClassifier

In [None]:
import time

print(time.gmtime())
i = 1
df = pd.DataFrame({'subsample': [],
                   'max_depth': [],
                   'learning_rate': [],
                   'score': []})
for n_estimators in [1600, 1700, 1800]:
    for max_depth in [11, 16, None]:
        for learning_rate in [0.14, 0.15, 0.16]:
            model = CatBoostClassifier(n_estimators = n_estimators,
                            max_depth = max_depth,
                           learning_rate = learning_rate,
                            verbose=True)
            
            model.fit(X_train, y_train)
            y_preds = model.predict(X_test)
            
            score = f1_score(y_test, y_preds, average = 'micro')
            
            df = df.append(pd.Series(
                data={'n_estimators': n_estimators,
                      'max_depth': max_depth,
                      'learning_rate': learning_rate,
                      'score': score},
            name = i))
            print(i, time.gmtime())
            i += 1

df = df.sort_values('score', ascending = False)
df

time.struct_time(tm_year=2021, tm_mon=7, tm_mday=15, tm_hour=3, tm_min=21, tm_sec=34, tm_wday=3, tm_yday=196, tm_isdst=0)
0:	learn: 1.0163680	total: 495ms	remaining: 13m 11s
1:	learn: 0.9569423	total: 886ms	remaining: 11m 47s
2:	learn: 0.9125987	total: 1.28s	remaining: 11m 21s
3:	learn: 0.8771151	total: 1.67s	remaining: 11m 8s
4:	learn: 0.8488214	total: 2.07s	remaining: 11m
5:	learn: 0.8250601	total: 2.5s	remaining: 11m 4s
6:	learn: 0.8043154	total: 2.91s	remaining: 11m 1s
7:	learn: 0.7879719	total: 3.32s	remaining: 11m
8:	learn: 0.7752173	total: 3.73s	remaining: 10m 59s
9:	learn: 0.7635107	total: 4.14s	remaining: 10m 57s
10:	learn: 0.7538627	total: 4.59s	remaining: 11m 2s
11:	learn: 0.7454367	total: 4.98s	remaining: 10m 59s
12:	learn: 0.7380199	total: 5.4s	remaining: 10m 59s
13:	learn: 0.7316375	total: 5.8s	remaining: 10m 56s
14:	learn: 0.7249791	total: 6.21s	remaining: 10m 55s
15:	learn: 0.7203790	total: 6.62s	remaining: 10m 54s
16:	learn: 0.7157801	total: 7.03s	remaining: 10m 54s
17

153:	learn: 0.5915797	total: 1m 7s	remaining: 10m 30s
154:	learn: 0.5912720	total: 1m 7s	remaining: 10m 30s
155:	learn: 0.5907043	total: 1m 8s	remaining: 10m 29s
156:	learn: 0.5902948	total: 1m 8s	remaining: 10m 28s
157:	learn: 0.5897098	total: 1m 8s	remaining: 10m 28s
158:	learn: 0.5892499	total: 1m 9s	remaining: 10m 27s
159:	learn: 0.5889144	total: 1m 9s	remaining: 10m 26s
160:	learn: 0.5884353	total: 1m 10s	remaining: 10m 26s
161:	learn: 0.5880200	total: 1m 10s	remaining: 10m 25s
162:	learn: 0.5877203	total: 1m 10s	remaining: 10m 25s
163:	learn: 0.5872040	total: 1m 11s	remaining: 10m 24s
164:	learn: 0.5867933	total: 1m 11s	remaining: 10m 24s
165:	learn: 0.5862644	total: 1m 12s	remaining: 10m 24s
166:	learn: 0.5858947	total: 1m 12s	remaining: 10m 23s
167:	learn: 0.5854885	total: 1m 13s	remaining: 10m 23s
168:	learn: 0.5849935	total: 1m 13s	remaining: 10m 23s
169:	learn: 0.5846113	total: 1m 14s	remaining: 10m 24s
170:	learn: 0.5840451	total: 1m 14s	remaining: 10m 23s
171:	learn: 0.583

305:	learn: 0.5381125	total: 2m 12s	remaining: 9m 20s
306:	learn: 0.5376939	total: 2m 13s	remaining: 9m 20s
307:	learn: 0.5374572	total: 2m 13s	remaining: 9m 19s
308:	learn: 0.5370640	total: 2m 13s	remaining: 9m 19s
309:	learn: 0.5367236	total: 2m 14s	remaining: 9m 18s
310:	learn: 0.5365021	total: 2m 14s	remaining: 9m 18s
311:	learn: 0.5362086	total: 2m 15s	remaining: 9m 17s
312:	learn: 0.5358888	total: 2m 15s	remaining: 9m 17s
313:	learn: 0.5355635	total: 2m 15s	remaining: 9m 16s
314:	learn: 0.5352442	total: 2m 16s	remaining: 9m 16s
315:	learn: 0.5349795	total: 2m 16s	remaining: 9m 15s
316:	learn: 0.5346345	total: 2m 17s	remaining: 9m 15s
317:	learn: 0.5343114	total: 2m 17s	remaining: 9m 14s
318:	learn: 0.5339851	total: 2m 18s	remaining: 9m 14s
319:	learn: 0.5336920	total: 2m 18s	remaining: 9m 13s
320:	learn: 0.5333993	total: 2m 18s	remaining: 9m 13s
321:	learn: 0.5331006	total: 2m 19s	remaining: 9m 12s
322:	learn: 0.5328753	total: 2m 19s	remaining: 9m 12s
323:	learn: 0.5325406	total:

458:	learn: 0.4979430	total: 3m 14s	remaining: 8m 4s
459:	learn: 0.4977519	total: 3m 15s	remaining: 8m 3s
460:	learn: 0.4976165	total: 3m 15s	remaining: 8m 3s
461:	learn: 0.4974992	total: 3m 16s	remaining: 8m 2s
462:	learn: 0.4973248	total: 3m 16s	remaining: 8m 2s
463:	learn: 0.4970312	total: 3m 16s	remaining: 8m 2s
464:	learn: 0.4968917	total: 3m 17s	remaining: 8m 1s
465:	learn: 0.4966392	total: 3m 17s	remaining: 8m 1s
466:	learn: 0.4964707	total: 3m 18s	remaining: 8m
467:	learn: 0.4961372	total: 3m 18s	remaining: 8m
468:	learn: 0.4958488	total: 3m 18s	remaining: 7m 59s
469:	learn: 0.4956418	total: 3m 19s	remaining: 7m 59s
470:	learn: 0.4953837	total: 3m 19s	remaining: 7m 58s
471:	learn: 0.4952653	total: 3m 20s	remaining: 7m 58s
472:	learn: 0.4950402	total: 3m 20s	remaining: 7m 57s
473:	learn: 0.4947619	total: 3m 20s	remaining: 7m 57s
474:	learn: 0.4945392	total: 3m 21s	remaining: 7m 56s
475:	learn: 0.4943478	total: 3m 21s	remaining: 7m 56s
476:	learn: 0.4939723	total: 3m 22s	remainin

In [None]:
df.to_csv('gs1.csv')