In [12]:
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, PredefinedSplit, train_test_split
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

import optuna


from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)
plt.style.use('seaborn')

import os
from os import listdir
from os.path import isfile, join
import shutil

In [13]:
DEST_PATH = 'submission/'
SRC_PATH = 'data/'
X_train = pd.read_parquet(SRC_PATH + 'train_values.parquet').set_index('building_id')
X_test = pd.read_parquet(SRC_PATH + 'test_values.parquet').set_index('building_id')
submission = pd.read_parquet(SRC_PATH + 'submission_format.parquet').set_index('building_id')
y_train = pd.read_parquet(SRC_PATH + 'train_labels.parquet').set_index('building_id')
df = X_train.join(y_train)
df.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1
802906,6,487,12198,2,30,6,5,t,r,n,f,q,t,d,1,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,3
28830,8,900,2812,2,10,8,7,o,r,n,x,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,2
94947,21,363,8973,2,10,5,5,t,r,n,f,x,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,3
590882,22,418,10694,2,10,6,5,t,r,n,f,x,s,d,0,1,0,0,0,0,1,1,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,2
201944,11,131,1488,3,30,8,9,t,r,n,f,x,s,d,1,0,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,3


In [14]:
not_cat = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'count_families']
cat = list(X_train.drop(not_cat, axis=1).columns)
str_cat = ['land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration']
cat

['geo_level_1_id',
 'geo_level_2_id',
 'geo_level_3_id',
 'land_surface_condition',
 'foundation_type',
 'roof_type',
 'ground_floor_type',
 'other_floor_type',
 'position',
 'plan_configuration',
 'has_superstructure_adobe_mud',
 'has_superstructure_mud_mortar_stone',
 'has_superstructure_stone_flag',
 'has_superstructure_cement_mortar_stone',
 'has_superstructure_mud_mortar_brick',
 'has_superstructure_cement_mortar_brick',
 'has_superstructure_timber',
 'has_superstructure_bamboo',
 'has_superstructure_rc_non_engineered',
 'has_superstructure_rc_engineered',
 'has_superstructure_other',
 'legal_ownership_status',
 'has_secondary_use',
 'has_secondary_use_agriculture',
 'has_secondary_use_hotel',
 'has_secondary_use_rental',
 'has_secondary_use_institution',
 'has_secondary_use_school',
 'has_secondary_use_industry',
 'has_secondary_use_health_post',
 'has_secondary_use_gov_office',
 'has_secondary_use_use_police',
 'has_secondary_use_other']

## Stratification

In [15]:
df['age_0'] = 0
df.loc[df['age']==0, 'age_0'] = 1

df['family_1'] = 0
df.loc[df['count_families']==1, 'family_1'] = 1

In [16]:
strat, _ = (
    df[['age_0', 'family_1', 'damage_grade']]
    .astype('str')
    .apply(lambda row: '_'.join(row), axis=1)
    .factorize()
)

strat = pd.Series(strat, index=df.index)

# split create the cross validation folds
def make_folds(X, strat, n_splits):
  strat_folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
  cv_fold = np.empty([len(X), 1], dtype=np.int32)

  for i, (_, test_index) in enumerate(strat_folds.split(X, strat)):
    cv_fold[test_index] = i

  return PredefinedSplit(cv_fold)

cv_splits = make_folds(X_train, strat, 3)

## Optuna Hyperparameter Tuning

In [None]:
def objective(trial):
    parameters = {
        'depth':trial.suggest_int('depth', 2, 12),
        'iterations':trial.suggest_int('iterations', 10, 5e3, log=True),
        'learning_rate':trial.suggest_float('learning_rate',20e-3, 70e-3), 
        'l2_leaf_reg':trial.suggest_int('l2_leaf_reg', 3, 9),
        'border_count':trial.suggest_int('border_count', 11, 17),
     }
    
    cat_clf = CatBoostClassifier(eval_metric='TotalF1', task_type="GPU", cat_features=cat, silent=True, random_seed=42, **parameters)
    return cross_val_score(cat_clf, X_train, y_train, cv=cv_splits, scoring='f1_micro', verbose=1).mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=2)

[32m[I 2022-08-05 19:40:46,447][0m A new study created in memory with name: no-name-8b0707ce-462a-4347-ab59-a51920096ee2[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


## Re-train model on best parameters
Fit the model on the whole dataset using the best parameters from tuning. Predict test values and generate submission file

In [None]:
best_params = study.best_trial.params
cat_clf = CatBoostClassifier(eval_metric='TotalF1', task_type="GPU", cat_features=cat, silent=True, random_seed=42, **best_params)
cat_clf.fit(X_train, y_train)
submission['damage_grade'] = cat_clf.predict(X_test)
submission.to_csv(SRC_PATH + 'fine_tuned_catboost_strat2.csv')