# Imports

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from catboost import CatBoostClassifier, Pool 

In [2]:
CLEAN_TRAIN_FEATURES = '../data/processed/train_features_clean_18-08-2021 10:42:11'
TRAIN_LABELS = '../data/external/pump-it-up-training-labels.csv'
SEED = 42

clean_train_features = pd.read_csv(CLEAN_TRAIN_FEATURES)
train_labels = pd.read_csv(TRAIN_LABELS)

# Feature Reduction

In [3]:
features_to_reduce = [
'Unnamed: 0', # artifact from importing from file
'id', # different for every well
'date_recorded', # redundant with years_elapsed synthetic feature
'wpt_name', # drop results in no change in accuracy
'num_private', # litte variation in feature values
'subvillage', # ward and lga capture similar info with greater feature importance and lower training runtimes
'region', # redundant, same as region_code
'public_meeting', # drop results in no change in accuracy
'recorded_by', # same for every observation
'scheme_name', # drop results in no change in accuracy
'scheme_management', # drop results in increase in accuracy
'permit', # drop results in no change in accuracy
'construction_year', # redundant with years_elapsed synthetic feature
'extraction_type_group', # similar to extraction but with less variation
'extraction_type_class', # similar to extraction but with less variation
'management_group', # similar to management but with less variation
'payment_type', # same as payment
'water_quality', # drop results in no change in accuracy
'quality_group', # drop results in no change in accuracy
'quantity_group', # same as quantity
'source_type', # similar to source but with less variation
'source_class', # similar to source but with less variation
'waterpoint_type_group' # similar to waterpoint_type but with less variation
]
reduced_clean_train_features = clean_train_features.drop(columns=features_to_reduce)

In [4]:
reduced_clean_train_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   amount_tsh       59400 non-null  float64
 1   funder           59400 non-null  object 
 2   gps_height       59400 non-null  float64
 3   installer        59400 non-null  object 
 4   longitude        59400 non-null  float64
 5   latitude         59400 non-null  float64
 6   basin            59400 non-null  object 
 7   region_code      59400 non-null  int64  
 8   district_code    59400 non-null  int64  
 9   lga              59400 non-null  object 
 10  ward             59400 non-null  object 
 11  population       59400 non-null  int64  
 12  extraction_type  59400 non-null  object 
 13  management       59400 non-null  object 
 14  payment          59400 non-null  object 
 15  quantity         59400 non-null  object 
 16  source           59400 non-null  object 
 17  waterpoint_t

In [5]:
cat_features = ['basin', 
                'extraction_type', 
                'management', 
                'payment', 
                'quantity', 
                'source', 
                'waterpoint_type']

text_features= ['funder', 
                'installer',
                'lga',
                'ward']

train_data = reduced_clean_train_features

train_labels = train_labels['status_group']

test_data = catboost_pool = Pool(train_data, 
                                 label=train_labels,
                                 cat_features=cat_features,
                                 text_features=text_features)

In [6]:
"""optional CatBoostClassifier params: cat_features=cat_features,
text_features=text_features, max_ctr_complexity=5, iterations=10000,
eval_metric='AUC', od_type='Iter', od_wait=500
"""

model = CatBoostClassifier(cat_features=cat_features,
                           text_features=text_features)

model.fit(train_data, train_labels, verbose=False)

preds_class = model.predict(test_data)

In [7]:
labels_array = train_labels.values
preds_list = [label[0] for label in preds_class]
preds_array = np.array(preds_list)
print('Accuracy:', np.sum(preds_array == labels_array)/labels_array.size)

Accuracy: 0.7947811447811448


In [8]:
feature_importance = model.get_feature_importance().tolist()
feature_names = reduced_clean_train_features.columns.tolist()

for i in range(len(feature_names)):
    print(feature_names[i], feature_importance[i])

amount_tsh 1.658094646188841
funder 4.304082939804251
gps_height 2.3528139062040254
installer 3.1189273609730948
longitude 3.722180736461092
latitude 2.7944659058329266
basin 1.9701171225772716
region_code 1.661950655880465
district_code 1.9844223188213845
lga 10.719882430422798
ward 5.421317213450845
population 2.1337352228876103
extraction_type 7.7369354475768
management 1.9235691315887076
payment 6.002150061554366
quantity 21.89618763134418
source 8.035001594885415
waterpoint_type 7.4981305738832065
years_elapsed 5.066035099662801


In [9]:
now = datetime.now().strftime("%d-%m-%Y %Hhr%Mmin%Ssec")
model.save_model('../models/catboost_model {}'.format(now))