# Reduce Features and Train Model

# Imports

In [53]:
import pandas as pd
import numpy as np
from datetime import datetime
from catboost import CatBoostClassifier, Pool 

In [54]:
CLEAN_TRAIN_FEATURES = '../data/processed/train_features_clean_18-08-2021 10:42:11'
TRAIN_LABELS = '../data/external/pump-it-up-training-labels.csv'
SEED = 42

clean_train_features = pd.read_csv(CLEAN_TRAIN_FEATURES)
train_labels = pd.read_csv(TRAIN_LABELS)

# Feature Reduction

In [55]:
# trying dropping amount_tsh, region code, district_code, basin

features_to_reduce = [
'Unnamed: 0', # artifact from importing from file
'id', # different for every well
'date_recorded', # redundant with years_elapsed synthetic feature
'wpt_name', # drop results in no change in accuracy
'num_private', # litte variation in feature values
'subvillage', # ward and lga capture similar info with greater feature importance and lower training runtimes
'region', # redundant, same as region_code
'public_meeting', # drop results in no change in accuracy
'recorded_by', # same for every observation
'scheme_name', # drop results in no change in accuracy
'scheme_management', # drop results in increase in accuracy
'permit', # drop results in no change in accuracy
'construction_year', # redundant with years_elapsed synthetic feature
'extraction_type_group', # similar to extraction but with less variation
'extraction_type_class', # similar to extraction but with less variation
'management_group', # similar to management but with less variation
'payment_type', # same as payment
'water_quality', # drop results in no change in accuracy
'quality_group', # drop results in no change in accuracy
'quantity_group', # same as quantity
'source_type', # similar to source but with less variation
'source_class', # similar to source but with less variation
'waterpoint_type_group' # similar to waterpoint_type but with less variation
]
reduced_clean_train_features = clean_train_features.drop(columns=features_to_reduce)

In [56]:
reduced_clean_train_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   funder           59400 non-null  object 
 1   gps_height       59400 non-null  float64
 2   installer        59400 non-null  object 
 3   longitude        59400 non-null  float64
 4   latitude         59400 non-null  float64
 5   lga              59400 non-null  object 
 6   ward             59400 non-null  object 
 7   population       59400 non-null  int64  
 8   extraction_type  59400 non-null  object 
 9   management       59400 non-null  object 
 10  payment          59400 non-null  object 
 11  quantity         59400 non-null  object 
 12  source           59400 non-null  object 
 13  waterpoint_type  59400 non-null  object 
 14  years_elapsed    59400 non-null  int64  
dtypes: float64(3), int64(2), object(10)
memory usage: 6.8+ MB


In [57]:
cat_features = ['basin', 
                'extraction_type', 
                'management', 
                'payment', 
                'quantity', 
                'source', 
                'waterpoint_type']

text_features= ['funder', 
                'installer',
                'lga',
                'ward']

train_data = reduced_clean_train_features

train_labels = train_labels['status_group']

test_data = catboost_pool = Pool(train_data, 
                                 label=train_labels,
                                 cat_features=cat_features,
                                 text_features=text_features)

In [58]:
"""optional CatBoostClassifier params: cat_features=cat_features,
text_features=text_features, max_ctr_complexity=5, iterations=10000,
eval_metric='AUC', od_type='Iter', od_wait=500
"""

model = CatBoostClassifier(cat_features=cat_features,
                           text_features=text_features)

model.fit(train_data, train_labels, verbose=False)

preds_class = model.predict(test_data)

In [59]:
labels_array = train_labels.values
preds_list = [label[0] for label in preds_class]
preds_array = np.array(preds_list)
print('Accuracy:', np.sum(preds_array == labels_array)/labels_array.size)

Accuracy: 0.793956228956229


In [60]:
feature_importance = model.get_feature_importance().tolist()
feature_names = reduced_clean_train_features.columns.tolist()

for i in range(len(feature_names)):
    print(feature_names[i], feature_importance[i])

funder 4.178941993308041
gps_height 2.574655285081021
installer 3.7440952844433952
longitude 5.40341291371409
latitude 4.13736958378615
lga 13.454370759673766
ward 5.777419001351448
population 1.7779578592011192
extraction_type 7.940770441689969
management 1.9162752222266775
payment 6.637703344831527
quantity 21.2581265762441
source 8.052754745448341
waterpoint_type 7.498998104666096
years_elapsed 5.647148884334419


In [34]:
now = datetime.now().strftime("%d-%m-%Y %H:%M:%S")
model.save_model('../models/catboost_model {}'.format(now))