# Imports

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from catboost import CatBoostClassifier, Pool 

In [4]:
CLEAN_TEST_FEATURES = '../data/processed/test_features_clean_19-08-2021 11:20:07'
MODEL = '../models/catboost_model 19-08-2021 11hr57min52sec'
SEED = 42

clean_test_features = pd.read_csv(CLEAN_TEST_FEATURES)

# Feature Reduction

In [5]:
features_to_reduce = [
'Unnamed: 0', # artifact from importing from file
'id', # different for every well
'date_recorded', # redundant with years_elapsed synthetic feature
'wpt_name', # drop results in no change in accuracy
'num_private', # litte variation in feature values
'subvillage', # ward and lga capture similar info with greater feature importance and lower training runtimes
'region', # redundant, same as region_code
'public_meeting', # drop results in no change in accuracy
'recorded_by', # same for every observation
'scheme_name', # drop results in no change in accuracy
'scheme_management', # drop results in increase in accuracy
'permit', # drop results in no change in accuracy
'construction_year', # redundant with years_elapsed synthetic feature
'extraction_type_group', # similar to extraction but with less variation
'extraction_type_class', # similar to extraction but with less variation
'management_group', # similar to management but with less variation
'payment_type', # same as payment
'water_quality', # drop results in no change in accuracy
'quality_group', # drop results in no change in accuracy
'quantity_group', # same as quantity
'source_type', # similar to source but with less variation
'source_class', # similar to source but with less variation
'waterpoint_type_group' # similar to waterpoint_type but with less variation
]

reduced_clean_test_features = clean_test_features.drop(columns=features_to_reduce)

In [8]:
reduced_clean_test_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14850 entries, 0 to 14849
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   amount_tsh       14850 non-null  float64
 1   funder           14850 non-null  object 
 2   gps_height       14850 non-null  float64
 3   installer        14850 non-null  object 
 4   longitude        14850 non-null  float64
 5   latitude         14850 non-null  float64
 6   basin            14850 non-null  object 
 7   region_code      14850 non-null  int64  
 8   district_code    14850 non-null  int64  
 9   lga              14850 non-null  object 
 10  ward             14850 non-null  object 
 11  population       14850 non-null  int64  
 12  extraction_type  14850 non-null  object 
 13  management       14850 non-null  object 
 14  payment          14850 non-null  object 
 15  quantity         14850 non-null  object 
 16  source           14850 non-null  object 
 17  waterpoint_t

In [10]:
reduced_clean_test_features.iloc[1235]

amount_tsh                   250.0
funder                       tasaf
gps_height                  1191.0
installer           local contract
longitude                31.851166
latitude                 -1.046189
basin                lake victoria
region_code                     18
district_code                    2
lga                   bukoba rural
ward                        rubafu
population                     300
extraction_type            gravity
management                     vwc
payment                  never pay
quantity                    enough
source                      spring
waterpoint_type    improved spring
years_elapsed                   10
Name: 1235, dtype: object

# CatBoost

In [11]:
clf = CatBoostClassifier()
model_from_file = clf.load_model(MODEL)

In [12]:
preds_class = model_from_file.predict(reduced_clean_test_features)
preds_list = [label[0] for label in preds_class]
preds_array = np.array(preds_list)
preds_array

array(['functional', 'functional', 'functional', ..., 'functional',
       'functional', 'non functional'], dtype='<U23')

In [15]:
submission = pd.DataFrame({'id': clean_test_features['id'], 
                           'status_group': pd.Series(preds_array)})
submission.set_index('id', inplace=True)

In [16]:
now = datetime.now().strftime("%d-%m-%Y %Hhr%Mmin%Ssec")
submission.to_csv('../submissions/catboost_submission_{}'.format(now))