# Imports

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

from catboost import CatBoostClassifier, Pool 

In [3]:
TESTING_FEATURES = '../data/interim/testing_features_clean_11-08-2021 08:53:44'
MODEL = '../models/catboost_model 12-08-2021 17:40:39'
SEED = 42

testing_features = pd.read_csv(TESTING_FEATURES)

# Feature Reduction

In [4]:
features_to_reduce = ['Unnamed: 0',
                      'id', # different for every well
                      'date_recorded', # data now redundant with years_elapsed
                      'wpt_name', # large text feature - drop to improve testing time
                      'num_private', # no description of feature, few values
                      'subvillage', # large text feature - drop to improve testing time
                      'region', # exactly same as region_code
                      'ward', # large text feature - drop to improve testing time
                      'public_meeting', # ?????
                      'recorded_by', # same for every observation
                      'scheme_name', # too many hard to impute values
                      'scheme_management', #removed to create .01% increase in accuracy
                      'permit', # removed w/o affecting accuracy?
                      'construction_year', # data now redundant with years_elapsed
                      'extraction_type_group', # same but less detailed info as extraction
                      'extraction_type_class', # same but less detailed info as extraction
                      'management_group', # same but less detailed info as management
                      'payment_type', # same as payment
                      #'water_quality', # removed to see what will happen
                      'quality_group', # same but less detailed info as water_quality
                      'quantity_group', # exactly same as quantity
                      'source_type', # same but less detailed info as source
                      'source_class', # same but less detailed info as source
                      'waterpoint_type_group'] # same but less detailed info as waterpoint_type

reduced_testing_features = testing_features.drop(columns=features_to_reduce)

In [5]:
reduced_testing_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14850 entries, 0 to 14849
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   amount_tsh       14850 non-null  float64
 1   funder           14850 non-null  object 
 2   gps_height       14850 non-null  float64
 3   installer        14850 non-null  object 
 4   longitude        14850 non-null  float64
 5   latitude         14850 non-null  float64
 6   basin            14850 non-null  object 
 7   region_code      14850 non-null  int64  
 8   district_code    14850 non-null  int64  
 9   lga              14850 non-null  object 
 10  population       14850 non-null  float64
 11  extraction_type  14850 non-null  object 
 12  management       14850 non-null  object 
 13  payment          14850 non-null  object 
 14  water_quality    14850 non-null  object 
 15  quantity         14850 non-null  object 
 16  source           14850 non-null  object 
 17  waterpoint_t

In [6]:
reduced_testing_features.iloc[1235]

amount_tsh                   250.0
funder                       tasaf
gps_height                  1191.0
installer           local contract
longitude                31.851166
latitude                 -1.046189
basin                lake victoria
region_code                     18
district_code                    2
lga                   bukoba rural
population                   300.0
extraction_type            gravity
management                     vwc
payment                  never pay
water_quality                 soft
quantity                    enough
source                      spring
waterpoint_type    improved spring
years_elapsed                   10
Name: 1235, dtype: object

# CatBoost

In [7]:
clf = CatBoostClassifier()
model_from_file = clf.load_model(MODEL)

In [8]:
preds_class = model_from_file.predict(reduced_testing_features)
preds_list = [label[0] for label in preds_class]
preds_array = np.array(preds_list)
preds_array

array(['functional', 'functional', 'functional', ..., 'functional',
       'functional', 'non functional'], dtype='<U23')

In [9]:
submission = pd.DataFrame({'id': testing_features['id'], 
                           'status_group': pd.Series(preds_array)})
submission.set_index('id', inplace=True)

now = datetime.now().strftime("%d-%m-%Y %H:%M:%S")
submission.to_csv('../submissions/catboost_submission_{}'.format(now))