# Imports

In [1]:
import pickle
import pandas as pd
from datetime import datetime

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier 

In [2]:
TESTING_FEATURES = '../data/interim/testing_features_clean_11-08-2021 08:53:44'
SEED = 42

testing_features = pd.read_csv(TESTING_FEATURES)

# Feature Reduction

In [3]:
features_to_reduce = ['Unnamed: 0',
                      'id', # different for every well
                      'date_recorded', # data now redundant with years_elapsed
                      'wpt_name', # large text feature - drop to improve testing time
                      'num_private', # no description of feature, few values
                      'subvillage', # large text feature - drop to improve testing time
                      'region', # exactly same as region_code
                      'ward', # large text feature - drop to improve testing time
                      'public_meeting', # ?????
                      'recorded_by', # same for every observation
                      'scheme_name', # too many hard to impute values
                      'scheme_management', #removed to create .01% increase in accuracy
                      'permit', # removed w/o affecting accuracy?
                      'construction_year', # data now redundant with years_elapsed
                      'extraction_type_group', # same but less detailed info as extraction
                      'extraction_type_class', # same but less detailed info as extraction
                      'management_group', # same but less detailed info as management
                      'payment_type', # same as payment
                      'water_quality', # removed to see what will happen
                      'quality_group', # same but less detailed info as water_quality
                      'quantity_group', # exactly same as quantity
                      'source_type', # same but less detailed info as source
                      'source_class', # same but less detailed info as source
                      'waterpoint_type_group'] # same but less detailed info as waterpoint_type

reduced_testing_features = testing_features.drop(columns=features_to_reduce)

In [4]:
reduced_testing_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14850 entries, 0 to 14849
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   amount_tsh       14850 non-null  float64
 1   funder           14850 non-null  object 
 2   gps_height       14850 non-null  float64
 3   installer        14850 non-null  object 
 4   longitude        14850 non-null  float64
 5   latitude         14850 non-null  float64
 6   basin            14850 non-null  object 
 7   region_code      14850 non-null  int64  
 8   district_code    14850 non-null  int64  
 9   lga              14850 non-null  object 
 10  population       14850 non-null  float64
 11  extraction_type  14850 non-null  object 
 12  management       14850 non-null  object 
 13  payment          14850 non-null  object 
 14  quantity         14850 non-null  object 
 15  source           14850 non-null  object 
 16  waterpoint_type  14850 non-null  object 
 17  years_elapse

# Feacture Processing

## Factorize categorical features

In [5]:
object_features = reduced_testing_features.select_dtypes(include=["object"]).columns.tolist()
print("Names of object columns : ", object_features)
for feature in object_features:
    reduced_testing_features[feature] = pd.factorize(reduced_testing_features[feature])[0]

Names of object columns :  ['funder', 'installer', 'basin', 'lga', 'extraction_type', 'management', 'payment', 'quantity', 'source', 'waterpoint_type']


## Scale numerical data

In [6]:
features=['amount_tsh', 'gps_height', 'population', 'years_elapsed']
scaler = MinMaxScaler(feature_range=(0,20))
reduced_testing_features[features] = scaler.fit_transform(reduced_testing_features[features])
reduced_testing_features[features].describe()

Unnamed: 0,amount_tsh,gps_height,population,years_elapsed
count,14850.0,14850.0,14850.0,14850.0
mean,0.063768,7.993565,0.611595,5.011245
std,0.270468,3.623208,0.786536,3.89723
min,0.0,0.0,0.0,0.0
25%,0.00498,6.210303,0.258132,2.641509
50%,0.02498,8.807339,0.519752,3.773585
75%,0.04998,9.858857,0.694166,6.037736
max,20.0,20.0,20.0,20.0


# Random Forest Classifier

In [7]:
X = reduced_testing_features

filename = '../models/rfc_model 11-08-2021 08:52:40'
rfc_model = pickle.load(open(filename, 'rb'))

y_pred = rfc_model.predict(X)

In [8]:
submission = pd.DataFrame({'id': testing_features['id'], 
                           'status_group': pd.Series(y_pred)})
submission.set_index('id', inplace=True)

now = datetime.now().strftime("%d-%m-%Y %H:%M:%S")
submission.to_csv('../submissions/submission {}'.format(now))