# Imports

In [1]:
import pickle
import pandas as pd
from datetime import datetime

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier 

In [5]:
CLEAN_TEST_FEATURES = '../data/processed/test_features_clean_19-08-2021 11:20:07'
MODEL = '../models/rfc_model 19-08-2021 08hr01min50sec copy'
SEED = 42

clean_test_features = pd.read_csv(CLEAN_TEST_FEATURES)

# Feature Reduction

In [7]:
features_to_reduce = [
'Unnamed: 0', # artifact from importing from file
'id', # different for every well
'date_recorded', # redundant with years_elapsed synthetic feature
'wpt_name', # drop results in no change in accuracy
'num_private', # litte variation in feature values
'subvillage', # ward and lga capture similar info with greater feature importance and lower training runtimes
'region', # redundant, same as region_code
'public_meeting', # drop results in no change in accuracy
'recorded_by', # same for every observation
'scheme_name', # drop results in no change in accuracy
'scheme_management', # drop results in increase in accuracy
'permit', # drop results in no change in accuracy
'construction_year', # redundant with years_elapsed synthetic feature
'extraction_type_group', # similar to extraction but with less variation
'extraction_type_class', # similar to extraction but with less variation
'management_group', # similar to management but with less variation
'payment_type', # same as payment
'water_quality', # drop results in no change in accuracy
'quality_group', # drop results in no change in accuracy
'quantity_group', # same as quantity
'source_type', # similar to source but with less variation
'source_class', # similar to source but with less variation
'waterpoint_type_group' # similar to waterpoint_type but with less variation
]

reduced_clean_test_features = clean_test_features.drop(columns=features_to_reduce)

In [10]:
reduced_clean_test_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14850 entries, 0 to 14849
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   amount_tsh       14850 non-null  float64
 1   funder           14850 non-null  object 
 2   gps_height       14850 non-null  float64
 3   installer        14850 non-null  object 
 4   longitude        14850 non-null  float64
 5   latitude         14850 non-null  float64
 6   basin            14850 non-null  object 
 7   region_code      14850 non-null  int64  
 8   district_code    14850 non-null  int64  
 9   lga              14850 non-null  object 
 10  ward             14850 non-null  object 
 11  population       14850 non-null  int64  
 12  extraction_type  14850 non-null  object 
 13  management       14850 non-null  object 
 14  payment          14850 non-null  object 
 15  quantity         14850 non-null  object 
 16  source           14850 non-null  object 
 17  waterpoint_t

# Feacture Processing

## Factorize categorical features

In [12]:
object_features = reduced_clean_test_features.select_dtypes(include=["object"]).columns.tolist()
print("Names of object columns : ", object_features)
for feature in object_features:
    reduced_clean_test_features[feature] = pd.factorize(reduced_clean_test_features[feature])[0]

Names of object columns :  ['funder', 'installer', 'basin', 'lga', 'ward', 'extraction_type', 'management', 'payment', 'quantity', 'source', 'waterpoint_type']


## Scale numerical data

In [14]:
features=['amount_tsh', 'gps_height', 'population', 'years_elapsed']
scaler = MinMaxScaler(feature_range=(0,20))
reduced_clean_test_features[features] = scaler.fit_transform(reduced_clean_test_features[features])
reduced_clean_test_features[features].describe()

Unnamed: 0,amount_tsh,gps_height,population,years_elapsed
count,14850.0,14850.0,14850.0,14850.0
mean,0.063768,7.993565,0.611576,5.011245
std,0.270468,3.623208,0.786549,3.89723
min,0.0,0.0,0.0,0.0
25%,0.00498,6.210303,0.258132,2.641509
50%,0.02498,8.807339,0.519752,3.773585
75%,0.04998,9.858857,0.694166,6.037736
max,20.0,20.0,20.0,20.0


# Random Forest Classifier

In [15]:
X = reduced_clean_test_features

filename = MODEL
rfc_model = pickle.load(open(filename, 'rb'))

y_pred = rfc_model.predict(X)

In [16]:
submission = pd.DataFrame({'id': testing_features['id'], 
                           'status_group': pd.Series(y_pred)})
submission.set_index('id', inplace=True)

now = datetime.now().strftime("%d-%m-%Y %Hhr%Mmin%Ssec")
submission.to_csv('../submissions/submission {}'.format(now))