In [196]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_selection import mutual_info_classif, SelectKBest

from sklearn.pipeline import Pipeline
import category_encoders as ce
from category_encoders import LeaveOneOutEncoder, TargetEncoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier


from data_prep_functions import *

In [355]:
# import data
dftrain = pd.read_csv('data/train_target.csv')
dftest = pd.read_csv('data/train_values.csv')

# merge on id
df = pd.merge(dftrain, dftest, how='inner', on='id')

In [356]:
# what to include
keep_columns = ['status_group', 'amount_tsh', 'gps_height', 'num_private', 'region', 'district_code', 'lga',
                'public_meeting', 'scheme_management', 'permit', 'construction_year', 'extraction_type',
                'management_group', 'water_quality', 'quantity', 'source', 'waterpoint_type', 
                'population']

df = df[keep_columns]

In [357]:
# drop missing values
df.dropna(inplace=True)

In [358]:
df['status_group'].value_counts(normalize=True)

functional                 0.549367
non functional             0.377019
functional needs repair    0.073614
Name: status_group, dtype: float64

In [359]:
# baseline 2
func_correct = len(df[(df['amount_tsh']>0) & (df['status_group']=='functional')])
nonfunc_correct = len(df[(df['amount_tsh']==0) & (df['status_group']=='non functional')])

total_correct = func_correct + nonfunc_correct
total = len(df)
print(total_correct/total)

0.5313496920206255


In [200]:
print(df['amount_tsh'].describe())

print("""
I am concerned about the units in the Total Static Head Column.
The mean value is 350 but ranges up to hundreds of thousands. 
Could it be reported in both meters and centimeters?
Leaves me uncertain how to deal with this data or to scale/trim.""")

count     49841.000000
mean        352.215854
std        2793.448736
min           0.000000
25%           0.000000
50%           0.000000
75%          30.000000
max      250000.000000
Name: amount_tsh, dtype: float64

I am concerned about the units in the Total Static Head Column.
The mean value is 350 but ranges up to hundreds of thousands. 
Could it be reported in both meters and centimeters?
Leaves me uncertain how to deal with this data or to scale/trim.


In [201]:
num_tsh_zero = len(df[df['amount_tsh']==0])
num_tsh_zero_nonfunc = len(df[(df['amount_tsh']==0) & (df['status_group']=='non functional')])

print(f"""
There are {num_tsh_zero} wells that have a Total Static Head of zero.
Out of these, {num_tsh_zero_nonfunc} are non functional.
""")


There are 33667 wells that have a Total Static Head of zero.
Out of these, 15153 are non functional.



In [202]:
num_tsh_positive = len(df[df['amount_tsh']>0])
num_tsh_pos_nonfunc = len(df[(df['amount_tsh']>0) & (df['status_group']=='non functional')])

print(f"""
There are {num_tsh_positive} wells that have a Total Static Head greater than zero.
Out of these, {num_tsh_pos_nonfunc} are non functional.

-------------------

So, when a well has a TSH of zero, it is much more likely to be non-functional.
""")


There are 16174 wells that have a Total Static Head greater than zero.
Out of these, 3638 are non functional.

-------------------

So, when a well has a TSH of zero, it is much more likely to be non-functional.



In [203]:
num_meeting = len(df[df['public_meeting']==True])
num_meeting_nonfunc = len(df[(df['public_meeting']==True) & (df['status_group']=='non functional')])

print(f"""
There are {num_meeting} rows that have had a public meeting.
Out of these, {num_meeting_nonfunc} are non functional.
----------------------------------
So if there has been a meeting or not isn't a perfect indicator of non-functionality on its own.

""")


There are 45360 rows that have had a public meeting.
Out of these, 16559 are non functional.
----------------------------------
So if there has been a meeting or not isn't a perfect indicator of non-functionality on its own.




In [204]:
df[df['amount_tsh']>1000]

Unnamed: 0,status_group,amount_tsh,gps_height,num_private,region,district_code,lga,public_meeting,scheme_management,permit,construction_year,extraction_type,management,management_group,water_quality,quantity,source,waterpoint_type,population
0,functional,6000.0,1390,0,Iringa,5,Ludewa,True,VWC,False,1999,gravity,vwc,user-group,soft,enough,spring,communal standpipe,109
38,functional,4000.0,1955,0,Iringa,7,Kilolo,True,VWC,True,2003,gravity,vwc,user-group,soft,enough,river,communal standpipe,100
48,functional,4000.0,1982,0,Iringa,7,Kilolo,True,VWC,True,2003,gravity,vwc,user-group,soft,enough,spring,communal standpipe,70
49,functional,1500.0,2169,0,Iringa,4,Njombe,True,VWC,True,2007,gravity,vwc,user-group,soft,enough,spring,communal standpipe,20
147,functional,2000.0,1603,0,Iringa,4,Njombe,True,VWC,False,2010,other - rope pump,vwc,user-group,soft,enough,hand dtw,hand pump,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59367,functional,2000.0,1977,0,Iringa,3,Makete,True,VWC,False,1976,gravity,vwc,user-group,soft,enough,spring,communal standpipe,0
59373,functional,5000.0,1137,0,Rukwa,1,Mpanda,True,VWC,True,1988,india mark ii,vwc,user-group,soft,enough,machine dbh,hand pump,96
59375,functional,40000.0,54,0,Tanga,4,Tanga,True,VWC,True,2005,gravity,vwc,user-group,soft,enough,river,communal standpipe,609
59380,non functional,6000.0,1439,0,Iringa,5,Ludewa,True,VWC,False,1999,gravity,vwc,user-group,soft,dry,spring,communal standpipe,50


In [205]:
df['quantity'].unique()

array(['enough', 'dry', 'seasonal', 'insufficient', 'unknown'],
      dtype=object)

In [361]:
le = LabelEncoder()
le.fit(['non functional', 'functional', 'functional needs repair'])

target = le.transform(df['status_group'])

df['target'] = target

In [362]:
df['status_group'].value_counts()

functional                 27381
non functional             18791
functional needs repair     3669
Name: status_group, dtype: int64

In [363]:
df['target'].value_counts()

0    27381
2    18791
1     3669
Name: target, dtype: int64

In [364]:
# can now select X and y from cleaned data
X = df.drop(['status_group', 'target', ], axis=1)
y = df['status_group']

In [365]:
# dont want district codes to be treated as numerical
X['district_code'] = X['district_code'].astype('object');

In [366]:
num_cols = []
ohe_cols = []
freq_cols = []
thresh = 7

for col in X.columns:
    
    if X[col].dtype in ['float64', 'int64']:
        num_cols.append(col)
        continue
    
    elif X[col].dtype == 'object':
        if len(X[col].unique()) > thresh:
            freq_cols.append(col)
        elif len(X[col].unique()) <= thresh:
            ohe_cols.append(col)


In [367]:
num_cols

['amount_tsh', 'gps_height', 'num_private', 'construction_year', 'population']

In [368]:
# OHE cols contains two True/False columns
# One Hot Encoding those would be redundant, 
# so store them separately so they can be ignored by ColumnTransformer
TF_cols = ohe_cols[0:2]
ohe_cols = ohe_cols[2:]

In [370]:
a = num_cols.index('construction_year')
del num_cols[a]

In [371]:
freq_cols

['region',
 'district_code',
 'lga',
 'scheme_management',
 'extraction_type',
 'water_quality',
 'source']

In [372]:
num_cols

['amount_tsh', 'gps_height', 'num_private', 'population']

In [375]:
ohe_cols

['management_group', 'quantity', 'waterpoint_type']

In [376]:
year_cols = ['construction_year']

In [377]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49841 entries, 0 to 59399
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   amount_tsh         49841 non-null  float64
 1   gps_height         49841 non-null  int64  
 2   num_private        49841 non-null  int64  
 3   region             49841 non-null  object 
 4   district_code      49841 non-null  object 
 5   lga                49841 non-null  object 
 6   public_meeting     49841 non-null  object 
 7   scheme_management  49841 non-null  object 
 8   permit             49841 non-null  object 
 9   construction_year  49841 non-null  int64  
 10  extraction_type    49841 non-null  object 
 11  management_group   49841 non-null  object 
 12  water_quality      49841 non-null  object 
 13  quantity           49841 non-null  object 
 14  source             49841 non-null  object 
 15  waterpoint_type    49841 non-null  object 
 16  population         498

In [386]:
X['construction_year'] = X['construction_year'].astype('str');

In [387]:
# scaler pipeline
num_transformer = Pipeline(steps=[
    ('num_imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())])

In [388]:
ohe_transformer = Pipeline(steps=[
    ('ohe_imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('ohencoder', OneHotEncoder(handle_unknown='ignore'))])

In [389]:
year_transformer = Pipeline(steps=[
    ('year_imp', SimpleImputer(missing_values='0', strategy='constant', fill_value='Unknown')),
    ('yeahohe', OneHotEncoder(handle_unknown='ignore'))
])

In [390]:
freq_transformer = Pipeline(steps=[
    ('freq_imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
#     ('loo_enc', LeaveOneOutEncoder()
    ('freq_enc', ce.count.CountEncoder(normalize=True, 
                                       handle_unknown=0,
                                       min_group_size=0.001,
                                       min_group_name='Other'
                                      ))])

In [391]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('ohe', ohe_transformer, ohe_cols),
        ('freq', freq_transformer, freq_cols),
        ('year', year_transformer, year_cols)],
        remainder='passthrough')

In [392]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [393]:
clf_rndf = Pipeline(steps=[('preprocessor', preprocessor),
                         ('classifier', RandomForestClassifier(random_state=1234))])

In [394]:
clf_rndf.fit(X_train, y_train);

In [395]:
train_preds = clf_rndf.predict(X_train)
test_preds = clf_rndf.predict(X_test)

print(classification_report(y_train, train_preds))
print(classification_report(y_test, test_preds))

                         precision    recall  f1-score   support

             functional       0.93      0.97      0.95     20470
functional needs repair       0.91      0.74      0.82      2750
         non functional       0.95      0.93      0.94     14160

               accuracy                           0.94     37380
              macro avg       0.93      0.88      0.90     37380
           weighted avg       0.94      0.94      0.94     37380

                         precision    recall  f1-score   support

             functional       0.80      0.87      0.83      6911
functional needs repair       0.48      0.30      0.37       919
         non functional       0.80      0.76      0.78      4631

               accuracy                           0.78     12461
              macro avg       0.69      0.64      0.66     12461
           weighted avg       0.78      0.78      0.78     12461



In [398]:
# random forest grid search
param_grid = {
    'classifier__n_estimators': [160, 190, 200, 210],
    'classifier__max_depth': [20, 29, 35, 50, 60, 70],
    'classifier__min_samples_split': [2, 4, 6],
    'classifier__min_samples_leaf': [2, 3]
}

scoring_metrics = {'f1_weighted': 'f1_weighted',
                   'accuracy': 'accuracy'}

forest_GS = GridSearchCV(clf_rndf, param_grid, cv=3, scoring=scoring_metrics, refit='f1_weighted')
forest_GS.fit(X_train, y_train);

In [399]:
# fifth grid search
train_preds = forest_GS.predict(X_train)
test_preds = forest_GS.predict(X_test)

print(classification_report(y_train, train_preds))
print(classification_report(y_test, test_preds))

                         precision    recall  f1-score   support

             functional       0.82      0.96      0.89     20470
functional needs repair       0.81      0.31      0.44      2750
         non functional       0.91      0.81      0.86     14160

               accuracy                           0.85     37380
              macro avg       0.85      0.69      0.73     37380
           weighted avg       0.86      0.85      0.84     37380

                         precision    recall  f1-score   support

             functional       0.78      0.91      0.84      6911
functional needs repair       0.66      0.23      0.35       919
         non functional       0.83      0.73      0.78      4631

               accuracy                           0.80     12461
              macro avg       0.76      0.63      0.66     12461
           weighted avg       0.79      0.80      0.78     12461



In [400]:
# fifth grid search
forest_GS.best_params_

{'classifier__max_depth': 60,
 'classifier__min_samples_leaf': 2,
 'classifier__min_samples_split': 2,
 'classifier__n_estimators': 160}

In [254]:
# fifth grid search
train_preds = forest_GS.predict(X_train)
test_preds = forest_GS.predict(X_test)

print(classification_report(y_train, train_preds))
print(classification_report(y_test, test_preds))

                         precision    recall  f1-score   support

             functional       0.84      0.96      0.90     20470
functional needs repair       0.84      0.37      0.51      2750
         non functional       0.91      0.84      0.87     14160

               accuracy                           0.87     37380
              macro avg       0.87      0.72      0.76     37380
           weighted avg       0.87      0.87      0.86     37380

                         precision    recall  f1-score   support

             functional       0.79      0.90      0.84      6911
functional needs repair       0.65      0.24      0.35       919
         non functional       0.82      0.75      0.78      4631

               accuracy                           0.80     12461
              macro avg       0.75      0.63      0.66     12461
           weighted avg       0.79      0.80      0.78     12461



In [255]:
# fifth grid search
forest_GS.best_params_

{'classifier__max_depth': 50, 'classifier__n_estimators': 138}

In [57]:
clf_knn = Pipeline(steps=[('preprocessor', preprocessor),
                         ('knn', KNeighborsClassifier())])

In [58]:
clf_knn.fit(X_train, y_train);

In [59]:
train_preds = clf_knn.predict(X_train)
test_preds = clf_knn.predict(X_test)

print(classification_report(y_train, train_preds))
print(classification_report(y_test, test_preds))

                         precision    recall  f1-score   support

             functional       0.81      0.90      0.85     20470
functional needs repair       0.58      0.36      0.44      2750
         non functional       0.85      0.77      0.81     14160

               accuracy                           0.81     37380
              macro avg       0.74      0.68      0.70     37380
           weighted avg       0.81      0.81      0.80     37380

                         precision    recall  f1-score   support

             functional       0.77      0.86      0.81      6911
functional needs repair       0.46      0.29      0.36       919
         non functional       0.77      0.71      0.74      4631

               accuracy                           0.76     12461
              macro avg       0.67      0.62      0.64     12461
           weighted avg       0.75      0.76      0.75     12461



## Below how to generate contest predictions

In [None]:
df_cont = pd.read_csv('data/test_values.csv')
df_cont.head()

In [None]:
keep_columns = ['amount_tsh', 'gps_height', 'num_private', 'region', 'district_code', 'lga',
                'public_meeting', 'scheme_management', 'permit', 'construction_year', 'extraction_type',
                'management_group', 'water_quality', 'quantity', 'source', 'waterpoint_type', 
                'population']

contest = df_cont[keep_columns]

In [None]:
contest['permit'].fillna(False, inplace=True)
contest['public_meeting'].fillna(False, inplace=True)

In [None]:
cont_preds = forest_clf.predict(contest)

In [None]:
ids = np.array(df_cont['id'])

In [None]:
final = pd.DataFrame({'id': ids, 'status_group': cont_preds})
final.head()

In [None]:
final.to_csv('name.csv', index=False)