# Phase III Project Technical Notebook

#### Authors: Kyle Dufrane and Brad Horn

In [1]:
# Import needed libraries

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, roc_curve, plot_confusion_matrix

pd.set_option('display.max_columns', 999)

### Business Understanding

#### Flatiron LLC has recently been awarded a contract to maintain wells in Tanzania. They're looking for a system to help develop preventative maintenance schedules by predicting pump failures and replacement schedules to better serve their client.

### Overview

#### Given the business problem we hope to identify the following features through our EDA:
* Are wells failing by geographic location?
* Does well type or source effect pump longevity? 
* Does well management or payment effect pump longevity?

### Data Understanding

#### This dataset comes from the Government of Tanzania and contains over ~59,000 wells with the earliest recorded construction year being 1966. Below you will see our data cleaning process.

#### This dataset comes in three files, test_set, training_set_labels, and training_set_values. We will exclude the test set until the final model has been completed then predict and submit our findings. 

#### To start we will look at the training_set_labels:

In [2]:
# Import training labels CSV
df_training_labels = pd.read_csv('data/Training_set_labels.csv')

In [3]:
df_training_labels.shape

(59400, 2)

In [4]:
df_training_labels.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [5]:
df_training_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            59400 non-null  int64 
 1   status_group  59400 non-null  object
dtypes: int64(1), object(1)
memory usage: 928.2+ KB


### Checking NA values

In [6]:
df_training_labels.isna().sum()

id              0
status_group    0
dtype: int64

### Class Imbalance

#### Based on our counts, we can see that we will have to counter the class imbalance. We will fix this issue later on in our model building process. 

In [7]:
df_training_labels['status_group'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

### Training_set_values

In [8]:
# Import training values CSV
df_training_values = pd.read_csv('data/Training_set_values.csv')

In [9]:
df_training_values.shape

(59400, 40)

#### Looking at the above cells output we can see that we have 40 predictive features to chose from being: 

* amount_tsh : Total static head (amount water available to waterpoint)
* date_recorded : The date the row was entered
* funder : Who funded the well
* gps_height : Altitude of the well
* installer : Organization that installed the well
* longitude : GPS coordinate
* latitude : GPS coordinate
* wpt_name : Name of the waterpoint if there is one
* num_private :Private use or not
* basin : Geographic water basin
* subvillage : Geographic location
* region : Geographic location
* region_code : Geographic location (coded)
* district_code : Geographic location (coded)
* lga : Geographic location
* ward : Geographic location
* population : Population around the well
* public_meeting : True/False
* recorded_by : Group entering this row of data
* scheme_management : Who operates the waterpoint
* scheme_name : Who operates the waterpoint
* permit : If the waterpoint is permitted
* construction_year : Year the waterpoint was constructed
* extraction_type : The kind of extraction the waterpoint uses
* extraction_type_group : The kind of extraction the waterpoint uses
* extraction_type_class : The kind of extraction the waterpoint uses
* management : How the waterpoint is managed
* management_group : How the waterpoint is managed
* payment : What the water costs
* payment_type : What the water costs
* water_quality : The quality of the water
* quality_group : The quality of the water
* quantity : The quantity of water
* quantity_group : The quantity of water
* source : The source of the water
* source_type : The source of the water
* source_class : The source of the water
* waterpoint_type : The kind of waterpoint
* waterpoint_type_group : The kind of waterpoint

In [10]:
df_training_values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   date_recorded          59400 non-null  object 
 3   funder                 55765 non-null  object 
 4   gps_height             59400 non-null  int64  
 5   installer              55745 non-null  object 
 6   longitude              59400 non-null  float64
 7   latitude               59400 non-null  float64
 8   wpt_name               59400 non-null  object 
 9   num_private            59400 non-null  int64  
 10  basin                  59400 non-null  object 
 11  subvillage             59029 non-null  object 
 12  region                 59400 non-null  object 
 13  region_code            59400 non-null  int64  
 14  district_code          59400 non-null  int64  
 15  lg

#### A quick review of the Non-Null column shows that we are missing values in this data set. Below we will dive deeper into which columns are the most effected. 

In [11]:
df_training_values.isna().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

#### Out of the 40 features 7 of them are missing values. A few items stand out:

* Funder and installer have close to equal amounts of missing values
* subvillage has the least amount of missing values
* scheme_name is missing almost half of the values - we will drop this column

In [12]:
# Dropping column from dataframe
df_training_values.drop('scheme_name', axis = 1, inplace = True)

#### We need to explore more to see how we should handle these values.

In [13]:
# creating a list of columns with missing values
missing_values = ['funder', 'installer', 'subvillage', 'public_meeting',\
                  'scheme_management', 'permit']

# creating a dataframe with above missing_values
df_training_values[missing_values].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   funder             55765 non-null  object
 1   installer          55745 non-null  object
 2   subvillage         59029 non-null  object
 3   public_meeting     56066 non-null  object
 4   scheme_management  55523 non-null  object
 5   permit             56344 non-null  object
dtypes: object(6)
memory usage: 2.7+ MB


In [14]:
df_training_values[missing_values].isna().sum()

funder               3635
installer            3655
subvillage            371
public_meeting       3334
scheme_management    3877
permit               3056
dtype: int64

#### We can now see that all of these features are of the dtype object which narrows down our options to dealing with the missing values. What are these features composed of? 

#### To start, lets take a look at our previous mentioned insite of funders and installers having close to the same amount of missing values. 

##### Note: prior to running the below cells I misread the value counts and thought that both of these columns had the same amount of NA values. The below lines raised the red flag of 'why are the true values the same but the false values differ?'

In [15]:
df_training_values[df_training_values['funder'].isna()]['installer'].isna().value_counts()

True     3582
False      53
Name: installer, dtype: int64

In [16]:
df_training_values[df_training_values['installer'].isna()]['funder'].isna().value_counts()

True     3582
False      73
Name: funder, dtype: int64

#### Looking at the above counts it looks like our counts vary minimally but enough so where we cannot attack these two columns as the same. 

In [17]:
df_training_values['funder'].value_counts()

Government Of Tanzania    9084
Danida                    3114
Hesawa                    2202
Rwssp                     1374
World Bank                1349
                          ... 
Dv                           1
Village Communi              1
Water Department             1
Misheni                      1
Mikumi G                     1
Name: funder, Length: 1897, dtype: int64

In [18]:
df_training_values[df_training_values['funder'].isna()]

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
34,41583,0.0,2011-02-23,,-41,,39.812912,-7.889986,Msikitini Wa Ijumaa,0,Rufiji,Kilombero B,Pwani,60,63,Mafia,Baleni,100,True,GeoData Consultants Ltd,VWC,False,0,nira/tanira,nira/tanira,handpump,vwc,user-group,never pay,never pay,salty,salty,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump
43,19282,0.0,2013-01-15,,1642,,34.967789,-4.628921,Mvae Primary,0,Internal,Mwarufyu,Singida,13,2,Singida Rural,Merya,1,True,GeoData Consultants Ltd,VWC,,1980,mono,mono,motorpump,vwc,user-group,unknown,unknown,unknown,unknown,dry,dry,machine dbh,borehole,groundwater,communal standpipe,communal standpipe
47,13620,0.0,2011-07-27,,0,,33.540607,-9.172905,Mahakamani,0,Lake Nyasa,Mpandapanda,Mbeya,12,4,Rungwe,Kiwira,0,True,GeoData Consultants Ltd,VWC,,0,gravity,gravity,gravity,vwc,user-group,never pay,never pay,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
65,51072,0.0,2013-02-09,,1415,,34.621598,-5.173136,Nyambi,0,Internal,Mfumbu,Singida,13,2,Singida Rural,Naintiri,1,True,GeoData Consultants Ltd,VWC,,1970,mono,mono,motorpump,vwc,user-group,unknown,unknown,unknown,unknown,dry,dry,machine dbh,borehole,groundwater,communal standpipe,communal standpipe
71,17386,0.0,2011-03-31,,0,,34.462228,-8.575780,Kwa Manyusi Mlilo,0,Rufiji,Lembuka,Mbeya,12,7,Mbarali,Mawindi,0,True,GeoData Consultants Ltd,VWC,False,0,gravity,gravity,gravity,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,river,river/lake,surface,communal standpipe,communal standpipe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59357,46563,0.0,2013-02-19,,1635,,34.971841,-5.098362,Shabani,0,Internal,Mampando A,Singida,13,2,Singida Rural,Ntuntu,1,True,GeoData Consultants Ltd,VWC,,1980,nira/tanira,nira/tanira,handpump,vwc,user-group,unknown,unknown,unknown,unknown,dry,dry,shallow well,shallow well,groundwater,hand pump,hand pump
59366,55232,0.0,2013-02-02,,1541,,34.765729,-5.027725,Joshoni,0,Internal,Mungumaho,Singida,13,2,Singida Rural,Puma,200,True,GeoData Consultants Ltd,VWC,,2000,nira/tanira,nira/tanira,handpump,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump
59370,14796,200.0,2013-01-29,,1154,,30.058731,-4.902633,Village Office,0,Lake Tanganyika,Mazungwe,Kigoma,16,2,Kasulu,Rusesa,1,True,GeoData Consultants Ltd,,False,0,other,other,other,unknown,unknown,pay monthly,monthly,unknown,unknown,unknown,unknown,unknown,other,unknown,other,other
59376,34716,0.0,2013-02-03,,1581,,34.821039,-5.076258,Nasingo,0,Internal,Hu,Singida,13,2,Singida Rural,Dung'unyi,1,True,GeoData Consultants Ltd,VWC,,1990,other,other,other,vwc,user-group,unknown,unknown,unknown,unknown,dry,dry,shallow well,shallow well,groundwater,other,other


In [19]:
df_training_values['installer'].value_counts()

DWE                17402
Government          1825
RWE                 1206
Commu               1060
DANIDA              1050
                   ...  
MAIVARU                1
Government /SDA        1
Swalehe Rajabu         1
WAMBA                  1
Villi                  1
Name: installer, Length: 2145, dtype: int64

In [20]:
df_training_values[df_training_values['installer'].isna()]

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
34,41583,0.0,2011-02-23,,-41,,39.812912,-7.889986,Msikitini Wa Ijumaa,0,Rufiji,Kilombero B,Pwani,60,63,Mafia,Baleni,100,True,GeoData Consultants Ltd,VWC,False,0,nira/tanira,nira/tanira,handpump,vwc,user-group,never pay,never pay,salty,salty,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump
43,19282,0.0,2013-01-15,,1642,,34.967789,-4.628921,Mvae Primary,0,Internal,Mwarufyu,Singida,13,2,Singida Rural,Merya,1,True,GeoData Consultants Ltd,VWC,,1980,mono,mono,motorpump,vwc,user-group,unknown,unknown,unknown,unknown,dry,dry,machine dbh,borehole,groundwater,communal standpipe,communal standpipe
47,13620,0.0,2011-07-27,,0,,33.540607,-9.172905,Mahakamani,0,Lake Nyasa,Mpandapanda,Mbeya,12,4,Rungwe,Kiwira,0,True,GeoData Consultants Ltd,VWC,,0,gravity,gravity,gravity,vwc,user-group,never pay,never pay,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
65,51072,0.0,2013-02-09,,1415,,34.621598,-5.173136,Nyambi,0,Internal,Mfumbu,Singida,13,2,Singida Rural,Naintiri,1,True,GeoData Consultants Ltd,VWC,,1970,mono,mono,motorpump,vwc,user-group,unknown,unknown,unknown,unknown,dry,dry,machine dbh,borehole,groundwater,communal standpipe,communal standpipe
71,17386,0.0,2011-03-31,,0,,34.462228,-8.575780,Kwa Manyusi Mlilo,0,Rufiji,Lembuka,Mbeya,12,7,Mbarali,Mawindi,0,True,GeoData Consultants Ltd,VWC,False,0,gravity,gravity,gravity,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,river,river/lake,surface,communal standpipe,communal standpipe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59357,46563,0.0,2013-02-19,,1635,,34.971841,-5.098362,Shabani,0,Internal,Mampando A,Singida,13,2,Singida Rural,Ntuntu,1,True,GeoData Consultants Ltd,VWC,,1980,nira/tanira,nira/tanira,handpump,vwc,user-group,unknown,unknown,unknown,unknown,dry,dry,shallow well,shallow well,groundwater,hand pump,hand pump
59366,55232,0.0,2013-02-02,,1541,,34.765729,-5.027725,Joshoni,0,Internal,Mungumaho,Singida,13,2,Singida Rural,Puma,200,True,GeoData Consultants Ltd,VWC,,2000,nira/tanira,nira/tanira,handpump,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump
59370,14796,200.0,2013-01-29,,1154,,30.058731,-4.902633,Village Office,0,Lake Tanganyika,Mazungwe,Kigoma,16,2,Kasulu,Rusesa,1,True,GeoData Consultants Ltd,,False,0,other,other,other,unknown,unknown,pay monthly,monthly,unknown,unknown,unknown,unknown,unknown,other,unknown,other,other
59376,34716,0.0,2013-02-03,,1581,,34.821039,-5.076258,Nasingo,0,Internal,Hu,Singida,13,2,Singida Rural,Dung'unyi,1,True,GeoData Consultants Ltd,VWC,,1990,other,other,other,vwc,user-group,unknown,unknown,unknown,unknown,dry,dry,shallow well,shallow well,groundwater,other,other


In [21]:
df_training_values['subvillage'].value_counts()

Madukani     508
Shuleni      506
Majengo      502
Kati         373
Mtakuja      262
            ... 
Mugombe        1
Bushini A      1
Godauni B      1
Kangwa         1
Tembe          1
Name: subvillage, Length: 19287, dtype: int64

In [22]:
df_training_values['subvillage'].isna()

0        False
1        False
2        False
3        False
4        False
         ...  
59395    False
59396    False
59397    False
59398    False
59399    False
Name: subvillage, Length: 59400, dtype: bool

In [23]:
df_training_values['public_meeting'].value_counts()

True     51011
False     5055
Name: public_meeting, dtype: int64

In [24]:
df_training_values[df_training_values['public_meeting'].isna()]

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466e+00,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
18,34169,0.0,2011-07-22,Hesawa,1162,DWE,32.920154,-1.947868e+00,Ngomee,0,Lake Victoria,Center,Mwanza,19,1,Ukerewe,Ilangala,1000,,GeoData Consultants Ltd,,True,1999,other,other,other,vwc,user-group,never pay,never pay,milky,milky,insufficient,insufficient,spring,spring,groundwater,other,other
21,6091,0.0,2013-02-10,Dwsp,0,DWE,0.000000,-2.000000e-08,Muungano,0,Lake Victoria,Ibabachegu,Shinyanga,17,1,Bariadi,Ikungulyabashashi,0,,GeoData Consultants Ltd,WUG,False,0,swn 80,swn 80,handpump,wug,user-group,unknown,unknown,unknown,unknown,unknown,unknown,shallow well,shallow well,groundwater,hand pump,hand pump
58,24593,0.0,2013-01-22,Kkkt,1703,KKKT,35.561346,-3.806879e+00,Kwa Iyora Mgirigisi,0,Internal,Ants B,Manyara,21,3,Mbulu,Bargish,456,,GeoData Consultants Ltd,WUG,True,2000,nira/tanira,nira/tanira,handpump,wug,user-group,never pay,never pay,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump
73,35715,0.0,2013-02-27,Hesawa,1565,HESAWA,34.657034,-1.888020e+00,Machumbe,0,Lake Victoria,Mtakuja,Mara,20,2,Serengeti,Manchira,200,,GeoData Consultants Ltd,Other,True,1991,nira/tanira,nira/tanira,handpump,wug,user-group,other,other,soft,good,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59306,71419,0.0,2013-01-16,Cocern,1296,TWESA,30.930385,-3.300592e+00,Kwa Mhunda,0,Lake Tanganyika,Nguvu Kazi A,Kigoma,16,1,Kibondo,Kakonko,1,,GeoData Consultants Ltd,,False,2013,gravity,gravity,gravity,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,spring,spring,groundwater,improved spring,improved spring
59310,32815,0.0,2013-04-04,Tasaf,892,TASAF,37.744090,-4.062296e+00,Ward Office,0,Pangani,Same,Kilimanjaro,3,3,Same,Same Urban,65,,GeoData Consultants Ltd,,,0,other,other,other,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,other,unknown,other,other
59324,748,0.0,2013-01-22,World Vision,0,World Vision,0.000000,-2.000000e-08,Mwazwilo,0,Lake Victoria,Mbita,Shinyanga,17,1,Bariadi,Mbita,0,,GeoData Consultants Ltd,WUG,False,0,nira/tanira,nira/tanira,handpump,wug,user-group,unknown,unknown,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump
59344,6450,0.0,2013-10-03,,1303,,36.900911,-3.111477e+00,Pentecoste Swidish,0,Internal,Madukani,Arusha,2,7,Meru,Ngarenanyuki,230,,GeoData Consultants Ltd,,,0,gravity,gravity,gravity,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,spring,spring,groundwater,communal standpipe,communal standpipe


In [25]:
df_training_values[df_training_values['public_meeting'].isna()]['recorded_by'].value_counts()

GeoData Consultants Ltd    3334
Name: recorded_by, dtype: int64

#### Inspecting the above dataframe you can see that all the items have been recorded by GeoData Consultants Ltd. Lets take a look at the whole dataframe. 

In [26]:
df_training_values[df_training_values['recorded_by'] == 'GeoData Consultants Ltd']['recorded_by'].value_counts()

GeoData Consultants Ltd    59400
Name: recorded_by, dtype: int64

#### Seeing how all of the data has been recorded by the same vendor this will have no impact on our modeling. This is another column that we can drop. 

In [27]:
df_training_values.drop('recorded_by', axis = 1, inplace = True)

In [28]:
df_training_values['scheme_management'].value_counts()

VWC                 36793
WUG                  5206
Water authority      3153
WUA                  2883
Water Board          2748
Parastatal           1680
Private operator     1063
Company              1061
Other                 766
SWC                    97
Trust                  72
None                    1
Name: scheme_management, dtype: int64

In [29]:
df_training_values['permit'].value_counts()

True     38852
False    17492
Name: permit, dtype: int64

## Data Preparation

### For each column we will create two variables for modeling. One with the mode value for each column and one with a newly created variable denoted 'other'.

In [30]:
# Creating new dataframe
df_training_val_mode = df_training_values.copy()
df_training_val_other = df_training_values.copy()


In [31]:
# Filling NAN values to 'Other'

df_training_val_other['funder'] = df_training_val_other['funder']\
                            .replace(np.nan, 'Other', regex = True)

df_training_val_other['installer'] = df_training_val_other['installer']\
                                .replace(np.nan, 'Other', regex = True)

df_training_val_other['subvillage'] = df_training_val_other['subvillage']\
                                    .replace(np.nan, 'Other', regex = True)

df_training_val_other['public_meeting'] = df_training_val_other['public_meeting']\
                                            .replace(np.nan, 'Other', regex = True)

df_training_val_other['scheme_management'] = df_training_val_other['scheme_management']\
                                                .replace(np.nan, 'Other', regex = True)

df_training_val_other['permit'] = df_training_val_other['permit']\
                            .replace(np.nan, 'Other', regex = True)


In [32]:
# Filling NAN values with most common feature based on count

df_training_val_mode['funder'].fillna(df_training_val_mode['funder']\
                        .value_counts().index[0], inplace = True)

df_training_val_mode['installer'].fillna(df_training_val_mode['installer']\
                                .value_counts().index[0], inplace = True)

df_training_val_mode['subvillage'].fillna(df_training_val_mode['subvillage']\
                                    .value_counts().index[0], inplace = True)

df_training_val_mode['public_meeting'].fillna(df_training_val_mode['public_meeting']\
                                            .value_counts().index[0], inplace = True)

df_training_val_mode['scheme_management'].fillna(df_training_val_mode['scheme_management']\
                                                 .value_counts().index[0], inplace = True)

df_training_val_mode['permit'].fillna(df_training_val_mode['permit']\
                            .value_counts().index[0], inplace = True)

In [33]:
df_training_val_mode.isna().sum()

id                       0
amount_tsh               0
date_recorded            0
funder                   0
gps_height               0
installer                0
longitude                0
latitude                 0
wpt_name                 0
num_private              0
basin                    0
subvillage               0
region                   0
region_code              0
district_code            0
lga                      0
ward                     0
population               0
public_meeting           0
scheme_management        0
permit                   0
construction_year        0
extraction_type          0
extraction_type_group    0
extraction_type_class    0
management               0
management_group         0
payment                  0
payment_type             0
water_quality            0
quality_group            0
quantity                 0
quantity_group           0
source                   0
source_type              0
source_class             0
waterpoint_type          0
w

In [34]:
df_training_val_other.isna().sum()

id                       0
amount_tsh               0
date_recorded            0
funder                   0
gps_height               0
installer                0
longitude                0
latitude                 0
wpt_name                 0
num_private              0
basin                    0
subvillage               0
region                   0
region_code              0
district_code            0
lga                      0
ward                     0
population               0
public_meeting           0
scheme_management        0
permit                   0
construction_year        0
extraction_type          0
extraction_type_group    0
extraction_type_class    0
management               0
management_group         0
payment                  0
payment_type             0
water_quality            0
quality_group            0
quantity                 0
quantity_group           0
source                   0
source_type              0
source_class             0
waterpoint_type          0
w

### Joining Tables

#### Now lets merge the tables so we only have two data sets to work with. To start, both dataframes have an ID column so we will create a new column on our target set and drop the identical column.

In [35]:
df_training_labels['id_2'] = df_training_labels['id']
df_training_labels.drop('id', axis = 1, inplace = True)

#### Next we will join our tables and create two dataframes for mode and other

In [36]:
df_mode = pd.concat([df_training_val_mode, df_training_labels], join = 'inner', axis = 1)
df_other = pd.concat([df_training_val_other, df_training_labels], join = 'inner', axis = 1)

In [37]:
df_mode

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group,id_2
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,VWC,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional,69572
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,True,Other,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional,8776
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,VWC,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional,34310
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,VWC,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional,67743
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,VWC,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional,19728
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,60739,10.0,2013-05-03,Germany Republi,1210,CES,37.169807,-3.253847,Area Three Namba 27,0,Pangani,Kiduruni,Kilimanjaro,3,5,Hai,Masama Magharibi,125,True,Water Board,True,1999,gravity,gravity,gravity,water board,user-group,pay per bucket,per bucket,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional,60739
59396,27263,4700.0,2011-05-07,Cefa-njombe,1212,Cefa,35.249991,-9.070629,Kwa Yahona Kuvala,0,Rufiji,Igumbilo,Iringa,11,4,Njombe,Ikondo,56,True,VWC,True,1996,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,river,river/lake,surface,communal standpipe,communal standpipe,functional,27263
59397,37057,0.0,2011-04-11,Government Of Tanzania,0,DWE,34.017087,-8.750434,Mashine,0,Rufiji,Madungulu,Mbeya,12,7,Mbarali,Chimala,0,True,VWC,False,0,swn 80,swn 80,handpump,vwc,user-group,pay monthly,monthly,fluoride,fluoride,enough,enough,machine dbh,borehole,groundwater,hand pump,hand pump,functional,37057
59398,31282,0.0,2011-03-08,Malec,0,Musa,35.861315,-6.378573,Mshoro,0,Rufiji,Mwinyi,Dodoma,1,4,Chamwino,Mvumi Makulu,0,True,VWC,True,0,nira/tanira,nira/tanira,handpump,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump,functional,31282


In [38]:
df_mode[df_mode['id'] == df_mode['id_2']]

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group,id_2
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,VWC,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional,69572
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,True,Other,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional,8776
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,VWC,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional,34310
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,VWC,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional,67743
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,VWC,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional,19728
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,60739,10.0,2013-05-03,Germany Republi,1210,CES,37.169807,-3.253847,Area Three Namba 27,0,Pangani,Kiduruni,Kilimanjaro,3,5,Hai,Masama Magharibi,125,True,Water Board,True,1999,gravity,gravity,gravity,water board,user-group,pay per bucket,per bucket,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional,60739
59396,27263,4700.0,2011-05-07,Cefa-njombe,1212,Cefa,35.249991,-9.070629,Kwa Yahona Kuvala,0,Rufiji,Igumbilo,Iringa,11,4,Njombe,Ikondo,56,True,VWC,True,1996,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,river,river/lake,surface,communal standpipe,communal standpipe,functional,27263
59397,37057,0.0,2011-04-11,Government Of Tanzania,0,DWE,34.017087,-8.750434,Mashine,0,Rufiji,Madungulu,Mbeya,12,7,Mbarali,Chimala,0,True,VWC,False,0,swn 80,swn 80,handpump,vwc,user-group,pay monthly,monthly,fluoride,fluoride,enough,enough,machine dbh,borehole,groundwater,hand pump,hand pump,functional,37057
59398,31282,0.0,2011-03-08,Malec,0,Musa,35.861315,-6.378573,Mshoro,0,Rufiji,Mwinyi,Dodoma,1,4,Chamwino,Mvumi Makulu,0,True,VWC,True,0,nira/tanira,nira/tanira,handpump,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump,functional,31282


In [39]:
df_other[df_other['id'] == df_other['id_2']]

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group,id_2
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,VWC,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional,69572
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,Other,Other,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional,8776
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,VWC,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional,34310
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,VWC,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional,67743
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,Other,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional,19728
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,60739,10.0,2013-05-03,Germany Republi,1210,CES,37.169807,-3.253847,Area Three Namba 27,0,Pangani,Kiduruni,Kilimanjaro,3,5,Hai,Masama Magharibi,125,True,Water Board,True,1999,gravity,gravity,gravity,water board,user-group,pay per bucket,per bucket,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional,60739
59396,27263,4700.0,2011-05-07,Cefa-njombe,1212,Cefa,35.249991,-9.070629,Kwa Yahona Kuvala,0,Rufiji,Igumbilo,Iringa,11,4,Njombe,Ikondo,56,True,VWC,True,1996,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,river,river/lake,surface,communal standpipe,communal standpipe,functional,27263
59397,37057,0.0,2011-04-11,Other,0,Other,34.017087,-8.750434,Mashine,0,Rufiji,Madungulu,Mbeya,12,7,Mbarali,Chimala,0,True,VWC,False,0,swn 80,swn 80,handpump,vwc,user-group,pay monthly,monthly,fluoride,fluoride,enough,enough,machine dbh,borehole,groundwater,hand pump,hand pump,functional,37057
59398,31282,0.0,2011-03-08,Malec,0,Musa,35.861315,-6.378573,Mshoro,0,Rufiji,Mwinyi,Dodoma,1,4,Chamwino,Mvumi Makulu,0,True,VWC,True,0,nira/tanira,nira/tanira,handpump,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump,functional,31282


#### As seen above our total rows equal that of the normal dataframe so we can conclude that our merges have been successful and we can drop our id_2 column.

In [40]:
df_mode.drop(['id_2'], axis = 1, inplace = True)
df_other.drop(['id_2'], axis = 1, inplace = True)

### Additional Columns to Drop

#### The Id columns and date_recorded are considered admin columns and will not have much predictive power in our model therefore we can drop these columns. 

In [41]:
df_mode.drop(['id', 'date_recorded'], axis = 1, inplace = True)
df_other.drop(['id', 'date_recorded'], axis = 1, inplace = True)

In [42]:
# mask = df.applymap(type) != bool
# d = {True: 'True', False: 'False'}

# df = df.where(mask, df.replace(d))

In [43]:
# pf = PolynomialFeatures(degree=2)

# X = df_no_cats

# pf.fit(X)
# pdf = pd.DataFrame(pf.transform(X), columns=pf.get_feature_names(X.columns))
# pdf
# corr_df = pdf.corr()
# corr_df[corr_df['x0'] > .75]['x0']

In [44]:
def get_totals(dataframe, filter_column, filter_groupby):

        '''
        **** filter_column & filter_groupby need to be passed
        as strings ****

        1. get_totals will calculate the sum of the variables
        within a column and return a new column with the 
        sum of their total occurances in the dataframe
        
        2. get_totals will calulate the percentage of the 
        values column vs the total values

        dataframe = pandas dataframe
        filter_column = column to filter by
        filter_groupby = groupby column to filter by

        '''

        df_new = pd.DataFrame(dataframe.groupby(filter_groupby)[filter_column].value_counts())
        df_new[f'{filter_groupby}_values'] = df_new[filter_column]
        df_new.drop(filter_column, axis = 1, inplace = True)
        df_new.reset_index(inplace = True)

        types = set()

        for idx, value in enumerate(df_new[f'{filter_groupby}_values']):
            for type_ in df_new[filter_column]:
                types.add(type_)
            
        total_values = {}
            
        for value in types:
            total_values[value] = df_new[df_new[filter_column] == value][f'{filter_groupby}_values'].sum()

        df_new[f'{filter_groupby}_total_values'] = df_new[filter_column].map(total_values)

        df_new[f'{filter_groupby}_percentage'] = df_new[f'{filter_groupby}_values'] / df_new[f'{filter_groupby}_total_values']
            
        return df_new


In [45]:
# function_df = df.drop('status_group', axis = 1)

# percentage_dict = {}

# for idx, column in enumerate(function_df.columns):
#     percentage_dict[column] = get_totals(df, column, 'status_group')

# pickle_out = open('percentage_dict.pickle', 'wb')
# pickle.dump(percentage_dict, pickle_out)

In [46]:
pickle_in = open('percentage_dict.pickle', 'rb')

percentage_dict = pickle.load(pickle_in)

In [47]:
# stat_group = df['status_group'].value_counts().index.to_list()

# for column in df:
#     fig, ax = plt.subplots(figsize = (5,5))
#     x = []
#     y = []
#     for stat in stat_group:
#         y.append(df.groupby('status_group')[column].value_counts()[stat].values.sum())
#         x.append(stat)
#     plt.title(column)
#     ax.bar(x, y)
#     plt.xticks(rotation=45, ha = 'right')

In [48]:
# for i in df:
#     print('------- {} -------'.format(i))
#     print(df.groupby('status_group')[i].value_counts())

### First Simple Model

#### To start our modeling process we will use only our integers and floats.

In [49]:
X_mode_fsm = df_mode.select_dtypes(['int64', 'float64'])
y_mode_fsm = df_mode['status_group']

X_other_fsm = df_other.select_dtypes(['int64','float64'])
y_other_fsm = df_other['status_group']

In [50]:
X_mode_train, X_mode_test, y_mode_train, y__mode_test = train_test_split(X_mode_fsm,y_mode_fsm, random_state = 42, stratify = y_mode_fsm)

X_other_train, X_other_test, y_other_train, y_other_test = train_test_split(X_other_fsm,y_other_fsm, random_state = 42, stratify = y_mode_fsm)


dtc_mode = DecisionTreeClassifier()
dtc_other = DecisionTreeClassifier()

dtc_mode.fit(X_mode_train, y_mode_train)
dtc_other.fit(X_other_train, y_other_train)

DecisionTreeClassifier()

In [51]:
print(dtc_mode.score(X_mode_train, y_mode_train))
print(dtc_other.score(X_other_train, y_other_train))

0.9858810325476992
0.9858810325476992


In [52]:
y_hat_mode = dtc_mode.predict(X_mode_train)
y_hat_other = dtc_other.predict(X_other_train)

In [53]:
print('mode recall:', recall_score(y_mode_train, y_hat_mode, average = 'macro'))
print('mode precision:', precision_score(y_mode_train, y_hat_mode, average = 'macro'))
print('mode f1 score:', f1_score(y_mode_train, y_hat_mode, average = 'macro'))

print('---------------------------------------------------------')

print('ohter recall:', recall_score(y_other_train, y_hat_other, average = 'macro'))
print('other precision:', precision_score(y_other_train, y_hat_other, average = 'macro'))
print('other f1 score:', f1_score(y_other_train, y_hat_other, average = 'macro'))

mode recall: 0.9646151149443224
mode precision: 0.9910484321431677
mode f1 score: 0.9771278730975221
---------------------------------------------------------
ohter recall: 0.9646151149443224
other precision: 0.9910484321431677
other f1 score: 0.9771278730975221


In [54]:
cross_val_score(dtc_mode, X_mode_train, y_mode_train, cv = 3, scoring = 'recall_macro')

array([0.53867397, 0.54440174, 0.54297815])

In [55]:
cross_val_score(dtc_other, X_mode_train, y_mode_train, cv = 3, scoring = 'recall_macro')

array([0.53528106, 0.54461303, 0.54689067])

### The cross val scores are pretty consitent across the folds. This doesnt give us much insight as far as our NAN replacements in during the EDA. 

In [56]:
# confusion_matrix(y_mode_train, y_hat_mode)

In [57]:
# roc_auc_score()

In [58]:
# fpr, tpr, thresholds = roc_curve(y_mode_train, y_hat_mode)

In [59]:
# # Seaborn's beautiful styling
# sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

# print('AUC: {}'.format(auc(fpr, tpr)))
# plt.figure(figsize=(10, 8))
# lw = 2
# plt.plot(fpr, tpr, color='darkorange',
#          lw=lw, label='ROC curve')
# plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.yticks([i/20.0 for i in range(21)])
# plt.xticks([i/20.0 for i in range(21)])
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Receiver operating characteristic (ROC) Curve')
# plt.legend(loc='lower right')
# plt.show()

### Model Exploration

#### Now that we have our baseline established we will loop through other models to see if we can get better results.

In [60]:
# model_selection = [LogisticRegression(random_state = 42, max_iter = 1000, n_jobs = -1),\
#                    RandomForestClassifier(random_state = 42, n_jobs = -1),\
#                    DecisionTreeClassifier(), KNeighborsClassifier(n_jobs = -1), 
#                   SVC(random_state = 42)]

# vanilla_models = {}

# for idx_mode, model in enumerate(model_selection):
#     vanilla_models[idx_mode] = model.fit(X_mode_train, y_mode_train)

In [61]:
# for key, val in enumerate(vanilla_models.values()):
#     print(val, val.score(X_mode_train, y_mode_train))

#### Based on the scores above, our scores are the best using RandomForestClassifier and DecisionTreeClassifier. Let's did deeper into these two models.

In [62]:
# # Select models from dictionary
# rfc = vanilla_models[1]
# dtc = vanilla_models[2]

# # predict on each model

# rfc_mode_yhat = rfc.predict(X_mode_train)
# dtc_mode_yhat = dtc.predict(X_mode_train)

In [63]:
# # Review scores for both models

# print('rfc recall:', recall_score(y_mode_train, rfc_mode_yhat, average = 'macro'))
# print('rfc precision:', precision_score(y_mode_train, rfc_mode_yhat, average = 'macro'))
# print('rfc f1 score:', f1_score(y_mode_train, rfc_mode_yhat, average = 'macro'))

# print('---------------------------------------------------------')

# print('dtc recall:', recall_score(y_mode_train, dtc_mode_yhat, average = 'macro'))
# print('dtc precision:', precision_score(y_mode_train, dtc_mode_yhat, average = 'macro'))
# print('dtc f1 score:', f1_score(y_mode_train, dtc_mode_yhat, average = 'macro'))


### Small advantage do the decision tree classifier. Lets see if our cross val & auc score shows anymore insights. 

In [64]:
# cross_val_score(rfc, X_mode_train, y_mode_train, cv = 5, n_jobs=-1, scoring = 'recall_macro')

In [65]:
# cross_val_score(dtc, X_mode_train, y_mode_train, cv = 5, n_jobs = -1, scoring = 'recall_macro')

### Since our stakeholder is considered with pump failures we need to avoid False Negatives. I.E. we do not want to say the bump is broken when it in fact it is operational. Therefore we need to focus on our recall score and tune our model appropriately which is why we're using the recall_macro score. As seen above our Random Forest is performing the best. We will move forward with tuning this model going forward.

### Our models above only utilized our numerical values. We will now begin using our categorical features and identify feature importance. 

In [82]:
df_cat = df_mode[df_mode.select_dtypes('object').columns.to_list()]
X_cat = df_cat.drop('status_group', axis = 1)
y_cat = df_cat['status_group']

X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(X_cat, y_cat, random_state = 42, stratify = y_cat)

In [83]:
ohe = OneHotEncoder(drop = 'first')

df_feat_import = X_train_cat[['extraction_type', 'management', 'payment', 'water_quality', 'source', 'source_class']]

X_mode_train_enc = ohe.fit_transform(df_feat_import)

rfc_feat_import = RandomForestClassifier(random_state = 42, class_weight= 'balanced', n_jobs = -1)

rfc_feat_import.fit(X_mode_train_enc, y_train_cat)

RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=42)

In [84]:
rfc_feat_import.feature_importances_

array([1.12264983e-03, 7.45681054e-04, 6.12692594e-02, 1.36817285e-02,
       2.41278133e-03, 1.50494838e-02, 1.92466772e-02, 2.97543020e-02,
       1.21732087e-01, 5.61920493e-05, 1.89213060e-03, 5.60360444e-03,
       3.83114486e-03, 2.10226355e-02, 1.58139151e-02, 2.19375836e-03,
       3.60147349e-03, 8.47253820e-03, 2.08824910e-03, 1.42299922e-02,
       1.70338662e-02, 2.98065384e-03, 7.54380133e-03, 4.38027490e-02,
       1.05497826e-02, 2.16806437e-02, 1.81333493e-02, 3.00531093e-02,
       1.31605143e-02, 3.43603877e-02, 4.80824563e-02, 6.36812398e-02,
       2.11839935e-02, 3.57710270e-02, 3.59437413e-03, 7.63120107e-04,
       6.32408928e-03, 1.71804885e-02, 1.08932398e-02, 2.89928174e-02,
       3.16307135e-02, 6.53772029e-03, 1.10326236e-02, 2.24595732e-02,
       2.48962415e-03, 2.05711253e-02, 2.85139946e-02, 2.43527250e-02,
       3.17593959e-02, 1.64403497e-03, 3.69875015e-02, 2.43498066e-03])

In [86]:
ohe.get_feature_names()

array(['x0_cemo', 'x0_climax', 'x0_gravity', 'x0_india mark ii',
       'x0_india mark iii', 'x0_ksb', 'x0_mono', 'x0_nira/tanira',
       'x0_other', 'x0_other - mkulima/shinyanga', 'x0_other - play pump',
       'x0_other - rope pump', 'x0_other - swn 81', 'x0_submersible',
       'x0_swn 80', 'x0_walimi', 'x0_windmill', 'x1_other',
       'x1_other - school', 'x1_parastatal', 'x1_private operator',
       'x1_trust', 'x1_unknown', 'x1_vwc', 'x1_water authority',
       'x1_water board', 'x1_wua', 'x1_wug', 'x2_other',
       'x2_pay annually', 'x2_pay monthly', 'x2_pay per bucket',
       'x2_pay when scheme fails', 'x2_unknown', 'x3_fluoride',
       'x3_fluoride abandoned', 'x3_milky', 'x3_salty',
       'x3_salty abandoned', 'x3_soft', 'x3_unknown', 'x4_hand dtw',
       'x4_lake', 'x4_machine dbh', 'x4_other', 'x4_rainwater harvesting',
       'x4_river', 'x4_shallow well', 'x4_spring', 'x4_unknown',
       'x5_surface', 'x5_unknown'], dtype=object)

In [88]:
rfc_feat_import.score(X_mode_train_enc, y_train_cat)

0.5537598204264871

In [None]:
X_mode_train_enc_array = ohe.fit_transform(X_mode_train).toarray()


In [None]:
rfc_feat_import.

In [74]:
X_train = X_train[['region_code', 'source_type', 'basin', 'region', 'region_code', 'district_code',\
                   'public_meeting', 'scheme_management', 'permit', 'construction_year', 'extraction_type',\
                  'extraction_type_group', 'extraction_type_class','management',\
                   'management_group', 'payment', 'payment_type', 'water_quality',\
                  'quality_group', 'quantity', 'quantity_group', 'source', 'source_type',
                   'source_class', 'waterpoint_type', 'waterpoint_type_group']]
X_test = X_test[['region_code', 'source_type', 'basin', 'region', 'region_code', 'district_code',\
                 'public_meeting', 'scheme_management', 'permit', 'construction_year', 'extraction_type',\
                'extraction_type_group', 'extraction_type_class','management',
                   'management_group', 'payment', 'payment_type', 'water_quality',\
                'quality_group', 'quantity', 'quantity_group', 'source', 'source_type',
                   'source_class', 'waterpoint_type', 'waterpoint_type_group']]

def prepare_inputs(X_train, X_test):
	oe = OrdinalEncoder()
	oe.fit(X_train)
	X_train_enc = oe.transform(X_train)
	X_test_enc = oe.transform(X_test)
	return X_train_enc, X_test_enc

X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)

NameError: name 'X_train' is not defined

In [75]:
def prepare_targets(y_train, y_test):
	le = LabelEncoder()
	le.fit(y_train)
	y_train_enc = le.transform(y_train)
	y_test_enc = le.transform(y_test)
	return y_train_enc, y_test_enc

y_train_enc, y_test_enc = prepare_targets(y_train, y_test)

NameError: name 'y_train' is not defined

In [None]:
fs = SelectKBest(score_func=chi2, k='all')
fs.fit(X_train_enc, y_train)
X_train_fs = fs.transform(X_train_enc)
X_test_fs = fs.transform(X_test_enc)

In [None]:
for i in range(len(fs.scores_)):
	print('Feature %d: %f' % (i, fs.scores_[i]))
# plot the scores
plt.barh([i for i in range(len(fs.scores_))], fs.scores_)
plt.show()

In [None]:
feature_names = ['region_code', 'source_type', 'basin', 'region', 'district_code',\
                   'public_meeting', 'scheme_management', 'permit', 'construction_year', 'extraction_type',\
                  'extraction_type_group', 'extraction_type_class','management',\
                   'management_group', 'payment', 'payment_type', 'water_quality',\
                  'quality_group', 'quantity', 'quantity_group', 'source', 'source_type',
                   'source_class', 'waterpoint_type', 'waterpoint_type_group']

In [None]:
fs.scores_ > 1000

In [None]:
mask = fs.scores_ > 400
new_features = []
for bool, feature in zip(mask, feature_names):
    if bool:
        new_features.append(feature)

In [None]:
new_features

In [None]:
sum(df['extraction_type'] == df['extraction_type_group'])

In [None]:
new_features = ['region_code',
 'region',
 'district_code',
 'public_meeting',
 'extraction_type',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'payment_type',
 'quantity_group',
 'source',
 'source_class',
 'waterpoint_type_group']

In [None]:
X_train

In [None]:
X = df[new_features]
y = df['status_group']

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 42)

In [None]:
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

preprocessor = ColumnTransformer([('cat', categorical_transformer, new_features)])

clf = Pipeline([('preprocessor', preprocessor), 
               ('classifier', RandomForestClassifier(verbose = 1, random_state = 42))])

clf.fit(X_train, y_train)

In [None]:
clf.score(X_train, y_train)

In [None]:
RandomForestClassifier()

In [None]:
param_grid = {
 'classifier__max_depth': [3,10, None],
 'classifier__criterion': ['gini', 'entropy'],
 'classifier__min_samples_leaf': [1, 2, 4],
 'classifier__n_estimators': [100, 500]
}

grid_search = GridSearchCV(clf, param_grid, n_jobs=-1, cv = 3, return_train_score=True)

grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
from sklearn.metrics import recall_score, precision_score, f1_score

In [None]:
yhat = grid_search.predict(X_train)

In [None]:
recall_score(y_train, yhat, average='macro')

In [None]:
precision_score(y_train, yhat, average='macro')

In [None]:
f1_score(y_train, yhat, average='macro')

In [None]:
param_grid = {
 'classifier__max_depth': [3,10, None],
 'classifier__criterion': ['gini', 'entropy'],
 'classifier__min_samples_leaf': [1, 2, 4],
 'classifier__n_estimators': [100, 500, 750, 1000]
}

grid_search = GridSearchCV(clf, param_grid, n_jobs=-1, cv = 3, return_train_score=True)

grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_estimator_['classifier'].feature_importances_

In [None]:
X_train = X_train.iloc[:, 1:]

In [None]:
just_to_a_var = grid_search.best_estimator_['preprocessor']

In [None]:
just_to_a_var.fit(X_train)

In [None]:
just_to_a_var