In [1]:
%%capture
import sys

# If you're on Colab:
if 'google.colab' in sys.modules:
    DATA_PATH = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Kaggle-Challenge/main/data/'
    !pip install category_encoders==2.*
    !pip install pandas-profiling==2.*

# If you're working locally:
else:
    DATA_PATH = '../data/'

In [2]:
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.pipeline import make_pipeline
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# cross validation
from sklearn.model_selection import cross_val_score
# hyperparameter tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import accuracy_score


This project focused on creating and improving a model for the Tanazania Water Pump dataset. The goal was to create a model to predict whether a water pump is functional, non-functional, or needs repair.

Dataset source: [DrivenData.org](https://www.drivendata.org/competitions/7/pump-it-up-data-mining-the-water-table/).

## Directions

The tasks for this project are as follows:

- **Task 1:** Use `wrangle` function to import training and test data.
- **Task 2:** Split training data into feature matrix `X` and target vector `y`.
- **Task 3:** Establish the baseline accuracy score for your dataset.
- **Task 4:** Build `clf_dt`.
- **Task 5:** Build `clf_rf`.
- **Task 6:** Evaluate classifiers using k-fold cross-validation.
- **Task 7:** Tune hyperparameters for best performing classifier.
- **Task 8:** Print out best score and params for model.
- **Task 9:** Create `submission.csv` and upload to Kaggle.

You should limit yourself to the following libraries for this project:

- `category_encoders`
- `matplotlib`
- `pandas`
- `pandas-profiling`
- `sklearn`

# I. Wrangle Data

In [3]:
def wrangle(fm_path, tv_path=None):
    if tv_path:
        df = pd.merge(pd.read_csv(fm_path, parse_dates = ['date_recorded'],
                                  na_values=[0, -2.000000e-08]),
                      pd.read_csv(tv_path)).set_index('id')
    else:
        df = pd.read_csv(fm_path, parse_dates = ['date_recorded'],
                         na_values=[0, -2.000000e-08],
                         index_col='id')
    

    # Drop constant columns
    df.drop(columns=['recorded_by'], inplace=True)

    # Drop HCCCs
    #cutoff = 50
    #drop_cols = [col for col in df.select_dtypes('object').columns
       #          if df[col].nunique() > cutoff]
    #df.drop(columns=drop_cols, inplace=True)
    HCCV = [col for col in df.select_dtypes('object') if df[col].nunique() > 50]
    df.drop(columns=HCCV, inplace=True)
    # Drop duplicate columns
    dupe_cols = [col for col in df.head(15).T.duplicated().index
                 if df.head(15).T.duplicated()[col]]
    df.drop(columns=dupe_cols, inplace=True)         


# Feature engineer
    #df['water_per_person'] =  df['amount_tsh'] / df['population']   
    #Pump Age
    df['date_recorded'] = pd.to_datetime(df['date_recorded'], infer_datetime_format=True)

    
    df['year'] = df['date_recorded'].dt.year
    df['month'] = df['date_recorded'].dt.month
    df['day'] = df['date_recorded'].dt.day
    df = df.drop(columns='date_recorded')
    df['pump_age'] = df['year'] - df['construction_year']
    df.drop(columns = ['waterpoint_type_group', 'num_private', 'payment_type',
                       'payment', 'public_meeting'], inplace= True)
    #Water per person
    df['water/person'] = df['amount_tsh'] / df['population']
     #population per age 
    #df['pop/year'] = df['population'] / df['pump_age']
    #df = df.dropna()
    
    return df

**Task 1:** Used the above `wrangle` function to read `train_features.csv` and `train_labels.csv` into the DataFrame `df`, and `test_features.csv` into the DataFrame `X_test`.

In [4]:
train_feature_path = DATA_PATH+'waterpumps/train_features.csv'
train_target_path = DATA_PATH+'waterpumps/train_labels.csv'
test_feature_path = DATA_PATH+'waterpumps/test_features.csv'

In [5]:
df = wrangle(train_feature_path, train_target_path)

In [6]:
X_test = wrangle(test_feature_path)

In [7]:
df = wrangle(DATA_PATH +'waterpumps/train_features.csv', DATA_PATH + 'waterpumps/train_labels.csv' )
X_test = wrangle(DATA_PATH +'waterpumps/test_features.csv')

In [8]:
#Get a series with the cardinality of the nonnumeric features
cardinality = df.select_dtypes(exclude='number').nunique()
# Get a list of all categorical features with cardinality <= 50
low_categorical_features = cardinality[cardinality <= 50].index.tolist()
#get a list of high categorical features with cardinality >= 50
high_categorical_features = cardinality[cardinality >= 50].index.tolist()

# II. Split Data

**Task 2:** Split my DataFrame `df` into a feature matrix `X` and the target vector `y`. I want to predict `'status_group'`.

**Note:** I won't need to do a train-test split because you'll use cross-validation instead.

In [9]:
target = 'status_group'
X = df.drop(columns=target)
y = df[target]


# III. Establish Baseline

**Task 3:** Since this is a **classification** problem, I need to  establish a baseline accuracy score. 

In [10]:
baseline_acc = baseline_acc = (y.value_counts(normalize=True).max())
print('Baseline Accuracy Score:', baseline_acc)

Baseline Accuracy Score: 0.5430899510092763


# IV. Build Models

**Task 4:** Build a `Pipeline` named `clf_dt`. My `Pipeline` should include:

- an `OrdinalEncoder` transformer for categorical features.
- a `SimpleImputer` transformer fot missing values.
- a `DecisionTreeClassifier` Predictor.

**Note:** Do not train `clf_dt`. You'll do that in a subsequent task. 

In [11]:
clf_dt = make_pipeline(OrdinalEncoder(),                 
                         SimpleImputer(),
                         DecisionTreeClassifier (random_state = 200))


In [12]:
#cv_scores_dt = cross_val_score(clf_dt, X_train, y_train, cv=3, scoring = 'accuracy')
#print(cv_scores_dt)

**Task 5:** Build a `Pipeline` named `clf_rf`. Your `Pipeline` should include:

- an `OrdinalEncoder` transformer for categorical features.
- a `SimpleImputer` transformer fot missing values.
- a `RandomForestClassifier` predictor.

**Note:** Do not train `clf_rf`. You'll do that in a subsequent task. 

In [13]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.feature_selection import f_regression, SelectKBest


clf_rf = make_pipeline(OrdinalEncoder(),                 
                         SimpleImputer(),
                         RandomForestClassifier(n_estimators = 300, criterion='gini',min_samples_split = 4,
                                                min_samples_leaf = 3,
                                                max_depth = 25, random_state = 200 , n_jobs = -1))


In [14]:
#cv_scores_rf = cross_val_score(clf_rf, X_train, y_train, cv=3, scoring = 'accuracy')
#print(cv_scores_rf)

# V. Check Metrics

**Task 6:** Evaluate the performance of both of your classifiers using k-fold cross-validation.

In [15]:
from sklearn.model_selection import KFold, cross_val_score


In [16]:
#from sklearn import tree
#from sklearn.cross_validation import cross_val_score
#from pprint import pprint
cv_scores_dt = cross_val_score(clf_dt, X, y, cv=3, scoring = 'accuracy')
cv_scores_rf = cross_val_score(clf_rf, X, y, cv=3, scoring = 'accuracy')

In [17]:
print('CV scores DecisionTreeClassifier')
print(cv_scores_dt)
print('Mean CV accuracy score:', cv_scores_dt.mean())
print('STD CV accuracy score:', cv_scores_dt.std())

CV scores DecisionTreeClassifier
[0.75035354 0.7480303  0.74306783]
Mean CV accuracy score: 0.7471505566975035
STD CV accuracy score: 0.003038731600273231


In [18]:
print('CV score RandomForestClassifier')
print(cv_scores_rf)
print('Mean CV accuracy score:', cv_scores_rf.mean())
print('STD CV accuracy score:', cv_scores_rf.std())

CV score RandomForestClassifier
[0.80338384 0.80762626 0.80246477]
Mean CV accuracy score: 0.8044916239860429
STD CV accuracy score: 0.002248057056135457


# VI. Tune Model

**Task 7:** Choose the best performing of my two models and tune its hyperparameters using a `RandomizedSearchCV` named `model`. Include cross-validation and that `n_iter` is set to at least `25`.



In [19]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint, uniform

In [20]:
max_depth = [10, 30, 25] # 3 choices
n_estimators = range(300, 250, 400) # 5 choices
min_sample_split = range(2,4, 2) # 4 choices
criterion = ['gini', 'entropy'] # 2 choices
min_samples_leaf = [3, 2, 4, 5]

param_grid = {
    'simpleimputer__strategy': ['mean', 'median', 'most_frequent'], 
    'randomforestclassifier__max_depth': [10, 30, 50],
    'randomforestclassifier__n_estimators': range(150, 300, 400),
    'randomforestclassifier__min_samples_split': range(5, 21, 5),
    'randomforestclassifier__criterion': ['gini', 'entropy'],
    'randomforestclassifier__min_samples_leaf': [3,2,4, 5]}

In [21]:
#model_gs = GridSearchCV(clf_rf, param_grid=param_grid, n_jobs = -1, cv=3,  verbose=10)
#model_gs.fit(X, y)

In [22]:
model = RandomizedSearchCV(
    clf_rf, 
    param_distributions=param_grid, 
    n_iter=25, 
    cv=5,  
    n_jobs=-1,
    verbose=10)

model.fit(X, y)



Fitting 5 folds for each of 25 candidates, totalling 125 fits




RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('ordinalencoder',
                                              OrdinalEncoder()),
                                             ('simpleimputer', SimpleImputer()),
                                             ('randomforestclassifier',
                                              RandomForestClassifier(max_depth=25,
                                                                     min_samples_leaf=3,
                                                                     min_samples_split=4,
                                                                     n_estimators=300,
                                                                     n_jobs=-1,
                                                                     random_state=200))]),
                   n_iter=25, n_jobs=-1,
                   param_distributions={'randomforestclassifier__criterion': ['gini',
                                              

**Task 8:** Print out the best score and best params for `model`.

In [23]:
best_score = model.best_score_
best_params = model.best_params_

print('Best score for `model`:', best_score)
print('Best params for `model`:', best_params)

Best score for `model`: 0.8094917111740918
Best params for `model`: {'simpleimputer__strategy': 'mean', 'randomforestclassifier__n_estimators': 150, 'randomforestclassifier__min_samples_split': 10, 'randomforestclassifier__min_samples_leaf': 2, 'randomforestclassifier__max_depth': 30, 'randomforestclassifier__criterion': 'entropy'}
