# Cross-Validation

- Do **k-fold cross-validation** with independent test set
- Use scikit-learn for **hyperparameter optimization**

In [None]:
%%capture

import sys

# If you're on Colab:
if 'google.colab' in sys.modules:
    DATA_PATH = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Kaggle-Challenge/master/data/'
    !pip install category_encoders==2.*

# If you're working locally:
else:
    DATA_PATH = '/Users/keila/Documents/Lambda/Units_Git/DS-Unit-2-Kaggle-Challenge/data/'

In [None]:
from category_encoders import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd

  import pandas.util.testing as tm


# I. Wrangle Data

In [None]:
def wrangle(fm_path, tv_path=None):
  # Import CSV files
  if tv_path:
    df = pd.merge(pd.read_csv(fm_path, 
                              na_values=[0, -2.000000e-08],
                              parse_dates=['construction_year', 'date_recorded']), 
                  pd.read_csv(tv_path)).set_index('id')
  else:
    df = pd.read_csv(fm_path, na_values=[0, -2.000000e-08],
                     parse_dates=['construction_year', 'date_recorded']).set_index('id')

  # Feature engineering (Credit: Mena and Keila)
  df['pump_age'] = df['date_recorded'].dt.year - df['construction_year'].dt.year

  # Drop constant and repeated columns
  df.drop(columns=['recorded_by', 'extraction_type_group', 'quantity_group',
                   'construction_year', 'date_recorded'], 
          inplace=True)
  
  # Drop columns with high % of NaN values
  df.dropna(axis=1, thresh=len(df)*.6, inplace=True)

  return df

df = wrangle(fm_path=DATA_PATH+'waterpumps/train_features.csv',
             tv_path=DATA_PATH+'waterpumps/train_labels.csv')

X_test = wrangle(fm_path=DATA_PATH+'waterpumps/test_features.csv')

# II. Split Data

## Split TV from FM

In [None]:
target = 'status_group'
y_train = df[target]
X_train = df.drop(columns=target)

# Training-Validation Split

- Since we're doing k-fold CV, there's no need for a validation set.

# III. Establish Baseline

This is a **classification** problem, our baseline will be **accuracy**. 

In [None]:
print('Baseline Accuracy:', y_train.value_counts(normalize=True).max())

Baseline Accuracy: 0.5430899510092763


# IV. Build Models

- `DecisionTreeClassifier`
- `RandomForestClassifier`

In [None]:
model_dt = make_pipeline(OrdinalEncoder(),
                         SimpleImputer(strategy = 'median'),
                         DecisionTreeClassifier(random_state = 42))

model_dt.fit(X_train, y_train);   # This step is optional for what we will do in this notebook

In [None]:
model_rf = make_pipeline(OrdinalEncoder(),
                         SimpleImputer(strategy = 'median'),
                         RandomForestClassifier(n_estimators = 30,
                                                random_state = 42))

# model_rf.fit(X_train, y_train);  # This step is optional for what we will do in this notebook

**Check cross-validation scores**

In [None]:
cv_scores_dt = cross_val_score(model_dt, X_train, y_train, n_jobs = -1)

In [None]:
cv_scores_rf = cross_val_score(model_rf, X_train, y_train, n_jobs = -1)

In [None]:
print('CV score DecisionTreeClassifier')
print(cv_scores_dt)
print('Mean CV accuracy score:', cv_scores_dt.mean())
print('STD CV accuracy score:', cv_scores_dt.std())

CV score DecisionTreeClassifier
[0.74604377 0.74713805 0.75159933 0.73678451 0.70957151]
Mean CV accuracy score: 0.738227433863851
STD CV accuracy score: 0.015116517522006122


In [None]:
print('CV score RandomForestClassifier')
print(cv_scores_rf)
print('Mean CV accuracy score:', cv_scores_rf.mean())
print('STD CV accuracy score:', cv_scores_rf.std())

CV score RandomForestClassifier
[0.80218855 0.80042088 0.80328283 0.80294613 0.79914134]
Mean CV accuracy score: 0.8015959451404354
STD CV accuracy score: 0.001576427451542507


# V. Tune Model

- What are important hyperparameters for `RandomForestClassifier`?

  - `max_depth`
  - `n_estimators`
  - imputation strategy

**`GridSearch`:** Very thourough, but it can take a long time.

In [None]:
estimator = make_pipeline(OrdinalEncoder(),
                          SimpleImputer(),
                          RandomForestClassifier(random_state = 42))

params = {'simpleimputer__strategy': ['mean', 'median'],
          'randomforestclassifier__n_estimators': [25, 50, 75, 100],
          'randomforestclassifier__max_depth': range(5, 36, 5)}

model_gs = GridSearchCV(estimator, 
                     param_grid = params, 
                     cv = 5,
                     n_jobs = -1,
                     verbose = 1)

model_gs.fit(X_train, y_train);

Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed: 19.4min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('ordinalencoder',
                                        OrdinalEncoder(cols=None,
                                                       drop_invariant=False,
                                                       handle_missing='value',
                                                       handle_unknown='value',
                                                       mapping=None,
                                                       return_df=True,
                                                       verbose=0)),
                                       ('simpleimputer',
                                        SimpleImputer(add_indicator=False,
                                                      copy=True,
                                                      fill_value=None,
                                                      missing_values=nan,
          

In [None]:
model_gs.best_params_

{'randomforestclassifier__max_depth': 20,
 'randomforestclassifier__n_estimators': 100,
 'simpleimputer__strategy': 'mean'}

In [None]:
model_gs.best_score_

0.8090203292855032

**`RandomizedSearchCV`:** Quicker, less effective but usually good enough.

In [None]:
model_rs = RandomizedSearchCV(estimator,
                              param_distributions = params,
                              n_iter = 3,
                              cv = 5,
                              n_jobs = -1,
                              verbose = 1)

model_rs.fit(X_train, y_train);

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   59.3s finished


In [None]:
type(model_rs)

sklearn.model_selection._search.RandomizedSearchCV

In [None]:
model_rs.best_params_

{'randomforestclassifier__max_depth': 35,
 'randomforestclassifier__n_estimators': 100,
 'simpleimputer__strategy': 'mean'}

In [None]:
model_rs.best_score_

0.806983190209472

In [None]:
model_rs.best_estimator_

# Make Submission

In [None]:
y_pred = model_gs.predict(X_test)

In [None]:
submission = pd.DataFrame({'status_group': y_pred}, index=X_test.index)
submission.head()

Unnamed: 0_level_0,status_group
id,Unnamed: 1_level_1
50785,functional
51630,functional
17168,functional
45559,non functional
49871,functional


In [None]:
submission.to_csv('2021-02-17_submission.csv')

# VI. Communicate Results

**Showing Feature Importance**

Plot the feature importance for our `RandomForest` model.