# Modeling

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plot
%matplotlib inline

## Baseline
How the different models compare without doing any scrubbing beyond whats needed to get it to work

In [2]:
features_df = pd.read_csv('../data/raw/X_train.csv')
target_df = pd.read_csv('../data/raw/y_train.csv')

In [3]:
print(features_df.shape)
features_df.head()

(59400, 40)


Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [4]:
target_df.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [5]:
X = features_df

In [6]:
y = target_df['status_group'].replace({'functional': 0, 'non functional': 1, 'functional needs repair': 2}).astype(int)

In [7]:
target_df['status_group'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

In [9]:
numerical_cols = list(X.select_dtypes(include=np.number).columns)
categorical_cols = list(X.select_dtypes(exclude=np.number).columns)

In [10]:
numerical_cols

['id',
 'amount_tsh',
 'gps_height',
 'longitude',
 'latitude',
 'num_private',
 'region_code',
 'district_code',
 'population',
 'construction_year']

### Creating Pipeline

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('scale', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

## Models
- Xgboost
- LightGBM
- Random Forest
- Extra Trees
- k-NN
- Logistic Regression
- Naïve Bayes Classifier Algorithm
- K Means Clustering Algorithm
- Support Vector Machine Algorithm
- Apriori Algorithm
- Decision Trees

In [117]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from  sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

models = {
    'xgb': { 'model': XGBClassifier(),
             'params': {} },
    'lgbm': { 'model': LGBMClassifier(),
             'params': {} },
    'rfc': { 'model': RandomForestClassifier(),
            'params': {} },
    'knn': { 'model': KNeighborsClassifier(),
            'params': {} },
    'lgr': { 'model': LogisticRegression(),
            'params': { 'model__C': np.logspace(-4, 4, 4),
                        'model__max_iter': [10000] } },
    'svc': { 'model': SVC(),
            'params': {} },
    'dtc': { 'model': DecisionTreeClassifier(),
             'params': {} },
    'gnb': { 'model': GaussianNB(),
             'params': {} }
}

### Predict

In [105]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV

def fit_predict(model, X, y):
    '''fit pipeline using given model, and return predictions'''
    
    param_grid = model['params']
    model = model['model']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)
    
    # Bundle preprocessing and modeling code in a pipeline
    my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('model', model)
                                 ])
    
    search = GridSearchCV(estimator=my_pipeline,
             param_grid=param_grid, n_jobs=-1)
    
    search.fit(X_train, y_train)
    
    print("Best parameter (CV score=%0.3f):" % search.best_score_)
    print(search.best_params_)

    # Preprocessing of validation data, get predictions
    test_preds = search.predict(X_test)
    train_preds = search.predict(X_train)
    return test_preds, train_preds

In [118]:
# test fit_predict()
test_preds, train_preds = fit_predict(models['lgr'], X_small, y_small)

Best parameter (CV score=0.751):
{'model__C': 0.046415888336127774, 'model__max_iter': 10000}


## Evaluate
What are the most important things to look for with regression?

In [59]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

def metrics(model_name, y_train, y_test, y_train_pred, y_test_pred):
    '''Print out the evaluation metrics for a given models predictions'''
    print(f'Model: {model_name}', )
    print('-'*60)
    print(f'test accuracy: {accuracy_score(y_test, y_test_pred)}')
    print(f'train accuracy: {accuracy_score(y_train, y_train_pred)}')
    print('-'*60)
    print('\ntest report:\n' + classification_report(y_test, y_test_pred))
    print('~'*60)
    print('\ntrain report:\n' + classification_report(y_train, y_train_pred))    
    print('-'*60)


In [119]:
# test metrics()
metrics('lgr', y_train, y_test, train_preds, test_preds)

Model: lgr
------------------------------------------------------------
test accuracy: 0.4144
train accuracy: 0.39226666666666665
------------------------------------------------------------

test report:
              precision    recall  f1-score   support

           0       0.56      0.70      0.62      1372
           1       0.52      0.01      0.03       932
           2       0.09      0.35      0.14       196

    accuracy                           0.41      2500
   macro avg       0.39      0.35      0.26      2500
weighted avg       0.51      0.41      0.36      2500

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

train report:
              precision    recall  f1-score   support

           0       0.54      0.67      0.60      4089
           1       0.42      0.01      0.02      2875
           2       0.08      0.35      0.13       536

    accuracy                           0.39      7500
   macro avg       0.35      0.34      0.25      7500
weighted avg

In [106]:
for name, model in models.items():
    try:
        test_preds, train_preds = fit_predict(model, X, y)
        metrics(name, y_train, y_test, train_preds, test_preds)
    except:
        print(f'error fitting {model}')

error fitting {'model': XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              objective='binary:logistic', random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, subsample=None,
              tree_method=None, validate_parameters=None, verbosity=None), 'params': {}}
error fitting {'model': LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, 

Exception in thread QueueManagerThread:
Traceback (most recent call last):
  File "/Users/yrgg/opt/anaconda3/envs/gpd/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/Users/yrgg/opt/anaconda3/envs/gpd/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/yrgg/opt/anaconda3/envs/gpd/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 747, in _queue_management_worker
    recursive_terminate(p)
  File "/Users/yrgg/opt/anaconda3/envs/gpd/lib/python3.7/site-packages/joblib/externals/loky/backend/utils.py", line 28, in recursive_terminate
    _recursive_terminate_without_psutil(process)
  File "/Users/yrgg/opt/anaconda3/envs/gpd/lib/python3.7/site-packages/joblib/externals/loky/backend/utils.py", line 53, in _recursive_terminate_without_psutil
    _recursive_terminate(process.pid)
  File "/Users/yrgg/opt/anaconda3/envs/gpd/lib/python3.7/site-packages/joblib/externals/loky/backend/u

error fitting {'model': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False), 'params': {}}
error fitting {'model': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform'), 'params': {}}
error fitting {'model': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                  

Exception in thread QueueManagerThread:
Traceback (most recent call last):
  File "/Users/yrgg/opt/anaconda3/envs/gpd/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/Users/yrgg/opt/anaconda3/envs/gpd/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/yrgg/opt/anaconda3/envs/gpd/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 747, in _queue_management_worker
    recursive_terminate(p)
  File "/Users/yrgg/opt/anaconda3/envs/gpd/lib/python3.7/site-packages/joblib/externals/loky/backend/utils.py", line 28, in recursive_terminate
    _recursive_terminate_without_psutil(process)
  File "/Users/yrgg/opt/anaconda3/envs/gpd/lib/python3.7/site-packages/joblib/externals/loky/backend/utils.py", line 53, in _recursive_terminate_without_psutil
    _recursive_terminate(process.pid)
  File "/Users/yrgg/opt/anaconda3/envs/gpd/lib/python3.7/site-packages/joblib/externals/loky/backend/u

error fitting {'model': GaussianNB(priors=None, var_smoothing=1e-09), 'params': {}}


## Sample
Smaller dataset for faster fitting

In [68]:
subsample_df

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
31629,50.0,2011-02-18,W.B,91,Canop,39.127857,-7.544019,Msikitini,0,Rufiji,...,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe,non functional
5176,1200.0,2011-03-07,Rc Ch,1739,DWE,34.994396,-8.474558,none,0,Rufiji,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
5879,2000.0,2011-07-15,H,1235,H,32.978062,-2.515321,Kwa Buswelu B,0,Lake Victoria,...,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump,functional
44011,0.0,2012-10-27,Hsw,1217,HSW,33.787274,-1.732290,Kwa Mzee Nyatihu,0,Lake Victoria,...,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump,non functional
57727,500.0,2013-03-23,Government Of Tanzania,1833,DWE,36.644697,-3.256113,Kwa Mwalimu Lumayani,0,Pangani,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40198,50.0,2013-02-23,Anglican Church,1257,Anglica Church,36.294321,-5.488812,Sorongine,0,Wami / Ruvu,...,soft,good,insufficient,insufficient,machine dbh,borehole,groundwater,other,other,non functional
22540,0.0,2012-10-22,Mileniam Project,0,Mileniam project,32.607653,-5.056114,Idd Omary Said,0,Lake Tanganyika,...,salty,salty,enough,enough,other,other,unknown,hand pump,hand pump,functional
934,0.0,2013-03-14,Lawate Fuka Water Suppl,1418,Lawate fuka water su,37.110720,-3.174713,Kwa Nelson B Munuo,0,Pangani,...,soft,good,enough,enough,other,other,unknown,other,other,functional
40485,1200.0,2011-03-26,Danida,1392,DWE,34.849950,-8.575696,none,0,Rufiji,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional


In [107]:
subsample_df = features_df.set_index('id').join(target_df.set_index('id')).reset_index()

In [108]:
subsample_df = subsample_df.sample(10000)
X_small = subsample_df.drop('status_group', axis=1)
y_small = subsample_df['status_group'].astype('category').cat.codes

In [109]:
for name, model in models.items():
    try:
        test_preds, train_preds = fit_predict(model, X_small, y_small)
        metrics(name, y_train, y_test, train_preds, test_preds)
    except:
        print(f'error fitting {model}')

Best parameter (CV score=0.760):
{}
Model: xgb
------------------------------------------------------------
test accuracy: 0.4068
train accuracy: 0.38066666666666665
------------------------------------------------------------

test report:
              precision    recall  f1-score   support

           0       0.55      0.67      0.61      1372
           1       0.42      0.02      0.05       932
           2       0.09      0.36      0.14       196

    accuracy                           0.41      2500
   macro avg       0.35      0.35      0.27      2500
weighted avg       0.47      0.41      0.36      2500

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

train report:
              precision    recall  f1-score   support

           0       0.54      0.63      0.58      4089
           1       0.36      0.03      0.06      2875
           2       0.08      0.35      0.12       536

    accuracy                           0.38      7500
   macro avg       0.32      0