# Modeling

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plot
%matplotlib inline

### Datasets

In [2]:
clean_df = pd.read_csv('../data/clean/tanzania.csv', index_col=0)

### Drop Lat and Long

In [3]:
clean_df = clean_df.drop(['latitude', 'longitude'], axis=1)

In [4]:
y = clean_df['target']
X = clean_df.drop(['target'], axis=1)

In [5]:
numerical_cols = list(X.select_dtypes(include=np.number).columns)
categorical_cols = list(X.select_dtypes(exclude=np.number).columns)

### Creating Pipeline

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('scale', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

## Models
- LinearSVC
- k-NN
- Support Vector Machine Algorithm
- XGBoost
- Random Forest

### Predict
- added SMOTE option
- added PCA to pipeline and see if time and score increase

In [67]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD

def fit_predict(model, X_train, X_test, y_train, y_test, smote=False):
    '''fit pipeline using given model, and return predictions'''
    
    param_grid = model['params']
    model = model['model']
        
    if smote:
        # Bundle preprocessing and modeling code in a pipeline
        my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                      ('smote', SMOTE()),
                                      ('under', RandomUnderSampler()),
                                      ('pca', TruncatedSVD()),
                                      ('model', model)
                                     ])
    else:
        # Bundle preprocessing and modeling code in a pipeline
        my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                      ('pca', TruncatedSVD()),
                                      ('model', model)
                                     ])

    search = GridSearchCV(estimator=my_pipeline,
             param_grid=param_grid, n_jobs=-1, verbose=2, cv=3)
    
    search.fit(X_train, y_train)
    
    print("Best parameter (CV score=%0.3f):" % search.best_score_)
    print(search.best_params_)

    # Preprocessing of validation data, get predictions
    test_preds = search.predict(X_test)
    train_preds = search.predict(X_train)
    return test_preds, train_preds

## Evaluate
What are the most important things to look for with regression?

- TODO: add graphs

In [8]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

def metrics(model_name, y_train, y_test, y_train_pred, y_test_pred):
    '''Print out the evaluation metrics for a given models predictions'''
    print(f'Model: {model_name}', )
    print('-'*60)
    print(f'test accuracy: {accuracy_score(y_test, y_test_pred)}')
    print(f'train accuracy: {accuracy_score(y_train, y_train_pred)}')
    print('-'*60)
    print('\ntest report:\n' + classification_report(y_test, y_test_pred))
    print('~'*60)
    print('\ntrain report:\n' + classification_report(y_train, y_train_pred))    
    print('-'*60)


## LinearSVC

### Why LinearSVC
The objective of a Linear SVC (Support Vector Classifier) is to fit to the data you provide, returning a "best fit"
hyperplane that divides, or categorizes, your data. 

It has more flexibility in the choice of penalties and loss functions and should scale better to large numbers of samples.

LinearSVC is another (faster) implementation of Support Vector Classification for the case of a linear kernel.

In [32]:
smallest_num = clean_df['target'].value_counts().sort_values().values[0]
target_0 = clean_df[clean_df['target'] == 0].sample(smallest_num)
target_1 = clean_df[clean_df['target'] == 1].sample(smallest_num)
target_2 = clean_df[clean_df['target'] == 2].sample(smallest_num)

sampled_df = pd.concat([target_0, target_1, target_2])

In [33]:
sampled_df['target'].value_counts()

2    2315
1    2315
0    2315
Name: target, dtype: int64

In [34]:
X_sampled_df = sampled_df.drop('target', axis=1)
y_sampled_df = sampled_df['target']

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_sampled_df, y_sampled_df, test_size=.3, random_state=42)

In [36]:
X_sampled_df.head()

Unnamed: 0,funder,gps_height,installer,basin,subvillage,region,district_code,population,scheme_management,permit,extraction_type,management,payment_type,water_quality,quantity,source,waterpoint_type,age
48899,Unicef,1485,DWE,Rufiji,Chafwimi,Iringa,4,120,VWC,False,gravity,vwc,monthly,soft,enough,river,communal standpipe,36
28922,Government Of Tanzania,1227,DWE,Pangani,Oru,Kilimanjaro,4,55,VWC,True,gravity,vwc,unknown,soft,enough,spring,other,57
24811,Losaa-kia Water Supply,1599,Losaa-Kia water supp,Pangani,Lobuo,Kilimanjaro,7,1,Water Board,True,gravity,water board,monthly,soft,enough,spring,communal standpipe,21
27918,Plan Int,62,DDCA,Wami / Ruvu,Vihingo,Pwani,3,250,VWC,True,india mark ii,vwc,per bucket,salty,enough,machine dbh,hand pump,13
31746,Dhv,266,DWE,Rufiji,Kidatu A,Morogoro,3,600,Company,True,gravity,vwc,never pay,soft,seasonal,river,communal standpipe,25


In [37]:
X_sampled_df.shape[0]

6945

### Hyperparameters

In [56]:
param_dict = {
    # 
    'model__C': [1, 10, 100], 
    'model__tol': [1,0.1,0.01],
    'model__max_iter': [10000, 20000],
    'model__penalty': ['l1', 'l2'],
    'model__multi_class': ['ovr', 'crammer_singer']
}

### Fitting LinearSVC

In [57]:
from sklearn.svm import LinearSVC

lsvc = { 'model': LinearSVC(), 'params': param_dict }

test_preds, train_preds = fit_predict(lsvc, X_train, X_test, y_train, y_test)

Best parameter (CV score=0.658):
{'model__C': 1, 'model__max_iter': 10000, 'model__multi_class': 'ovr', 'model__penalty': 'l2', 'model__tol': 0.01}


### Evaluating LinearSVC

In [58]:
metrics('LinearSVC', y_train, y_test, train_preds, test_preds)

Model: LinearSVC
------------------------------------------------------------
test accuracy: 0.654510556621881
train accuracy: 0.9487759720222176
------------------------------------------------------------

test report:
              precision    recall  f1-score   support

           0       0.63      0.60      0.62       727
           1       0.64      0.66      0.65       677
           2       0.70      0.70      0.70       680

    accuracy                           0.65      2084
   macro avg       0.65      0.66      0.66      2084
weighted avg       0.65      0.65      0.65      2084

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

train report:
              precision    recall  f1-score   support

           0       0.95      0.93      0.94      1588
           1       0.93      0.96      0.94      1638
           2       0.97      0.96      0.96      1635

    accuracy                           0.95      4861
   macro avg       0.95      0.95      0.95      4

There is a debate that LinearSVC is not a good fit for multi-class classification problems.

There are some ways to adjust for it: 
- using the 'crammer_singer' algorithm

## KNN

### Why KNN
KNNs are a non-parametric, lazy learning algorithm. It makes no underlying assumptions about the distribution of data. 

No training is necessary! 

KNN makes predictions just-in-time by calculating the similarity between an input sample and each training instance.

It is a Simple algorithm — to explain and understand/interpret. It is versatile — useful for classification or regression.

### Hyperparameters

In [65]:
param_dict = {
    'pca__n_components': range(80, 120, 20),
    # amount of neighbors
    'model__n_neighbors': range(1, 20, 2),
    # leaf size
    'model__leaf_size': range(1, 50, 5),
    'model__weights': ['distance'],
    'model__p': [1]
}

### Fitting KNN

In [68]:
from sklearn.neighbors import KNeighborsClassifier
knn = { 'model': KNeighborsClassifier(), 'params': param_dict }

test_preds, train_preds = fit_predict(knn, X_train, X_test, y_train, y_test)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  9.6min finished


Best parameter (CV score=0.657):
{'model__leaf_size': 26, 'model__n_neighbors': 19, 'model__p': 1, 'model__weights': 'distance', 'pca__n_components': 80}


### Evaluating KNN

In [69]:
metrics('k-NN', y_train, y_test, train_preds, test_preds)

Model: k-NN
------------------------------------------------------------
test accuracy: 0.682341650671785
train accuracy: 0.9934169923883974
------------------------------------------------------------

test report:
              precision    recall  f1-score   support

           0       0.69      0.64      0.66       727
           1       0.65      0.73      0.69       677
           2       0.72      0.68      0.70       680

    accuracy                           0.68      2084
   macro avg       0.68      0.68      0.68      2084
weighted avg       0.68      0.68      0.68      2084

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

train report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1588
           1       0.99      1.00      0.99      1638
           2       1.00      0.99      1.00      1635

    accuracy                           0.99      4861
   macro avg       0.99      0.99      0.99      4861
w

In [77]:
param_dict = {
    'pca__n_components': range(20, 60, 20),
    'model__colsample_bytree': [1.0],
    'model__eta': [.3, .2, .1, .05, .01, .005],
    'model__max_depth': [10],
    'model__min_child_weight': [6],
    'model__subsample': [0.8]
}

In [78]:
from xgboost import XGBClassifier

xgb = { 'model': XGBClassifier(), 'params': param_dict }

test_preds, train_preds = fit_predict(xgb, X_train, X_test, y_train, y_test)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed:   44.7s remaining:   12.8s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   56.8s finished


Best parameter (CV score=0.666):
{'model__colsample_bytree': 1.0, 'model__eta': 0.01, 'model__max_depth': 10, 'model__min_child_weight': 6, 'model__subsample': 0.8, 'pca__n_components': 40}


In [79]:
metrics('XGBoost', y_train, y_test, train_preds, test_preds)

Model: XGBoost
------------------------------------------------------------
test accuracy: 0.6861804222648752
train accuracy: 0.8792429541246657
------------------------------------------------------------

test report:
              precision    recall  f1-score   support

           0       0.68      0.65      0.66       727
           1       0.69      0.71      0.70       677
           2       0.69      0.70      0.70       680

    accuracy                           0.69      2084
   macro avg       0.69      0.69      0.69      2084
weighted avg       0.69      0.69      0.69      2084

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

train report:
              precision    recall  f1-score   support

           0       0.88      0.87      0.87      1588
           1       0.85      0.89      0.87      1638
           2       0.91      0.88      0.89      1635

    accuracy                           0.88      4861
   macro avg       0.88      0.88      0.88      48