# Modeling

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plot
%matplotlib inline

### Datasets

In [2]:
clean_df = pd.read_csv('../data/clean/tanzania.csv', index_col=0)

### Drop Lat and Long

In [3]:
clean_df = clean_df.drop(['latitude', 'longitude'], axis=1)

In [4]:
y = clean_df['target']
X = clean_df.drop(['target'], axis=1)

In [5]:
numerical_cols = list(X.select_dtypes(include=np.number).columns)
categorical_cols = list(X.select_dtypes(exclude=np.number).columns)

### Creating Pipeline

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('scale', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

## Models
- LinearSVC
- k-NN
- Support Vector Machine Algorithm
- XGBoost
- Random Forest

### Predict
- TODO: add SMOTE option
- TODO: add PCA to pipeline and see if time and score increase

In [57]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

def fit_predict(model, X_train, X_test, y_train, y_test, smote=False):
    '''fit pipeline using given model, and return predictions'''
    
    param_grid = model['params']
    model = model['model']
        
    if smote:
        # Bundle preprocessing and modeling code in a pipeline
        my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                      ('smote', SMOTE()),
                                      ('under', RandomUnderSampler()),
                                      ('model', model)
                                     ])
    else:
        # Bundle preprocessing and modeling code in a pipeline
        my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                      ('model', model)
                                     ])

    search = GridSearchCV(estimator=my_pipeline,
             param_grid=param_grid, n_jobs=-1)
    
    search.fit(X_train, y_train)
    
    print("Best parameter (CV score=%0.3f):" % search.best_score_)
    print(search.best_params_)

    # Preprocessing of validation data, get predictions
    test_preds = search.predict(X_test)
    train_preds = search.predict(X_train)
    return test_preds, train_preds

## Evaluate
What are the most important things to look for with regression?

- TODO: add graphs

In [49]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

def metrics(model_name, y_train, y_test, y_train_pred, y_test_pred):
    '''Print out the evaluation metrics for a given models predictions'''
    print(f'Model: {model_name}', )
    print('-'*60)
    print(f'test accuracy: {accuracy_score(y_test, y_test_pred)}')
    print(f'train accuracy: {accuracy_score(y_train, y_train_pred)}')
    print('-'*60)
    print('\ntest report:\n' + classification_report(y_test, y_test_pred))
    print('~'*60)
    print('\ntrain report:\n' + classification_report(y_train, y_train_pred))    
    print('-'*60)


## LinearSVC

### Why LinearSVC
The objective of a Linear SVC (Support Vector Classifier) is to fit to the data you provide, returning a "best fit"
hyperplane that divides, or categorizes, your data. 

It has more flexibility in the choice of penalties and loss functions and should scale better to large numbers of samples.

LinearSVC is another (faster) implementation of Support Vector Classification for the case of a linear kernel.

### Hyperparameters

In [36]:
param_dict = {
    # 
    'model__C': [0.1,1, 10, 100], 
    'model__tol': [1,0.1,0.01,0.001],
    'model__max_iter': [10000],
    'model__penalty': ['l1', 'l2'],
    'model__multi_class': ['ovr', 'crammer_singer']
}

In [61]:
smallest = clean_df['target'].value_counts().sort_values()
# target_0 = df[df['target'] == 0].sample(smallest)

In [40]:
sampled_df = clean_df.sample(2500)
X_sampled_df = sampled_df.drop('target', axis=1)
y_sampled_df = sampled_df['target']

### Fitting LinearSVC

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X_sampled_df, y_sampled_df, test_size=.25, random_state=42)

In [42]:
from sklearn.svm import LinearSVC

lsvc = { 'model': LinearSVC(), 'params': param_dict }

test_preds, train_preds = fit_predict(lsvc, X_train, X_test, y_train, y_test, smote=True)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/yrgg/opt/anaconda3/envs/gpd/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-42-4ef66d5d641c>", line 5, in <module>
    test_preds, train_preds = fit_predict(lsvc, X_train, X_test, y_train, y_test, smote=True)
  File "<ipython-input-23-9ef1f5aeca8d>", line 29, in fit_predict
    search.fit(X_train, y_train)
  File "/Users/yrgg/opt/anaconda3/envs/gpd/lib/python3.7/site-packages/sklearn/model_selection/_search.py", line 710, in fit
    self._run_search(evaluate_candidates)
  File "/Users/yrgg/opt/anaconda3/envs/gpd/lib/python3.7/site-packages/sklearn/model_selection/_search.py", line 1151, in _run_search
    evaluate_candidates(ParameterGrid(self.param_grid))
  File "/Users/yrgg/opt/anaconda3/envs/gpd/lib/python3.7/site-packages/sklearn/model_selection/_search.py", line 689, in evaluate_candidates
    cv.split(X, y, groups)))


KeyboardInterrupt: 

### Evaluating LinearSVC

In [26]:
metrics('LinearSVC', y_train, y_test, train_preds, test_preds)

Model: LinearSVC
------------------------------------------------------------
test accuracy: 0.696
train accuracy: 0.9933333333333333
------------------------------------------------------------

test report:
              precision    recall  f1-score   support

           0       0.84      0.70      0.77       149
           1       0.28      0.26      0.27        19
           2       0.60      0.78      0.68        82

    accuracy                           0.70       250
   macro avg       0.57      0.58      0.57       250
weighted avg       0.72      0.70      0.70       250

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

train report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       409
           1       0.95      1.00      0.97        54
           2       0.99      1.00      1.00       287

    accuracy                           0.99       750
   macro avg       0.98      1.00      0.99       750
weighted

In [15]:
metrics('LinearSVC', y_train, y_test, train_preds, test_preds)

Model: LinearSVC
------------------------------------------------------------
test accuracy: 0.708
train accuracy: 0.996
------------------------------------------------------------

test report:
              precision    recall  f1-score   support

           0       0.74      0.76      0.75       132
           1       0.50      0.38      0.43        21
           2       0.70      0.71      0.71        97

    accuracy                           0.71       250
   macro avg       0.65      0.62      0.63       250
weighted avg       0.70      0.71      0.70       250

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

train report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       425
           1       0.96      1.00      0.98        51
           2       1.00      0.99      1.00       274

    accuracy                           1.00       750
   macro avg       0.99      1.00      0.99       750
weighted avg       1.

There is a debate that LinearSVC is not a good fit for multi-class classification problems.

There are some ways to adjust for it: 
- using the 'crammer_singer' algorithm

## KNN

### Why KNN
KNNs are a non-parametric, lazy learning algorithm. It makes no underlying assumptions about the distribution of data. 

No training is necessary! 

KNN makes predictions just-in-time by calculating the similarity between an input sample and each training instance.

It is a Simple algorithm — to explain and understand/interpret. It is versatile — useful for classification or regression.

### Hyperparameters

In [45]:
param_dict = {
    # amount of neighbors
    'model__n_neighbors': range(1, 10, 2),
    # leaf size
#     'model__leaf_size': range(30, ),
    'model__weights': ['uniform', 'distance']
}

### Fitting KNN

In [58]:
from sklearn.neighbors import KNeighborsClassifier
knn = { 'model': KNeighborsClassifier(), 'params': param_dict }

test_preds, train_preds = fit_predict(knn, X_train, X_test, y_train, y_test, smote=True)

Best parameter (CV score=0.628):
{'model__n_neighbors': 1, 'model__weights': 'uniform'}


### Evaluating KNN

In [59]:
metrics('k-NN', y_train, y_test, train_preds, test_preds)

Model: k-NN
------------------------------------------------------------
test accuracy: 0.6336
train accuracy: 1.0
------------------------------------------------------------

test report:
              precision    recall  f1-score   support

           0       0.74      0.63      0.68       325
           1       0.22      0.40      0.28        45
           2       0.65      0.68      0.66       255

    accuracy                           0.63       625
   macro avg       0.54      0.57      0.54       625
weighted avg       0.67      0.63      0.65       625

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

train report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1026
           1       1.00      1.00      1.00       116
           2       1.00      1.00      1.00       733

    accuracy                           1.00      1875
   macro avg       1.00      1.00      1.00      1875
weighted avg       1.00    