# Modeling

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plot
%matplotlib inline

## Baseline
How the different models compare without doing any scrubbing beyond whats needed to get it to work

In [2]:
features_df = pd.read_csv('../data/raw/X_train.csv')
target_df = pd.read_csv('../data/raw/y_train.csv')

In [3]:
print(features_df.shape)
features_df.head()

(59400, 40)


Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [4]:
target_df.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [5]:
X = features_df

In [6]:
y = target_df['status_group'].replace({'functional': 0, 'non functional': 1, 'functional needs repair': 2}).astype(int)

In [7]:
target_df['status_group'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

In [8]:
numerical_cols = list(X.select_dtypes(include=np.number).columns)
categorical_cols = list(X.select_dtypes(exclude=np.number).columns)

In [9]:
numerical_cols

['id',
 'amount_tsh',
 'gps_height',
 'longitude',
 'latitude',
 'num_private',
 'region_code',
 'district_code',
 'population',
 'construction_year']

### Creating Pipeline

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('scale', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

## Models
- Xgboost
- LightGBM
- Random Forest
- Extra Trees
- k-NN
- Logistic Regression
- Naïve Bayes Classifier Algorithm
- K Means Clustering Algorithm
- Support Vector Machine Algorithm
- Apriori Algorithm
- Decision Trees

### Predict

In [19]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV

def fit_predict(model, X_train, X_test, y_train, y_test):
    '''fit pipeline using given model, and return predictions'''
    
    param_grid = model['params']
    model = model['model']
        
    # Bundle preprocessing and modeling code in a pipeline
    my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('model', model)
                                 ])
    
    search = GridSearchCV(estimator=my_pipeline,
             param_grid=param_grid, n_jobs=-1)
    
    search.fit(X_train, y_train)
    
    print("Best parameter (CV score=%0.3f):" % search.best_score_)
    print(search.best_params_)

    # Preprocessing of validation data, get predictions
    test_preds = search.predict(X_test)
    train_preds = search.predict(X_train)
    return test_preds, train_preds

## Evaluate
What are the most important things to look for with regression?

In [12]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

def metrics(model_name, y_train, y_test, y_train_pred, y_test_pred):
    '''Print out the evaluation metrics for a given models predictions'''
    print(f'Model: {model_name}', )
    print('-'*60)
    print(f'test accuracy: {accuracy_score(y_test, y_test_pred)}')
    print(f'train accuracy: {accuracy_score(y_train, y_train_pred)}')
    print('-'*60)
    print('\ntest report:\n' + classification_report(y_test, y_test_pred))
    print('~'*60)
    print('\ntrain report:\n' + classification_report(y_train, y_train_pred))    
    print('-'*60)


## Sample
Smaller dataset for faster fitting

In [14]:
subsample_df = features_df.set_index('id').join(target_df.set_index('id')).reset_index()

In [15]:
subsample_df = subsample_df.sample(10000)
X_small = subsample_df.drop('status_group', axis=1)
y_small = subsample_df['status_group'].astype('category').cat.codes

## LinearSVC

### Why LinearSVC
The objective of a Linear SVC (Support Vector Classifier) is to fit to the data you provide, returning a "best fit"
hyperplane that divides, or categorizes, your data. 

It has more flexibility in the choice of penalties and loss functions and should scale better to large numbers of samples.

LinearSVC is another (faster) implementation of Support Vector Classification for the case of a linear kernel.

### Hyperparameters

In [20]:
param_dict = {
    'model__max_iter': [500, 1000]
}

### Fitting LinearSVC

In [21]:
from sklearn.svm import LinearSVC

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

lsvc = { 'model': LinearSVC(), 'params': param_dict }

test_preds, train_preds = fit_predict(lsvc, X_train, X_test, y_train, y_test)



Best parameter (CV score=0.772):
{'model__max_iter': 500}


### Evaluating LinearSVC

In [22]:
metrics('LinearSVC', y_train, y_test, train_preds, test_preds)

Model: LinearSVC
------------------------------------------------------------
test accuracy: 0.7785185185185185
train accuracy: 0.9828058361391695
------------------------------------------------------------

test report:
              precision    recall  f1-score   support

           0       0.80      0.85      0.82      8098
           1       0.79      0.76      0.78      5678
           2       0.47      0.32      0.38      1074

    accuracy                           0.78     14850
   macro avg       0.69      0.65      0.66     14850
weighted avg       0.77      0.78      0.77     14850

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

train report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     24161
           1       0.99      0.98      0.98     17146
           2       0.98      0.94      0.96      3243

    accuracy                           0.98     44550
   macro avg       0.98      0.97      0.98     4

some words here