# AIM
- run baseline models again with revised preprocessing routine as of Mar22

Created: 23 Mar 2022

# LIBRARIES

In [7]:
##### MODULES
import os
import numpy as np
import pandas as pd
from pathlib import Path
from matplotlib import pyplot as plt
import matplotlib.dates
import matplotlib.patches
import datetime as dt
import ast

## custom modules
# import sys  
# sys.path.append('../../scripts')

import import_data
import clean_data
import mappings
import plotting
import report
import helper

##### OPTIONS
pd.options.mode.chained_assignment = None

# autoreload external modules after saving changes to disk
%reload_ext autoreload
%autoreload 2

##### DIRECTORIES
proj_dir = Path('.') / '..' / '..'
source_data_dir = proj_dir/'data'/'source'
clean_data_dir =  proj_dir/'data'/'clean'
viz_dir = proj_dir/'viz'

In [8]:
from tsfresh import extract_features
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

import random

In [9]:
import feather

ML libraries

In [10]:
from sklearn.model_selection import train_test_split

# preprocessing
from sklearn.preprocessing import StandardScaler


# classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import svm

# tune
from sklearn.model_selection import GridSearchCV

# performance
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# PARAMETERS

In [15]:
PATH_TO_PREPROCESSED_DATA = proj_dir/'data'/'clean'/'preprocessMar22'/'data_merge2.ftr'
PATH_TO_FEATURES_SELECTED = proj_dir/'data'/'clean'/'fsMar23all'/'feat_tsfresh_select.ftr'

SEED = 123

### INITIAL SPLIT
TEST_SIZE = 0.3

### TUNING
N_JOBS = -2 # all but 1 CPUs
KFOLD = 5

# IMPORT

In [12]:
fs = pd.read_feather(PATH_TO_FEATURES_SELECTED)
df = pd.read_feather(PATH_TO_PREPROCESSED_DATA)

In [13]:
df['target'] = df['phq'] > 10
_, y = clean_data.generate_ts_y(df)
y = y[fs['index']]

if np.array_equal(y.index.values, fs['index'].values):
    print("Primary key does match.")

Primary key does match.


In [None]:
X = fs.loc[:, fs.columns != 'index'].copy()

# TRAIN TEST SPLIT

In [14]:
from sklearn.model_selection import train_test_split # Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=TEST_SIZE, random_state=SEED
)
report.report_train_test_split(X_train, X_test, y_train, y_test)

Training Features Shape: (2002, 422)
Training Labels Shape: (2002,)
Testing Features Shape: (858, 422)
Testing Labels Shape: (858,)


### RF hyperparameter tuning

In [24]:
# init grid
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

# tune
with helper.Timer("Tune RF"):
    CV_rfc = GridSearchCV(
        estimator=RandomForestClassifier(random_state=SEED), 
        param_grid=param_grid, cv=KFOLD, n_jobs=N_JOBS, return_train_score=True)
    CV_rfc.fit(X_train, y_train)

True

In [None]:
### Show best model from CV
CV_rfc.best_params_
plotting.plot_search_results(CV_rfc)

### Fit with best model
#! modify if necessary
rfc1=RandomForestClassifier(random_state=SEED)
rfc1.set_params(**CV_rfc.best_params_)

### Fit model

In [None]:
rf_param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

## SCALING?
classifiers = [
    # name         # scaler        
    ('KNNdefault', StandardScaler(), KNeighborsClassifier()),
    ('RFdefault',  None,             RandomForestClassifier(random_state=SEED)),
    ('RFbest',     None,             GridSearchCV(
        estimator=RandomForestClassifier(random_state=SEED), 
        param_grid=rf_param_grid, cv=KFOLD, n_jobs=N_JOBS, return_train_score=True)
    ),
    ('SVMlinear',  StandardScaler(), svm.SVC(kernel='poly'),  ),
    ('SVMpoly',    StandardScaler(), ),
    ('SVMrbf',     StandardScaler(), ),
    ('SVMsigmoid', StandardScaler(), )
]    

In [6]:
StandardScaler().fit_transform(X_train)

NameError: name 'X_train' is not defined

In [None]:
predictions = {}
classifiers_fitted = {}
for name, clf, scaler, tune in classifiers:
    if scaler:
        print("# SCALE")
        X_train_preprocessed = scaler.fit_transform(X_train)
    if isinstance(tune, GridSearchCV):
        print("# TUNE")
        with helper.Timer("tune"):
                
        pass
    
    
    
    with helper.Timer(name):
        clf.fit(X_train_preprocessed, y_train.values.ravel())
    prediction = clf.predict(X_test)
    predictions[name] = (prediction, clf, )

In [None]:
pd.DataFrame(classification_report(y_test, pred, output_dict=True)).transpose().reset_index()

In [None]:
print(classification_report(y_test, pred))

In [None]:
rfc1

In [None]:
### Feature importance
from sklearn.inspection import permutation_importance

with helper.Timer("RF permutation importance"):
    imp_perm = permutation_importance(rfc1, X_test, y_test)

plotting.plot_imp_perm(imp_perm, features=X_test.columns, n=10);