In [1]:
pip install -U -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [173]:
#all packages and libraries required
%matplotlib inline
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator
import sklearn.preprocessing as preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.linear_model import RidgeClassifier
import sklearn.metrics as metrics
import problem
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.svm import SVC

In [220]:
#import data
X_train, y_train = problem.get_train_data()
X_test, y_test = problem.get_test_data()

assert X_train.shape[1] == 284 + 331695

In [221]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(410, 331979) (103, 331979) (410,) (103,)


In [32]:
#check target label count
count = pd.DataFrame({'Target': y_train})
count['Target'].value_counts()

Target
control          222
schizophrenia    188
Name: count, dtype: int64

In [33]:
#check target label count
count = pd.DataFrame({'Target': y_test})
count['Target'].value_counts()

Target
control          55
schizophrenia    48
Name: count, dtype: int64

In [20]:
#feature extractor
class ROIsFeatureExtractor(BaseEstimator, TransformerMixin):
    """Select only the 284 ROIs features:"""
    def fit(self, X, y):
        return self

    def transform(self, X):
        return X[:, :284]

class VBMFeatureExtractor(BaseEstimator, TransformerMixin):
    """Select only the 284 ROIs features:"""
    def fit(self, X, y):
        return self

    def transform(self, X):
        return X[:, 284:]


fe = ROIsFeatureExtractor()
roi_xtrain = fe.transform(X_train)
roi_xtest = fe.transform(X_test)
# roi_ytrain = fe.transform(y_train)
# roi_ytest = fe.transform(y_test)

fe = VBMFeatureExtractor()
vbm_xtrain = fe.transform(X_train)
vbm_xtest = fe.transform(X_test)
# vbm_ytrain = fe.transform(y_train)
# vbm_ytest = fe.transform(y_test)

In [21]:
roi_train.shape,vbm_train.shape, roi_test.shape,vbm_test.shape

((410, 284), (410, 331695), (103, 284), (103, 331695))

In [22]:
y_train.shape,y_test.shape

((410,), (103,))

In [47]:
y_test

array(['schizophrenia', 'schizophrenia', 'schizophrenia', 'control',
       'schizophrenia', 'schizophrenia', 'control', 'control', 'control',
       'control', 'schizophrenia', 'control', 'control', 'schizophrenia',
       'schizophrenia', 'control', 'schizophrenia', 'schizophrenia',
       'schizophrenia', 'control', 'control', 'control', 'schizophrenia',
       'control', 'control', 'control', 'control', 'schizophrenia',
       'schizophrenia', 'control', 'schizophrenia', 'control', 'control',
       'control', 'control', 'control', 'schizophrenia', 'schizophrenia',
       'schizophrenia', 'schizophrenia', 'control', 'schizophrenia',
       'control', 'schizophrenia', 'control', 'control', 'schizophrenia',
       'control', 'control', 'schizophrenia', 'schizophrenia', 'control',
       'schizophrenia', 'schizophrenia', 'control', 'control',
       'schizophrenia', 'control', 'control', 'control', 'control',
       'control', 'control', 'control', 'schizophrenia', 'schizophrenia',
  

# Function to Train Model based on Speicified Feature Extractor

In [233]:
def evaluate(extractor, model, X_train, y_train, X_test, y_test, mode):
    cv = problem.get_cv(X_train, y_train)    
    estimator = make_pipeline(extractor,model)#pipeline of extractor and model
    le = LabelEncoder() #encode target label
    y_train = le.fit_transform(y_train)
    if mode == 'Common': #common cv
        cv_results = cross_validate(estimator, X_train, y_train, scoring=['balanced_accuracy', 'roc_auc'], cv=cv,
                                verbose=1, return_train_score=True, n_jobs=5)       
    elif mode == 'Group Stratified': #group stratified cv
        n_splits = 2
        sex = le.fit_transform(participants_train['sex'])
        cv = GroupKFold(n_splits=n_splits)
        cv_results = cross_validate(estimator, X_train, y_train, cv=cv, 
                                    groups=sex, scoring=['balanced_accuracy', 'roc_auc'], return_estimator=True)
    else: 
        print('Cross Validation Method chosen INVALID!')        
    print(f"# {mode} Cross Validation Results")
    print('bACC=%.2f' % cv_results['test_balanced_accuracy'].mean(),
          'ROC-AUC=%.2f' % cv_results['test_roc_auc'].mean())
    # Refit on all train
    estimator.fit(X_train, y_train)
    # Apply on test
    y_pred_train = estimator.predict(X_train)
    y_pred_test = estimator.predict(X_test)
    score_pred_test = estimator.predict_proba(X_test)[:, 1]
    bacc_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
    auc_test = metrics.roc_auc_score(y_test, score_pred_test)
    print("# Test")
    print('bACC=%.2f' % bacc_test,
          'ROC-AUC=%.2f' % auc_test)

# Regularized Linear Model - Logistic Regression

In [374]:
%%time
evaluate(ROIsFeatureExtractor(),
         LogisticRegressionCV(Cs = 5, cv=5, penalty = 'elasticnet', solver = 'saga', random_state=42, 
                              l1_ratios = [0.3,0.8], max_iter=1000),
         X_train, y_train,X_test, y_test, 'Common')

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:  1.3min remaining:  2.0min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:  1.4min finished


# Common Cross Validation Results
bACC=0.69 ROC-AUC=0.75
# Test
bACC=0.72 ROC-AUC=0.82
CPU times: user 55.6 s, sys: 1.33 s, total: 57 s
Wall time: 2min 16s


In [375]:
%%time
evaluate(ROIsFeatureExtractor(),
         LogisticRegressionCV(Cs = 5, cv=5, penalty = 'elasticnet', solver = 'saga', random_state=42, 
                              l1_ratios = [0.3,0.8], max_iter=1000),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.66 ROC-AUC=0.72
# Test
bACC=0.72 ROC-AUC=0.82
CPU times: user 1min 51s, sys: 2.01 s, total: 1min 53s
Wall time: 1min 46s


In [376]:
#requires >10 hrs
%%time
evaluate(VBMFeatureExtractor(),
         LogisticRegressionCV(Cs = 5, cv=5, penalty = 'elasticnet', solver = 'saga', random_state=42, 
                              l1_ratios = [0.3,0.8], max_iter=1000),
         X_train, y_train,X_test, y_test, 'Common')

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed: 314.1min remaining: 471.2min


KeyboardInterrupt: 

In [377]:
#requires >10 hrs
%%time
evaluate(VBMFeatureExtractor(),
         LogisticRegressionCV(Cs = 5, cv=5, penalty = 'elasticnet', solver = 'saga', random_state=42, 
                              l1_ratios = [0.3,0.8], max_iter=1000),
         X_train, y_train,X_test, y_test, 'Group Stratified')

KeyboardInterrupt: 

# Tree-based Ensemble Model - XGBoost

In [378]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.1, n_estimators=500, gamma = 0.01,colsample_bytree=0.2, random_state=42),
         X_train, y_train,X_test, y_test,'Common')

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:   19.0s remaining:   28.5s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:   19.3s finished


# Common Cross Validation Results
bACC=0.74 ROC-AUC=0.83
# Test
bACC=0.76 ROC-AUC=0.83
CPU times: user 9.23 s, sys: 1.73 s, total: 11 s
Wall time: 21.4 s


In [379]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.1, n_estimators=500, gamma = 0.01,colsample_bytree=0.2, random_state=42),
         X_train, y_train,X_test, y_test,'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.71 ROC-AUC=0.81
# Test
bACC=0.76 ROC-AUC=0.83
CPU times: user 16.5 s, sys: 2.88 s, total: 19.4 s
Wall time: 9.15 s


In [384]:
%%time
evaluate(VBMFeatureExtractor(),
         XGBClassifier(learning_rate = 0.1, n_estimators=500, gamma = 0.01,colsample_bytree=0.2, random_state=42),
         X_train, y_train,X_test, y_test,'Common')

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed: 19.3min remaining: 28.9min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 19.3min finished


# Common Cross Validation Results
bACC=0.74 ROC-AUC=0.84
# Test
bACC=0.76 ROC-AUC=0.84
CPU times: user 37min 47s, sys: 4min 27s, total: 42min 15s
Wall time: 25min 29s


In [385]:
%%time
evaluate(VBMFeatureExtractor(),
         XGBClassifier(learning_rate = 0.1, n_estimators=500, gamma = 0.01,colsample_bytree=0.2, random_state=42),
         X_train, y_train,X_test, y_test,'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.66 ROC-AUC=0.77
# Test
bACC=0.76 ROC-AUC=0.84
CPU times: user 1h 18min 43s, sys: 8min 14s, total: 1h 26min 57s
Wall time: 12min 12s


# Non-linear Model - SVC with RBF Kernel

In [386]:
%%time
evaluate(ROIsFeatureExtractor(),
         SVC(kernel='rbf', C=1.0, gamma='auto',probability=True),
         X_train, y_train,X_test, y_test,'Common')

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:   14.2s remaining:   21.3s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:   14.5s finished


# Common Cross Validation Results
bACC=0.69 ROC-AUC=0.77
# Test
bACC=0.72 ROC-AUC=0.76
CPU times: user 469 ms, sys: 682 ms, total: 1.15 s
Wall time: 14.9 s


In [235]:
%%time
evaluate(ROIsFeatureExtractor(),
         SVC(kernel='rbf', C=1.0, gamma='auto',probability=True),
         X_train, y_train,X_test, y_test,'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.66 ROC-AUC=0.74
# Test
bACC=0.72 ROC-AUC=0.76
CPU times: user 4.93 s, sys: 1.32 s, total: 6.25 s
Wall time: 7.45 s


In [236]:
%%time
evaluate(VBMFeatureExtractor(),
         SVC(kernel='rbf', C=1.0, gamma='auto',probability=True),
         X_train, y_train,X_test, y_test,'Common')

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed: 30.8min remaining: 46.1min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 30.9min finished


# Common Cross Validation Results
bACC=0.56 ROC-AUC=0.70
# Test
bACC=0.53 ROC-AUC=0.63
CPU times: user 16min 23s, sys: 1min 25s, total: 17min 48s
Wall time: 35min 53s


In [237]:
%%time
evaluate(VBMFeatureExtractor(),
         SVC(kernel='rbf', C=1.0, gamma='auto',probability=True),
         X_train, y_train,X_test, y_test,'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.51 ROC-AUC=0.75
# Test
bACC=0.53 ROC-AUC=0.63
CPU times: user 25min 44s, sys: 1min 59s, total: 27min 44s
Wall time: 7min 9s


# Hyperparameter Testing

### Logistic Regression Hyperparameter Testing

In [361]:
%%time
evaluate(ROIsFeatureExtractor(),
         LogisticRegressionCV(cv=5, random_state=42, max_iter=1000),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.68 ROC-AUC=0.75
# Test
bACC=0.70 ROC-AUC=0.82
CPU times: user 1min 18s, sys: 10.7 s, total: 1min 28s
Wall time: 27.3 s


#### Experimenting other penalty methods rather than default L2 penalty
#### c value

In [365]:
%%time
evaluate(ROIsFeatureExtractor(),
         LogisticRegressionCV(Cs = 1, cv=5, random_state=42, max_iter=1000),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.54 ROC-AUC=0.73
# Test
bACC=0.58 ROC-AUC=0.64
CPU times: user 6.15 s, sys: 1.28 s, total: 7.42 s
Wall time: 6.12 s


In [366]:
%%time
evaluate(ROIsFeatureExtractor(),
         LogisticRegressionCV(Cs = 15, cv=5, random_state=42, max_iter=1000),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.66 ROC-AUC=0.74
# Test
bACC=0.71 ROC-AUC=0.81
CPU times: user 1min 48s, sys: 13.6 s, total: 2min 2s
Wall time: 35.2 s


In [367]:
%%time
evaluate(ROIsFeatureExtractor(),
         LogisticRegressionCV(Cs = 30, cv=5, random_state=42, max_iter=1000),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.68 ROC-AUC=0.76
# Test
bACC=0.70 ROC-AUC=0.79
CPU times: user 3min 31s, sys: 33.6 s, total: 4min 5s
Wall time: 1min 11s


In [261]:
%%time
evaluate(ROIsFeatureExtractor(),
         LogisticRegressionCV(Cs = 4, cv=5, penalty = 'elasticnet', solver = 'saga', random_state=42, 
                              l1_ratios = [0.4,0.7], max_iter=1000),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.65 ROC-AUC=0.71
# Test
bACC=0.73 ROC-AUC=0.80
CPU times: user 1min 31s, sys: 4.98 s, total: 1min 36s
Wall time: 1min 37s


In [262]:
%%time
evaluate(ROIsFeatureExtractor(),
         LogisticRegressionCV(Cs = 4, cv=5, penalty = 'elasticnet', solver = 'saga', random_state=42, 
                              l1_ratios = [0.7,0.3], max_iter=1000),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.65 ROC-AUC=0.71
# Test
bACC=0.73 ROC-AUC=0.80
CPU times: user 1min 29s, sys: 1.97 s, total: 1min 31s
Wall time: 1min 24s


In [264]:
%%time
evaluate(ROIsFeatureExtractor(),
         LogisticRegressionCV(Cs = 4, cv=5, penalty = 'elasticnet', solver = 'saga', random_state=42, 
                              l1_ratios = [0.2,0.8], max_iter=1000),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.68 ROC-AUC=0.75
# Test
bACC=0.72 ROC-AUC=0.80
CPU times: user 1min 33s, sys: 2.36 s, total: 1min 35s
Wall time: 1min 28s


In [265]:
%%time
evaluate(ROIsFeatureExtractor(),
         LogisticRegressionCV(Cs = 10, cv=5, penalty = 'elasticnet', solver = 'saga', random_state=42, 
                              l1_ratios = [0.3,0.8], max_iter=1000),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.66 ROC-AUC=0.72
# Test
bACC=0.74 ROC-AUC=0.80
CPU times: user 3min 26s, sys: 2.02 s, total: 3min 28s
Wall time: 3min 21s


In [268]:
%%time
evaluate(ROIsFeatureExtractor(),
         LogisticRegressionCV(Cs = 30, cv=5, penalty = 'elasticnet', solver = 'saga', random_state=42, 
                              l1_ratios = [0.3,0.8], max_iter=1000),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.70 ROC-AUC=0.76
# Test
bACC=0.74 ROC-AUC=0.80
CPU times: user 6min 3s, sys: 2.66 s, total: 6min 5s
Wall time: 5min 58s


In [364]:
%%time
evaluate(ROIsFeatureExtractor(),
         LogisticRegressionCV(Cs = 30, cv=5, penalty = 'elasticnet', solver = 'saga', random_state=42, 
                              l1_ratios = [0.3,0.8], max_iter=500),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.68 ROC-AUC=0.75
# Test
bACC=0.74 ROC-AUC=0.80
CPU times: user 4min 57s, sys: 3.86 s, total: 5min 1s
Wall time: 4min 57s


In [368]:
%%time
evaluate(ROIsFeatureExtractor(),
         LogisticRegressionCV(Cs = 30, cv=5, penalty = 'elasticnet', solver = 'saga', random_state=42, 
                              l1_ratios = [0.3,0.8], max_iter=1000),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.70 ROC-AUC=0.76
# Test
bACC=0.74 ROC-AUC=0.80
CPU times: user 6min 6s, sys: 3.06 s, total: 6min 9s
Wall time: 6min 4s


In [369]:
%%time
evaluate(ROIsFeatureExtractor(),
         LogisticRegressionCV(Cs = 15, cv=5, penalty = 'elasticnet', solver = 'saga', random_state=42, 
                              l1_ratios = [0.3,0.8], max_iter=1000),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.69 ROC-AUC=0.75
# Test
bACC=0.73 ROC-AUC=0.79
CPU times: user 4min 24s, sys: 2.32 s, total: 4min 27s
Wall time: 4min 19s


In [370]:
%%time
evaluate(ROIsFeatureExtractor(),
         LogisticRegressionCV(Cs = 10, cv=5, penalty = 'elasticnet', solver = 'saga', random_state=42, 
                              l1_ratios = [0.3,0.8], max_iter=1000),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.66 ROC-AUC=0.72
# Test
bACC=0.74 ROC-AUC=0.80
CPU times: user 3min 33s, sys: 2.26 s, total: 3min 35s
Wall time: 3min 28s


In [371]:
%%time
evaluate(ROIsFeatureExtractor(),
         LogisticRegressionCV(Cs = 5, cv=5, penalty = 'elasticnet', solver = 'saga', random_state=42, 
                              l1_ratios = [0.3,0.8], max_iter=1000),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.66 ROC-AUC=0.72
# Test
bACC=0.72 ROC-AUC=0.82
CPU times: user 1min 52s, sys: 2.32 s, total: 1min 55s
Wall time: 1min 48s


### XGBoost Hyperparameter Testing

#### Learning Rate 


In [281]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.001, random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.67 ROC-AUC=0.71
# Test
bACC=0.64 ROC-AUC=0.73
CPU times: user 16.2 s, sys: 1.43 s, total: 17.6 s
Wall time: 7.76 s


In [282]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.000001, random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.64 ROC-AUC=0.67
# Test
bACC=0.60 ROC-AUC=0.62
CPU times: user 15.8 s, sys: 1.44 s, total: 17.2 s
Wall time: 7.57 s


In [287]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.1, random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.73 ROC-AUC=0.80
# Test
bACC=0.73 ROC-AUC=0.82
CPU times: user 13.7 s, sys: 1.34 s, total: 15.1 s
Wall time: 7.28 s


In [288]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.2, random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.74 ROC-AUC=0.81
# Test
bACC=0.73 ROC-AUC=0.81
CPU times: user 12.2 s, sys: 1.38 s, total: 13.5 s
Wall time: 7.01 s


In [286]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.3, random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.73 ROC-AUC=0.81
# Test
bACC=0.73 ROC-AUC=0.81
CPU times: user 11 s, sys: 1.22 s, total: 12.2 s
Wall time: 6.33 s


In [283]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.5, random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.72 ROC-AUC=0.79
# Test
bACC=0.72 ROC-AUC=0.81
CPU times: user 10.2 s, sys: 1.51 s, total: 11.7 s
Wall time: 7.03 s


In [285]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.6, random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.74 ROC-AUC=0.80
# Test
bACC=0.73 ROC-AUC=0.80
CPU times: user 9.91 s, sys: 1.44 s, total: 11.4 s
Wall time: 6.74 s


In [284]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.8, random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.74 ROC-AUC=0.81
# Test
bACC=0.73 ROC-AUC=0.76
CPU times: user 9.84 s, sys: 1.29 s, total: 11.1 s
Wall time: 6.44 s


#### n_estimators

In [289]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.1, n_estimators=100, random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.73 ROC-AUC=0.80
# Test
bACC=0.73 ROC-AUC=0.82
CPU times: user 13.6 s, sys: 1.31 s, total: 14.9 s
Wall time: 7.28 s


In [290]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.1, n_estimators=500, random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.74 ROC-AUC=0.82
# Test
bACC=0.72 ROC-AUC=0.82
CPU times: user 25.7 s, sys: 2.04 s, total: 27.8 s
Wall time: 8.71 s


In [291]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.1, n_estimators=1000, random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.73 ROC-AUC=0.82
# Test
bACC=0.73 ROC-AUC=0.82
CPU times: user 31.9 s, sys: 1.4 s, total: 33.3 s
Wall time: 9.18 s


In [292]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.1, n_estimators=10000, random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.73 ROC-AUC=0.82
# Test
bACC=0.74 ROC-AUC=0.81
CPU times: user 2min 29s, sys: 5.73 s, total: 2min 35s
Wall time: 26 s


#### gamma (default = 0)

In [295]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.1, n_estimators=500, gamma = 0, random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.74 ROC-AUC=0.82
# Test
bACC=0.72 ROC-AUC=0.82
CPU times: user 23.5 s, sys: 1.56 s, total: 25.1 s
Wall time: 8.29 s


In [293]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.1, n_estimators=500, gamma = 5, random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.69 ROC-AUC=0.79
# Test
bACC=0.76 ROC-AUC=0.82
CPU times: user 54.8 s, sys: 3.08 s, total: 57.9 s
Wall time: 13.7 s


In [294]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.1, n_estimators=500, gamma = 2, random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.71 ROC-AUC=0.80
# Test
bACC=0.73 ROC-AUC=0.82
CPU times: user 52.5 s, sys: 3.76 s, total: 56.2 s
Wall time: 13.8 s


#### colsample_bytree (default=1)

In [302]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.1, n_estimators=500, gamma = 0.01,colsample_bytree=0.5, random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.73 ROC-AUC=0.81
# Test
bACC=0.72 ROC-AUC=0.83
CPU times: user 18.1 s, sys: 1.67 s, total: 19.8 s
Wall time: 7.93 s


In [303]:
#FINAL MODEL
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.1, n_estimators=500, gamma = 0.01,colsample_bytree=0.2, random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.71 ROC-AUC=0.81
# Test
bACC=0.76 ROC-AUC=0.83
CPU times: user 14 s, sys: 1.13 s, total: 15.1 s
Wall time: 6.47 s


In [304]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.1, n_estimators=500, gamma = 0.01,colsample_bytree=0.1, random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.70 ROC-AUC=0.80
# Test
bACC=0.73 ROC-AUC=0.83
CPU times: user 13.4 s, sys: 1.58 s, total: 14.9 s
Wall time: 6.94 s


#### subsample (default=1)

In [308]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.1, n_estimators=500, gamma = 0.01,colsample_bytree=0.1, subsample = 0.5,random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.70 ROC-AUC=0.79
# Test
bACC=0.71 ROC-AUC=0.83
CPU times: user 13.2 s, sys: 1.37 s, total: 14.6 s
Wall time: 7.27 s


In [311]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.1, n_estimators=500, gamma = 0.01,colsample_bytree=0.1, subsample = 0.8,random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.71 ROC-AUC=0.81
# Test
bACC=0.72 ROC-AUC=0.83
CPU times: user 13 s, sys: 1.19 s, total: 14.2 s
Wall time: 6.52 s


In [312]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.1, n_estimators=500, gamma = 0.01,colsample_bytree=0.1, subsample = 0.9,random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.70 ROC-AUC=0.80
# Test
bACC=0.70 ROC-AUC=0.82
CPU times: user 13.2 s, sys: 1.29 s, total: 14.5 s
Wall time: 6.84 s


#### max_depth (default=6)

In [316]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.1, n_estimators=500, gamma = 0.01,colsample_bytree=0.1, subsample = 0.8,
                       max_depth = 100, random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.70 ROC-AUC=0.81
# Test
bACC=0.68 ROC-AUC=0.81
CPU times: user 13.2 s, sys: 1.37 s, total: 14.5 s
Wall time: 6.68 s


In [317]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.1, n_estimators=500, gamma = 0.01,colsample_bytree=0.1, subsample = 0.8,
                       max_depth = 50, random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.70 ROC-AUC=0.81
# Test
bACC=0.68 ROC-AUC=0.81
CPU times: user 14.9 s, sys: 1.61 s, total: 16.5 s
Wall time: 7.12 s


In [322]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.1, n_estimators=500, gamma = 0.01,colsample_bytree=0.1, subsample = 0.8,
                       max_depth = 10, random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.70 ROC-AUC=0.81
# Test
bACC=0.70 ROC-AUC=0.82
CPU times: user 14.2 s, sys: 1.85 s, total: 16.1 s
Wall time: 6.88 s


In [318]:
%%time
evaluate(ROIsFeatureExtractor(),
         XGBClassifier(learning_rate = 0.1, n_estimators=500, gamma = 0.01,colsample_bytree=0.1, subsample = 0.8,
                       max_depth = 1000, random_state=42),
         X_train, y_train,X_test, y_test, 'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.70 ROC-AUC=0.81
# Test
bACC=0.68 ROC-AUC=0.81
CPU times: user 14.2 s, sys: 1.94 s, total: 16.2 s
Wall time: 7.19 s


In [234]:
#Test Random Forest
%%time
evaluate(ROIsFeatureExtractor(),
         RandomForestClassifier(n_estimators =100, random_state=42),
         X_train, y_train,X_test, y_test,'Common')

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:   11.9s remaining:   17.8s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:   11.9s finished


# Common Cross Validation Results
bACC=0.71 ROC-AUC=0.79
# Test
bACC=0.71 ROC-AUC=0.79
CPU times: user 846 ms, sys: 695 ms, total: 1.54 s
Wall time: 12.6 s


### SVC Hyperparameter Tuning

#### gamma

In [338]:
%%time
evaluate(ROIsFeatureExtractor(),
         SVC(kernel='rbf', C=1.0, gamma='auto',probability=True),
         X_train, y_train,X_test, y_test,'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.66 ROC-AUC=0.74
# Test
bACC=0.72 ROC-AUC=0.76
CPU times: user 5.4 s, sys: 1.11 s, total: 6.51 s
Wall time: 6.55 s


In [348]:
%%time
evaluate(ROIsFeatureExtractor(),
         SVC(kernel='rbf', C=1.0, gamma='scale',probability=True),
         X_train, y_train,X_test, y_test,'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.59 ROC-AUC=0.73
# Test
bACC=0.64 ROC-AUC=0.68
CPU times: user 4.66 s, sys: 935 ms, total: 5.59 s
Wall time: 5.59 s


In [355]:
#FINAL MODEL
%%time
evaluate(ROIsFeatureExtractor(),
         SVC(kernel='rbf', C=1.0, gamma=0.003401,probability=True),
         X_train, y_train,X_test, y_test,'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.66 ROC-AUC=0.74
# Test
bACC=0.73 ROC-AUC=0.77
CPU times: user 5.06 s, sys: 1.04 s, total: 6.1 s
Wall time: 6.11 s


In [360]:
%%time
evaluate(ROIsFeatureExtractor(),
         SVC(kernel='rbf', C=1.0, gamma=0.0003401,probability=True),
         X_train, y_train,X_test, y_test,'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.60 ROC-AUC=0.74
# Test
bACC=0.63 ROC-AUC=0.70
CPU times: user 4.8 s, sys: 967 ms, total: 5.77 s
Wall time: 5.77 s


In [359]:
%%time
evaluate(ROIsFeatureExtractor(),
         SVC(kernel='rbf', C=1.0, gamma=0.1,probability=True),
         X_train, y_train,X_test, y_test,'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.50 ROC-AUC=0.53
# Test
bACC=0.50 ROC-AUC=0.48
CPU times: user 5.19 s, sys: 1.04 s, total: 6.23 s
Wall time: 6.24 s


In [339]:
%%time
evaluate(ROIsFeatureExtractor(),
         SVC(kernel='rbf', C=0.01, gamma='auto',probability=True),
         X_train, y_train,X_test, y_test,'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.50 ROC-AUC=0.67
# Test
bACC=0.50 ROC-AUC=0.66
CPU times: user 5.42 s, sys: 1.12 s, total: 6.54 s
Wall time: 6.56 s


#### Other kernels - poly

In [341]:
%%time
evaluate(ROIsFeatureExtractor(),
         SVC(kernel='poly', degree=3, C=1.0, gamma='auto',probability=True),
         X_train, y_train,X_test, y_test,'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.61 ROC-AUC=0.67
# Test
bACC=0.68 ROC-AUC=0.78
CPU times: user 6.08 s, sys: 1.04 s, total: 7.12 s
Wall time: 7.17 s


In [340]:
%%time
evaluate(ROIsFeatureExtractor(),
         SVC(kernel='poly', degree=6, C=1.0, gamma='auto',probability=True),
         X_train, y_train,X_test, y_test,'Group Stratified')

# Group Stratified Cross Validation Results
bACC=0.64 ROC-AUC=0.69
# Test
bACC=0.63 ROC-AUC=0.74
CPU times: user 6.14 s, sys: 1.1 s, total: 7.24 s
Wall time: 7.27 s


# Playground

In [254]:
#using ROI  LOGISTIC
from sklearn.svm import SVC
cv = problem.get_cv(X_train, y_train)
# cv2 = problem.get_cv(X_test, y_test)


estimator = make_pipeline(
    ROIsFeatureExtractor(),
    LogisticRegressionCV(Cs = 4, cv=5, penalty = 'elasticnet', solver = 'saga', random_state=42, l1_ratios = [0.3,0.8], max_iter=1000))

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)
    
cv_results = cross_validate(estimator, X_train, y_train, scoring=['balanced_accuracy', 'roc_auc'], cv=cv,
                         verbose=1, return_train_score=True, n_jobs=5,error_score='raise')

print("# 5CV")
print('bACC=%.2f' % cv_results['test_balanced_accuracy'].mean(),
      'ROC-AUC=%.2f' % cv_results['test_roc_auc'].mean())


# Refit on all train
estimator.fit(X_train, y_train)
# Apply on test
y_pred_train = estimator.predict(X_train)
y_pred_test = estimator.predict(X_test)
score_pred_test = estimator.predict_proba(X_test)[:, 1]

bacc_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
auc_test = metrics.roc_auc_score(y_test, score_pred_test)
print("# Test")
print('bACC=%.2f' % bacc_test,
      'ROC-AUC=%.2f' % auc_test)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:   51.9s remaining:  1.3min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:   53.2s finished


# 5CV
bACC=0.69 ROC-AUC=0.77
# Test
bACC=0.73 ROC-AUC=0.80


In [255]:
#using ROI  LOGISTIC
from sklearn.svm import SVC
cv = problem.get_cv(X_train, y_train)
# cv2 = problem.get_cv(X_test, y_test)


estimator = make_pipeline(
    ROIsFeatureExtractor(),
    LogisticRegressionCV(Cs = 4, cv=5, penalty = 'elasticnet', solver = 'saga', random_state=42, l1_ratios = [0.5,0.8], max_iter=1000))

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)
    
cv_results = cross_validate(estimator, X_train, y_train, scoring=['balanced_accuracy', 'roc_auc'], cv=cv,
                         verbose=1, return_train_score=True, n_jobs=5,error_score='raise')

print("# 5CV")
print('bACC=%.2f' % cv_results['test_balanced_accuracy'].mean(),
      'ROC-AUC=%.2f' % cv_results['test_roc_auc'].mean())


# Refit on all train
estimator.fit(X_train, y_train)
# Apply on test
y_pred_train = estimator.predict(X_train)
y_pred_test = estimator.predict(X_test)
score_pred_test = estimator.predict_proba(X_test)[:, 1]

bacc_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
auc_test = metrics.roc_auc_score(y_test, score_pred_test)
print("# Test")
print('bACC=%.2f' % bacc_test,
      'ROC-AUC=%.2f' % auc_test)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:   52.2s remaining:  1.3min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:   53.0s finished


# 5CV
bACC=0.68 ROC-AUC=0.74
# Test
bACC=0.73 ROC-AUC=0.79


In [270]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.3.5-py3-none-macosx_10_15_x86_64.macosx_11_6_x86_64.macosx_12_0_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 223 kB/s eta 0:00:01
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.5
Note: you may need to restart the kernel to use updated packages.


In [277]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.5-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 5.4 MB/s eta 0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-1.7.5
Note: you may need to restart the kernel to use updated packages.


In [278]:
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from sklearn.model_selection import cross_validate

cv = problem.get_cv(X_train, y_train)

estimator = make_pipeline(
    ROIsFeatureExtractor(),
    XGBClassifier(random_state=1))


cv_results = cross_validate(estimator, X_train, y_train, scoring=['balanced_accuracy', 'roc_auc'], cv=cv,
                         verbose=1, return_train_score=True, n_jobs=5)

print("# 5CV")
print('bACC=%.2f' % cv_results['test_balanced_accuracy'].mean(),
      'ROC-AUC=%.2f' % cv_results['test_roc_auc'].mean())

# Refit on all train
estimator.fit(X_train, y_train)
# Apply on test
y_pred_train = estimator.predict(X_train)
y_pred_test = estimator.predict(X_test)
score_pred_test = estimator.predict_proba(X_test)[:, 1]

bacc_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
auc_test = metrics.roc_auc_score(y_test, score_pred_test)
print("# Test")
print('bACC=%.2f' % bacc_test,
      'ROC-AUC=%.2f' % auc_test)


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:   14.1s remaining:   21.2s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:   14.2s finished


# 5CV
bACC=0.73 ROC-AUC=0.81
# Test
bACC=0.73 ROC-AUC=0.81
