En este notebook
- Se optimizan hiperparámetros del pipeline por validación cruzada en cada sitio
- Se calculan métricas de entrenamiento y test para los mejores modelos.

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.metrics import accuracy_score, make_scorer, recall_score, f1_score
from sklearn.feature_selection import SelectFromModel
pd.set_option('display.max_rows', 100, 'display.max_columns', 100)

pheno_cols = ['Age', 'Verbal.IQ', 'Performance.IQ', 'Full4.IQ']

In [68]:
def specificity_loss_func(y_true, y_pred):
    return recall_score(y_true, y_pred, pos_label=0)
specificity = make_scorer(specificity_loss_func, greater_is_better=True)

# Optimización de hiperparámetros por validación cruzada para el pipeline completo (SMOTE-)elasticnet-SVM

def cv_elasticnet_svc(train_data_path, svc_C_values, en_l1_ratios, en_K=25, en_alpha_max = 1000, en_epsilon =.01):
    data_train = pd.read_csv(train_data_path)
    X_train, y_train = data_train.drop(columns=['DX_bin']), data_train['DX_bin']

    # modelos
    en = LogisticRegression(penalty='elasticnet', solver='saga', n_jobs=-1, max_iter=100)
    selector = SelectFromModel(en)
    svc = LinearSVC(random_state=11)

    pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
                                    ['selector', selector], 
                                    ['svc', svc]])

    # se define la grilla de hiperparámetros y la partición estratificada
    alpha_path = np.exp(np.linspace(np.log(en_alpha_max), np.log(en_alpha_max*en_epsilon), num = en_K))
    param_grid = {'selector__estimator__C':1/alpha_path, 'selector__estimator__l1_ratio':en_l1_ratios, 'svc__C':svc_C_values}

    stratified_kfold = StratifiedKFold(n_splits=5,
                                           shuffle=True,
                                           random_state=11)
    # se define el scoring
    scoring = {'accuracy':'accuracy', 'f1_macro':'f1_macro', 'sensitivity':'recall', 'specificity':specificity}
    
    # se define y corre la función de validación cruzada
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring=scoring, cv=stratified_kfold, refit= 'f1_macro', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    cols_to_keep = ['param_selector__estimator__C', 'param_selector__estimator__l1_ratio', 'param_svc__C', 'mean_test_accuracy', 'std_test_accuracy', 'mean_test_f1_macro', 'std_test_f1_macro', 'mean_test_sensitivity', 'mean_test_specificity']
    df = pd.DataFrame(grid_search.cv_results_)[cols_to_keep]

    return df, grid_search

def report(grid_search, train_data_path, test_data_path):
    data_train = pd.read_csv(train_data_path)
    X_train, y_train = data_train.drop(columns=['DX_bin']), data_train['DX_bin']
    data_test = pd.read_csv(test_data_path)
    X_test, y_test = data_test.drop(columns=['DX_bin']), data_test['DX_bin']

    print('Mejores parámetros')
    param_cols = ['param_selector__estimator__C', 'param_selector__estimator__l1_ratio', 'param_svc__C']
    df = pd.DataFrame(grid_search.cv_results_)
    display(df.loc[[df.mean_test_f1_macro.argmax()], param_cols].reset_index(drop=True))
    n_nonzero = grid_search.best_estimator_['selector'].get_support().sum()
    print(f'Número de variables seleccionadas: {n_nonzero}')

    score_cols = ['mean_test_accuracy', 'std_test_accuracy', 'mean_test_f1_macro', 'std_test_f1_macro', 'mean_test_sensitivity', 'mean_test_specificity']
    print('\nMétricas validación cruzada (train)')
    display(df.loc[[df.mean_test_f1_macro.argmax()], score_cols].reset_index(drop=True))

    print('\nMétricas train')
    y_pred = grid_search.best_estimator_.predict(X_train)
    acc = accuracy_score(y_train, y_pred)
    f1 = f1_score(y_train, y_pred)
    sens = recall_score(y_train, y_pred)
    spec = specificity_loss_func(y_train, y_pred)
    df = pd.DataFrame({'accuracy':[acc], 'f1_macro':[f1], 'sensitivity':[sens], 'specificity':[spec]})
    display(df)

    print('\nMétricas test')
    y_pred = grid_search.best_estimator_.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    sens = recall_score(y_test, y_pred)
    spec = specificity_loss_func(y_test, y_pred)
    df = pd.DataFrame({'accuracy':[acc], 'f1_macro':[f1], 'sensitivity':[sens], 'specificity':[spec]})
    display(df)

In [67]:
# grilla de hiperparámetros

svc_C_values = [10** i for i in range(-5,5)]
en_l1_ratios = [0.05, 0.1, 0.2, 0.3]
en_K = 25

### Peking

In [62]:
train_data_path = 'data/data_raj_train_Peking_preproc.csv'
test_data_path = 'data/data_raj_test_Peking_preproc.csv'

In [None]:
df_raj_Peking, grid_search_raj_Peking = cv_elasticnet_svc(train_data_path, svc_C_values=svc_C_values, en_l1_ratios=en_l1_ratios, en_K=en_K)
df_raj_Peking.to_csv('cv_results_raj_Peking.csv')

In [69]:
report(grid_search_raj_Peking, train_data_path, test_data_path)

Mejores parámetros


Unnamed: 0,param_selector__estimator__C,param_selector__estimator__l1_ratio,param_svc__C
0,0.001778,0.05,0.01


Número de variables seleccionadas: 3

Métricas validación cruzada (train)


Unnamed: 0,mean_test_accuracy,std_test_accuracy,mean_test_f1_macro,std_test_f1_macro,mean_test_sensitivity,mean_test_specificity
0,0.811765,0.068599,0.769781,0.072546,0.67,0.867949



Métricas train


Unnamed: 0,accuracy,f1_macro,sensitivity,specificity
0,0.788235,0.689655,0.833333,0.770492



Métricas test


Unnamed: 0,accuracy,f1_macro,sensitivity,specificity
0,0.705882,0.666667,0.625,0.777778


### KKI

In [74]:
train_data_path = 'data/data_raj_train_KKI_preproc.csv'
test_data_path = 'data/data_raj_test_KKI_preproc.csv'

In [None]:
df_raj_KKI, grid_search_raj_KKI = cv_elasticnet_svc(train_data_path, svc_C_values=svc_C_values, en_l1_ratios=en_l1_ratios, en_K=en_K)
df_raj_KKI.to_csv('cv_results_raj_KKI.csv')

In [76]:
report(grid_search_raj_KKI, train_data_path, test_data_path)

Mejores parámetros


Unnamed: 0,param_selector__estimator__C,param_selector__estimator__l1_ratio,param_svc__C
0,0.014678,0.3,0.001


Número de variables seleccionadas: 64

Métricas validación cruzada (train)


Unnamed: 0,mean_test_accuracy,std_test_accuracy,mean_test_f1_macro,std_test_f1_macro,mean_test_sensitivity,mean_test_specificity
0,0.736765,0.083985,0.560952,0.101541,0.19,0.935897



Métricas train


Unnamed: 0,accuracy,f1_macro,sensitivity,specificity
0,0.963855,0.926829,0.863636,1.0



Métricas test


Unnamed: 0,accuracy,f1_macro,sensitivity,specificity
0,0.727273,0.4,0.333333,0.875
