# FXA protein
# Data load and preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%run ../modules/run_or_load_decorator.py
%run ../modules/plotting_metrics.py

In [2]:
%run ./1_Helper_functions.ipynb

In [3]:
protein_name='fxa'
file_name = '../../../FXa/ANALISIS/6_Machine_Learning_Models/' + \
'df_DkSc_results_COCRYS_DEKOIS_DUD.pkl'
X_merged_dksc = pd.read_pickle(file_name)
y_true_merged = X_merged_dksc['activity']
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)
X_merged_dksc.shape

# Simplify the names
X = X_merged_dksc.values
y = y_true_merged.values

## Estimators
### ML Classifiers

In [4]:
from sklearn.neighbors import KNeighborsClassifier

hyparams ={'n_neighbors': 1, 'n_jobs': 4}
knn = KNeighborsClassifier(**hyparams)

In [5]:
from sklearn.linear_model import LogisticRegression 

hyparams = {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear', 'max_iter': 400}
lr = LogisticRegression(**hyparams)

In [6]:
from xgboost import XGBClassifier

hyparams = {'subsample': 0.5, 'n_estimators': 200, 
            'max_depth': 10, 'learning_rate': 0.1,
            'gamma': 1, 'colsample_bytree': 0.5, 
            'alpha': 0.5}

xgb = XGBClassifier(**hyparams)

In [7]:
# ML Classifier
ml_classifiers = {
    'knn': knn,
    'lr': lr,
    'xgb': xgb,
}

# update names
ml_classifiers = {f'ml_{name}': clf 
                for name, clf in ml_classifiers.items()}

### Consensus Scorings

In [8]:
%run ../6_Machine_Learning_Models/5_Helper_Consensus_Scoring.ipynb

cs_functions = {
    'MEAN': get_mean_score,
    'MAX': get_max_score,
    'MIN': get_min_score,
    'VOTE': get_vote_score
}

# update names
cs_functions = {f'cs_{name}': func 
                for name, func in cs_functions.items()}

### Estimators dictionary 

In [9]:
estimators = {**ml_classifiers, **cs_functions}

### Evaluation metrics 

In [10]:
metrics=dict(roc_auc = {'metric_name': 'roc_auc'},
             nef_02 = {'metric_name': 'ef', 
                    'fraction': 0.02, 'method':'normalized'},
             nef_005 = {'metric_name': 'ef', 
                        
                    'fraction': 0.005, 'method':'normalized'},
             nef_12_Ra = {'metric_name': 'ef', 
                    'fraction': 0.12, 'method':'normalized'}
            )

# Hold-out Validation 

## 30 replicas
### AUC-ROC


In [11]:
# Save the results to a file to ommit repeate the analysis
@run_or_load_joblib
def n_hold_out_validation_SAVE(filename, **kwargs):
    return n_hold_out_validation(**kwargs)

In [None]:
%%time
evaluation_name='ho30'
ho30 = n_hold_out_validation_SAVE(
    filename=f'./cachedir/{evaluation_name}_{protein_name}',
    estimators=estimators, X=X, y=y, metrics=metrics, 
                      n_reps=30, random_state=42)

metric='roc_auc'
# Normality
display(multi_norm_test(ho30, metric=metric))
# Homocedasticity
display(multi_homovar_test(ho30, metric=metric))

### Statistical Evaluation
#### Multiple classifiers

In [None]:
%run Friedman_and_Nemenyi_test.ipynb

In [None]:
ho30_auc = ho30.loc['roc_auc']

#### Friedman's test

In [None]:
friedmanTest(ho30_auc)

In [None]:
friedman_imanDavenportTest(ho30_auc)

#### Nemenyi test - pairwise comparison

In [None]:
display(pairwise_nemenyi(ho30_auc)[0].style.applymap(_col_sig_p_values))

### Visualizations

#### Swarm plot

In [None]:
metric='roc_auc'
plot_swarm_metrics(ho30, metric_name=metric, ascending=True, title_extra='- 30 reps')

#### Critical Differences plot

In [None]:
plot_cd(ho30_auc)

#### p-values heatmap

In [None]:
plot_p_heatmap(ho30_auc)

#### Boxplot: Pairwise statistical significance

In [None]:
plot_box_signif(ho30, 'roc_auc', ascending=True)

# Y-randomization test

In [None]:
@run_or_load_joblib
def n_hout_val_scrambling(filename, y, random_chi,  **kwargs):
    y_rand = randomize_y_labels(y_target=y, random_chi=random_chi)
    results = n_hold_out_validation(y=y_rand, **kwargs)
    return results

In [None]:
%%time
chi_fractions = [1, 0.75, 0.5, 0.25, 0.0]
n_reps=30
chi_results = {}
evaluation_name = f'y_scrambling_{n_reps}_reps'

for chi in chi_fractions:
    filename = f'./cachedir/{evaluation_name}_{protein_name}_chi-{chi}'
    result = n_hout_val_scrambling(filename=filename, 
                             random_chi=chi, estimators=estimators, 
                             X=X, y=y, metrics=metrics, 
                             n_reps=3, random_state=42)
    print(f'Fraction chi={chi} finished.')
    chi_results[f'chi_{chi}'] = result
    

In [None]:
for chi in chi_results.keys():
    a = chi_results[chi]
    plot_swarm_metrics(a, metric_name=metric, ascending=True, title_extra=f'- chi={chi}')