# load packages

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, balanced_accuracy_score
import sys
import numpy as np
import xgboost as xgb
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
import gc
from memory_profiler import memory_usage
import time
import inspect
from datetime import datetime
import argparse as ap
import shap
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# read in input files

## known genes and pathways

In [None]:
ad_genes = pd.read_csv('advp/AD_known_gene_list.txt', header = None)
print(len(ad_genes.index))
ad_genes.head()

In [None]:
known_path = pd.read_csv('go_ad_pathways.csv')
print(len(known_path.index))
known_path.head()

## pathway scores

In [None]:
input_dir = 'ML/statistical_models/input/'
input_suffix = '.statistical_models_input.txt'
avg_gene = pd.read_csv((input_dir + 'AOU_ALL.UKBB.metasoft.gene_score.ROSMAP.RNAseq.methylation.somoscan_proteomics.MSBB.RNAseq.methylation.tmt_proteomics.ADSP.gene_average.pathway_scores.minmax_scaled.go.keep_quest_comb.covariates' + input_suffix),
                       sep = '\t').replace([np.inf, -np.inf], np.nan).dropna(axis = 1)
print(avg_gene.shape)

In [None]:
input_dir = 'ML/statistical_models/input/'
input_suffix = '.statistical_models_input.txt'
avg_pathway = pd.read_csv((input_dir + 'AOU_ALL.UKBB.metasoft.gene_score.ROSMAP.RNAseq.methylation.somoscan_proteomics.MSBB.RNAseq.methylation.tmt_proteomics.ADSP.pathway_average.pathway_scores.minmax_scaled.go.keep_quest_comb.covariates' + input_suffix),
                       sep = '\t').replace([np.inf, -np.inf], np.nan).dropna(axis = 1)
print(avg_pathway.shape)

In [None]:
avg_pathway['AD'].value_counts()

## pathway map

### reactome

In [None]:
pathway_map = pd.read_csv('pathway_score/pathway_annotation/reactome/AD_KMI.ADSP.ROSMAP.all_omics.somoscan.MSBB.all_omics.reactome.gene_to_pathway.pathway_mapping.txt',
                          sep = '\t')
pathway_map.head()

### go

In [None]:
pathway_map_go = pd.read_csv('pathway_score/pathway_annotation/go/AD_KMI.ADSP.ROSMAP.all_omics.MSBB.all_omics.go.gene_to_pathway.pathway_mapping.txt',
                             sep = '\t')
print(len(pathway_map_go.index))
print(len(pathway_map_go['PATHWAY_ID'].unique()))
pathway_map_go.head()

### go pathway names

In [None]:
pathway_ids = []
pathway_names = []
sources = []

with open('pathway_score/pathway_annotation/raw_databases/go-basic.obo.1', "r") as f:
    for line in f:
        if line.startswith("id: "):
            pathway_ids.append(line)
        elif line.startswith('name: '):
            pathway_names.append(line)
        elif line.startswith('namespace: '):
            sources.append(line)

#print(len(pathway_ids))
#print(len(sources))

go_source = pd.DataFrame({'PATHWAY_ID' : pathway_ids, 'PATHWAY_NAME' : pathway_names, 'SOURCE' : sources})
go_source['PATHWAY_ID'] = go_source['PATHWAY_ID'].str.replace('\n', '')
go_source['PATHWAY_ID'] = go_source['PATHWAY_ID'].str.replace('id: ', '')
go_source['PATHWAY_NAME'] = go_source['PATHWAY_NAME'].str.replace('\n', '')
go_source['PATHWAY_NAME'] = go_source['PATHWAY_NAME'].str.replace('name: ', '')
go_source['SOURCE'] = go_source['SOURCE'].str.replace('\n', '')
go_source['SOURCE'] = go_source['SOURCE'].str.replace('namespace: ', '')
print(len(go_source.index))
print(len(go_source['PATHWAY_ID'].unique()))
print(len(go_source['PATHWAY_NAME'].unique()))
print(go_source['SOURCE'].unique())
go_source.head()

## test output metrics

In [None]:
avg_gene_auroc = pd.read_csv('ML/statistical_models/xgboost_output/indiv_metrics/TEST.AUROC.XGBoost.avg_gene.minmax_scaled.go.keep_quest_comb.csv',
                             index_col = 0)
print(avg_gene_auroc.shape)
avg_gene_auroc.head()

In [None]:
avg_gene_auprc = pd.read_csv('ML/statistical_models/xgboost_output/indiv_metrics/TEST.AUPRC.XGBoost.avg_gene.minmax_scaled.go.keep_quest_comb.csv',
                             index_col = 0)
print(avg_gene_auprc.shape)
avg_gene_auprc.head()

In [None]:
avg_gene_f1 = pd.read_csv('ML/statistical_models/xgboost_output/indiv_metrics/TEST.F1_SCORE.XGBoost.avg_gene.minmax_scaled.go.keep_quest_comb.csv',
                          index_col = 0)
print(avg_gene_f1.shape)
avg_gene_f1.head()

In [None]:
avg_gene_balanced_acc = pd.read_csv('ML/statistical_models/xgboost_output/indiv_metrics/TEST.BALANCED_ACCURACY.XGBoost.avg_gene.minmax_scaled.go.keep_quest_comb.csv',
                                    index_col = 0)
print(avg_gene_balanced_acc.shape)
avg_gene_balanced_acc.head()

In [None]:
avg_pathway_auroc = pd.read_csv('ML/statistical_models/xgboost_output/indiv_metrics/TEST.AUROC.XGBoost.avg_pathway.minmax_scaled.go.keep_quest_comb.csv',
                                index_col = 0)
print(avg_pathway_auroc.shape)
avg_pathway_auroc.head()

In [None]:
avg_pathway_auprc = pd.read_csv('ML/statistical_models/xgboost_output/indiv_metrics/TEST.AUPRC.XGBoost.avg_pathway.minmax_scaled.go.keep_quest_comb.csv',
                                index_col = 0)
print(avg_pathway_auprc.shape)
avg_pathway_auprc.head()

In [None]:
avg_pathway_f1 = pd.read_csv('ML/statistical_models/xgboost_output/indiv_metrics/TEST.F1_SCORE.XGBoost.avg_pathway.minmax_scaled.go.keep_quest_comb.csv',
                             index_col = 0)
print(avg_pathway_f1.shape)
avg_pathway_f1.head()

In [None]:
avg_pathway_balanced_acc = pd.read_csv('ML/statistical_models/xgboost_output/indiv_metrics/TEST.BALANCED_ACCURACY.XGBoost.avg_pathway.minmax_scaled.go.keep_quest_comb.csv',
                                       index_col = 0)
print(avg_pathway_balanced_acc.shape)
avg_pathway_balanced_acc.head()

## train output metrics

In [None]:
avg_gene_all_train_auroc = pd.read_csv('ML/statistical_models/xgboost_output/indiv_metrics/TRAIN.AUROC.XGBoost.avg_gene.minmax_scaled.go.keep_quest_comb.csv',
                                       index_col = 0)
print(avg_gene_all_train_auroc.shape)
avg_gene_all_train_auroc.head()

In [None]:
avg_gene_all_train_auprc = pd.read_csv('ML/statistical_models/xgboost_output/indiv_metrics/TRAIN.AUPRC.XGBoost.avg_gene.minmax_scaled.go.keep_quest_comb.csv',
                                       index_col = 0)
print(avg_gene_all_train_auprc.shape)
avg_gene_all_train_auprc.head()

In [None]:
avg_gene_all_train_f1 = pd.read_csv('ML/statistical_models/xgboost_output/indiv_metrics/TRAIN.F1_SCORE.XGBoost.avg_gene.minmax_scaled.go.keep_quest_comb.csv',
                                    index_col = 0)
print(avg_gene_all_train_f1.shape)
avg_gene_all_train_f1.head()

In [None]:
avg_gene_all_train_balanced_acc = pd.read_csv('ML/statistical_models/xgboost_output/indiv_metrics/TRAIN.BALANCED_ACCURACY.XGBoost.avg_gene.minmax_scaled.go.keep_quest_comb.csv',
                                              index_col = 0)
print(avg_gene_all_train_balanced_acc.shape)

In [None]:
avg_pathway_all_train_auroc = pd.read_csv('ML/statistical_models/xgboost_output/indiv_metrics/TRAIN.AUROC.XGBoost.avg_pathway.minmax_scaled.go.keep_quest_comb.csv',
                                          index_col = 0)
print(avg_pathway_all_train_auroc.shape)
avg_pathway_all_train_auroc.head()

In [None]:
avg_pathway_all_train_auprc = pd.read_csv('ML/statistical_models/xgboost_output/indiv_metrics/TRAIN.AUPRC.XGBoost.avg_pathway.minmax_scaled.go.keep_quest_comb.csv',
                                          index_col = 0)
print(avg_pathway_all_train_auprc.shape)
avg_pathway_all_train_auprc.head()

In [None]:
avg_pathway_all_train_f1 = pd.read_csv('ML/statistical_models/xgboost_output/indiv_metrics/TRAIN.F1_SCORE.XGBoost.avg_pathway.minmax_scaled.go.keep_quest_comb.csv',
                                       index_col = 0)
print(avg_pathway_all_train_f1.shape)
avg_pathway_all_train_f1.head()

In [None]:
avg_pathway_all_train_balanced_acc = pd.read_csv('ML/statistical_models/xgboost_output/indiv_metrics/TRAIN.BALANCED_ACCURACY.XGBoost.avg_pathway.minmax_scaled.go.keep_quest_comb.csv',
                                                 index_col = 0)
print(avg_pathway_all_train_balanced_acc.shape)

## all omics

In [None]:
all_omics = pd.read_csv('ML/statistical_models/input/AD_KMI.multiomics.sample_list.txt',
                        sep = '\t')

# identify iteration w best metrics

In [None]:
print(avg_gene_auroc.max(axis = 1).iloc[0])
avg_gene_auroc.idxmax(axis = 1).iloc[0]

In [None]:
print(avg_gene_auprc.max(axis = 1).iloc[0])
avg_gene_auprc.idxmax(axis = 1).iloc[0]

In [None]:
print(avg_gene_f1.max(axis = 1).iloc[0])
avg_gene_f1.idxmax(axis = 1).iloc[0]

In [None]:
print(avg_gene_balanced_acc.max(axis = 1).iloc[0])
avg_gene_balanced_acc.idxmax(axis = 1).iloc[0]

In [None]:
print(avg_pathway_auroc.max(axis = 1).iloc[0])
avg_pathway_auroc.idxmax(axis = 1).iloc[0]

In [None]:
print(avg_pathway_auprc.max(axis = 1).iloc[0])
avg_pathway_auprc.idxmax(axis = 1).iloc[0]

In [None]:
print(avg_pathway_f1.max(axis = 1).iloc[0])
avg_pathway_f1.idxmax(axis = 1).iloc[0]

In [None]:
print(avg_pathway_balanced_acc.max(axis = 1).iloc[0])
avg_pathway_balanced_acc.idxmax(axis = 1).iloc[0]

# split dataset

### random

In [None]:
avg_gene_train = avg_gene.sample(frac = 0.7, random_state = 53)
avg_gene_no_train = avg_gene.drop(avg_gene_train.index)
avg_gene_val = avg_gene_no_train.sample(frac = 0.5, random_state = 53)
avg_gene_test = avg_gene_no_train.drop(avg_gene_val.index)

In [None]:
avg_pathway_train = avg_pathway.sample(frac = 0.7, random_state = 13)
avg_pathway_no_train = avg_pathway.drop(avg_pathway_train.index)
avg_pathway_val = avg_pathway_no_train.sample(frac = 0.5, random_state = 13)
avg_pathway_test = avg_pathway_no_train.drop(avg_pathway_val.index)

### even omics

In [None]:
df = avg_gene.copy()
iter = 20

# split into omics and no omics
target = df[df['ID'].isin(all_omics['ID'])]
no_target = df[~df['ID'].isin(all_omics['ID'])]

# get even omics samples
target_train = target.sample(n = 255, random_state = iter)
target_no_train = target.drop(target_train.index)
target_val = target_no_train.sample(frac = 0.5, random_state = iter)
target_test = target_no_train.drop(target_val.index)

# get gene score sample numbers
train_total = int(len(df) * 0.7)
train_remaining = train_total - 254

# get rest of splits with gene score samples
no_target_train = no_target.sample(n = train_remaining, random_state = iter)
no_target_no_train = no_target.drop(no_target_train.index)
no_target_val = no_target_no_train.sample(frac = 0.5, random_state = iter)
no_target_test = no_target_no_train.drop(no_target_val.index)

# concat
avg_gene_train = pd.concat([target_train, no_target_train], axis = 0)
avg_gene_no_train = pd.concat([target_no_train, no_target_no_train], axis = 0)
avg_gene_val = pd.concat([target_val, no_target_val], axis = 0)
avg_gene_test = pd.concat([target_test, no_target_test], axis = 0)

In [None]:
df = avg_pathway.copy()
iter = 96

# split into omics and no omics
target = df[df['ID'].isin(all_omics['ID'])]
no_target = df[~df['ID'].isin(all_omics['ID'])]

# get even omics samples
target_train = target.sample(n = 255, random_state = iter)
target_no_train = target.drop(target_train.index)
target_val = target_no_train.sample(frac = 0.5, random_state = iter)
target_test = target_no_train.drop(target_val.index)

# get gene score sample numbers
train_total = int(len(df) * 0.7)
train_remaining = train_total - 254

# get rest of splits with gene score samples
no_target_train = no_target.sample(n = train_remaining, random_state = iter)
no_target_no_train = no_target.drop(no_target_train.index)
no_target_val = no_target_no_train.sample(frac = 0.5, random_state = iter)
no_target_test = no_target_no_train.drop(no_target_val.index)

# concat
avg_pathway_train = pd.concat([target_train, no_target_train], axis = 0)
avg_pathway_no_train = pd.concat([target_no_train, no_target_no_train], axis = 0)
avg_pathway_val = pd.concat([target_val, no_target_val], axis = 0)
avg_pathway_test = pd.concat([target_test, no_target_test], axis = 0)

# scale data

In [None]:
predictors = avg_gene_train.columns.tolist()
predictors.remove('ID')
predictors.remove('AD')

scaler = StandardScaler()
X_avg_gene_train_scaled = scaler.fit_transform(avg_gene_train[predictors])
X_avg_gene_no_train_scaled = scaler.transform(avg_gene_no_train[predictors])
X_avg_gene_val_scaled = scaler.transform(avg_gene_val[predictors])
X_avg_gene_test_scaled = scaler.transform(avg_gene_test[predictors])

In [None]:
predictors = avg_pathway_train.columns.tolist()
predictors.remove('ID')
predictors.remove('AD')

scaler = StandardScaler()
X_avg_pathway_train_scaled = scaler.fit_transform(avg_pathway_train[predictors])
X_avg_pathway_no_train_scaled = scaler.transform(avg_pathway_no_train[predictors])
X_avg_pathway_val_scaled = scaler.transform(avg_pathway_val[predictors])
X_avg_pathway_test_scaled = scaler.transform(avg_pathway_test[predictors])

In [None]:
print(xgb.__version__)

# train models w CVs

In [None]:
iter = 20

avg_gene_cv_train_auroc_list = []
avg_gene_cv_train_auprc_list = []
avg_gene_cv_train_f1_list = []
avg_gene_cv_train_balanced_acc_list = []
avg_gene_cv_train_selected_features = []

kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = iter)
for train_idx, val_idx in kf.split(X_avg_gene_train_scaled, avg_gene_train[['AD']]):
    X_cv_train_scaled, X_cv_val_scaled = X_avg_gene_train_scaled[train_idx], X_avg_gene_train_scaled[val_idx]
    y_cv_train, y_cv_val = avg_gene_train['AD'].iloc[train_idx], avg_gene_train['AD'].iloc[val_idx]
    
    train_model = xgb.XGBClassifier(objective = 'binary:logistic', random_state = iter, n_jobs = -1)

    feature_selector = train_model.fit(X_cv_train_scaled, y_cv_train)

    importances = feature_selector.feature_importances_
    threshold = np.percentile(importances, 75)
    selector = SelectFromModel(feature_selector, threshold = threshold, prefit = True)

    selected_mask = selector.get_support()
    avg_gene_cv_train_selected_features.append(selected_mask)

    X_cv_train_selected = selector.transform(X_cv_train_scaled)
    X_cv_val_selected = selector.transform(X_cv_val_scaled)

    train_model = xgb.XGBClassifier(objective = 'binary:logistic', random_state = iter, n_jobs = -1)
    train_model.fit(X_cv_train_selected,  y_cv_train)

    y_cv_val_pred_bin = train_model.predict(X_cv_val_selected)
    y_cv_val_pred_cont = train_model.predict_proba(X_cv_val_selected)[:, 1]

    auroc = roc_auc_score(y_cv_val, y_cv_val_pred_cont)
    auprc = average_precision_score(y_cv_val, y_cv_val_pred_cont)
    f1 = f1_score(y_cv_val, y_cv_val_pred_bin)
    balanced_acc = balanced_accuracy_score(y_cv_val, y_cv_val_pred_bin)

    avg_gene_cv_train_auroc_list.append(auroc)
    avg_gene_cv_train_auprc_list.append(auprc)
    avg_gene_cv_train_f1_list.append(f1)
    avg_gene_cv_train_balanced_acc_list.append(balanced_acc)

In [None]:
iter = 96

avg_pathway_cv_train_auroc_list = []
avg_pathway_cv_train_auprc_list = []
avg_pathway_cv_train_f1_list = []
avg_pathway_cv_train_balanced_acc_list = []
avg_pathway_cv_train_selected_features = []

kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = iter)
for train_idx, val_idx in kf.split(X_avg_pathway_train_scaled, avg_pathway_train[['AD']]):
    X_cv_train_scaled, X_cv_val_scaled = X_avg_pathway_train_scaled[train_idx], X_avg_pathway_train_scaled[val_idx]
    y_cv_train, y_cv_val = avg_pathway_train['AD'].iloc[train_idx], avg_pathway_train['AD'].iloc[val_idx]
    
    train_model = xgb.XGBClassifier(objective = 'binary:logistic', random_state = iter, n_jobs = -1)

    feature_selector = train_model.fit(X_cv_train_scaled, y_cv_train)

    importances = feature_selector.feature_importances_
    threshold = np.percentile(importances, 75)
    selector = SelectFromModel(feature_selector, threshold = threshold, prefit = True)

    selected_mask = selector.get_support()
    avg_pathway_cv_train_selected_features.append(selected_mask)

    X_cv_train_selected = selector.transform(X_cv_train_scaled)
    X_cv_val_selected = selector.transform(X_cv_val_scaled)

    train_model = xgb.XGBClassifier(objective = 'binary:logistic', random_state = iter, n_jobs = -1)
    train_model.fit(X_cv_train_selected,  y_cv_train)

    y_cv_val_pred_bin = train_model.predict(X_cv_val_selected)
    y_cv_val_pred_cont = train_model.predict_proba(X_cv_val_selected)[:, 1]

    auroc = roc_auc_score(y_cv_val, y_cv_val_pred_cont)
    auprc = average_precision_score(y_cv_val, y_cv_val_pred_cont)
    f1 = f1_score(y_cv_val, y_cv_val_pred_bin)
    balanced_acc = balanced_accuracy_score(y_cv_val, y_cv_val_pred_bin)

    avg_pathway_cv_train_auroc_list.append(auroc)
    avg_pathway_cv_train_auprc_list.append(auprc)
    avg_pathway_cv_train_f1_list.append(f1)
    avg_pathway_cv_train_balanced_acc_list.append(balanced_acc)

# select features

In [None]:
feature_matrix = np.vstack(avg_gene_cv_train_selected_features)
feature_counts = np.sum(feature_matrix, axis = 0)
avg_gene_final_features = feature_counts >= 4
X_avg_gene_val_selected = X_avg_gene_val_scaled[:, avg_gene_final_features]
X_avg_gene_test_selected = X_avg_gene_test_scaled[:, avg_gene_final_features]
print(X_avg_gene_val_selected.shape[1], flush = True)

In [None]:
feature_matrix = np.vstack(avg_pathway_cv_train_selected_features)
feature_counts = np.sum(feature_matrix, axis = 0)
avg_pathway_final_features = feature_counts >= 4
X_avg_pathway_val_selected = X_avg_pathway_val_scaled[:, avg_pathway_final_features]
X_avg_pathway_test_selected = X_avg_pathway_test_scaled[:, avg_pathway_final_features]
print(X_avg_pathway_val_selected.shape[1], flush = True)

# compute mean train metrics

## avg gene

In [None]:
avg_gene_train_auroc = np.mean(avg_gene_cv_train_auroc_list)
avg_gene_train_auprc = np.mean(avg_gene_cv_train_auprc_list)
avg_gene_train_f1 = np.mean(avg_gene_cv_train_f1_list)
avg_gene_train_balanced_acc = np.mean(avg_gene_cv_train_balanced_acc_list)

print(avg_gene_train_auroc)
print(avg_gene_train_auprc)
print(avg_gene_train_f1)
print(avg_gene_train_balanced_acc)

In [None]:
print(avg_gene_all_train_auroc['ITER_20'])
print(avg_gene_all_train_auprc['ITER_20'])
print(avg_gene_all_train_f1['ITER_20'])
print(avg_gene_all_train_balanced_acc['ITER_20'])

## avg pathway

In [None]:
avg_pathway_train_auroc = np.mean(avg_pathway_cv_train_auroc_list)
avg_pathway_train_auprc = np.mean(avg_pathway_cv_train_auprc_list)
avg_pathway_train_f1 = np.mean(avg_pathway_cv_train_f1_list)
avg_pathway_train_balanced_acc = np.mean(avg_pathway_cv_train_balanced_acc_list)

print(avg_pathway_train_auroc)
print(avg_pathway_train_auprc)
print(avg_pathway_train_f1)
print(avg_pathway_train_balanced_acc)

In [None]:
print(avg_pathway_all_train_auroc['ITER_96'])
print(avg_pathway_all_train_auprc['ITER_96'])
print(avg_pathway_all_train_f1['ITER_96'])
print(avg_pathway_all_train_balanced_acc['ITER_96'])

# hyperparameter tuning

In [None]:
iter = 20

model = xgb.XGBClassifier(objective = 'binary:logistic', random_state = iter, n_jobs = -1)

n_pos = sum(avg_gene_train['AD'] == 1)
n_neg = sum(avg_gene_train['AD'] == 0)
        
param_grid = {
            'n_estimators' : [50, 100, 200, 300, 400],
            'max_depth' : [1, 2, 3],
            'learning_rate' : [0.02, 0.03],
            'subsample' : [0.5, 0.6, 0.7],
            'colsample_bytree' : [0.5, 0.6, 0.7, 0.8, 0.9],
            'gamma' : [0.1, 0.2, 0.3, 0.4, 0.5],
            'min_child_weight' : [10, 20, 30, 40],
            'scale_pos_weight' : [n_neg/n_pos]
}

avg_gene_random_search = RandomizedSearchCV(estimator = model, param_distributions = param_grid, cv = 5, random_state = iter, scoring = 'balanced_accuracy', n_jobs = -1, n_iter = 10)
        
avg_gene_random_search.fit(X_avg_gene_val_selected, avg_gene_val['AD'])

avg_gene_best_model = avg_gene_random_search.best_estimator_

y_val_pred_bin = avg_gene_best_model.predict(X_avg_gene_val_selected)
y_val_pred_cont = avg_gene_best_model.predict_proba(X_avg_gene_val_selected)[:, 1]

y_val_pred_bin = avg_gene_best_model.predict(X_avg_gene_val_selected)
y_val_pred_cont = avg_gene_best_model.predict_proba(X_avg_gene_val_selected)[:, 1]

avg_gene_val_auroc = roc_auc_score(avg_gene_val['AD'], y_val_pred_cont)
avg_gene_val_auprc = average_precision_score(avg_gene_val['AD'], y_val_pred_cont)
avg_gene_val_f1 = f1_score(avg_gene_val['AD'], y_val_pred_bin)
avg_gene_val_balanced_acc = balanced_accuracy_score(avg_gene_val['AD'], y_val_pred_bin)

print(avg_gene_val_auroc)
print(avg_gene_val_auprc)
print(avg_gene_val_f1)
print(avg_gene_val_balanced_acc)

In [None]:
iter = 96

model = xgb.XGBClassifier(objective = 'binary:logistic', random_state = iter, n_jobs = -1)

n_pos = sum(avg_gene_train['AD'] == 1)
n_neg = sum(avg_gene_train['AD'] == 0)
        
param_grid = {
            'n_estimators' : [50, 100, 200, 300, 400],
            'max_depth' : [1, 2, 3],
            'learning_rate' : [0.02, 0.03],
            'subsample' : [0.5, 0.6, 0.7],
            'colsample_bytree' : [0.5, 0.6, 0.7, 0.8, 0.9],
            'gamma' : [0.1, 0.2, 0.3, 0.4, 0.5],
            'min_child_weight' : [10, 20, 30, 40],
            'scale_pos_weight' : [n_neg/n_pos]
}

avg_pathway_random_search = RandomizedSearchCV(estimator = model, param_distributions = param_grid, cv = 5, random_state = iter, scoring = 'balanced_accuracy', n_jobs = -1, n_iter = 10)
avg_pathway_random_search.fit(X_avg_pathway_val_selected, avg_pathway_val['AD'])

avg_pathway_best_model = avg_pathway_random_search.best_estimator_

y_val_pred_bin = avg_pathway_best_model.predict(X_avg_pathway_val_selected)
y_val_pred_cont = avg_pathway_best_model.predict_proba(X_avg_pathway_val_selected)[:, 1]

y_val_pred_bin = avg_pathway_best_model.predict(X_avg_pathway_val_selected)
y_val_pred_cont = avg_pathway_best_model.predict_proba(X_avg_pathway_val_selected)[:, 1]

avg_pathway_val_auroc = roc_auc_score(avg_pathway_val['AD'], y_val_pred_cont)
avg_pathway_val_auprc = average_precision_score(avg_pathway_val['AD'], y_val_pred_cont)
avg_pathway_val_f1 = f1_score(avg_pathway_val['AD'], y_val_pred_bin)
avg_pathway_val_balanced_acc = balanced_accuracy_score(avg_pathway_val['AD'], y_val_pred_bin)

print(avg_pathway_val_auroc)
print(avg_pathway_val_auprc)
print(avg_pathway_val_f1)
print(avg_pathway_val_balanced_acc)

# test model

In [None]:
y_test_pred_bin = avg_gene_best_model.predict(X_avg_gene_test_selected)
y_test_pred_cont = avg_gene_best_model.predict_proba(X_avg_gene_test_selected)[:, 1]

avg_gene_test_auroc = roc_auc_score(avg_gene_test['AD'], y_test_pred_cont)
avg_gene_test_auprc = average_precision_score(avg_gene_test['AD'], y_test_pred_cont)
avg_gene_test_f1 = f1_score(avg_gene_test['AD'], y_test_pred_bin)
avg_gene_test_balanced_acc = balanced_accuracy_score(avg_gene_test['AD'], y_test_pred_bin)

print(avg_gene_test_auroc)
print(avg_gene_test_auprc)
print(avg_gene_test_f1)
print(avg_gene_test_balanced_acc)

In [None]:
y_test_pred_bin = avg_pathway_best_model.predict(X_avg_pathway_test_selected)
y_test_pred_cont = avg_pathway_best_model.predict_proba(X_avg_pathway_test_selected)[:, 1]

avg_pathway_test_auroc = roc_auc_score(avg_pathway_test['AD'], y_test_pred_cont)
avg_pathway_test_auprc = average_precision_score(avg_pathway_test['AD'], y_test_pred_cont)
avg_pathway_test_f1 = f1_score(avg_pathway_test['AD'], y_test_pred_bin)
avg_pathway_test_balanced_acc = balanced_accuracy_score(avg_pathway_test['AD'], y_test_pred_bin)

print(avg_pathway_test_auroc)
print(avg_pathway_test_auprc)
print(avg_pathway_test_f1)
print(avg_pathway_test_balanced_acc)

# compute shap values

In [None]:
explainer = shap.TreeExplainer(avg_gene_best_model)
avg_gene_shap_values = explainer(X_avg_gene_test_selected, check_additivity = False)

In [None]:
explainer = shap.TreeExplainer(avg_pathway_best_model)
avg_pathway_shap_values = explainer(X_avg_pathway_test_selected, check_additivity = False)

# make plot

## create column map

In [None]:
pathway_map_go_sub = go_source[['PATHWAY_ID', 'PATHWAY_NAME']].drop_duplicates()
print(len(pathway_map_go_sub.index))
print(len(go_source.index))
pathway_map_go_sub.head()

In [None]:
pathway_map_go_known = pathway_map_go_sub[pathway_map_go_sub['PATHWAY_ID'].isin(known_path['GO_ID'])]
print(len(pathway_map_go_sub['PATHWAY_ID'].unique()))
print(len(pathway_map_go_known['PATHWAY_ID'].unique()))
print(len(known_path['GO_ID'].unique()))

In [None]:
pathway_map_go_known['PATHWAY_NAME'] = pathway_map_go_known['PATHWAY_NAME'] + ' (**KNOWN**)'
pathway_map_go_known.head()

In [None]:
pathway_map_go_no_known = pathway_map_go_sub[~pathway_map_go_sub['PATHWAY_ID'].isin(known_path['GO_ID'])]
print(len(pathway_map_go_sub['PATHWAY_ID'].unique()))
print(len(pathway_map_go_no_known['PATHWAY_ID'].unique()))
print(len(known_path['GO_ID'].unique()))

In [None]:
pathway_map_go_fixed = pd.concat([pathway_map_go_known, pathway_map_go_no_known], axis = 0)
print(len(pathway_map_go_fixed.index))
print(len(pathway_map_go_sub.index))

In [None]:
column_map = dict(zip(pathway_map_go_fixed['PATHWAY_ID'], pathway_map_go_fixed['PATHWAY_NAME']))

## avg gene

### initial plot, no pathway names

In [None]:
#avg_gene_shap_values_class1 = avg_gene_shap_values.values[:, :, 1]
avg_gene_shap_values_class1 = avg_gene_shap_values.values
avg_gene_selected_features = np.array(predictors)[avg_gene_final_features]
shap.summary_plot(avg_gene_shap_values_class1, avg_gene_test[avg_gene_selected_features])

### add pathway names

In [None]:
avg_gene_pathway_names = avg_gene_test[avg_gene_selected_features]
avg_gene_pathway_names = avg_gene_pathway_names.rename(columns = column_map)

### remake plots

In [None]:
#avg_gene_shap_values_class1 = avg_gene_shap_values.values[:, :, 1]
avg_gene_shap_values_class1 = avg_gene_shap_values.values
avg_gene_selected_features = np.array(predictors)[avg_gene_final_features]
shap.summary_plot(avg_gene_shap_values_class1, avg_gene_pathway_names, plot_size = (10, 6))

## avg pathway

### make initial plot

In [None]:
#avg_pathway_shap_values_class1 = avg_pathway_shap_values.values[:, :, 1]
avg_pathway_shap_values_class1 = avg_pathway_shap_values.values
avg_pathway_selected_features = np.array(predictors)[avg_pathway_final_features]
shap.summary_plot(avg_pathway_shap_values_class1, avg_pathway_test[avg_pathway_selected_features])

### rename pathways

In [None]:
avg_pathway_pathway_names = avg_pathway_test[avg_pathway_selected_features]
avg_pathway_pathway_names = avg_pathway_pathway_names.rename(columns = column_map)

### remake plots

In [None]:
#avg_pathway_shap_values_class1 = avg_pathway_shap_values.values[:, :, 1]
avg_pathway_shap_values_class1 = avg_pathway_shap_values.values
avg_pathway_selected_features = np.array(predictors)[avg_pathway_final_features]
shap.summary_plot(avg_pathway_shap_values_class1, avg_pathway_pathway_names, plot_size = (10, 6))

# get top pathway sources

## extract top 20

In [None]:
avg_gene_top20 = [avg_gene_selected_features[i] for i in np.argsort(np.mean(np.abs(avg_gene_shap_values_class1), axis = 0))[-20:][::-1]]
avg_gene_top20

In [None]:
avg_pathway_top20 = [avg_pathway_selected_features[i] for i in np.argsort(np.mean(np.abs(avg_pathway_shap_values_class1), axis = 0))[-20:][::-1]]
avg_pathway_top20

## filter map to get source

In [None]:
avg_gene_top20_source = pathway_map_go[pathway_map_go['PATHWAY_ID'].isin(avg_gene_top20)]
print(len(avg_gene_top20_source.index))
print(len(avg_gene_top20_source['PATHWAY_ID'].unique()))
avg_gene_top20_source.head()

In [None]:
avg_pathway_top20_source = pathway_map_go[pathway_map_go['PATHWAY_ID'].isin(avg_pathway_top20)]
print(len(avg_pathway_top20_source.index))
print(len(avg_pathway_top20_source['PATHWAY_ID'].unique()))
avg_pathway_top20_source.head()

## explode source column, merge with correct names, and subset

In [None]:
avg_gene_top20_source = avg_gene_top20_source.assign(SOURCE = avg_gene_top20_source['SOURCE'].str.split(';')).explode('SOURCE')
avg_gene_top20_source_sub = avg_gene_top20_source[['PATHWAY_ID', 'SOURCE']]
avg_gene_top20_source_merge = avg_gene_top20_source_sub.merge(pathway_map_go_fixed, on = 'PATHWAY_ID').drop_duplicates()
print(len(avg_gene_top20_source_merge['PATHWAY_ID'].unique()))
avg_gene_top20_source_merge.head()

In [None]:
avg_pathway_top20_source = avg_pathway_top20_source.assign(SOURCE = avg_pathway_top20_source['SOURCE'].str.split(';')).explode('SOURCE')
avg_pathway_top20_source_sub = avg_pathway_top20_source[['PATHWAY_ID', 'SOURCE']]
avg_pathway_top20_source_merge = avg_pathway_top20_source_sub.merge(pathway_map_go_fixed, on = 'PATHWAY_ID').drop_duplicates()
print(len(avg_pathway_top20_source_merge['PATHWAY_ID'].unique()))
avg_pathway_top20_source_merge.head()

## get summary numbers

In [None]:
avg_gene_top20_source['SOURCE'].value_counts(dropna = False)

In [None]:
avg_pathway_top20_source['SOURCE'].value_counts(dropna = False)