In [156]:
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, roc_curve, precision_recall_curve, auc, recall_score, confusion_matrix
import sys 
sys.path.append('/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/')
from RF_atomver import *
from VisUtils import *
import pandas as pd
import numpy as np

In [157]:
def average_metric(df, metric): 
    averages = []
    neks = df['NEK'].unique()
    for nek in neks:
        nek_df=df[df['NEK'] == nek] 
        average_score = nek_df[metric].mean() 
        print(f'{nek} average {metric} score: {average_score:.2f}') 
        averages.append(average_score) 
    print()
    return averages

In [158]:
data_dir = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/'
result_path  = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/GP/GP_results/'
nek_nums = [2,3,5,9]
samplings = ['scaled', 'UNDER', 'SMOTE', 'ADASYN'] 
features = ['moe', 'mfp']

NEK= 'NEK'
train_ys = []
test_ys = []
pred_types = []
for i, n in enumerate(nek_nums):
    nek = str(n)
    nek_path= f'{data_dir}NEK{nek}/bind/'
    for k, feat in enumerate(features): 
        for j, samp in enumerate(samplings): 
            # print(f'NEK{nek} bind {feat} {samp}')
            file_root = f'NEK{nek}_binding_{feat}_{samp}'
            train = pd.read_csv(f'{result_path}{file_root}_train_GP.csv')
            test = pd.read_csv(f'{result_path}{file_root}_test_GP.csv')
            pred_types.append(test['prediction_type'])
    if n == 2 or n == 9:
        nek_path= f'{data_dir}NEK{nek}/inhib/'
        for k, feat in enumerate(features): 
            for j, samp in enumerate(samplings): 
                file_root = f'NEK{nek}_inhibition_{feat}_{samp}'
                # print(f'NEK{nek} inbhib {feat} {samp}')
                train = pd.read_csv(f'{result_path}{file_root}_train_GP.csv')
                test = pd.read_csv(f'{result_path}{file_root}_test_GP.csv')
                pred_types.append(test['prediction_type'])


In [159]:
gp_df = pd.read_csv(GP_resultpath+'GP_test_results_all_NEK.csv')
gp_df['prediction_type'] = pred_types
gp_df['dataset_category'] = gp_df['feat_type'].astype(str)+'_'+gp_df['strategy']
gp_df['strategy'] = gp_df['strategy'].str.replace('scaled', 'raw')
gp_df['model'] = gp_df['model'].str.replace('scaled', 'raw')
gp_df['modeling type'] = 'GP' 
gp_df['model strategy'] = 'GP'


In [160]:
ratio_df = pd.read_csv('/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/dataset_creation/all_NEK_dataset_sizes.csv') 
RF_resultpath= '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/RF/' 
RFGS_resultpath = RF_resultpath+'RF_grid_search/rf_results/'
GP_resultpath = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/GP/GP_results/'
rf_df = pd.read_csv(RF_resultpath+'RF_results/RF_test_results_all_NEK.csv')
rfgs_df = pd.read_csv(RFGS_resultpath+'RF_gridsearch_test_results_all_NEK.csv')
rf_df['model'] = rf_df['model'].str.replace('scaled', 'raw')
rf_df['strategy'] = rf_df['strategy'].str.replace('scaled', 'raw')
rf_df['dataset_category'] = rf_df['feat_type'].astype(str)+'_'+rf_df['strategy']
rfgs_df['model'] = rfgs_df['model'].str.replace('scaled', 'raw')
rfgs_df['strategy'] = rfgs_df['strategy'].str.replace('scaled', 'raw')
rfgs_df=rfgs_df.drop(columns='y')

rf_cols = list(rf_df.columns)
rfgs_cols = list(rfgs_df.columns)
rfgs_df = rfgs_df[rf_cols]
# rf_df['modeling type'] = 'noGS'
# rfgs_df['modeling type'] = 'GS'
# rfgs_df['modeling type'] = rfgs_df['RF_type'].astype(str)+'_'+rfgs_df['modeling type']
# rf_df['modeling type'] = rf_df['RF_type'].astype(str)+'_'+rf_df['modeling type']
rf_df['modeling type'] = rf_df['RF_type'] 
rfgs_df['modeling type'] = rfgs_df['RF_type'] 
rf_df['model strategy'] = 'RF_noGS'
rfgs_df['model strategy'] = 'RF_GS'

col_order = ['NEK', 'strategy', 'feat_type', 'RF_type', 'dataset_category', 'modeling type','model',
            'prediction_type', 'recall', 'accuracy', 'precision','specificity','f1', 'ROC-AUC', 'MCC', 'Balanced Accuracy',
             'TN', 'FN', 'FP', 'TP']
col_order2 = ['NEK', 'strategy', 'feat_type', 'dataset_category', 'modeling type','model','model strategy','cm',
            'prediction_type', 'recall', 'accuracy', 'precision','specificity','f1', 'ROC-AUC', 'MCC', 'Balanced Accuracy',
             'TN', 'FN', 'FP', 'TP']
df = pd.concat([rf_df, rfgs_df],axis=0)
df = df.drop(columns=['RF_type'])
df = df[col_order2]
df = pd.concat([gp_df, df], axis=0)

In [161]:
avg_dfs = []
neks = df['NEK'].unique()
for model in df['model strategy'].unique(): 
    
    this_df = df[df['model strategy'] == model] 
    print(f"{this_df['model strategy'].loc[0]}") 
    f1s = average_metric(this_df, 'f1')
    MCCs = average_metric(this_df, 'MCC')
    BAs = average_metric(this_df, 'Balanced Accuracy')
    ROC_AUCs = average_metric(this_df, 'ROC-AUC')
    precs = average_metric(this_df, 'precision')
    accs = average_metric(this_df, 'accuracy')
    specs = average_metric(this_df, 'specificity')
    recalls = average_metric(this_df, 'recall')
    avg_df = pd.DataFrame({'Recall': recalls, 'Accuracy': accs, 'Precision': precs, 'Specificity': specs, 'F1': f1s,
                          'MCC': MCCs, 'Balanced Accuracy': BAs, 'ROC-AUC': ROC_AUCs})

    avg_dfs.append(avg_df)
    

GP
NEK2_binding average f1 score: 0.21
NEK2_inhibition average f1 score: 0.43
NEK3_binding average f1 score: 0.15
NEK5_binding average f1 score: 0.34
NEK9_binding average f1 score: 0.07
NEK9_inhibition average f1 score: 0.48

NEK2_binding average MCC score: 0.24
NEK2_inhibition average MCC score: 0.43
NEK3_binding average MCC score: 0.13
NEK5_binding average MCC score: 0.33
NEK9_binding average MCC score: 0.07
NEK9_inhibition average MCC score: 0.44

NEK2_binding average Balanced Accuracy score: 0.59
NEK2_inhibition average Balanced Accuracy score: 0.69
NEK3_binding average Balanced Accuracy score: 0.56
NEK5_binding average Balanced Accuracy score: 0.64
NEK9_binding average Balanced Accuracy score: 0.52
NEK9_inhibition average Balanced Accuracy score: 0.70

NEK2_binding average ROC-AUC score: 0.59
NEK2_inhibition average ROC-AUC score: 0.69
NEK3_binding average ROC-AUC score: 0.56
NEK5_binding average ROC-AUC score: 0.64
NEK9_binding average ROC-AUC score: 0.52
NEK9_inhibition average 

In [162]:
neks

array(['NEK2_binding', 'NEK2_inhibition', 'NEK3_binding', 'NEK5_binding',
       'NEK9_binding', 'NEK9_inhibition'], dtype=object)

In [163]:
for lst in avg_dfs: 
    lst.index = neks

In [164]:
avg_GP = avg_dfs[0]
avg_RF=avg_dfs[1]
avg_RFGS=avg_dfs[2]


In [165]:
avg_GP.to_excel('average_NEK_GP.xlsx')

In [166]:
avg_RF.to_excel('average_NEK_RF.xlsx')

In [167]:
avg_RFGS.to_excel('average_NEK_RFGS.xlsx')