In [1]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import os
import sklearn
from sklearn.metrics import precision_score, recall_score, roc_auc_score, matthews_corrcoef, balanced_accuracy_score, confusion_matrix, f1_score

from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks, ClusterCentroids
from imblearn.ensemble import EasyEnsembleClassifier, BalancedBaggingClassifier
from imblearn.pipeline import Pipeline
from sklearn.datasets import make_classification
import sys

sys.path.append('/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/')
from RF_atomver import *

# adding CM and prediction types to all GP dataframes 

In [4]:
metric_cols = ['model',  'accuracy', 'precision', 'recall', 'specificity',
       'TN', 'FN', 'FP', 'TP', 'cm','ROC-AUC','MCC','Balanced Accuracy','f1']
def all_GP_metrics(file_path, filename): 
    df = pd.read_csv(file_path+filename)
    
    df['ROC-AUC'] = roc_auc_score(df['y'], df['y_pred'])
    df['MCC'] = matthews_corrcoef(df['y'], df['y_pred'])
    df['Balanced Accuracy']=balanced_accuracy_score(df['y'], df['y_pred'])
    df['f1']=f1_score(df['y'], df['y_pred'])
    df.to_csv(file_path+ filename, index=False)
    results = df.iloc[[0]]
    results[metric_cols]
    results = results[metric_cols].values
    # model_names = []
    # acc =[] 
    # prec = [] 
    # recall = [] 
    # spec = [] 
    # rocauc = [] 
    # tn = []
    # fn = [] 
    # fp = []
    # train_tp = [] 
    return results
    
    
    

In [5]:
data_dir = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/'
result_path  = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/GP/GP_results/'
nek_nums = [2,3,5,9]
samplings = ['scaled', 'UNDER', 'SMOTE', 'ADASYN'] 
features = ['moe', 'mfp']
NEK= 'NEK'
train_results = [] 
test_results = [] 
for i, n in enumerate(nek_nums):
    nek = str(n)
    # print(f'NEK{nek}')
    nek_path= f'{data_dir}NEK{nek}/bind/'
    for k, feat in enumerate(features): 
        print()
        for j, samp in enumerate(samplings): 
        
            print(f'NEK{nek} bind {feat} {samp}')
            file_root = f'NEK{nek}_binding_{feat}_{samp}'
       
            train = all_GP_metrics(result_path, file_root+'_train_GP.csv') 
            test = all_GP_metrics(result_path, file_root+'_test_GP.csv') 

            train_results.append(train)
            test_results.append(test) 
            
            
        
            print()
            
            
    # print()
    
    if n == 2 or n == 9:

        nek_path= f'{data_dir}NEK{nek}/inhib/'
        for k, feat in enumerate(features): 
            print()
            for j, samp in enumerate(samplings): 
                file_root = f'NEK{nek}_inhibition_{feat}_{samp}'
                print(f'NEK{nek} inbhib{feat} {samp}')
                train = all_GP_metrics(result_path, file_root+'_train_GP.csv') 
                test = all_GP_metrics(result_path, file_root+'_test_GP.csv') 
                train_results.append(train)
                test_results.append(test) 
    
        
            
    print()


NEK2 bind moe scaled

NEK2 bind moe UNDER

NEK2 bind moe SMOTE

NEK2 bind moe ADASYN


NEK2 bind mfp scaled

NEK2 bind mfp UNDER

NEK2 bind mfp SMOTE

NEK2 bind mfp ADASYN


NEK2 inbhibmoe scaled
NEK2 inbhibmoe UNDER
NEK2 inbhibmoe SMOTE
NEK2 inbhibmoe ADASYN

NEK2 inbhibmfp scaled
NEK2 inbhibmfp UNDER
NEK2 inbhibmfp SMOTE
NEK2 inbhibmfp ADASYN


NEK3 bind moe scaled

NEK3 bind moe UNDER

NEK3 bind moe SMOTE

NEK3 bind moe ADASYN


NEK3 bind mfp scaled

NEK3 bind mfp UNDER

NEK3 bind mfp SMOTE

NEK3 bind mfp ADASYN



NEK5 bind moe scaled

NEK5 bind moe UNDER

NEK5 bind moe SMOTE

NEK5 bind moe ADASYN


NEK5 bind mfp scaled

NEK5 bind mfp UNDER

NEK5 bind mfp SMOTE

NEK5 bind mfp ADASYN



NEK9 bind moe scaled

NEK9 bind moe UNDER

NEK9 bind moe SMOTE

NEK9 bind moe ADASYN


NEK9 bind mfp scaled

NEK9 bind mfp UNDER

NEK9 bind mfp SMOTE

NEK9 bind mfp ADASYN


NEK9 inbhibmoe scaled
NEK9 inbhibmoe UNDER
NEK9 inbhibmoe SMOTE
NEK9 inbhibmoe ADASYN

NEK9 inbhibmfp scaled
NEK9 inbhibmfp UN

In [6]:
train_results = np.squeeze(train_results, axis=1)
train_results

train_df =  pd.DataFrame(train_results,columns=metric_cols)
train_df

Unnamed: 0,model,accuracy,precision,recall,specificity,TN,FN,FP,TP,cm,ROC-AUC,MCC,Balanced Accuracy,f1
0,NEK2_binding_moe_scaled_GP,0.96,0.0,0.0,1.0,1080,45,0,0,"[1080, 0, 45, 0]",0.5,0.0,0.5,0.0
1,NEK2_binding_moe_UNDER_GP,1.0,1.0,1.0,1.0,45,0,0,45,"[45, 0, 0, 45]",1.0,1.0,1.0,1.0
2,NEK2_binding_moe_SMOTE_GP,0.999074,0.998152,1.0,0.998148,1078,0,2,1080,"[1078, 2, 0, 1080]",0.999074,0.99815,0.999074,0.999075
3,NEK2_binding_moe_ADASYN_GP,0.99861,0.997225,1.0,0.997222,1077,0,3,1078,"[1077, 3, 0, 1078]",0.998611,0.997224,0.998611,0.99861
4,NEK2_binding_mfp_scaled_GP,0.96,0.0,0.0,1.0,1080,45,0,0,"[1080, 0, 45, 0]",0.5,0.0,0.5,0.0
5,NEK2_binding_mfp_UNDER_GP,1.0,1.0,1.0,1.0,45,0,0,45,"[45, 0, 0, 45]",1.0,1.0,1.0,1.0
6,NEK2_binding_mfp_SMOTE_GP,0.993056,0.999063,0.987037,0.999074,1079,14,1,1066,"[1079, 1, 14, 1066]",0.993056,0.986183,0.993056,0.993014
7,NEK2_binding_mfp_ADASYN_GP,0.994004,0.999071,0.988971,0.999074,1079,12,1,1076,"[1079, 1, 12, 1076]",0.994022,0.988059,0.994022,0.993995
8,NEK2_inhibition_moe_scaled_GP,0.933333,1.0,0.026786,1.0,1523,109,0,3,"[1523, 0, 109, 3]",0.513393,0.158104,0.513393,0.052174
9,NEK2_inhibition_moe_UNDER_GP,1.0,1.0,1.0,1.0,112,0,0,112,"[112, 0, 0, 112]",1.0,1.0,1.0,1.0


In [7]:
test_results = np.squeeze(test_results, axis=1)
test_results

test_df =  pd.DataFrame(test_results,columns=metric_cols)
test_df

Unnamed: 0,model,accuracy,precision,recall,specificity,TN,FN,FP,TP,cm,ROC-AUC,MCC,Balanced Accuracy,f1
0,NEK2_binding_moe_scaled_GP,0.957597,0.0,0.0,1.0,271,12,0,0,"[271, 0, 12, 0]",0.5,0.0,0.5,0.0
1,NEK2_binding_moe_UNDER_GP,0.738516,0.102564,0.666667,0.741697,201,4,70,8,"[201, 70, 4, 8]",0.704182,0.184161,0.704182,0.177778
2,NEK2_binding_moe_SMOTE_GP,0.968198,0.8,0.333333,0.99631,270,8,1,4,"[270, 1, 8, 4]",0.664822,0.504211,0.664822,0.470588
3,NEK2_binding_moe_ADASYN_GP,0.961131,0.571429,0.333333,0.98893,268,8,3,4,"[268, 3, 8, 4]",0.661132,0.418102,0.661132,0.421053
4,NEK2_binding_mfp_scaled_GP,0.957597,0.0,0.0,1.0,271,12,0,0,"[271, 0, 12, 0]",0.5,0.0,0.5,0.0
5,NEK2_binding_mfp_UNDER_GP,0.954064,0.0,0.0,0.99631,270,12,1,0,"[270, 1, 12, 0]",0.498155,-0.012531,0.498155,0.0
6,NEK2_binding_mfp_SMOTE_GP,0.964664,1.0,0.166667,1.0,271,10,0,2,"[271, 0, 10, 2]",0.583333,0.400918,0.583333,0.285714
7,NEK2_binding_mfp_ADASYN_GP,0.964664,1.0,0.166667,1.0,271,10,0,2,"[271, 0, 10, 2]",0.583333,0.400918,0.583333,0.285714
8,NEK2_inhibition_moe_scaled_GP,0.93154,0.0,0.0,1.0,381,28,0,0,"[381, 0, 28, 0]",0.5,0.0,0.5,0.0
9,NEK2_inhibition_moe_UNDER_GP,0.865526,0.309859,0.785714,0.871391,332,6,49,22,"[332, 49, 6, 22]",0.828553,0.438116,0.828553,0.444444


In [8]:
train_df.to_csv('GP_train_results_all_NEK.csv',index=False)
test_df.to_csv('GP_test_results_all_NEK.csv',index=False)

# Try to select the max and min for each of the metrics and display that model
For all models together

In [9]:
# max_cols = ['accuracy', 'precision', 'recall', 'specificity',
#        'TN', 'FN', 'FP', 'TP', 'ROC-AUC']
max_cols = ['accuracy', 'precision', 'recall', 'specificity',
       'TN',  'TP', 'ROC-AUC', 'MCC', 'Balanced Accuracy','f1']
min_cols = ['FN', 'FP']
max_rows = test_df.groupby('model')[max_cols].max()

max_rows

Unnamed: 0_level_0,accuracy,precision,recall,specificity,TN,TP,ROC-AUC,MCC,Balanced Accuracy,f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
NEK2_binding_mfp_ADASYN_GP,0.964664,1.0,0.166667,1.0,271,2,0.583333,0.400918,0.583333,0.285714
NEK2_binding_mfp_SMOTE_GP,0.964664,1.0,0.166667,1.0,271,2,0.583333,0.400918,0.583333,0.285714
NEK2_binding_mfp_UNDER_GP,0.954064,0.0,0.0,0.99631,270,0,0.498155,-0.012531,0.498155,0.0
NEK2_binding_mfp_scaled_GP,0.957597,0.0,0.0,1.0,271,0,0.5,0.0,0.5,0.0
NEK2_binding_moe_ADASYN_GP,0.961131,0.571429,0.333333,0.98893,268,4,0.661132,0.418102,0.661132,0.421053
NEK2_binding_moe_SMOTE_GP,0.968198,0.8,0.333333,0.99631,270,4,0.664822,0.504211,0.664822,0.470588
NEK2_binding_moe_UNDER_GP,0.738516,0.102564,0.666667,0.741697,201,8,0.704182,0.184161,0.704182,0.177778
NEK2_binding_moe_scaled_GP,0.957597,0.0,0.0,1.0,271,0,0.5,0.0,0.5,0.0
NEK2_inhibition_mfp_ADASYN_GP,0.96577,0.888889,0.571429,0.994751,379,16,0.78309,0.697061,0.78309,0.695652
NEK2_inhibition_mfp_SMOTE_GP,0.968215,0.894737,0.607143,0.994751,379,17,0.800947,0.722191,0.800947,0.723404


In [10]:
min_cols = ['FN', 'FP']
min_rows = test_df.groupby('model')[min_cols].max()
min_rows

Unnamed: 0_level_0,FN,FP
model,Unnamed: 1_level_1,Unnamed: 2_level_1
NEK2_binding_mfp_ADASYN_GP,10,0
NEK2_binding_mfp_SMOTE_GP,10,0
NEK2_binding_mfp_UNDER_GP,12,1
NEK2_binding_mfp_scaled_GP,12,0
NEK2_binding_moe_ADASYN_GP,8,3
NEK2_binding_moe_SMOTE_GP,8,1
NEK2_binding_moe_UNDER_GP,4,70
NEK2_binding_moe_scaled_GP,12,0
NEK2_inhibition_mfp_ADASYN_GP,12,2
NEK2_inhibition_mfp_SMOTE_GP,11,2


# Displaying the max metric more clearly ? 
for all models together

In [11]:
for i, metric in enumerate(max_cols): 
    max_metric = test_df.loc[test_df[metric].idxmax()]
    print(f'{metric}:')
    display(max_metric)
    print()
print()
print('\nLowest FN and FP\n')
for i, metric in enumerate(min_cols): 
    min_metric = test_df.loc[test_df[metric].idxmax()]
    print(f'{metric}:')
    display(min_metric)
    print()

TypeError: reduction operation 'argmax' not allowed for this dtype

# Adding a 'NEK#' column 
so it's easier to compare across one type of nek

In [12]:
new_test = test_df
new_test['NEK'] = new_test['model'].str[0:4]
new_test

Unnamed: 0,model,accuracy,precision,recall,specificity,TN,FN,FP,TP,cm,ROC-AUC,MCC,Balanced Accuracy,f1,NEK
0,NEK2_binding_moe_scaled_GP,0.957597,0.0,0.0,1.0,271,12,0,0,"[271, 0, 12, 0]",0.5,0.0,0.5,0.0,NEK2
1,NEK2_binding_moe_UNDER_GP,0.738516,0.102564,0.666667,0.741697,201,4,70,8,"[201, 70, 4, 8]",0.704182,0.184161,0.704182,0.177778,NEK2
2,NEK2_binding_moe_SMOTE_GP,0.968198,0.8,0.333333,0.99631,270,8,1,4,"[270, 1, 8, 4]",0.664822,0.504211,0.664822,0.470588,NEK2
3,NEK2_binding_moe_ADASYN_GP,0.961131,0.571429,0.333333,0.98893,268,8,3,4,"[268, 3, 8, 4]",0.661132,0.418102,0.661132,0.421053,NEK2
4,NEK2_binding_mfp_scaled_GP,0.957597,0.0,0.0,1.0,271,12,0,0,"[271, 0, 12, 0]",0.5,0.0,0.5,0.0,NEK2
5,NEK2_binding_mfp_UNDER_GP,0.954064,0.0,0.0,0.99631,270,12,1,0,"[270, 1, 12, 0]",0.498155,-0.012531,0.498155,0.0,NEK2
6,NEK2_binding_mfp_SMOTE_GP,0.964664,1.0,0.166667,1.0,271,10,0,2,"[271, 0, 10, 2]",0.583333,0.400918,0.583333,0.285714,NEK2
7,NEK2_binding_mfp_ADASYN_GP,0.964664,1.0,0.166667,1.0,271,10,0,2,"[271, 0, 10, 2]",0.583333,0.400918,0.583333,0.285714,NEK2
8,NEK2_inhibition_moe_scaled_GP,0.93154,0.0,0.0,1.0,381,28,0,0,"[381, 0, 28, 0]",0.5,0.0,0.5,0.0,NEK2
9,NEK2_inhibition_moe_UNDER_GP,0.865526,0.309859,0.785714,0.871391,332,6,49,22,"[332, 49, 6, 22]",0.828553,0.438116,0.828553,0.444444,NEK2


In [13]:
neks = ['NEK2', 'NEK3', 'NEK5', 'NEK9']
for n, nek in enumerate(neks): 
    nek_df = new_test[new_test['NEK'] ==nek]
    print(nek)
    for i, metric in enumerate(max_cols): 
        max_metric = new_test.loc[new_test[metric].idxmax()]
        max_model_name = max_metric['model'] 
        print(f'{metric}:')
        # display(max_metric)
        print()
    print()
    print('\nLowest FN and FP\n')
    for j, metric in enumerate(min_cols): 
        min_metric = new_test.loc[new_test[metric].idxmax()]
        print(f'{metric}:')
        display(min_metric)
        print()


NEK2


TypeError: reduction operation 'argmax' not allowed for this dtype

In [21]:
df = pd.read_csv('/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/GP/GP_results/GP_test_results_all_NEK.csv')
df['NEK'] = df['model'].str[0:4]+'_'+train_df['model'].str.extract(r'_(.*?)_', expand=False)

df['strategy'] = df['model'].str[-10:-1].str.extract(r'_(.*?)_', expand=False)

df['feat_type'] =df['model'].str[-15:-1].str.extract(r'_(.*?)_', expand=False)
df['model_type'] = 'GP'
df.head(5)
df.to_csv('/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/GP/GP_results/GP_test_results_all_NEK.csv', index=False)



In [22]:
df.columns

Index(['model', 'accuracy', 'precision', 'recall', 'specificity', 'TN', 'FN',
       'FP', 'TP', 'cm', 'ROC-AUC', 'MCC', 'Balanced Accuracy', 'f1', 'NEK',
       'strategy', 'feat_type', 'model_type'],
      dtype='object')

In [16]:
list1 = list(df.columns)
list2 = ['recall', 'precision']
list3 = [m for m in list1 if m not in list2]
# list3

In [39]:
def rank(df, compare_type, metrics): 
    # metrics = ['recall', 'precision']
    # compare_type = 'NEK'

    if not isinstance(compare_type, list):
        compare_type = [compare_type]
    grouped = df.groupby(compare_type, group_keys=False).apply(lambda group: group.sort_values(by=metrics, ascending=[False]*len(metrics))) 
        # can do any lsit of metrics 
    cols = list(df.columns) 
    # keep_cols = ['model','model_type','NEK', 'feat_type','cm']
    keep_cols = ['model','cm']
    keep_cols.append(compare_type)
    keep_cols.extend(compare_type)
    keep_cols.append(metrics) 
    
    keep_cols.extend(metrics)
    # other_cols = [col for col in cols if col not in keep_cols]
    # print(cols)
    # print(other_cols)
    
    # ranked_df = grouped.reset_index(drop=True).drop(columns=other_cols)
    ranked_df = grouped.reset_index(drop=True)
    ranked_df
    return ranked_df

In [40]:
print(df['strategy'].dtype)

object


In [41]:
metrics = ['recall', 'precision','f1']
compare_types = ['NEK', 'strategy']

ranked_df = rank(df, compare_types, metrics)  
ranked_df 

Unnamed: 0,model,accuracy,precision,recall,specificity,TN,FN,FP,TP,cm,ROC-AUC,MCC,Balanced Accuracy,f1,NEK,strategy,feat_type,model_type
0,NEK2_binding_moe_ADASYN_GP,0.961131,0.571429,0.333333,0.98893,268,8,3,4,"[268, 3, 8, 4]",0.661132,0.418102,0.661132,0.421053,NEK2_binding,ADASYN,moe,GP
1,NEK2_binding_mfp_ADASYN_GP,0.964664,1.0,0.166667,1.0,271,10,0,2,"[271, 0, 10, 2]",0.583333,0.400918,0.583333,0.285714,NEK2_binding,ADASYN,mfp,GP
2,NEK2_binding_moe_SMOTE_GP,0.968198,0.8,0.333333,0.99631,270,8,1,4,"[270, 1, 8, 4]",0.664822,0.504211,0.664822,0.470588,NEK2_binding,SMOTE,moe,GP
3,NEK2_binding_mfp_SMOTE_GP,0.964664,1.0,0.166667,1.0,271,10,0,2,"[271, 0, 10, 2]",0.583333,0.400918,0.583333,0.285714,NEK2_binding,SMOTE,mfp,GP
4,NEK2_binding_moe_UNDER_GP,0.738516,0.102564,0.666667,0.741697,201,4,70,8,"[201, 70, 4, 8]",0.704182,0.184161,0.704182,0.177778,NEK2_binding,UNDER,moe,GP
5,NEK2_binding_mfp_UNDER_GP,0.954064,0.0,0.0,0.99631,270,12,1,0,"[270, 1, 12, 0]",0.498155,-0.012531,0.498155,0.0,NEK2_binding,UNDER,mfp,GP
6,NEK2_binding_moe_scaled_GP,0.957597,0.0,0.0,1.0,271,12,0,0,"[271, 0, 12, 0]",0.5,0.0,0.5,0.0,NEK2_binding,scaled,moe,GP
7,NEK2_binding_mfp_scaled_GP,0.957597,0.0,0.0,1.0,271,12,0,0,"[271, 0, 12, 0]",0.5,0.0,0.5,0.0,NEK2_binding,scaled,mfp,GP
8,NEK2_inhibition_mfp_ADASYN_GP,0.96577,0.888889,0.571429,0.994751,379,12,2,16,"[379, 2, 12, 16]",0.78309,0.697061,0.78309,0.695652,NEK2_inhibition,ADASYN,mfp,GP
9,NEK2_inhibition_moe_ADASYN_GP,0.9511,0.681818,0.535714,0.981627,374,13,7,15,"[374, 7, 13, 15]",0.758671,0.579098,0.758671,0.6,NEK2_inhibition,ADASYN,moe,GP


In [48]:
nek2_bind = df[df['NEK'] == 'NEK2_binding']

compare_type =['feat_type']
metrics=['recall', 'ROC-AUC', 'f1']

nek2_ranked = rank(nek2_bind, compare_type,metrics)  
nek2_ranked

Unnamed: 0,model,accuracy,precision,recall,specificity,TN,FN,FP,TP,cm,ROC-AUC,MCC,Balanced Accuracy,f1,NEK,strategy,feat_type,model_type
0,NEK2_binding_mfp_SMOTE_GP,0.964664,1.0,0.166667,1.0,271,10,0,2,"[271, 0, 10, 2]",0.583333,0.400918,0.583333,0.285714,NEK2_binding,SMOTE,mfp,GP
1,NEK2_binding_mfp_ADASYN_GP,0.964664,1.0,0.166667,1.0,271,10,0,2,"[271, 0, 10, 2]",0.583333,0.400918,0.583333,0.285714,NEK2_binding,ADASYN,mfp,GP
2,NEK2_binding_mfp_scaled_GP,0.957597,0.0,0.0,1.0,271,12,0,0,"[271, 0, 12, 0]",0.5,0.0,0.5,0.0,NEK2_binding,scaled,mfp,GP
3,NEK2_binding_mfp_UNDER_GP,0.954064,0.0,0.0,0.99631,270,12,1,0,"[270, 1, 12, 0]",0.498155,-0.012531,0.498155,0.0,NEK2_binding,UNDER,mfp,GP
4,NEK2_binding_moe_UNDER_GP,0.738516,0.102564,0.666667,0.741697,201,4,70,8,"[201, 70, 4, 8]",0.704182,0.184161,0.704182,0.177778,NEK2_binding,UNDER,moe,GP
5,NEK2_binding_moe_SMOTE_GP,0.968198,0.8,0.333333,0.99631,270,8,1,4,"[270, 1, 8, 4]",0.664822,0.504211,0.664822,0.470588,NEK2_binding,SMOTE,moe,GP
6,NEK2_binding_moe_ADASYN_GP,0.961131,0.571429,0.333333,0.98893,268,8,3,4,"[268, 3, 8, 4]",0.661132,0.418102,0.661132,0.421053,NEK2_binding,ADASYN,moe,GP
7,NEK2_binding_moe_scaled_GP,0.957597,0.0,0.0,1.0,271,12,0,0,"[271, 0, 12, 0]",0.5,0.0,0.5,0.0,NEK2_binding,scaled,moe,GP


In [45]:
# df.columns

In [3]:
# def add_cm(filepath, filename): 
#     # print(filepath)
#     # print(filename)
#     df = pd.read_csv(filepath+filename)
    
#     true_labels = df['y'] 
#     predictions = df['y_pred']
#     cm = confusion_matrix(true_labels, predictions )
#     cm_flattened = cm.flatten().tolist()
#     df['cm'] = [cm_flattened]* len(df)
#     df['prediction_type'] = df.apply(lambda x: prediction_type(x['y'], x['y_pred']), axis=1)
#     return df
        

In [10]:
# df = pd.read_csv(result_path+ 'NEK2_binding_moe_scaled_test_GP.csv')
# true_labels = df['y'] 
# predictions = df['y_pred']
# cm = confusion_matrix(true_labels, predictions )
# df_copy = df.copy()
# cm_flattened = cm.flatten().tolist()
# cm_flattened
# # df_copy['cm'] =[(x,) for x in list(cm)]
# df_copy['cm'] = [cm_flattened]* len(df_copy)
# df_copy['prediction_type'] = df.apply(lambda x: prediction_type(x['y'], x['y_pred']), axis=1)

In [5]:
# data_dir = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/'
# gp_result_path = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/GP/GP_results/'
# # capstone/atom2024/atom2024/notebooks/NEK/GP/GP_results/NEK2_binding_mfp_ADASYN_test_GP.csv
# nek_nums = [2,3,5,9]
# samplings = ['scaled', 'UNDER', 'SMOTE', 'ADASYN'] 
# features = ['moe', 'mfp']
# NEK= 'NEK'
# for i, n in enumerate(nek_nums):
#     nek = str(n)
#     print(f'NEK{nek}')
#     nek_path= f'{data_dir}NEK{nek}/bind/'
    
#     for k, feat in enumerate(features): 
#         print()
#         for j, samp in enumerate(samplings): 
        
#             print(f'NEK{nek} {feat} {samp}')
#             file_root = f'NEK{nek}_binding_{feat}_{samp}'
#             train_file = f'{file_root}_train_GP.csv'
#             test_file = f'{file_root}_test_GP.csv'

#             train_df = add_cm(gp_result_path,train_file)
#             test_df = add_cm(gp_result_path,test_file)
#             train_df.to_csv(f'{gp_result_path}{file_root}_train_GP.csv',index=False) 
#             test_df.to_csv(f'{gp_result_path}{file_root}_test_GP.csv',index=False) 
            
#             print()
            
            
#     print()
    
#     if n == 2 or n == 9:

#         nek_path= f'{data_dir}NEK{nek}/inhib/'
#         for k, feat in enumerate(features): 
#             print()
#             for j, samp in enumerate(samplings): 
#                 file_root = f'NEK{nek}_inhibition_{feat}_{samp}'
#                 print(f'NEK{nek} {feat} {samp}')
#                 print(f'NEK{nek} {feat} {samp}')
#                 file_root = f'NEK{nek}_inhibition_{feat}_{samp}'
#                 train_file = f'{file_root}_train_GP.csv'
#                 test_file = f'{file_root}_test_GP.csv'
    
#                 train_df = add_cm(gp_result_path,train_file)
#                 test_df = add_cm(gp_result_path,test_file)
#                 train_df.to_csv(f'{gp_result_path}{file_root}_train_GP.csv',index=False) 
#                 test_df.to_csv(f'{gp_result_path}{file_root}_test_GP.csv',index=False) 
#                 print()
        
            
#     print()

NEK2

NEK2 moe scaled
/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/GP/GP_results/
NEK2_binding_moe_scaled_train_GP.csv
/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/GP/GP_results/
NEK2_binding_moe_scaled_test_GP.csv

NEK2 moe UNDER
/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/GP/GP_results/
NEK2_binding_moe_UNDER_train_GP.csv
/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/GP/GP_results/
NEK2_binding_moe_UNDER_test_GP.csv

NEK2 moe SMOTE
/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/GP/GP_results/
NEK2_binding_moe_SMOTE_train_GP.csv
/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/GP/GP_results/
NEK2_binding_moe_SMOTE_test_GP.csv

NEK2 moe ADASYN
/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/GP/GP_results/
NEK2_binding_moe_ADASYN_train_GP.csv
/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/GP/GP_results/
NEK2_binding_moe_ADASYN_test_GP.csv


NEK2 mf

In [8]:
# test = pd.read_csv('/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/GP/GP_results/NEK9_binding_mfp_ADASYN_test_GP.csv')