In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, balanced_accuracy_score,precision_score, f1_score, roc_auc_score, roc_curve, precision_recall_curve, auc, recall_score, confusion_matrix,matthews_corrcoef
import sys 
import pickle

from imblearn.under_sampling import RandomUnderSampler
sys.path.append('/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/')
from RF_atomver import *
from VisUtils import *

In [2]:
def add_cm2(df): 
    true_labels = df['y'] 
    predictions = df['prediction']
    cm = confusion_matrix(true_labels, predictions )
    cm_flattened = cm.flatten().tolist()
    df['cm'] = [cm_flattened]* len(df)
    df['prediction_type'] = df.apply(lambda x: prediction_type(x['y'], x['prediction']), axis=1)
    
    df['f1'] = f1_score(df['y'], df['prediction'])
    df['ROC-AUC'] = roc_auc_score(df['y'], df['prediction'])
    # df['ROC-AUC'] ='NA'
    df['MCC'] = matthews_corrcoef(df['y'], df['prediction'])
    df['Balanced Accuracy'] = balanced_accuracy_score(df['y'], df['prediction'])
    return df


In [7]:
drop_cols = ['subset', 'compound_id', 'base_rdkit_smiles', 'fold', 'active'] 
this_dir='/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/undersampler_validation/under_batch2_jp/'
RF_type = 'RF'
neks = ['2','3','5','9']
for i, nek in enumerate(neks):
    if nek in ['2','9']: 
        bind_inhib = ['binding', 'inhibition']
    else: 
        bind_inhib = ['binding'] 
    for bi in bind_inhib: 
        if bi == 'binding': 
            this_bi = 'bind'
        else: 
            this_bi = 'inhib' 
        for feat in ['moe', 'mfp']:
            print(f'NEK{nek} {bi} {feat}')
            
            df = pd.read_csv(f'{this_dir}NEK{nek}_{bi}_{feat}_UNDER_batch2.csv')
            if 'active.1' in drop_cols: 
                drop_cols.append('active.1') 
            if 'subset.1' in drop_cols: 
                drop_cols.append('subset.1') 
            train = df[df['subset'] == 'train'] 
            test = df[df['subset'] == 'test']
            
            trainX = train.drop(columns=drop_cols).to_numpy()
            train_y = train['active'].to_numpy() 

            testX = test.drop(columns=drop_cols).to_numpy()
            test_y = test['active'].to_numpy() 
    
            model = rf_models(trainX, train_y, testX, test_y, RF_type, {})
            train_df = gather_rf_results(model, trainX, train_y)
            test_df = gather_rf_results(model, testX, test_y)
            train_df['subset'] = 'train' 
            test_df['subset'] = 'test' 
            model_name = f'NEK{nek}_{bi}_{feat}_UNDER_RF_batch2' 
            train_df['model'] = model_name
            test_df['model'] = model_name
            
            train_df = add_cm2(train_df)
            test_df = add_cm2(test_df)
            
            train_df['NEK'] = f'NEK{nek}_{bi}'
            train_df['feat_type'] = feat
            train_df['strategy'] = 'UNDER'
            train_df['RF_type'] = RF_type
            test_df['NEK'] = f'NEK{nek}_{bi}'
            test_df['feat_type'] = feat
            test_df['strategy'] = 'UNDER'
            test_df['RF_type'] = RF_type
            
            with open(f'{this_dir}{model_name}.pkl', 'wb') as f: 
                pickle.dump(model,f)
    
            train_df.to_csv(f'{this_dir}{model_name}_train_results.csv',index=False) 
            test_df.to_csv(f'{this_dir}{model_name}_test_results.csv',index=False) 
            
            

NEK2 binding moe
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.618, precision: 0.079, recall: 0.750, specificity: 0.613
NEK2 binding mfp
TRAIN: accuracy: 0.978, precision: 1.000, recall: 0.956, specificity: 1.000
TEST: accuracy: 0.403, precision: 0.056, recall: 0.833, specificity: 0.384
NEK2 inhibition moe
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.743, precision: 0.171, recall: 0.714, specificity: 0.745
NEK2 inhibition mfp
TRAIN: accuracy: 0.946, precision: 1.000, recall: 0.893, specificity: 1.000
TEST: accuracy: 0.451, precision: 0.062, recall: 0.500, specificity: 0.447
NEK3 binding moe
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.571, precision: 0.081, recall: 0.588, specificity: 0.570
NEK3 binding mfp
TRAIN: accuracy: 0.969, precision: 1.000, recall: 0.938, specificity: 1.000
TEST: accuracy: 0.489, precision: 0.050, recall: 0.412, speci

In [9]:
results = [] 
metric_cols = ['accuracy', 'precision', 'recall', 'specificity','TN', 'FN', 'FP', 'TP','f1', 'ROC-AUC', 'MCC', 'Balanced Accuracy',
       'model', 'cm', 'prediction_type', 'NEK', 'feat_type', 'strategy','RF_type']
for i, nek in enumerate(neks):
    if nek in ['2','9']: 
        bind_inhib = ['binding', 'inhibition']
    else: 
        bind_inhib = ['binding'] 
    for bi in bind_inhib: 
        if bi == 'binding': 
            this_bi = 'bind'
        else: 
            this_bi = 'inhib' 
        for feat in ['moe', 'mfp']: 
            result_df = pd.read_csv(f'{this_dir}NEK{nek}_{bi}_{feat}_UNDER_RF_batch2_test_results.csv')
            results.append(result_df.iloc[[0]][metric_cols].values.flatten())
results_df =  pd.DataFrame(results,columns=metric_cols)
results_df['model'] = results_df['model'].str.replace('scaled', 'raw')
results_df['strategy'] =results_df['strategy'].str.replace('scaled', 'raw')
results_df['strategy'] = results_df['strategy'].str.replace('scaled', 'raw')
results_df.to_csv(this_dir+'RF_results_UDNER_batch2.csv', index=False) 
results_df['modeling_type'] = 'RF' 
results_df['set'] = 'UNDER_batch2'

In [13]:
original_results = pd.read_csv('/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/general_NEK/final_paper_models.csv')
original_results = original_results[(original_results['set']=='original') & (original_results['modeling_type'] == 'RF')]
only_under = original_results[(original_results['strategy'] == 'UNDER') &(original_results['RF_type'] == 'RF')]
all_under_rf = pd.concat([results_df, only_under])
comparison = all_under_rf[['NEK', 'feat_type','set','cm', 'recall', 'specificity']]
comparison = comparison.sort_values(['NEK', 'feat_type'])
comparison 

Unnamed: 0,NEK,feat_type,set,cm,recall,specificity
1,NEK2_binding,mfp,UNDER_batch2,"[104, 167, 2, 10]",0.833333,0.383764
68,NEK2_binding,mfp,original,"[153, 118, 4, 8]",0.666667,0.564576
0,NEK2_binding,moe,UNDER_batch2,"[166, 105, 3, 9]",0.75,0.612546
52,NEK2_binding,moe,original,"[191, 80, 6, 6]",0.5,0.704797
3,NEK2_inhibition,mfp,UNDER_batch2,"[170, 210, 14, 14]",0.5,0.447368
100,NEK2_inhibition,mfp,original,"[171, 209, 14, 14]",0.5,0.45
2,NEK2_inhibition,moe,UNDER_batch2,"[283, 97, 8, 20]",0.714286,0.744737
84,NEK2_inhibition,moe,original,"[305, 75, 4, 24]",0.857143,0.802632
5,NEK3_binding,mfp,UNDER_batch2,"[131, 134, 10, 7]",0.411765,0.49434
132,NEK3_binding,mfp,original,"[127, 138, 5, 12]",0.705882,0.479245
