In [4]:
import math
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import os
import pickle
import sklearn
import imblearn as imb
# print("imblearn version: ",imblearn.__version__)
from imblearn.over_sampling import SMOTE, ADASYN

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
import itertools

from scipy.stats import randint

from rdkit import Chem
import sys
sys.path.append('/Users/radhi/Desktop/GitHub/atom2024/atom2024/notebooks/')
from split_data import *


import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, balanced_accuracy_score,precision_score, f1_score, roc_auc_score, roc_curve, precision_recall_curve, auc, recall_score, confusion_matrix,matthews_corrcoef
import sys 
import pickle

from imblearn.under_sampling import RandomUnderSampler

from RF_atomver import *
from VisUtils import *


In [5]:
# def move(files, dest):

#     if not os.path.exists(dest):
#         os.makedirs(dest)
#         print(f'{dest} created')
#     else: 
#         print(f'moving to {dest}')
#     for f in files:
#     #     try:
#         shutil.move(f, dest)
#         # except Exception as e:
#         #     print(f'error moving {f} -- {e}')
    

In [6]:
def scaled_df(file_path, file_name):
    """Standard Scalar to normalize original dataset
    Scale data, put back feature names (including ID columns, add subset (train / test) column
    file_path: directory where file is located 
    file_name: should be NEK#_1_uM_min_50_pct_(binding/inhibition)_5fold_random_imbalanced.csv
    returns: final scaled dataframe
    """
    df_original = pd.read_csv(file_path+file_name)
    original_cols = df_original.columns
    df_original = df_original.drop_duplicates(subset='base_rdkit_smiles') 
    og_cols = original_cols.to_list() 
    feature_cols = og_cols[3:-1] 
    id_col_names = og_cols[0:2]
    id_cols = df_original[id_col_names]
    fold_col = df_original['fold']
    true_labels = df_original['active']
    # print(f'all cols: {original_cols.shape}, features: {len(feature_cols)}, id: {id_col_names}')

    train_df = df_original[df_original['fold']!='fold2']
    test_df = df_original[df_original['fold']=='fold2']
    trainX_df = train_df[feature_cols]
    testX_df = test_df[feature_cols]
    trainy_df = train_df['active']
    testy_df = test_df['active']
    
    train_id_df = train_df[id_col_names]
    test_id_df = test_df[id_col_names]
    train_fold = train_df['fold']
    test_fold = test_df['fold']
    
    x_df = pd.concat([trainX_df, testX_df])
    scaling=StandardScaler()
    scaling.fit(x_df)
    
    scaled_data=scaling.transform(x_df)
    trainX_scaled = scaling.transform(trainX_df)
    testX_scaled = scaling.transform(testX_df) 
    print(f'train X: {trainX_scaled.shape}, testX: {testX_scaled.shape}')
    trainX_final = pd.DataFrame(trainX_scaled, columns=feature_cols)
    trainX_final['subset'] = 'train'
    
    testX_final = pd.DataFrame(testX_scaled, columns=feature_cols)
    testX_final['subset']='test'
    trainX_final[id_col_names] = train_id_df.reset_index(drop=True)
    testX_final[id_col_names] = test_id_df.reset_index(drop=True)
    trainX_final['fold'] = train_fold.reset_index(drop=True)
    testX_final['fold'] = test_fold.reset_index(drop=True)
    
    trainX_final['active'] = trainy_df.reset_index(drop=True)
    testX_final['active'] = testy_df.reset_index(drop=True)

    
    
    final_df = pd.concat([trainX_final, testX_final], ignore_index=True)
    return final_df
    
        

In [7]:
data_dir = '/Users/radhi/Desktop/CAPSTONE_DATA/original_datasets/scaled_descriptors'
nek_nums = [2,3,5,9]
NEK= 'NEK'
for i, n in enumerate(nek_nums):
    nek = str(n)
    print(f'NEK{nek}')
    nek_path = os.path.join(data_dir, nek, '')


    
    bind_file = f'NEK{nek}_1_uM_min_50_pct_binding_5fold_random_imbalanced.csv'
    nek_bind = scaled_df(nek_path,bind_file)

    bind_final = f'NEK{nek}_binding_moe_scaled_df.csv'
    # dest = f'NEK{nek}'
    # if not os.path.exists(dest):
    #     os.makedirs(dest)
    # dest = dest+'/bind/'
    # if not os.path.exists(dest):
    #     os.makedirs(dest)
    dest = '/Users/radhi/Desktop/GitHub/atom2024/atom2024/notebooks/NEK/undersampler_validation'
    
    nek_bind.to_csv(bind_final, index=False)
    #move([bind_final], dest)
    

    if n == 2 or n == 9:
        dest =  f'NEK{nek}/inhib/'
        # if not os.path.exists(dest):
        #     os.makedirs(dest)
        # dest = dest+'/inhib/'
        # if not os.path.exists(dest):
        #     os.makedirs(dest)
   
        inhib_file = f'NEK{nek}_1_uM_min_50_pct_inhibition_5fold_random_imbalanced.csv'
        inhib_final = f'NEK{nek}_inhibition_moe_scaled_df.csv'
        
        nek_inhib=scaled_df(nek_path,inhib_file)
        nek_inhib.to_csv(inhib_final, index=False)
        #move([inhib_final], dest)
    else:
        pass
    print()
    


NEK2
train X: (1126, 306), testX: (282, 306)
train X: (1628, 306), testX: (404, 306)

NEK3
train X: (1123, 306), testX: (281, 306)

NEK5
train X: (989, 306), testX: (248, 306)

NEK9
train X: (1127, 306), testX: (282, 306)
train X: (314, 306), testX: (79, 306)



In [8]:
def undersample(file_path, filename):
    """Undersample the datasetes using the RandomUndersampler
    Keeps the feature names and id cols
    file_name (full/absolute path): use the scaled dataframe we just created above 'NEK#_binding_moe_scaled_df.csv'
    returns: undersampled dataframe
    """
    df = pd.read_csv(file_path+filename)
    original_cols = df.columns.to_list() 
    feature_names = original_cols[0:-5]
    other_cols = original_cols[-5:]
    id_col_names = other_cols[0:4]
    id_cols = df[id_col_names]
    
    train_df = df[df['subset'] == 'train']
    test_df = df[df['subset'] == 'test']
    trainX = train_df[feature_names]
    testX = test_df[feature_names]
    trainy = train_df['active']
    testy = test_df['active']
    
    train_ids = train_df[id_col_names]
    test_ids = test_df[id_col_names]
    undersample = RandomUnderSampler(random_state=42)
    
    trainX_temp, trainy_temp = undersample.fit_resample(trainX.to_numpy(), trainy.to_numpy().reshape(-1))
    
    trainX_resamp = pd.DataFrame(trainX_temp, columns=feature_names)
    trainy_resamp = pd.DataFrame(trainy_temp, columns=['active'])
    
    train_ids_resamp = train_ids.iloc[trainX_resamp.index].reset_index(drop=True)
    train_resamp= pd.concat([train_ids_resamp, trainX_resamp,trainy_resamp], axis=1)
    train_resamp['subset'] = 'train'
    
    
    test_df_final = pd.concat([test_ids.reset_index(drop=True),testX.reset_index(drop=True),testy.reset_index(drop=True)],axis=1)
    test_df_final['subset'] = 'test'
    
    
    final_df = pd.concat([train_resamp,test_df_final]).reset_index(drop=True)
    return final_df 
        

In [9]:
#filepath = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/'
nek_path = '/Users/radhi/Desktop/GitHub/atom2024/atom2024/notebooks/NEK/undersampler_validation/idea1_dir/'
nek_nums = [2,3,5,9]
NEK= 'NEK'
for i, n in enumerate(nek_nums):
    nek = str(n)
    print(f'NEK{nek}')
    #nek_path= f'{filepath}/NEK{nek}/bind/'
    
    bind_file = f'NEK{nek}_binding_moe_scaled_df.csv'
    nek_UNDER = undersample(nek_path,bind_file)
    
    
    bind_final_UNDER = f'NEK{nek}_binding_moe_UNDER_df.csv'
    
    nek_UNDER.to_csv(bind_final_UNDER, index=False)
    dest = f'NEK{nek}/bind/'
    #move([bind_final_UNDER], dest)
    if n == 2 or n == 9:
        #nek_path= f'{filepath}/NEK{nek}/inhib/'
        inhib_file = f'NEK{nek}_inhibition_moe_scaled_df.csv'
        inhib_final_UNDER = f'NEK{nek}_inhibition_moe_UNDER_df.csv'
        
        nek_inhib_UNDER = undersample(nek_path,inhib_file)
        nek_inhib_UNDER.to_csv(inhib_final_UNDER, index=False)
        dest = f'NEK{nek}/inhib/'
        #move([inhib_final_UNDER], dest)
   

NEK2
NEK3
NEK5
NEK9


In [10]:

def get_data_arrays(file_path, df_filename, filename_type=None, save=False):
    """use dataframes to get trainX, trainy, testX, testy out. Optional: save those files to csv
    file_path: directory
    df_filename: dataframe NEK#_binding_moe_{sampling}_df.csv (sampling: scaled, UNDER, SMOTE, ADASYN)
    split dataframe to train and test, and x and y
    save: bool, option to save splits to separate csv files (train X, train y, test X, test y) 
    returns: numpy arrays train X, train y, testX, test y"""
    df = pd.read_csv(file_path+df_filename)
    train_df= df[df['subset']=='train']
    test_df = df[df['subset']=='test']
    train_y = train_df['active'].to_numpy().reshape(-1)
    test_y=test_df['active'].to_numpy().reshape(-1)
    train_x_df = train_df.drop(columns='active')

  
    test_x_df = test_df.drop(columns='active')
    
    train_x_df = train_df.drop(columns='active')
    test_x_df = test_df.drop(columns='active')
    trainX = train_x_df.select_dtypes(include='number').to_numpy()
    testX = test_x_df.select_dtypes(include='number').to_numpy()
    
    print(f'train X shape: {trainX.shape}, y: {train_y.shape}, test X: {testX.shape}, y:{test_y.shape}')
    if (save and filename_type is not None): 
        trainxdf = pd.DataFrame(trainX)
        trainxdf.to_csv(file_path+filename_type+'_trainX.csv', index=False)
        # train_x_df.to_csv(filename_type+'_trainX.csv', index=False)
        trainy_df = pd.DataFrame(train_y)
        trainy_df.to_csv(file_path+filename_type+'_train_y.csv', index=False) 
        # test_x_df.to_csv(filename_type+'_testX.csv', index=False)
        testxdf = pd.DataFrame(testX)
        testxdf.to_csv(file_path+filename_type+'_testX.csv', index=False)
        testy_df = pd.DataFrame(test_y)
        testy_df.to_csv(file_path+filename_type+'_test_y.csv', index=False) 
        
    return trainX, train_y, testX, test_y

In [11]:
samplings = ['scaled', 'UNDER']

nek_nums = [2,3,5,9]
NEK= 'NEK'
file_path = '/Users/radhi/Desktop/GitHub/atom2024/atom2024/notebooks/NEK/undersampler_validation/idea1_dir/'
for i, n in enumerate(nek_nums):
    nek = str(n)
    
    for j, samp in enumerate (samplings):
        print(f'NEK{nek} {samp}')
        nek_path= f'{file_path}'
        bind_df = f'NEK{nek}_binding_moe_{samp}_df.csv'
        bind_dataset_type = f'NEK{nek}_binding_moe_{samp}'
        get_data_arrays(nek_path, bind_df, bind_dataset_type, save=True)
        if n == 2 or n == 9:
            nek_path= f'{file_path}'
            inhib_df = f'NEK{nek}_inhibition_moe_{samp}_df.csv'
            inhib_dataset_type = f'NEK{nek}_inhibition_moe_{samp}'
            get_data_arrays(nek_path, inhib_df, inhib_dataset_type, save=True)
        print()
        
    print()
        

NEK2 scaled
train X shape: (1126, 306), y: (1126,), test X: (282, 306), y:(282,)
train X shape: (1628, 306), y: (1628,), test X: (404, 306), y:(404,)

NEK2 UNDER
train X shape: (90, 306), y: (90,), test X: (282, 306), y:(282,)
train X shape: (224, 306), y: (224,), test X: (404, 306), y:(404,)


NEK3 scaled
train X shape: (1123, 306), y: (1123,), test X: (281, 306), y:(281,)

NEK3 UNDER
train X shape: (130, 306), y: (130,), test X: (281, 306), y:(281,)


NEK5 scaled
train X shape: (989, 306), y: (989,), test X: (248, 306), y:(248,)

NEK5 UNDER
train X shape: (154, 306), y: (154,), test X: (248, 306), y:(248,)


NEK9 scaled
train X shape: (1127, 306), y: (1127,), test X: (282, 306), y:(282,)
train X shape: (314, 306), y: (314,), test X: (79, 306), y:(79,)

NEK9 UNDER
train X shape: (98, 306), y: (98,), test X: (282, 306), y:(282,)
train X shape: (66, 306), y: (66,), test X: (79, 306), y:(79,)




In [12]:
# original_cols = nek2scaled.columns.to_list() 
# # true_labels = nek2scaled['active'] 
# feature_names = original_cols[0:-5]
# # features = nek2scaled[feature_names] 
# other_cols = original_cols[-5:]
# id_col_names = other_cols[0:4]
# id_cols = nek2scaled[id_col_names]

# train_df = nek2scaled[nek2scaled['subset'] == 'train']
# test_df = nek2scaled[nek2scaled['subset'] == 'test']

# trainX = train_df[feature_names]
# testX = test_df[feature_names]
# trainy = train_df['active']
# testy = test_df['active']

# train_ids = train_df[id_col_names]
# test_ids = test_df[id_col_names]

# undersample = RandomUnderSampler(random_state=42)
# trainX_temp, trainy_temp = undersample.fit_resample(trainX.to_numpy(), trainy.to_numpy().reshape(-1))

# trainX_resamp = pd.DataFrame(trainX_temp, columns=feature_names)
# trainy_resamp = pd.DataFrame(trainy_temp, columns=['active'])

# train_ids_resamp = train_ids.iloc[trainX_resamp.index].reset_index(drop=True)
# train_resamp= pd.concat([train_ids_resamp, trainX_resamp,trainy_resamp], axis=1)
# train_resamp['subset'] = 'train'


# test_df_final = pd.concat([test_ids.reset_index(drop=True),testX.reset_index(drop=True),testy.reset_index(drop=True)],axis=1)
# test_df_final['subset'] = 'test'


# final_df = pd.concat([train_resamp,test_df_final]).reset_index(drop=True)

In [13]:
#train RF defualt UNDER for new train ()

In [14]:
neks = ['2','3','5','9']

def add_cm2(df): 
    true_labels = df['y'] 
    predictions = df['prediction']
    cm = confusion_matrix(true_labels, predictions )
    cm_flattened = cm.flatten().tolist()
    df['cm'] = [cm_flattened]* len(df)
    df['prediction_type'] = df.apply(lambda x: prediction_type(x['y'], x['prediction']), axis=1)
    
    df['f1'] = f1_score(df['y'], df['prediction'])
    df['ROC-AUC'] = roc_auc_score(df['y'], df['prediction'])
    df['MCC'] = matthews_corrcoef(df['y'], df['prediction'])
    df['Balanced Accuracy'] = balanced_accuracy_score(df['y'], df['prediction'])
    return df

    

In [15]:
drop_cols = ['subset', 'compound_id', 'base_rdkit_smiles', 'fold', 'active'] 
save_path ='/Users/radhi/Desktop/GitHub/atom2024/atom2024/notebooks/NEK/undersampler_validation/idea1_dir/'
for i, nek in enumerate(neks):
    if nek in ['2','9']: 
        bind_inhib = ['binding', 'inhibition']
    else: 
        bind_inhib = ['binding'] 
    for bi in bind_inhib: 
        if bi == 'binding': 
            this_bi = 'bind'
        else: 
            this_bi = 'inhib' 
        print(f'NEK{nek} {bi}')
        RF_type = 'RF'
        trainX = pd.read_csv(f'NEK{nek}_{bi}_moe_UNDER_trainX.csv').to_numpy()
        trainY = pd.read_csv(f'NEK{nek}_{bi}_moe_UNDER_train_y.csv').to_numpy().reshape(-1)
        testX = pd.read_csv(f'NEK{nek}_{bi}_moe_UNDER_testX.csv').to_numpy()
        testY = pd.read_csv(f'NEK{nek}_{bi}_moe_UNDER_test_y.csv').to_numpy().reshape(-1)


        model = rf_models(trainX, trainY, testX, testY, RF_type, {})
        train_df = gather_rf_results(model, trainX, trainY)
        test_df = gather_rf_results(model, testX, testY)
        train_df['subset'] = 'train' 
        test_df['subset'] = 'test' 
        model_name = f'NEK{nek}_{bi}_moe_UNDER_RF_new_train_fold1345' 
        train_df['model'] = model_name
        
        test_df['model'] = model_name
        train_df = add_cm2(train_df)
        test_df = add_cm2(test_df)
        
        train_df['NEK'] = f'NEK{nek}_{bi}'
        train_df['feat_type'] = 'moe'
        train_df['strategy'] = 'UNDER'
        train_df['RF_type'] = RF_type
        test_df['NEK'] = f'NEK{nek}_{bi}'
        test_df['feat_type'] = 'moe'
        test_df['strategy'] = 'UNDER'
        test_df['RF_type'] = RF_type
        
        with open(f'{save_path}{model_name}.pkl', 'wb') as f: 
            pickle.dump(model,f)

        train_df.to_csv(f'{save_path}{model_name}_train_results.csv',index=False) 
        test_df.to_csv(f'{save_path}{model_name}_test_results.csv',index=False) 
        

NEK2 binding
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.571, precision: 0.071, recall: 0.750, specificity: 0.563
NEK2 inhibition
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.765, precision: 0.204, recall: 0.821, specificity: 0.761
NEK3 binding
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.484, precision: 0.067, recall: 0.625, specificity: 0.475
NEK5 binding
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.641, precision: 0.137, recall: 0.650, specificity: 0.640
NEK9 binding
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.535, precision: 0.072, recall: 0.833, specificity: 0.522
NEK9 inhibition
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.797, precision: 0.333, recall: 0.778, specificity: 0.800


In [16]:
results = [] 
metric_cols = ['accuracy', 'precision', 'recall', 'specificity','TN', 'FN', 'FP', 'TP','f1', 'ROC-AUC', 'MCC', 'Balanced Accuracy',
       'model', 'cm', 'prediction_type', 'NEK', 'feat_type', 'strategy','RF_type']

for i, nek in enumerate(neks):
    if nek in ['2','9']: 
        bind_inhib = ['binding', 'inhibition']
    else: 
        bind_inhib = ['binding'] 
    for bi in bind_inhib: 
        if bi == 'binding': 
            this_bi = 'bind'
        else: 
            this_bi = 'inhib' 
        result_df = pd.read_csv(f'{save_path}NEK{nek}_{bi}_moe_UNDER_RF_new_train_fold1345_test_results.csv')
        results.append(result_df.iloc[[0]][metric_cols].values.flatten())
results_df =  pd.DataFrame(results,columns=metric_cols)
results_df['model'] = results_df['model'].str.replace('scaled', 'raw')
results_df['strategy'] =results_df['strategy'].str.replace('scaled', 'raw')
results_df['strategy'] = results_df['strategy'].str.replace('scaled', 'raw')
results_df.to_csv(save_path+'RF_results_trained_1345.csv', index=False) 
results_df['modeling_type'] = 'RF' 
results_df['set'] = 'fold1345'

In [18]:
original_results = pd.read_excel('/Users/radhi/Desktop/CAPSTONE_DATA/GP_GPmatern_RF_RFGS_results_all_sets.xlsx')
original_results = original_results[(original_results['set']=='original') & (original_results['modeling_type'] == 'RF') & (original_results['feat_type'] == 'moe')]
only_under = original_results[(original_results['strategy'] == 'UNDER') &(original_results['RF_type'] == 'RF')]
all_under_rf = pd.concat([results_df, only_under])
comparison = all_under_rf[['NEK', 'set','cm', 'recall', 'specificity']]
comparison = comparison.sort_values('NEK')
comparison 

Unnamed: 0,NEK,set,cm,recall,specificity
0,NEK2_binding,fold1345,"[152, 118, 3, 9]",0.75,0.562963
156,NEK2_binding,original,"[191, 80, 6, 6]",0.5,0.704797
1,NEK2_inhibition,fold1345,"[286, 90, 5, 23]",0.821429,0.760638
188,NEK2_inhibition,original,"[302, 78, 4, 24]",0.857143,0.794737
2,NEK3_binding,fold1345,"[126, 139, 6, 10]",0.625,0.475472
220,NEK3_binding,original,"[135, 130, 3, 14]",0.823529,0.509434
3,NEK5_binding,fold1345,"[146, 82, 7, 13]",0.65,0.640351
252,NEK5_binding,original,"[161, 67, 3, 17]",0.85,0.70614
4,NEK9_binding,fold1345,"[141, 129, 2, 10]",0.833333,0.522222
284,NEK9_binding,original,"[177, 93, 4, 9]",0.692308,0.655556


In [20]:
all_us =  pd.read_excel('/Users/radhi/Desktop/CAPSTONE_DATA/GP_GPmatern_RF_RFGS_results_all_sets.xlsx')
all_us  = all_us[(all_us ['set'] !='literature') & (all_us['modeling_type']!='RF_GS')]
all_us =all_us[(all_us['strategy'] != 'SMOTE') & (all_us['set'] !='literature') & (all_us['strategy'] !='ADASYN')]
rf= all_us[all_us['modeling_type'] != 'GP']
rf= all_us[all_us['modeling_type'] != 'GP_matern']

rf_raw = rf[(rf['strategy'] == 'raw') & (rf['RF_type'].isin(['RF', 'RF_BCW', 'BRFC', 'BRFC_BCW']))]
rf_other = rf[(rf['RF_type'] == 'RF') & (rf['strategy'].isin(['UNDER']))]  
rf_us = pd.concat([rf_raw, rf_other])
rf_us_moe = rf_us[rf_us['feat_type'] == 'moe']
make_comparison_us = pd.concat([rf_us_moe, results_df])
make_comparison_us = make_comparison_us.sort_values(['NEK', 'strategy'])
make_comparison_us 
new_order = ['model', 'NEK', 'set', 'RF_type','modeling_type', 'feat_type', 'strategy', 'cm',
       'recall', 'specificity','accuracy', 'precision',  'f1', 'MCC',
       'Balanced Accuracy', 'ROC-AUC', 'TN', 'FN', 'FP', 'TP',
       'prediction_type']
make_comparison_us=make_comparison_us[new_order]
make_comparison_us.to_excel('compare_og_undersampler_to_new_fold1345_RF_UNDER.xlsx',index=False)