In [18]:
import math
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import os
import pickle
import sklearn
import imblearn as imb
# print("imblearn version: ",imblearn.__version__)
from imblearn.over_sampling import SMOTE, ADASYN

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
import itertools

from scipy.stats import randint

from rdkit import Chem
import sys
sys.path.append('../../')
import utils
from split_data import *
# from RF_GSCV import *
# from RF_Utils import *


In [23]:
def move(files, dest):

    if not os.path.exists(dest):
        os.makedirs(dest)
        print(f'{dest} created')
    else: 
        print(f'moving to {dest}')
    for f in files:
    #     try:
        shutil.move(f, dest)
        # except Exception as e:
        #     print(f'error moving {f} -- {e}')
    

In [24]:
def scaled_df(file_path, file_name):
    """Standard Scalar to normalize original dataset
    Scale data, put back feature names (including ID columns, add subset (train / test) column
    file_path: directory where file is located 
    file_name: should be NEK#_1_uM_min_50_pct_(binding/inhibition)_5fold_random_imbalanced.csv
    returns: final scaled dataframe
    """
    df_original = pd.read_csv(file_path+file_name)
    original_cols = df_original.columns 
    og_cols = original_cols.to_list() 
    feature_cols = og_cols[3:-1] 
    id_col_names = og_cols[0:2]
    id_cols = df_original[id_col_names]
    fold_col = df_original['fold']
    true_labels = df_original['active']
    # print(f'all cols: {original_cols.shape}, features: {len(feature_cols)}, id: {id_col_names}')

    train_df = df_original[df_original['fold']!='fold1']
    test_df = df_original[df_original['fold']=='fold1']
    trainX_df = train_df[feature_cols]
    testX_df = test_df[feature_cols]
    trainy_df = train_df['active']
    testy_df = test_df['active']
    
    train_id_df = train_df[id_col_names]
    test_id_df = test_df[id_col_names]
    train_fold = train_df['fold']
    test_fold = test_df['fold']
    
    x_df = pd.concat([trainX_df, testX_df])
    scaling=StandardScaler()
    scaling.fit(x_df)
    
    scaled_data=scaling.transform(x_df)
    trainX_scaled = scaling.transform(trainX_df)
    testX_scaled = scaling.transform(testX_df) 
    print(f'train X: {trainX_scaled.shape}, testX: {testX_scaled.shape}')
    trainX_final = pd.DataFrame(trainX_scaled, columns=feature_cols)
    trainX_final['subset'] = 'train'
    
    testX_final = pd.DataFrame(testX_scaled, columns=feature_cols)
    testX_final['subset']='test'
    trainX_final[id_col_names] = train_id_df.reset_index(drop=True)
    testX_final[id_col_names] = test_id_df.reset_index(drop=True)
    trainX_final['fold'] = train_fold.reset_index(drop=True)
    testX_final['fold'] = test_fold.reset_index(drop=True)
    
    trainX_final['active'] = trainy_df.reset_index(drop=True)
    testX_final['active'] = testy_df.reset_index(drop=True)

    
    
    final_df = pd.concat([trainX_final, testX_final], ignore_index=True)
    return final_df
    
        

In [25]:
data_dir = '/Users/jayceepang/msse/capstone/data/NEK_ATOM_data/NEK'
nek_nums = [2,3,5,9]
NEK= 'NEK'
for i, n in enumerate(nek_nums):
    nek = str(n)
    print(f'NEK{nek}')
    nek_path= data_dir+nek+'/'
    
    bind_file = f'NEK{nek}_1_uM_min_50_pct_binding_5fold_random_imbalanced.csv'
    nek_bind = scaled_df(nek_path,bind_file)

    bind_final = f'NEK{nek}_binding_moe_scaled_df.csv'
    # dest = f'NEK{nek}'
    # if not os.path.exists(dest):
    #     os.makedirs(dest)
    # dest = dest+'/bind/'
    # if not os.path.exists(dest):
    #     os.makedirs(dest)
    dest = f'NEK{nek}/bind/'
    
    nek_bind.to_csv(bind_final, index=False)
    move([bind_final], dest)
    

    if n == 2 or n == 9:
        dest =  f'NEK{nek}/inhib/'
        # if not os.path.exists(dest):
        #     os.makedirs(dest)
        # dest = dest+'/inhib/'
        # if not os.path.exists(dest):
        #     os.makedirs(dest)
   
        inhib_file = f'NEK{nek}_1_uM_min_50_pct_inhibition_5fold_random_imbalanced.csv'
        inhib_final = f'NEK{nek}_inhibition_moe_scaled_df.csv'
        
        nek_inhib=scaled_df(nek_path,inhib_file)
        nek_inhib.to_csv(inhib_final, index=False)
        move([inhib_final], dest)
    else:
        pass
    print()
    


NEK2
train X: (1125, 306), testX: (283, 306)
moving to NEK2/bind/


Error: Destination path 'NEK2/bind/NEK2_binding_moe_scaled_df.csv' already exists

In [None]:
def undersample(file_path, filename):
    """Undersample the datasetes using the RandomUndersampler
    Keeps the feature names and id cols
    file_name (full/absolute path): use the scaled dataframe we just created above 'NEK#_binding_moe_scaled_df.csv'
    returns: undersampled dataframe
    """
    df = pd.read_csv(file_path+filename)
    original_cols = df.columns.to_list() 
    feature_names = original_cols[0:-5]
    other_cols = original_cols[-5:]
    id_col_names = other_cols[0:4]
    id_cols = df[id_col_names]
    
    train_df = df[df['subset'] == 'train']
    test_df = df[df['subset'] == 'test']
    trainX = train_df[feature_names]
    testX = test_df[feature_names]
    trainy = train_df['active']
    testy = test_df['active']
    
    train_ids = train_df[id_col_names]
    test_ids = test_df[id_col_names]
    undersample = RandomUnderSampler(random_state=42)
    
    trainX_temp, trainy_temp = undersample.fit_resample(trainX.to_numpy(), trainy.to_numpy().reshape(-1))
    
    trainX_resamp = pd.DataFrame(trainX_temp, columns=feature_names)
    trainy_resamp = pd.DataFrame(trainy_temp, columns=['active'])
    
    train_ids_resamp = train_ids.iloc[trainX_resamp.index].reset_index(drop=True)
    train_resamp= pd.concat([train_ids_resamp, trainX_resamp,trainy_resamp], axis=1)
    train_resamp['subset'] = 'train'
    
    
    test_df_final = pd.concat([test_ids.reset_index(drop=True),testX.reset_index(drop=True),testy.reset_index(drop=True)],axis=1)
    test_df_final['subset'] = 'test'
    
    
    final_df = pd.concat([train_resamp,test_df_final]).reset_index(drop=True)
    return final_df 
        

In [16]:
filepath = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/'
nek_nums = [2,3,5,9]
NEK= 'NEK'
for i, n in enumerate(nek_nums):
    nek = str(n)
    print(f'NEK{nek}')
    nek_path= f'{filepath}/NEK{nek}/bind/'
    
    bind_file = f'NEK{nek}_binding_moe_scaled_df.csv'
    nek_UNDER = undersample(nek_path,bind_file)
    
    
    bind_final_UNDER = f'NEK{nek}_binding_moe_UNDER_df.csv'
    
    nek_UNDER.to_csv(bind_final_UNDER, index=False)
    dest = f'NEK{nek}/bind/'
    move([bind_final_UNDER], dest)
    if n == 2 or n == 9:
        nek_path= f'{filepath}/NEK{nek}/inhib/'
        inhib_file = f'NEK{nek}_inhibition_moe_scaled_df.csv'
        inhib_final_UNDER = f'NEK{nek}_inhibition_moe_UNDER_df.csv'
        
        nek_inhib_UNDER = undersample(nek_path,inhib_file)
        nek_inhib_UNDER.to_csv(inhib_final_UNDER, index=False)
        dest = f'NEK{nek}/inhib/'
        move([inhib_final_UNDER], dest)
   

NEK2


NameError: name 'undersample' is not defined

In [97]:
def oversample(file_path,filename, sampling):
    """Oversample the datasetes using the SMOTE or ADASYN
    Keeps the feature names and id cols
    file_name (full/absolute path): use the scaled dataframe we just created above 'NEK#_binding_moe_scaled_df.csv'
    sampling (str): 'SMOTE' or 'ADASYN'
    returns: oversampled dataframe
    """
    
    df = pd.read_csv(file_path+filename)
    original_cols = df.columns.to_list() 
    feature_names = original_cols[0:-5]
    other_cols = original_cols[-5:]
    id_col_names = other_cols[0:4]
    id_cols = df[id_col_names]
    
    train_df = df[df['subset'] == 'train']
    test_df = df[df['subset'] == 'test']
    trainX = train_df[feature_names]
    testX = test_df[feature_names]
    trainy = train_df['active']
    testy = test_df['active']
    
    # train_ids = train_df[id_col_names]
    test_ids = test_df[id_col_names]
    print(f'original train size: {train_df.shape}, original test size: {test_df.shape}')
    
    # oversample = SMOTE(random_state=42)
    if sampling == 'ADASYN':
        oversample = ADASYN(random_state=42)
    else: 
        oversample = SMOTE(random_state=42)
        
    trainX_temp, trainy_temp = oversample.fit_resample(trainX.to_numpy(), trainy.to_numpy().reshape(-1))
    print(f'train after {sampling}: {trainX_temp.shape}')
    trainX_resamp = pd.DataFrame(trainX_temp, columns=feature_names)
    trainy_resamp = pd.DataFrame(trainy_temp, columns=['active'])
    placeholder='synthetic '+sampling
    syn_samples=pd.DataFrame({col:[placeholder]*len(trainX_resamp) for col in id_col_names})
    # train_ids_resamp = train_ids.iloc[trainX_resamp.index].reset_index(drop=True)
    # train_resamp= pd.concat([train_ids_resamp, trainX_resamp,trainy_resamp], axis=1)
    train_resamp= pd.concat([syn_samples,trainX_resamp,trainy_resamp], axis=1)
    train_resamp['subset'] = 'train'
    
    
    test_df_final = pd.concat([test_ids.reset_index(drop=True),testX.reset_index(drop=True),testy.reset_index(drop=True)],axis=1)
    # test_df_final = pd.concat([testX.reset_index(drop=True),testy.reset_index(drop=True)],axis=1)
    test_df_final['subset'] = 'test'
    
    
    final_df = pd.concat([train_resamp,test_df_final]).reset_index(drop=True)
    return final_df 
        

In [98]:

nek_nums = [2,3,5,9]
NEK= 'NEK'
for i, n in enumerate(nek_nums):
    nek = str(n)
    print(f'NEK{nek}')
    nek_path= f'{filepath}/NEK{nek}/bind/'
    bind_file = f'NEK{nek}_binding_moe_scaled_df.csv'
    nek_bind_SMOTE = oversample(nek_path,bind_file, 'SMOTE')

    bind_final_SMOTE = f'NEK{nek}_binding_moe_SMOTE_df.csv'
    
    nek_bind_SMOTE.to_csv(bind_final_SMOTE, index=False)
    dest = f'NEK{nek}/bind/'
    move([bind_final_SMOTE], dest)
    
    if n == 2 or n == 9:
        nek_path= f'{filepath}/NEK{nek}/inhib/'
        inhib_file = f'NEK{nek}_inhibition_moe_scaled_df.csv'
        inhib_final_SMOTE = f'NEK{nek}_inhibition_moe_SMOTE_df.csv'
        nek_inhib_SMOTE = oversample(nek_path,inhib_file, 'SMOTE')
        nek_inhib_SMOTE.to_csv(inhib_final_SMOTE, index=False)
        dest = f'NEK{nek}/inhib/'
        move([inhib_final_SMOTE], dest)
        
    print()
   

NEK2
original train size: (1125, 311), original test size: (283, 311)
train after SMOTE: (2160, 306)
moving to NEK2/bind/
original train size: (1635, 311), original test size: (409, 311)
train after SMOTE: (3046, 306)
moving to NEK2/inhib/

NEK3
original train size: (1122, 311), original test size: (282, 311)
train after SMOTE: (2116, 306)
moving to NEK3/bind/

NEK5
original train size: (989, 311), original test size: (248, 311)
train after SMOTE: (1824, 306)
moving to NEK5/bind/

NEK9
original train size: (1126, 311), original test size: (283, 311)
train after SMOTE: (2156, 306)
moving to NEK9/bind/
original train size: (313, 311), original test size: (80, 311)
train after SMOTE: (560, 306)
moving to NEK9/inhib/



In [99]:

for i, n in enumerate(nek_nums):
    nek = str(n)
    print(f'NEK{nek}')
    nek_path= f'{filepath}/NEK{nek}/bind/'
    bind_file = f'NEK{nek}_binding_moe_scaled_df.csv'
    nek_bind_ADASYN = oversample(nek_path,bind_file, 'ADASYN')
    
    bind_final_ADASYN = f'NEK{nek}_binding_moe_ADASYN_df.csv'
    nek_bind_ADASYN.to_csv(bind_final_ADASYN, index=False)
    dest = f'NEK{nek}/bind/'
    move([bind_final_ADASYN], dest)
    
    if n == 2 or n == 9:
        nek_path= f'{filepath}/NEK{nek}/inhib/'
        inhib_file = f'NEK{nek}_inhibition_moe_scaled_df.csv'
        inhib_final_ADASYN = f'NEK{nek}_inhibition_moe_ADASYN_df.csv'
        nek_inhib_ADASYN = oversample(nek_path,inhib_file, 'ADASYN')
        nek_inhib_ADASYN.to_csv(inhib_final_ADASYN, index=False)
        dest = f'NEK{nek}/inhib/'
        move([inhib_final_ADASYN], dest)
        
    print()
   

NEK2
original train size: (1125, 311), original test size: (283, 311)
train after ADASYN: (2158, 306)
moving to NEK2/bind/
original train size: (1635, 311), original test size: (409, 311)
train after ADASYN: (3037, 306)
moving to NEK2/inhib/

NEK3
original train size: (1122, 311), original test size: (282, 311)
train after ADASYN: (2113, 306)
moving to NEK3/bind/

NEK5
original train size: (989, 311), original test size: (248, 311)
train after ADASYN: (1831, 306)
moving to NEK5/bind/

NEK9
original train size: (1126, 311), original test size: (283, 311)
train after ADASYN: (2164, 306)
moving to NEK9/bind/
original train size: (313, 311), original test size: (80, 311)
train after ADASYN: (560, 306)
moving to NEK9/inhib/



In [101]:

def get_data_arrays(file_path, df_filename, filename_type=None, save=False):
    """use dataframes to get trainX, trainy, testX, testy out. Optional: save those files to csv
    file_path: directory
    df_filename: dataframe NEK#_binding_moe_{sampling}_df.csv (sampling: scaled, UNDER, SMOTE, ADASYN)
    split dataframe to train and test, and x and y
    save: bool, option to save splits to separate csv files (train X, train y, test X, test y) 
    returns: numpy arrays train X, train y, testX, test y"""
    df = pd.read_csv(file_path+df_filename)
    train_df= df[df['subset']=='train']
    test_df = df[df['subset']=='test']
    train_y = train_df['active'].to_numpy().reshape(-1)
    test_y=test_df['active'].to_numpy().reshape(-1)
    train_x_df = train_df.drop(columns='active')

  
    test_x_df = test_df.drop(columns='active')
    
    train_x_df = train_df.drop(columns='active')
    test_x_df = test_df.drop(columns='active')
    trainX = train_x_df.select_dtypes(include='number').to_numpy()
    testX = test_x_df.select_dtypes(include='number').to_numpy()
    
    print(f'train X shape: {trainX.shape}, y: {train_y.shape}, test X: {testX.shape}, y:{test_y.shape}')
    if (save and filename_type is not None): 
        trainxdf = pd.DataFrame(trainX)
        trainxdf.to_csv(file_path+filename_type+'_trainX.csv', index=False)
        # train_x_df.to_csv(filename_type+'_trainX.csv', index=False)
        trainy_df = pd.DataFrame(train_y)
        trainy_df.to_csv(file_path+filename_type+'_train_y.csv', index=False) 
        # test_x_df.to_csv(filename_type+'_testX.csv', index=False)
        testxdf = pd.DataFrame(testX)
        testxdf.to_csv(file_path+filename_type+'_testX.csv', index=False)
        testy_df = pd.DataFrame(test_y)
        testy_df.to_csv(file_path+filename_type+'_test_y.csv', index=False) 
        
    return trainX, train_y, testX, test_y

In [102]:
samplings = ['scaled', 'UNDER', 'SMOTE', 'ADASYN']

nek_nums = [2,3,5,9]
NEK= 'NEK'
file_path = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/'
for i, n in enumerate(nek_nums):
    nek = str(n)
    
    for j, samp in enumerate (samplings):
        print(f'NEK{nek} {samp}')
        nek_path= f'{file_path}/NEK{nek}/bind/'
        bind_df = f'NEK{nek}_binding_moe_{samp}_df.csv'
        bind_dataset_type = f'NEK{nek}_binding_moe_{samp}'
        get_data_arrays(nek_path, bind_df, bind_dataset_type, save=True)
        if n == 2 or n == 9:
            nek_path= f'{file_path}/NEK{nek}/inhib/'
            inhib_df = f'NEK{nek}_inhibition_moe_{samp}_df.csv'
            inhib_dataset_type = f'NEK{nek}_inhibition_moe_{samp}'
            get_data_arrays(nek_path, inhib_df, inhib_dataset_type, save=True)
        print()
        
    print()
        

NEK2 scaled
train X shape: (1125, 306), y: (1125,), test X: (283, 306), y:(283,)
train X shape: (1635, 306), y: (1635,), test X: (409, 306), y:(409,)

NEK2 UNDER
train X shape: (90, 306), y: (90,), test X: (283, 306), y:(283,)
train X shape: (224, 306), y: (224,), test X: (409, 306), y:(409,)

NEK2 SMOTE
train X shape: (2160, 306), y: (2160,), test X: (283, 306), y:(283,)
train X shape: (3046, 306), y: (3046,), test X: (409, 306), y:(409,)

NEK2 ADASYN
train X shape: (2158, 306), y: (2158,), test X: (283, 306), y:(283,)
train X shape: (3037, 306), y: (3037,), test X: (409, 306), y:(409,)


NEK3 scaled
train X shape: (1122, 306), y: (1122,), test X: (282, 306), y:(282,)

NEK3 UNDER
train X shape: (128, 306), y: (128,), test X: (282, 306), y:(282,)

NEK3 SMOTE
train X shape: (2116, 306), y: (2116,), test X: (282, 306), y:(282,)

NEK3 ADASYN
train X shape: (2113, 306), y: (2113,), test X: (282, 306), y:(282,)


NEK5 scaled
train X shape: (989, 306), y: (989,), test X: (248, 306), y:(248,)

In [None]:
# original_cols = nek2scaled.columns.to_list() 
# # true_labels = nek2scaled['active'] 
# feature_names = original_cols[0:-5]
# # features = nek2scaled[feature_names] 
# other_cols = original_cols[-5:]
# id_col_names = other_cols[0:4]
# id_cols = nek2scaled[id_col_names]

# train_df = nek2scaled[nek2scaled['subset'] == 'train']
# test_df = nek2scaled[nek2scaled['subset'] == 'test']

# trainX = train_df[feature_names]
# testX = test_df[feature_names]
# trainy = train_df['active']
# testy = test_df['active']

# train_ids = train_df[id_col_names]
# test_ids = test_df[id_col_names]

# undersample = RandomUnderSampler(random_state=42)
# trainX_temp, trainy_temp = undersample.fit_resample(trainX.to_numpy(), trainy.to_numpy().reshape(-1))

# trainX_resamp = pd.DataFrame(trainX_temp, columns=feature_names)
# trainy_resamp = pd.DataFrame(trainy_temp, columns=['active'])

# train_ids_resamp = train_ids.iloc[trainX_resamp.index].reset_index(drop=True)
# train_resamp= pd.concat([train_ids_resamp, trainX_resamp,trainy_resamp], axis=1)
# train_resamp['subset'] = 'train'


# test_df_final = pd.concat([test_ids.reset_index(drop=True),testX.reset_index(drop=True),testy.reset_index(drop=True)],axis=1)
# test_df_final['subset'] = 'test'


# final_df = pd.concat([train_resamp,test_df_final]).reset_index(drop=True)

In [7]:
reading = pd.read_csv('NEK2/bind/NEK2_binding_moe_scaled_df.csv')

In [8]:
reading

Unnamed: 0,ASA+_per_atom,ASA-,ASA_H_per_atom,ASA_P,ASA_per_atom,BCUT_PEOE_0,BCUT_PEOE_1,BCUT_PEOE_2,BCUT_PEOE_3,BCUT_SLOGP_0_per_atom,...,vsurf_Wp7,vsurf_Wp8,weinerPath,weinerPol_per_atom,zagreb_per_atom,subset,compound_id,base_rdkit_smiles,fold,active
0,6.171916,-2.880666,-0.242820,-0.138540,3.174018,1.664387,2.956607,-2.776526,-1.873703,-6.030218,...,-0.183657,-0.206969,-1.235610,-0.816892,0.194164,train,kdb_2562,Cn1cnc2c(N)ncnc21,fold4,0
1,0.504051,0.725849,-0.489283,1.847024,0.319364,-0.163025,-0.395619,0.682256,0.269110,-0.059029,...,-0.291313,-0.206969,0.257966,-0.011573,-0.539664,train,kdb_3056,CNCc1ccc(-c2cc(-c3nc(-c4ccc(S(=O)(=O)C(C)C)cc4...,fold2,0
2,-1.005664,1.857896,-0.732021,1.562579,-0.222465,-0.258641,0.555600,-0.776457,0.794523,0.996309,...,-0.398969,-0.206969,0.769733,1.066145,0.502085,train,kdb_3510,CNC(=O)Nc1ccc2c(c1)CC[C@@]21OC(=O)N(CC(=O)N(Cc...,fold4,0
3,-0.272463,0.356568,-0.742466,1.119058,0.391476,0.992658,-0.464873,0.315869,0.916609,0.039975,...,-0.398969,-0.206969,-0.450504,0.540647,1.080537,train,kdb_2948,CC(=O)Nc1ccc2ccn(-c3cc(NC4CC4)n4ncc(C#N)c4n3)c2c1,fold3,0
4,1.507075,-0.643926,1.305948,-0.705561,1.121426,1.445349,0.459030,1.731704,-1.692399,-0.232629,...,0.031654,-0.206969,-0.736191,-0.070246,0.125982,train,kdb_2748,Cc1cccc(-c2nc(Nc3ccncc3)c3ccccc3n2)n1,fold4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1403,-0.829305,-0.606676,-0.711927,-0.580871,-1.457042,-1.209951,1.095581,-0.537771,1.347102,0.065689,...,-0.398969,-0.206969,-0.128627,3.562265,1.194907,test,kdb_1627,CN[C@H]1C[C@@H]2O[C@](C)([C@H]1OC)n1c3ccccc3c3...,fold1,1
1404,1.325649,-1.222164,2.374258,-1.635889,1.987426,1.693552,-0.893856,0.765541,-1.947318,-2.045763,...,-0.398969,-0.206969,-1.147903,-0.762591,0.125982,test,kdb_2758,c1ccc(-c2ccnc3[nH]ccc23)cc1,fold1,1
1405,1.130839,-0.968577,0.678757,-0.691671,0.130626,0.118424,0.656058,-1.451143,-0.473444,0.291278,...,-0.398969,-0.206969,-0.236771,0.214837,-0.364932,test,kdb_2288,COCCOCCOc1cc2ncc3c4ccc(C#N)cc4[nH]c3c2cc1OC,fold1,1
1406,-0.073978,-0.955461,-0.432754,-0.659996,-1.040959,-0.678455,-0.542517,0.329226,0.102786,0.353850,...,-0.291313,-0.206969,0.012301,-0.364379,-0.601298,test,kdb_2768,CNC(=O)c1ccccc1Nc1nc(Nc2ccc(N3CCOCC3)cc2)ncc1Cl,fold1,1
