In [1]:
import math
import torch
import numpy as np
import pandas as pd
import os
import pickle
import shutil
import sklearn
from sklearn.model_selection import KFold
import imblearn
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import itertools
from scipy.stats import randint
from rdkit import Chem
from rdkit.Chem import Draw
import sys
sys.path.append('../../')
import utils
from rdkit import Chem
from rdkit.Chem import AllChem

In [2]:
def smiles_to_fps(smiles_list, radius=2, nBits=2048):
    fps = []
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol is not None:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
            arr = np.zeros((1,), dtype=np.int8)
            Chem.DataStructs.ConvertToNumpyArray(fp, arr)
            fps.append(arr)
    return np.array(fps)

In [3]:
def make_smiles(file_path, filename):
    """Create Morgan Fingerpint features from smiles srrings 
    file_path: directory of datafame
    filename: any dataframe with active, base_rdkit_smiles, fold, compound_id
    returns: final dataframe with morgan fingerprints based on the original dataframe's smiles strings 
    """
    
    df = pd.read_csv(file_path+filename)
    train_y = df[df['subset']=='train']['active']
    test_y=df[df['subset']=='test']['active']
    train_sm = df[df['subset']=='train']['base_rdkit_smiles']
    test_sm = df[df['subset']=='test']['base_rdkit_smiles']
    train_id = df[df['subset']=='train']['compound_id']
    test_id = df[df['subset']=='test']['compound_id']
    train_fold = df[df['subset']=='train']['fold']
    test_fold = df[df['subset']=='test']['fold']
    
    trainX_mfp = smiles_to_fps(train_sm, radius=2)
    testX_mfp = smiles_to_fps(test_sm, radius=2)
    
    trainX_mfp_df = pd.DataFrame(trainX_mfp)
    testX_mfp_df = pd.DataFrame(testX_mfp)
    
    trainX_mfp_df['subset'] = 'train'
    testX_mfp_df['subset'] = 'test'
    
    trainX_mfp_df['base_rdkit_smiles'] = train_sm.reset_index(drop=True)
    testX_mfp_df['base_rdkit_smiles'] = test_sm.reset_index(drop=True)
    trainX_mfp_df['compound_id'] = train_id.reset_index(drop=True)
    testX_mfp_df['compound_id'] = test_id.reset_index(drop=True)
    trainX_mfp_df['fold'] = train_fold.reset_index(drop=True)
    testX_mfp_df['fold'] = test_fold.reset_index(drop=True)
    
    trainX_mfp_df['active'] = train_y.reset_index(drop=True)
    testX_mfp_df['active'] = test_y.reset_index(drop=True)
    
    
    final_df = pd.concat([trainX_mfp_df,testX_mfp_df], ignore_index=True)
    return final_df
        

In [5]:
file_path = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/'
nek_nums = [2,3,5,9]
NEK= 'NEK'
for i, n in enumerate(nek_nums):
    nek = str(n)
    print(f'NEK{nek} bind')
    # nek_path= data_dir+nek+'/'
    nek_path = file_path
    
    bind_file = f'NEK{nek}_binding_moe_scaled_df.csv'
    nek_bind = make_smiles(nek_path,bind_file)

    bind_final = f'NEK{nek}_binding_mfp_scaled_df.csv'
    nek_bind.to_csv(bind_final, index=False)
    if n == 2 or n == 9:
        print(f'NEK{nek} inhib')
        inhib_file = f'NEK{nek}_inhibition_moe_scaled_df.csv'
        inhib_final = f'NEK{nek}_inhibition_mfp_scaled_df.csv'
        nek_inhib=make_smiles(nek_path,inhib_file)
        nek_inhib.to_csv(inhib_final, index=False)
    else:
        pass
    print()
    

NEK2 bind
NEK2 inhib

NEK3 bind

NEK5 bind

NEK9 bind
NEK9 inhib



In [6]:
# Undersample
for i, n in enumerate(nek_nums):
    nek = str(n)
    print(f'NEK{nek} bind')
    # nek_path= data_dir+nek+'/'
    nek_path = file_path
    
    bind_file_UNDER = f'NEK{nek}_binding_moe_UNDER_df.csv'
    nek_bind_UNDER = make_smiles(nek_path,bind_file_UNDER)

    bind_final_UNDER = f'NEK{nek}_binding_mfp_UNDER_df.csv'
    nek_bind_UNDER.to_csv(bind_final_UNDER, index=False)
    if n == 2 or n == 9:
        print(f'NEK{nek} inhib')
        inhib_file_UNDER = f'NEK{nek}_inhibition_moe_UNDER_df.csv'
        inhib_final_UNDER = f'NEK{nek}_inhibition_mfp_UNDER_df.csv'
        nek_inhib_UNDER=make_smiles(nek_path,inhib_file_UNDER)
        nek_inhib_UNDER.to_csv(inhib_final_UNDER, index=False)
    else:
        pass
    print()
    

NEK2 bind
NEK2 inhib

NEK3 bind

NEK5 bind

NEK9 bind
NEK9 inhib



In [7]:
def oversample(file_path, filename, sampling):
    """Oversample the datasetes using the SMOTE or ADASYN
    Keeps the feature names and id cols
    file_name (full/absolute path): use the scaled dataframe we just created above 'NEK#_binding_mfp_scaled_df.csv'
    sampling (str): 'SMOTE' or 'ADASYN'
    returns: oversampled dataframe
    don't have to call smiles_to_fps again
    """
    df = pd.read_csv(file_path+filename)
    original_cols = df.columns.to_list() 
    train_df = df[df['subset'] == 'train']
    test_df = df[df['subset'] == 'test']
    train_y = train_df['active']
    test_y = test_df['active']
    
    
    train_ids = train_df[['base_rdkit_smiles', 'compound_id', 'fold']]

    test_ids = test_df[['base_rdkit_smiles', 'compound_id', 'fold']]
    trainX = train_df.select_dtypes(include='number').drop(columns=['active'])
    testX = test_df.select_dtypes(include='number').drop(columns=['active'])
    feature_cols = trainX.columns.to_list()

    print(f'original train size: {train_df.shape}, original test size: {test_df.shape}')
    
    # oversample = SMOTE(random_state=42)
    if sampling == 'ADASYN':
        oversample = ADASYN(random_state=42)
    else: 
        oversample = SMOTE(random_state=42)
    
    trainX_temp, trainy_temp = oversample.fit_resample(trainX.to_numpy(), train_y.to_numpy().reshape(-1))
    print(f'train after {sampling}: {trainX_temp.shape}')
    trainX_resamp = pd.DataFrame(trainX_temp, columns=feature_cols)
    trainy_resamp = pd.DataFrame(trainy_temp, columns=['active'])
    placeholder='synthetic '+sampling
    id_col_names = ['base_rdkit_smiles', 'compound_id', 'fold']
    syn_samples=pd.DataFrame({col:[placeholder]*len(trainX_resamp) for col in id_col_names})

    train_resamp= pd.concat([syn_samples,trainX_resamp,trainy_resamp], axis=1)
    train_resamp['subset'] = 'train'
    
    
    test_df_final = pd.concat([test_ids.reset_index(drop=True),testX.reset_index(drop=True),test_y.reset_index(drop=True)],axis=1)

    test_df_final['subset'] = 'test'
    
    
    final_df = pd.concat([train_resamp,test_df_final]).reset_index(drop=True)
    return final_df 

In [8]:
# file_path= '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/'
# nek_nums = [2,3,5,9]
# NEK= 'NEK'
for i, n in enumerate(nek_nums):
    nek = str(n)
    print(f'NEK{nek} bind')
    
    bind_file = f'NEK{nek}_binding_mfp_scaled_df.csv'
    nek_bind_SMOTE = oversample(file_path,bind_file, 'SMOTE')

    bind_final = f'NEK{nek}_binding_mfp_SMOTE_df.csv'
    nek_bind_SMOTE.to_csv(bind_final, index=False)
    
    if n == 2 or n == 9:
        print(f'NEK{nek} inhib')
        inhib_file = f'NEK{nek}_inhibition_mfp_scaled_df.csv'
        inhib_final = f'NEK{nek}_inhibition_mfp_SMOTE_df.csv'
        nek_inhib_SMOTE = oversample(file_path,inhib_file, 'SMOTE')
        nek_inhib_SMOTE.to_csv(inhib_final, index=False)
        
    print()
   

NEK2 bind
original train size: (1125, 2053), original test size: (283, 2053)
train after SMOTE: (2160, 2048)
NEK2 inhib
original train size: (1635, 2053), original test size: (409, 2053)
train after SMOTE: (3046, 2048)

NEK3 bind
original train size: (1122, 2053), original test size: (282, 2053)
train after SMOTE: (2116, 2048)

NEK5 bind
original train size: (989, 2053), original test size: (248, 2053)
train after SMOTE: (1824, 2048)

NEK9 bind
original train size: (1126, 2053), original test size: (283, 2053)
train after SMOTE: (2156, 2048)
NEK9 inhib
original train size: (313, 2053), original test size: (80, 2053)
train after SMOTE: (560, 2048)



In [9]:
for i, n in enumerate(nek_nums):
    nek = str(n)
    print(f'NEK{nek} bind')
    
    bind_file = f'NEK{nek}_binding_mfp_scaled_df.csv'
    nek_bind_ADASYN = oversample(file_path,bind_file, 'ADASYN')

    bind_final_ADASYN = f'NEK{nek}_binding_mfp_ADASYN_df.csv'
    nek_bind_SMOTE.to_csv(bind_final_ADASYN, index=False)
    
    if n == 2 or n == 9:
        print(f'NEK{nek} inhib')
        inhib_file = f'NEK{nek}_inhibition_mfp_scaled_df.csv'
        inhib_final_ADASYN = f'NEK{nek}_inhibition_mfp_ADASYN_df.csv'
        nek_inhib_ADASYN = oversample(file_path,inhib_file, 'ADASYN')
        nek_inhib_ADASYN.to_csv(inhib_final_ADASYN, index=False)
        
    print()
   

NEK2 bind
original train size: (1125, 2053), original test size: (283, 2053)
train after ADASYN: (2168, 2048)
NEK2 inhib
original train size: (1635, 2053), original test size: (409, 2053)
train after ADASYN: (3027, 2048)

NEK3 bind
original train size: (1122, 2053), original test size: (282, 2053)
train after ADASYN: (2130, 2048)

NEK5 bind
original train size: (989, 2053), original test size: (248, 2053)
train after ADASYN: (1805, 2048)

NEK9 bind
original train size: (1126, 2053), original test size: (283, 2053)
train after ADASYN: (2166, 2048)
NEK9 inhib
original train size: (313, 2053), original test size: (80, 2053)
train after ADASYN: (555, 2048)



In [10]:
def get_arrays(file_path, df_filename, filename_type=None, save=False):
    """use dataframes to get trainX, trainy, testX, testy out. Optional: save those files to csv
    file_path: directory
    df_filename: dataframe NEK#_binding_moe_{sampling}_df.csv (sampling: scaled, UNDER, SMOTE, ADASYN)
    split dataframe to train and test, and x and y
    save: bool, option to save splits to separate csv files (train X, train y, test X, test y) 
    returns: numpy arrays train X, train y, testX, test y"""
    df = pd.read_csv(file_path+df_filename)
    train_df= df[df['subset']=='train']
    test_df = df[df['subset']=='test']
    train_y = train_df['active'].to_numpy().reshape(-1)
    test_y=test_df['active'].to_numpy().reshape(-1)
    train_x_df = train_df.drop(columns='active')

  
    test_x_df = test_df.drop(columns='active')
    
    train_x_df = train_df.drop(columns='active')
    test_x_df = test_df.drop(columns='active')
    trainX = train_x_df.select_dtypes(include='number').to_numpy()
    testX = test_x_df.select_dtypes(include='number').to_numpy()
    
    print(f'train X shape: {trainX.shape}, y: {train_y.shape}, test X: {testX.shape}, y:{test_y.shape}')
    if (save and filename_type is not None): 
        trainxdf = pd.DataFrame(trainX)
        trainxdf.to_csv(filename_type+'_trainX.csv', index=False)
        # train_x_df.to_csv(filename_type+'_trainX.csv', index=False)
        trainy_df = pd.DataFrame(train_y)
        trainy_df.to_csv(filename_type+'_train_y.csv', index=False) 
        # test_x_df.to_csv(filename_type+'_testX.csv', index=False)
        testxdf = pd.DataFrame(testX)
        testxdf.to_csv(filename_type+'_testX.csv', index=False)
        testy_df = pd.DataFrame(test_y)
        testy_df.to_csv(filename_type+'_test_y.csv', index=False) 
        
    return trainX, train_y, testX, test_y

In [11]:
samplings = ['scaled', 'UNDER', 'SMOTE', 'ADASYN']

nek_nums = [2,3,5,9]
NEK= 'NEK'
# file_path = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/'
for i, n in enumerate(nek_nums):
    nek = str(n)
    
    for j, samp in enumerate (samplings):
        print(f'NEK{nek} bind {samp}')
        bind_df = f'NEK{nek}_binding_mfp_{samp}_df.csv'
        bind_dataset_type = f'NEK{nek}_binding_mfp_{samp}'
        get_arrays(file_path, bind_df, bind_dataset_type, save=True)
        if n == 2 or n == 9:
            print(f'NEK{nek} inhib {samp}')
            inhib_df = f'NEK{nek}_inhibition_mfp_{samp}_df.csv'
            inhib_dataset_type = f'NEK{nek}_inhibition_mfp_{samp}'
            get_arrays(file_path, inhib_df, inhib_dataset_type, save=True)
        print()
        
    print()
        

NEK2 bind scaled
train X shape: (1125, 2048), y: (1125,), test X: (283, 2048), y:(283,)
NEK2 inhib scaled
train X shape: (1635, 2048), y: (1635,), test X: (409, 2048), y:(409,)

NEK2 bind UNDER
train X shape: (90, 2048), y: (90,), test X: (283, 2048), y:(283,)
NEK2 inhib UNDER
train X shape: (224, 2048), y: (224,), test X: (409, 2048), y:(409,)

NEK2 bind SMOTE
train X shape: (2160, 2048), y: (2160,), test X: (283, 2048), y:(283,)
NEK2 inhib SMOTE
train X shape: (3046, 2048), y: (3046,), test X: (409, 2048), y:(409,)

NEK2 bind ADASYN
train X shape: (2156, 2048), y: (2156,), test X: (283, 2048), y:(283,)
NEK2 inhib ADASYN
train X shape: (3027, 2048), y: (3027,), test X: (409, 2048), y:(409,)


NEK3 bind scaled
train X shape: (1122, 2048), y: (1122,), test X: (282, 2048), y:(282,)

NEK3 bind UNDER
train X shape: (128, 2048), y: (128,), test X: (282, 2048), y:(282,)

NEK3 bind SMOTE
train X shape: (2116, 2048), y: (2116,), test X: (282, 2048), y:(282,)

NEK3 bind ADASYN
train X shape: (2

In [5]:
# df = pd.read_csv('/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK2_binding_moe_UNDER_df.csv')

# train_y = df[df['subset']=='train']['active']
# test_y=df[df['subset']=='test']['active']
# train_sm = df[df['subset']=='train']['base_rdkit_smiles']
# test_sm = df[df['subset']=='test']['base_rdkit_smiles']
# train_id = df[df['subset']=='train']['compound_id']
# test_id = df[df['subset']=='test']['compound_id']
# train_fold = df[df['subset']=='train']['fold']
# test_fold = df[df['subset']=='test']['fold']
# trainX_mfp = smiles_to_fps(train_sm, radius=2)
# testX_mfp = smiles_to_fps(test_sm, radius=2)
# trainX_mfp_df = pd.DataFrame(trainX_mfp)
# testX_mfp_df = pd.DataFrame(testX_mfp)

# trainX_mfp_df['subset'] = 'train'
# testX_mfp_df['subset'] = 'test'
# trainX_mfp_df['base_rdkit_smiles'] = train_sm.reset_index(drop=True)
# testX_mfp_df['base_rdkit_smiles'] = test_sm.reset_index(drop=True)
# trainX_mfp_df['compound_id'] = train_id.reset_index(drop=True)
# testX_mfp_df['compound_id'] = test_id.reset_index(drop=True)
# trainX_mfp_df['fold'] = train_fold.reset_index(drop=True)
# testX_mfp_df['fold'] = test_fold.reset_index(drop=True)

# trainX_mfp_df['active'] = train_y.reset_index(drop=True)
# testX_mfp_df['active'] = test_y.reset_index(drop=True)


# final_df_under = pd.concat([trainX_mfp_df,testX_mfp_df], ignore_index=True)


In [65]:
# def undersample(file_path, filename):
#     df = pd.read_csv(file_path+filename)
#     original_cols = df.columns.to_list() 
#     train_df = df[df['subset'] == 'train']
#     test_df = df[df['subset'] == 'test']
#     train_y = train_df['active']
#     test_y = test_df['active']
    
    
#     train_ids = train_df[['base_rdkit_smiles', 'compound_id', 'fold']]
#     test_ids = test_df[['base_rdkit_smiles', 'compound_id', 'fold']]
#     trainX = train_df.select_dtypes(include='number').drop(columns=['active'])
#     testX = test_df.select_dtypes(include='number').drop(columns=['active'])
#     feature_cols = trainX.columns.to_list()
#     print(f'original train: {trainX.shape}, original test: {testX.shape}')
#     undersample = RandomUnderSampler(random_state=42)
#     trainX_resamp, trainy_resamp = undersample.fit_resample(trainX, train_y)
#     print(f'after train: {trainX_resamp.shape}')
#     trainX_resamp_df = pd.DataFrame(trainX_resamp,columns=feature_cols)
#     trainy_resamp_df = pd.DataFrame(trainy_resamp,columns=['active'])
    
#     train_resamp_df = pd.concat([train_ids.iloc[trainX_resamp_df.index].reset_index(drop=True),trainX_resamp_df.reset_index(drop=True), 
#                                  trainy_resamp_df.reset_index(drop=True)], axis=1)
#     train_resamp_df['subset'] = 'train'
#     test_df_final = pd.concat([test_ids.reset_index(drop=True),testX.reset_index(drop=True), 
#                                test_y.reset_index(drop=True)], axis=1)
#     test_df_final['subset'] = 'test'
    
#     final_df = pd.concat([train_resamp_df,test_df_final]).reset_index(drop=True)
#     return final_df

In [15]:
# file_path= '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/'
# nek_nums = [2,3,5,9]
# NEK= 'NEK'
# for i, n in enumerate(nek_nums):
#     nek = str(n)
#     print(f'NEK{nek}')
    
#     bind_file = f'NEK{nek}_binding_mfp_scaled_df.csv'
#     nek_UNDER = undersample(file_path, bind_file)

#     bind_final = f'NEK{nek}_binding_mfp_UNDER_df.csv'
#     nek_UNDER.to_csv(bind_final, index=False)
#     if n == 2 or n == 9:
#         inhib_file = f'NEK{nek}_inhibition_mfp_scaled_df.csv'
#         inhib_final = f'NEK{nek}_inhibition_mfp_UNDER_df.csv'
#         nek_inhib_UNDER = undersample(file_path, inhib_file)
#         nek_inhib_UNDER.to_csv(inhib_final, index=False)
#     print()
   