In [1]:
# run with atomsci environment 

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

import math
import torch

import pickle
import shutil
import sklearn
from sklearn.model_selection import KFold
import imblearn
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler
import sys
sys.path.append('../')
from split_data import *


In [2]:

def RDkit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        # add hydrogens to molecules
        # mol=Chem.AddHs(mol)
        # Calculate all 200 descriptors for each molecule
        descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names 




In [3]:
def make_rdkit(file_path, filename): 
    """Create RDKit features from smiles strings 
    file_path: directory of datafame
    filename: any dataframe with active, base_rdkit_smiles, fold, compound_id
    returns: final dataframe with RDkit feats based on the original dataframe's smiles strings 
    """
    df = pd.read_csv(file_path+filename)
    
    # df = pd.read_csv(datapath+ 'NEK2_binding_moe_scaled_df.csv')
    
    train_y = df[df['subset']=='train']['active']
    test_y = df[df['subset']=='test']['active']
    train_sm = df[df['subset']=='train']['base_rdkit_smiles']
    test_sm = df[df['subset']=='test']['base_rdkit_smiles']
    
    train_id = df[df['subset']=='train']['compound_id']
    test_id = df[df['subset']=='test']['compound_id']
    train_fold = df[df['subset']=='train']['fold']
    test_fold = df[df['subset']=='test']['fold']
    
    trainX_rdkit,descriptors = RDkit_descriptors(train_sm)
    testX_rdkit, descirptors = RDkit_descriptors(test_sm)
    # print(f'rdkit descriptors: {len(descriptors)}')
    
    trainX_rdkit_df = pd.DataFrame(trainX_rdkit, columns=descriptors)
    testX_rdkit_df = pd.DataFrame(testX_rdkit, columns=descriptors)
    
    
    print(f'rdkit X shape: {trainX_rdkit_df.shape}, test: {testX_rdkit_df.shape}')
    
    trainX_rdkit_df['subset'] = 'train'
    testX_rdkit_df['subset'] = 'test'
    
    trainX_rdkit_df['base_rdkit_smiles'] = train_sm.reset_index(drop=True)
    testX_rdkit_df['base_rdkit_smiles'] = test_sm.reset_index(drop=True)
    trainX_rdkit_df['compound_id'] = train_id.reset_index(drop=True)
    testX_rdkit_df['compound_id'] = test_id.reset_index(drop=True)
    trainX_rdkit_df['fold'] = train_fold.reset_index(drop=True)
    testX_rdkit_df['fold'] = test_fold.reset_index(drop=True)
    
    trainX_rdkit_df['active'] = train_y.reset_index(drop=True)
    testX_rdkit_df['active'] = test_y.reset_index(drop=True)
    
    final_df = pd.concat([trainX_rdkit_df.reset_index(drop=True), testX_rdkit_df.reset_index(drop=True)], ignore_index=True)
    return final_df
     

In [14]:
file_path = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK'
nek_nums = [2,3,5,9]
NEK= 'NEK'
for i, n in enumerate(nek_nums):
    nek = str(n)
    print(f'NEK{nek} bind')
    nek_path=f'{file_path}/NEK{nek}/bind/'
    
    bind_file = f'NEK{nek}_binding_moe_scaled_df.csv'
    nek_bind = make_rdkit(nek_path,bind_file)

    bind_final = f'NEK{nek}_binding_rdkit_scaled_df.csv'
    nek_bind.to_csv(bind_final, index=False)
    dest = f'NEK{nek}/bind/'
    move([bind_final], nek_path)
    if n == 2 or n == 9:
        print(f'NEK{nek} inhib')
        nek_path=f'{file_path}/NEK{nek}/inhib/'
        inhib_file = f'NEK{nek}_inhibition_moe_scaled_df.csv'
        inhib_final = f'NEK{nek}_inhibition_rdkit_scaled_df.csv'
        nek_inhib=make_rdkit(nek_path,inhib_file)
        nek_inhib.to_csv(inhib_final, index=False)

        move([inhib_final], nek_path)
    else:
        pass
    print()
    

NEK2 bind
rdkit X shape: (1125, 208), test: (283, 208)
moving to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK2/bind/
error moving NEK2_binding_rdkit_scaled_df.csv -- Destination path '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK2/bind/NEK2_binding_rdkit_scaled_df.csv' already exists
NEK2 inhib
rdkit X shape: (1635, 208), test: (409, 208)
moving to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK2/inhib/
error moving NEK2_inhibition_rdkit_scaled_df.csv -- Destination path '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK2/inhib/NEK2_inhibition_rdkit_scaled_df.csv' already exists

NEK3 bind
rdkit X shape: (1122, 208), test: (282, 208)
moving to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK3/bind/
error moving NEK3_binding_rdkit_scaled_df.csv -- Destination path '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK3/bind/NEK3_binding_rdkit_scaled_df.csv' already exists


In [6]:
# undersample (use chosen samples from UNDER dataset) 
for i, n in enumerate(nek_nums):
    nek = str(n)
    print(f'NEK{nek} bind')
    # nek_path= data_dir+nek+'/'
    nek_path=f'{file_path}/NEK{nek}/bind/'
    
    bind_file_UNDER = f'NEK{nek}_binding_moe_UNDER_df.csv'
    nek_bind_UNDER = make_rdkit(nek_path,bind_file_UNDER)

    bind_final_UNDER = f'NEK{nek}_binding_rdkit_UNDER_df.csv'
    nek_bind_UNDER.to_csv(bind_final_UNDER, index=False)
    move([bind_final_UNDER], nek_path)
    if n == 2 or n == 9:
        print(f'NEK{nek} inhib')
        nek_path=f'{file_path}/NEK{nek}/inhib/'
        inhib_file_UNDER = f'NEK{nek}_inhibition_moe_UNDER_df.csv'
        inhib_final_UNDER = f'NEK{nek}_inhibition_rdkit_UNDER_df.csv'
        nek_inhib_UNDER=make_rdkit(nek_path,inhib_file_UNDER)
        nek_inhib_UNDER.to_csv(inhib_final_UNDER, index=False)
        move([inhib_final_UNDER], nek_path)
    else:
        pass
    print()

NEK2 bind
rdkit X shape: (90, 208), test: (283, 208)
moving to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK2/bind/
NEK2 inhib
rdkit X shape: (224, 208), test: (409, 208)
moving to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK2/inhib/

NEK3 bind
rdkit X shape: (128, 208), test: (282, 208)
moving to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK3/bind/

NEK5 bind
rdkit X shape: (154, 208), test: (248, 208)
moving to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK5/bind/

NEK9 bind
rdkit X shape: (96, 208), test: (283, 208)
moving to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK9/bind/
NEK9 inhib
rdkit X shape: (66, 208), test: (80, 208)
moving to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK9/inhib/



In [7]:
def oversample_rdkit(file_path, filename, sampling):
    """Oversample the datasetes using the SMOTE or ADASYN
    Keeps the feature names and id cols
    file_name (full/absolute path): use the scaled dataframe we just created above 'NEK#_binding_rdkit_scaled_df.csv'
    sampling (str): 'SMOTE' or 'ADASYN'
    returns: oversampled dataframe
    don't have to call RDkit_descriptors again
    """
    df = pd.read_csv(file_path+filename)
    original_cols = df.columns.to_list() 
    train_df = df[df['subset'] == 'train']
    test_df = df[df['subset'] == 'test']
    train_y = train_df['active']
    test_y = test_df['active']
    train_ids = train_df[['base_rdkit_smiles', 'compound_id', 'fold']]
    test_ids = test_df[['base_rdkit_smiles', 'compound_id', 'fold']]
    trainX = train_df.select_dtypes(include='number').drop(columns=['active'])
    testX = test_df.select_dtypes(include='number').drop(columns=['active'])
    feature_cols = trainX.columns.to_list()

    print(f'original train size: {train_df.shape}, original test size: {test_df.shape}')
    
    if sampling == 'ADASYN':
        oversample = ADASYN(random_state=42)
    else: 
        oversample = SMOTE(random_state=42)
    
    trainX_temp, trainy_temp = oversample.fit_resample(trainX.to_numpy(), train_y.to_numpy().reshape(-1))
    print(f'train after {sampling}: {trainX_temp.shape}')
    trainX_resamp = pd.DataFrame(trainX_temp, columns=feature_cols)
    trainy_resamp = pd.DataFrame(trainy_temp, columns=['active'])
    placeholder='synthetic '+sampling
    id_col_names = ['base_rdkit_smiles', 'compound_id', 'fold']
    syn_samples=pd.DataFrame({col:[placeholder]*len(trainX_resamp) for col in id_col_names})

    train_resamp= pd.concat([syn_samples,trainX_resamp,trainy_resamp], axis=1)
    train_resamp['subset'] = 'train'
    
    
    test_df_final = pd.concat([test_ids.reset_index(drop=True),testX.reset_index(drop=True),test_y.reset_index(drop=True)],axis=1)

    test_df_final['subset'] = 'test'
    
    
    final_df = pd.concat([train_resamp,test_df_final]).reset_index(drop=True)
    return final_df 

In [10]:
for i, n in enumerate(nek_nums):
    nek = str(n)
    print(f'NEK{nek} bind')
    nek_path=f'{file_path}/NEK{nek}/bind/'
    bind_file = f'NEK{nek}_binding_rdkit_scaled_df.csv'
    nek_bind_SMOTE = oversample_rdkit(nek_path ,bind_file, 'SMOTE')

    bind_final_SMOTE = f'NEK{nek}_binding_rdkit_SMOTE_df.csv'
    nek_bind_SMOTE.to_csv(bind_final_SMOTE, index=False)
    move([bind_final_SMOTE], nek_path)
    if n == 2 or n == 9:
        print(f'NEK{nek} inhib')
        nek_path=f'{file_path}/NEK{nek}/inhib/'
        inhib_file = f'NEK{nek}_inhibition_rdkit_scaled_df.csv'
        inhib_final_SMOTE = f'NEK{nek}_inhibition_rdkit_SMOTE_df.csv'
        nek_inhib_SMOTE = oversample_rdkit(nek_path ,inhib_file, 'SMOTE')
        nek_inhib_SMOTE.to_csv(inhib_final_SMOTE, index=False)
        move([inhib_final_SMOTE], nek_path)
        
    print()
   

NEK2 bind
original train size: (1125, 213), original test size: (283, 213)
train after SMOTE: (2160, 208)
moving to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK2/bind/
NEK2 inhib
original train size: (1635, 213), original test size: (409, 213)
train after SMOTE: (3046, 208)
moving to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK2/inhib/

NEK3 bind
original train size: (1122, 213), original test size: (282, 213)
train after SMOTE: (2116, 208)
moving to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK3/bind/

NEK5 bind
original train size: (989, 213), original test size: (248, 213)
train after SMOTE: (1824, 208)
moving to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK5/bind/

NEK9 bind
original train size: (1126, 213), original test size: (283, 213)
train after SMOTE: (2156, 208)
moving to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK9/bind/
NEK9 inhib
original train size: (313, 213

In [11]:
for i, n in enumerate(nek_nums):
    nek = str(n)
    print(f'NEK{nek} bind')
    nek_path=f'{file_path}/NEK{nek}/bind/'
    bind_file = f'NEK{nek}_binding_rdkit_scaled_df.csv'
    nek_bind_ADASYN = oversample_rdkit(nek_path,bind_file, 'ADASYN')

    bind_final_ADASYN = f'NEK{nek}_binding_rdkit_ADASYN_df.csv'
    nek_bind_ADASYN.to_csv(bind_final_ADASYN, index=False)
    move([bind_final_ADASYN], nek_path)
    if n == 2 or n == 9:
        print(f'NEK{nek} inhib')
        nek_path=f'{file_path}/NEK{nek}/inhib/'
        inhib_file = f'NEK{nek}_inhibition_rdkit_scaled_df.csv'
        inhib_final_ADASYN = f'NEK{nek}_inhibition_rdkit_ADASYN_df.csv'
        nek_inhib_ADASYN = oversample_rdkit(nek_path ,inhib_file, 'ADASYN')
        nek_inhib_ADASYN.to_csv(inhib_final_ADASYN, index=False)
        move([inhib_final_ADASYN], nek_path)
        
    print()
   

NEK2 bind
original train size: (1125, 213), original test size: (283, 213)
train after ADASYN: (2163, 208)
moving to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK2/bind/
NEK2 inhib
original train size: (1635, 213), original test size: (409, 213)
train after ADASYN: (3050, 208)
moving to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK2/inhib/

NEK3 bind
original train size: (1122, 213), original test size: (282, 213)
train after ADASYN: (2122, 208)
moving to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK3/bind/

NEK5 bind
original train size: (989, 213), original test size: (248, 213)
train after ADASYN: (1846, 208)
moving to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK5/bind/

NEK9 bind
original train size: (1126, 213), original test size: (283, 213)
train after ADASYN: (2158, 208)
moving to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/NEK9/bind/
NEK9 inhib
original train size: (313

In [12]:
def get_arrays(file_path, df_filename, filename_type=None, save=False):
    """use dataframes to get trainX, trainy, testX, testy out. Optional: save those files to csv
    file_path: directory
    df_filename: dataframe NEK#_binding_moe_{sampling}_df.csv (sampling: scaled, UNDER, SMOTE, ADASYN)
    split dataframe to train and test, and x and y
    save: bool, option to save splits to separate csv files (train X, train y, test X, test y) 
    returns: numpy arrays train X, train y, testX, test y"""
    df = pd.read_csv(file_path+df_filename)
    train_df= df[df['subset']=='train']
    test_df = df[df['subset']=='test']
    train_y = train_df['active'].to_numpy().reshape(-1)
    test_y=test_df['active'].to_numpy().reshape(-1)
    train_x_df = train_df.drop(columns='active')

  
    test_x_df = test_df.drop(columns='active')
    
    train_x_df = train_df.drop(columns='active')
    test_x_df = test_df.drop(columns='active')
    trainX = train_x_df.select_dtypes(include='number').to_numpy()
    testX = test_x_df.select_dtypes(include='number').to_numpy()
    
    print(f'train X shape: {trainX.shape}, y: {train_y.shape}, test X: {testX.shape}, y:{test_y.shape}')
    if (save and filename_type is not None): 
        trainxdf = pd.DataFrame(trainX)
        trainxdf.to_csv(file_path+filename_type+'_trainX.csv', index=False)
        # train_x_df.to_csv(filename_type+'_trainX.csv', index=False)
        trainy_df = pd.DataFrame(train_y)
        trainy_df.to_csv(file_path+filename_type+'_train_y.csv', index=False) 
        # test_x_df.to_csv(filename_type+'_testX.csv', index=False)
        testxdf = pd.DataFrame(testX)
        testxdf.to_csv(file_path+filename_type+'_testX.csv', index=False)
        testy_df = pd.DataFrame(test_y)
        testy_df.to_csv(file_path+filename_type+'_test_y.csv', index=False) 
        
    return trainX, train_y, testX, test_y

In [13]:
samplings = ['scaled', 'UNDER', 'SMOTE', 'ADASYN']

nek_nums = [2,3,5,9]
NEK= 'NEK'
for i, n in enumerate(nek_nums):
    nek = str(n)
    
    for j, samp in enumerate (samplings):
        print(f'NEK{nek} bind {samp}')
        nek_path=f'{file_path}/NEK{nek}/bind/'
        bind_df = f'NEK{nek}_binding_rdkit_{samp}_df.csv'
        bind_dataset_type = f'NEK{nek}_binding_rdkit_{samp}'
        get_arrays(nek_path, bind_df, bind_dataset_type, save=True)
        if n == 2 or n == 9:
            print(f'NEK{nek} inhib {samp}')
            nek_path=f'{file_path}/NEK{nek}/inhib/'
            inhib_df = f'NEK{nek}_inhibition_rdkit_{samp}_df.csv'
            inhib_dataset_type = f'NEK{nek}_inhibition_rdkit_{samp}'
            get_arrays(nek_path, inhib_df, inhib_dataset_type, save=True)
        print()
        
    print()
        

NEK2 bind scaled
train X shape: (1125, 208), y: (1125,), test X: (283, 208), y:(283,)
NEK2 inhib scaled
train X shape: (1635, 208), y: (1635,), test X: (409, 208), y:(409,)

NEK2 bind UNDER
train X shape: (90, 208), y: (90,), test X: (283, 208), y:(283,)
NEK2 inhib UNDER
train X shape: (224, 208), y: (224,), test X: (409, 208), y:(409,)

NEK2 bind SMOTE
train X shape: (2160, 208), y: (2160,), test X: (283, 208), y:(283,)
NEK2 inhib SMOTE
train X shape: (3046, 208), y: (3046,), test X: (409, 208), y:(409,)

NEK2 bind ADASYN
train X shape: (2163, 208), y: (2163,), test X: (283, 208), y:(283,)
NEK2 inhib ADASYN
train X shape: (3050, 208), y: (3050,), test X: (409, 208), y:(409,)


NEK3 bind scaled
train X shape: (1122, 208), y: (1122,), test X: (282, 208), y:(282,)

NEK3 bind UNDER
train X shape: (128, 208), y: (128,), test X: (282, 208), y:(282,)

NEK3 bind SMOTE
train X shape: (2116, 208), y: (2116,), test X: (282, 208), y:(282,)

NEK3 bind ADASYN
train X shape: (2122, 208), y: (2122,),