In [11]:
import math
import torch
import numpy as np
import gpytorch
import pandas as pd
import seaborn as sns
import os
import pickle
import matplotlib 
matplotlib.use('Agg')
from matplotlib import pyplot as plt
import sklearn
from sklearn.model_selection import KFold

%matplotlib inline
# %load_ext autoreload
# %autoreload 2
%reload_ext autoreload
import imblearn
# print("imblearn version: ",imblearn.__version__)
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import confusion_matrix
import itertools

from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, roc_curve, precision_recall_curve, auc, recall_score

from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from rdkit import Chem
from rdkit.Chem import Draw
import sys
sys.path.append('../../')
import utils
from sklearn.model_selection import GridSearchCV
from VisUtils import *
from split_data import *
from RF_GSCV import *



In [12]:
datapath = '../../../../../data/datasets/'
# capstone/data/datasets/NEK2_1_uM_min_50_pct_binding_with_moe_descriptors.csv
binding_moe = pd.read_csv(datapath+'NEK2_1_uM_min_50_pct_inhibition_with_moe_descriptors.csv') 
all_folds = pd.read_csv(datapath+'NEK_data_4Berkeley/NEK2/NEK2_1_uM_min_50_pct_inhibition_5fold_random_imbalanced.csv')
all_folds['fold'].value_counts()


fold
fold4    409
fold1    409
fold2    409
fold3    409
fold5    408
Name: count, dtype: int64

In [5]:
smile_prop_df = pd.read_csv(datapath+'NEK_data_4Berkeley/NEK2/NEK2_1_uM_min_50_pct_inhibition_5fold_random_imbalanced.csv')
# smile prop train will be normalized feats
smile_prop_train = smile_prop_df[smile_prop_df['fold'] != 'fold1'].drop(columns=['fold'])
smile_prop_test = smile_prop_df[smile_prop_df['fold'] == 'fold1'].drop(columns=['fold'])
mol_train = []
for i in smile_prop_train.index:
    m = Chem.MolFromSmiles(smile_prop_train.loc[i, 'base_rdkit_smiles'])
    if m:
        m.SetProp("_Name", smile_prop_train.loc[i, 'compound_id'])
        m.SetProp("Active", str(smile_prop_train.loc[i, 'active']))
        mol_train.append(m)
        # mol_train.append(utils.normalize(m)) # this causes an erro

mol_test = []
for i in smile_prop_test.index:
    m = Chem.MolFromSmiles(smile_prop_test.loc[i, 'base_rdkit_smiles'])
    if m:
        m.SetProp("_Name", smile_prop_test.loc[i, 'compound_id'])
        m.SetProp("Active", str(smile_prop_test.loc[i, 'active']))
        # mol_test.append(utils.normalize(m)) # this causes an error with inhibition
        mol_test.append(m)
# normalized feats from smile_prop_train
X_existing_train = smile_prop_train.drop(columns=['compound_id', 'base_rdkit_smiles', 'active'])
X_existing_test = smile_prop_test.drop(columns=['compound_id', 'base_rdkit_smiles', 'active'])
y_train = smile_prop_train['active']
y_test = smile_prop_test['active']

# generate fingeprints: Morgan fingerprint with radius 2
fps = [Chem.AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mol_train]
# convert the RDKit explicit vectors into numpy arrays
X_morgan_train = []
for fp in fps:
  arr = np.zeros((1,))
  Chem.DataStructs.ConvertToNumpyArray(fp, arr)
  X_morgan_train.append(arr)

# generate fingeprints: Morgan fingerprint with radius 2
fps = [Chem.AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mol_test]
# convert the RDKit explicit vectors into numpy arrays
X_morgan_test = []
for fp in fps:
  arr = np.zeros((1,))
  Chem.DataStructs.ConvertToNumpyArray(fp, arr)
  X_morgan_test.append(arr)

In [6]:
bind_data_path = '../features/data/inhib/'
trainX_norm = X_existing_train.to_numpy()
# y_train = y_train.to_numpy().reshape(-1)
testX_norm = X_existing_test.to_numpy()
# y_test = y_test.to_numpy().reshape(-1)
# we also scale the data

# back to df for saving 

train_x_df = pd.DataFrame(trainX_norm) 
train_y_df = pd.DataFrame(y_train) 
test_y_df = pd.DataFrame(y_test)
test_x_df = pd.DataFrame(testX_norm)

# scale the data for consistency (this is what i did in the project)  using StandardScalar 
train_x_temp, train_y_temp, test_x_temp, test_y_temp = scale_data(train_x_df, train_y_df, test_x_df, test_y_df)

train_x_df = pd.DataFrame(train_x_temp) 
train_y_df = pd.DataFrame(train_y_temp) 
test_y_df = pd.DataFrame(test_y_temp)
test_x_df = pd.DataFrame(test_x_temp)

train_x_df.to_csv(bind_data_path+'inhib_train_x_normalized.csv', index=False, header=False)
train_y_df.to_csv(bind_data_path+'inhib_train_y_normalized.csv', index=False, header=False) 
test_y_df.to_csv(bind_data_path+'inhib_test_y_normalized.csv',index=False, header=False) 
test_x_df.to_csv(bind_data_path+'inhib_test_x_normalized.csv', index=False, header=False) 
print(train_x_df.shape, train_y_df.shape, test_x_df.shape, test_x_df.shape)

(1635, 306) (1635, 1) (409, 306) (409, 306)


In [7]:
train_x = train_x_df.to_numpy()
train_y = train_y_df.to_numpy().reshape(-1)
test_x = test_x_df.to_numpy()
test_y = test_y_df.to_numpy().reshape(-1)
print(train_x.shape, train_y.shape, test_x.shape, test_y.shape)

(1635, 306) (1635,) (409, 306) (409,)


In [8]:
split_path ='../features/data/inhib/'
X_morgan_train 
X_morgan_test = X_morgan_test
y_trainmorgan = smile_prop_train['active']
y_testmorgan = smile_prop_test['active']

train_x_dfmorgan = pd.DataFrame(X_morgan_train) 
train_y_dfmorgan = pd.DataFrame(y_trainmorgan) 
test_y_dfmorgan = pd.DataFrame(y_testmorgan)
test_x_dfmorgan = pd.DataFrame(X_morgan_test)

train_x_dfmorgan.to_csv(split_path+'inhib_train_x_Morgan2.csv', index=False, header=False)
train_y_dfmorgan.to_csv(split_path+'inhib_train_y_Morgan2.csv', index=False, header=False) 
test_y_dfmorgan.to_csv(split_path+'inhib_test_y_Morgan2.csv', index=False, header=False) 
test_x_dfmorgan.to_csv(split_path+'inhib_test_x_Morgan2.csv', index=False, header=False) 
print(train_x_dfmorgan.shape, train_y_dfmorgan.shape, test_y_dfmorgan.shape, test_x_dfmorgan.shape)

(1635, 2048) (1635, 1) (409, 1) (409, 2048)


In [9]:
oversample = SMOTE()
trainX_Morgan_SMOTE, trainy_Morgan_SMOTE = oversample.fit_resample(X_morgan_train,train_y)
train_x_dfmorganSMOTE = pd.DataFrame(trainX_Morgan_SMOTE) 
train_y_dfmorganSMOTE = pd.DataFrame(trainy_Morgan_SMOTE) 
test_y_dfmorganSMOTE = pd.DataFrame(y_testmorgan)
test_x_dfmorganSMOTE = pd.DataFrame(X_morgan_test)

train_x_dfmorganSMOTE.to_csv(split_path+'inhib_train_x_Morgan2SMOTE.csv', index=False, header=False)
train_y_dfmorganSMOTE.to_csv(split_path+'inhib_train_y_Morgan2SMOTE.csv', index=False, header=False) 
test_y_dfmorganSMOTE.to_csv(split_path+'inhib_test_y_Morgan2SMOTE.csv', index=False, header=False) 
test_x_dfmorganSMOTE.to_csv(split_path+'inhib_test_x_Morgan2SMOTE.csv', index=False, header=False) 
print(train_x_dfmorganSMOTE.shape, train_y_dfmorganSMOTE.shape, test_x_dfmorganSMOTE.shape, test_x_dfmorganSMOTE.shape)

(3046, 2048) (3046, 1) (409, 2048) (409, 2048)


In [10]:
undersample = RandomUnderSampler()
train_x_Morgan_UNDER, train_y_Morgan_UNDER = undersample.fit_resample(X_morgan_train, train_y)

train_x_dfMorganUNDER = pd.DataFrame(train_x_Morgan_UNDER) 
train_y_dfMorganUNDER = pd.DataFrame(train_y_Morgan_UNDER) 
test_y_dfMorganUnder = pd.DataFrame(y_testmorgan)
test_x_dfMorganUnder = pd.DataFrame(X_morgan_test)

train_x_dfMorganUNDER.to_csv(split_path+'inhib_train_x_Morgan2UNDER.csv', index=False, header=False)
train_y_dfMorganUNDER.to_csv(split_path+'inhib_train_y_Morgan2UNDER.csv', index=False, header=False) 
test_y_dfMorganUnder.to_csv(split_path+'inhib_test_y_Morgan2UNDER.csv', index=False, header=False) 
test_x_dfMorganUnder.to_csv(split_path+'inhib_test_x_Morgan2UNDER.csv', index=False, header=False)
print(train_x_dfMorganUNDER.shape, train_y_dfMorganUNDER.shape, test_x_dfMorganUnder.shape, test_y_dfMorganUnder.shape)

(90, 2048) (90, 1) (283, 2048) (283, 1)


In [10]:
adasyn = ADASYN() 
train_x_MorganADASYN, train_y_MorganADASYN = adasyn.fit_resample(X_morgan_train, train_y)

train_x_dfMorganADASYN = pd.DataFrame(train_x_MorganADASYN) 
train_y_dfMorganADASYN = pd.DataFrame(train_y_MorganADASYN) 
test_y_dfMorganADASYN = pd.DataFrame(y_testmorgan)
test_x_dfMorganADASYN = pd.DataFrame(X_morgan_test)

train_x_dfMorganADASYN.to_csv(split_path+'inhib_train_x_Morgan2ADASYN.csv', index=False, header=False)
train_y_dfMorganADASYN.to_csv(split_path+'inhib_train_y_Morgan2ADASYN.csv', index=False, header=False) 
test_y_dfMorganADASYN.to_csv(split_path+'inhib_test_y_Morgan2ADASYN.csv', index=False, header=False) 
test_x_dfMorganADASYN.to_csv(split_path+'inhib_test_x_Morgan2ADASYN.csv', index=False, header=False) 

print(train_x_dfMorganADASYN.shape, train_y_dfMorganADASYN.shape, test_x_dfMorganADASYN.shape, test_y_dfMorganADASYN.shape)

(3050, 2048) (3050, 1) (409, 2048) (409, 1)
