In [2]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold

In [3]:
target_list = ['erbB4', 'egfr', 'met', 'alk', 'erbB2', 'ret', 'ros1']

def splitData(df, split_size, label_column):
    train, test = train_test_split(df, test_size=split_size, shuffle=True, stratify= df.loc[:,label_column])
    return train, test

def count_taskDataset(df, target_list):
    tk_df = pd.DataFrame()
    result = pd.DataFrame(index=[0], columns=target_list)
    result['total'] = df.shape[0]
    print('total data: '+str(df.shape[0]))
    for tar in target_list:
        tar_df = df.dropna(subset=['pIC50_'+tar])
        tar_df = tar_df[['SMILES_NS','pIC50_'+tar]]
        tar_df.rename(columns={'pIC50_'+tar:'pIC50'},inplace=True)
        tar_df['target'] = tar
        tk_df = pd.concat([tk_df, tar_df],axis=0)
        result[tar] = tar_df.shape[0]
        print(tar+' data: '+str(tar_df.shape[0]))
    tk_df.sort_index(inplace=True)
    return result, tk_df

def defineLabel(df, target_list):
    count_dict = {'erbB4':0, 'egfr':0, 'met':0, 'alk':0, 'erbB2':0, 'ret':0, 'ros1':0}
    co = [0]*7
    for i, tar in(enumerate(target_list)):
        for v in range(len(df['SMILES_NS'])):
            if np.isnan(df['pIC50_'+tar][v]) == False:
                count_dict[tar] += 1
    target_order = sorted(count_dict, key=count_dict.get)
    co = [0]*7
    for i, tar in(enumerate(target_order)):
        for v in range(len(df['SMILES_NS'])):
            if np.isnan(df['pIC50_'+tar][v]) == False:
                co[i] += 1
    Label = []
    for index, row in df.iterrows():
        for i, tar in(enumerate(target_order)):
            if np.isnan(row['pIC50_'+tar]) == False:
                Label.append(i+1)
                break
    return Label

In [3]:
raw_df = pd.read_csv('./7TKs_ic50_chembl+bindingdb.csv', index_col=0)
print(raw_df.shape)
_,tk = count_taskDataset(raw_df, target_list)
tk.reset_index(drop=True, inplace=True)

Label = defineLabel(raw_df, target_list)
raw_df['Label'] = Label

(16345, 8)
total data: 16345
erbB4 data: 196
egfr data: 7427
met data: 3618
alk data: 1871
erbB2 data: 2313
ret data: 2985
ros1 data: 151


In [70]:
pretint_bAC = pd.read_csv('./7TK_train-intest.csv',index_col=0)
for t,tar in enumerate(target_list):
    tar_tint = pretint_bAC[pretint_bAC.target == tar]
    tar_tint.rename(columns={'pIC50':'pIC50_'+tar},inplace=True)
    tar_tint.drop(columns=['target'], inplace=True)
    if t == 0:
        pretint = tar_tint
    else:
        pretint = pd.merge(pretint, tar_tint, on=['SMILES_NS'], how='outer')
print('Before AC analysis',pretint.shape)
print('After AC analysis')
AC_df = pd.read_csv('./train-intest_fromAC.csv', index_col=0)
_,AC_tk = count_taskDataset(AC_df, target_list)

Before AC analysis (14710, 8)
After AC analysis
total data: 11116
erbB4 data: 163
egfr data: 4426
met data: 2430
alk data: 1313
erbB2 data: 1717
ret data: 2332
ros1 data: 131


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [76]:
print('Train-Intest')
tint = pd.merge(AC_df, raw_df[['SMILES_NS','Label']])
Label = defineLabel(tint, target_list)
tint['Label'] = Label
print(tint.shape)
tint.to_csv('./train-intest.csv')

print('External-test')
extest = raw_df[~(raw_df.SMILES_NS.isin(pretint.SMILES_NS))].reset_index(drop=True)
extest.reset_index(drop=True, inplace=True)
print(extest.shape)
extest.to_csv('./external-test.csv')

intest, train = splitData(tint, 0.9, 'Label')
print('Internal-test')
intest.reset_index(drop=True, inplace=True)
print(intest.shape)
intest.to_csv('./internal-test.csv')

print('Training')
train.reset_index(drop=True, inplace=True)
print(train.shape)
train.to_csv('./train.csv')

Train-Intest
(11116, 9)
External-test
(1635, 9)
Internal-test
(1111, 9)
Training
(10005, 9)


In [6]:
n_round = 10
skf = StratifiedKFold(n_splits=n_round, random_state=42, shuffle=True)
LAB = train.loc[:,'Label']
for n, (train_index, valid_index) in zip(range(n_round), skf.split(train, LAB)):
    print('Dataset fold '+str(n))
    print('Train')
    cv_train = train.loc[train_index,:]
    cv_train.reset_index(drop=True, inplace=True)
    print(cv_train.shape)
    cv_train.to_csv('./cv/train_kfold-'+str(n)+'.csv')
    print('Valid')
    cv_valid = train.loc[valid_index,:]
    cv_valid.reset_index(drop=True, inplace=True)
    print(cv_valid.shape)
    cv_valid.to_csv('./cv/valid_kfold-'+str(n)+'.csv')

Dataset fold 0
Train
(9004, 9)
Valid
(1001, 9)
Dataset fold 1
Train
(9004, 9)
Valid
(1001, 9)
Dataset fold 2
Train
(9004, 9)
Valid
(1001, 9)
Dataset fold 3
Train
(9004, 9)
Valid
(1001, 9)
Dataset fold 4
Train
(9004, 9)
Valid
(1001, 9)
Dataset fold 5
Train
(9005, 9)
Valid
(1000, 9)
Dataset fold 6
Train
(9005, 9)
Valid
(1000, 9)
Dataset fold 7
Train
(9005, 9)
Valid
(1000, 9)
Dataset fold 8
Train
(9005, 9)
Valid
(1000, 9)
Dataset fold 9
Train
(9005, 9)
Valid
(1000, 9)
