In [1]:
import pandas as pd 
import numpy as np
import sklearn 
from sklearn.model_selection import StratifiedKFold
from rdkit import Chem
from rdkit.Chem import AllChem
from imblearn.over_sampling import SMOTEN, ADASYN, SMOTE 

In [2]:
neks = ['NEK2_binding', 'NEK2_inhibition', 'NEK3_binding', 'NEK5_binding','NEK9_binding','NEK9_inhibition']
samplings=['none_scaled', 'UNDER','SMOTE','ADASYN']
folds=['fold1','fold2','fold3','fold4','fold5'] 
feats=['MOE','MFP']

In [3]:
def create_folds(df, num): 
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=num)
    for i, (train_index, test_index) in enumerate(skf.split(df, df['active'])):
        df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
    return df
     

In [4]:
def label_subsets(df, test_fold, label):
   """function to label 'train' or 'test' in the 'subset' column
   to be used to create train/test OR train/val
   fold_df: dataframe with column 'fold'
   test_fold (str): fold to make the test set (the remaining folds will be train)
   label (str): 'test' or 'valididation' 
   """ 
   df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')
   return df[['NEK','compound_id', 'active', 'base_rdkit_smiles', 'subset']]


In [5]:
id_cols = ['NEK', 'compound_id', 'base_rdkit_smiles','subset', 'active'] 
def over_sampling(data_path=None,filename=None, df=None, sampling=None):
    """Oversample the datasetes using the SMOTE or ADASYN
    Keeps the feature names and id cols
    file_name (full/absolute path): use the scaled dataframe we just created above 'NEK#_(binding/inhibition)_(MOE/MFP)_none_scaled_df.csv'
    sampling (str): 'SMOTE' or 'ADASYN'
    returns: oversampled dataframe
    """
    if data_path is not None: 
        df = pd.read_csv(data_path+filename) # this is the already scaled ver
    
    feat_cols = list(set(list(df.columns))-set(id_cols))

    # train and test 
    train = df[df['subset']=='train'] 
    test =df[df['subset']=='test'] 
    nek = df['NEK'].iloc[0]
    # separate just id cols
    just_ids = ['NEK', 'compound_id', 'base_rdkit_smiles','subset']
    train_just_ids = train[just_ids]
    test_just_ids = test[just_ids]

    # just feats and 'active'
    trainX = train[feat_cols]
    testX = test[feat_cols]
    
    trainy = train['active']
    testy = test['active']
    
    if sampling == 'ADASYN':
        oversample = ADASYN(random_state=42)
    else: 
        oversample = SMOTE(random_state=42)

    
    trainX_temp, trainy_temp = oversample.fit_resample(trainX.to_numpy(), trainy.to_numpy().reshape(-1))
    print(f'train after {sampling}: {trainX_temp.shape}')
    
    trainX_resamp = pd.DataFrame(trainX_temp, columns=feat_cols)
    trainy_resamp = pd.DataFrame(trainy_temp, columns=['active'])

    num_real = len(train)
    num_synthetic = len(trainX_resamp)-num_real
    synthetic_ids = pd.DataFrame({'NEK': [nek] * num_synthetic,
        'compound_id': [f'synthetic_{sampling}_{i}' for i in range(num_synthetic)],
        'base_rdkit_smiles': [f'synthetic_{sampling}'] * num_synthetic,
        'subset': ['train']*num_synthetic}) # ,'active':[1]*num_synthetic}

    real_ids = train_just_ids.reset_index(drop=True)
    combined_ids = pd.concat([real_ids,synthetic_ids], ignore_index=True)
    
    train_resamp = pd.concat([combined_ids, trainX_resamp, trainy_resamp[['active']]], axis=1)

    print(train_resamp.columns[train_resamp.columns.duplicated()])
    test_df_final = pd.concat([test_just_ids.reset_index(drop=True),
                               testX.reset_index(drop=True), testy.reset_index(drop=True)],axis=1)
    
    final_df = pd.concat([train_resamp, test_df_final]).reset_index(drop=True)
    return final_df[list(df.columns)]

In [6]:

from imblearn.under_sampling import RandomUnderSampler
def under_sampling(data_path=None,filename=None, df=None): 
    if data_path is not None: 
        df = pd.read_csv(data_path+filename) # this is the already scaled ver
    feat_cols = list(set(list(df.columns))-set(id_cols))
    
    # train and test 
    train = df[df['subset']=='train'] 
    test =df[df['subset']=='test'] 

    # separate just id cols
    just_ids = ['NEK', 'compound_id', 'base_rdkit_smiles','subset']
    train_just_ids = train[just_ids]
    test_just_ids = test[just_ids]

    # just feats and 'active'
    trainX = train[feat_cols]
    testX = test[feat_cols]
    
    trainy = train['active']
    testy = test['active']
    
    undersample = RandomUnderSampler(random_state=42)
    
    trainX_temp, trainy_temp = undersample.fit_resample(trainX.to_numpy(), trainy.to_numpy().reshape(-1))
    
    trainX_resamp = pd.DataFrame(trainX_temp, columns=feat_cols)
    trainy_resamp = pd.DataFrame(trainy_temp, columns=['active'])
    
    train_ids_resamp = train_just_ids.iloc[trainX_resamp.index].reset_index(drop=True)
    train_resamp= pd.concat([train_ids_resamp, trainX_resamp,trainy_resamp], axis=1)
    # train_resamp['subset'] = 'train'

    test_df_final = pd.concat([test_just_ids.reset_index(drop=True),testX.reset_index(drop=True),testy.reset_index(drop=True)],axis=1)
    # test_df_final['subset'] = 'test'
    final_df = pd.concat([train_resamp,test_df_final]).reset_index(drop=True)
    return final_df[list(df.columns)]
    

In [10]:
def featurize(feat_type,data_path=None, filename=None,moe_path=None, moe_file=None, moe_df=None, df=None,mfp_radius=2, nBits=2048): 
    if (feat_type == 'MOE') and (moe_path is not None) and (data_path is not None): 
        feat_df = create_moe(data_path, filename, moe_path, moe_file)
    elif (feat_type == 'MOE') and (df is not None) and (moe_df is not None): 
        feat_df = create_moe(df=df, moe_df=moe_df)
    elif (feat_type == 'MFP') and (data_path is not None): 
        feat_df = create_mfp(data_path, filename, mfp_radius, nBits)
    elif (feat_type == 'MFP') and (df is not None): 
        feat_df = create_mfp(df=df)
    
    return feat_df

def create_moe(data_path=None, filename=None, moe_path=None, moe_file=None, df=None, moe_df=None):
    """(intended use for already existing dataset)
    This function will use an existing dataframe with smiles column to
    get the features from an existing file (moe_path+moe_file) with the MOE features generated"""
    drop_cols = ['active', 'compound_id']
    id_cols = ['NEK', 'compound_id','base_rdkit_smiles','subset', 'active']
   
    if data_path is not None: 
        df = remove_duplicates(data_path, filename)
    df=df.drop(columns=drop_cols)
    if moe_path is not None: 
        moe_df=remove_duplicates(moe_path,moe_file)
    final_df=moe_df.merge(df, how='outer', on=['base_rdkit_smiles'], suffixes=('_moe_desc', '_og'))
    NEK_col = final_df['NEK_og'] 
    subset_col = final_df['subset_og']
    
    final_df = final_df.loc[:,~final_df.columns.str.endswith(('_moe_desc', '_og'))]
    final_df['NEK']=NEK_col
    final_df['subset']=subset_col
    
    feat_cols = set(list(final_df.columns))-set(id_cols)
    final_order_cols = list(id_cols)+list(feat_cols)
    final_df =final_df[final_order_cols] 
    if 'fold' in final_df.columns: 
        final_df=final_df.drop(columns=['fold']) 
    return final_df
def smiles_to_fps(smiles_list, radius=2, nBits=2048):
    fps = []
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol is not None:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
            arr = np.zeros((1,), dtype=np.int8)
            Chem.DataStructs.ConvertToNumpyArray(fp, arr)
            fps.append(arr)
    return np.array(fps)

def create_mfp(file_path=None, filename=None, df=None,mfp_radius=2, nBits=2048):
    if file_path is not None: 
        df = pd.read_csv(file_path+filename)
    
    id_cols = ['NEK', 'compound_id','base_rdkit_smiles','subset', 'active'] 
    
   
    smiles = df['base_rdkit_smiles']
    mfp_feats = smiles_to_fps(smiles,mfp_radius,nBits)
    mfp_df = pd.DataFrame(mfp_feats)
    # if mfp_df['base_rdkit_smiles'].isnull().any():
    #     print("Warning: Missing values found in 'base_rdkit_smiles' column in df.")
    valid_smiles = smiles[smiles.apply(lambda x: Chem.MolFromSmiles(x) is not None)]
    
    if len(valid_smiles) != len(smiles):
        print(f"Warning: {len(smiles) - len(valid_smiles)} invalid SMILES strings found and excluded.")
    
    feat_cols = set(list(mfp_df.columns))-set(id_cols)
    final_order_cols = list(id_cols)+list(feat_cols)

    mfp_df.reset_index(drop=True, inplace=True)
    df.reset_index(drop=True, inplace=True)
    final_df = pd.concat([df,mfp_df],axis=1)

    final_df = final_df[final_order_cols]
  

    return final_df

In [None]:
datapath='/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/paper/datasets/80train_20test/featurized/'
rng = np.random.default_rng(seed=42) # Create a Generator object with a seed 
numbers = rng.integers(low=0, high=1e6, size=10)  # Generate random numbers
print(numbers) # [ 89250 773956 654571 438878 433015 858597  85945 697368 201469  94177] 
for nek in neks: 
    for feat in ['MFP']: 
        for i, num in enumerate(numbers): # 5fold x10 
            split_df = pd.read_csv(f'{datapath}{nek}_{feat}_none_scaled.csv')
            train=split_df[split_df['subset']=='train'] 
            folded_train_df = create_folds(train,num) # 5 fold split (validation models) in this iteration 
            # 80% train data, split into 5 folds  
            for fold in folds: # then use these 5 folds for train/validation 
                kfold_df=label_subsets(folded_train_df, fold, 'test') 
                print(f'{nek} {feat} {fold} (it: {i})')
                if feat == 'MOE': 
                    featurized_df = featurize(feat_type='MOE',data_path=None, filename=None,moe_path=None, moe_file=None, moe_df=folded_train_df,df=kfold_df) 
         
                else: 
                    # feat_type,data_path=None,filename=None,moe_path=None,moe_file=None,moe_df=None,df=None,mfp_radius=2,nBits=2048
                    featurized_df = featurize(feat_type='MFP', df=kfold_df,mfp_radius=2, nBits=2048)
                    test=pd.DataFrame(featurized_df)
               
                smote_df = over_sampling(data_path=None,filename=None,df=featurized_df,sampling='SMOTE')
                adasyn_df = over_sampling(data_path=None,filename=None,df=featurized_df, sampling='ADASYN')
                under_df=under_sampling(data_path=None,filename=None,df=featurized_df)
                display(featurized_df) # none scaled 
                 
                

[ 89250 773956 654571 438878 433015 858597  85945 697368 201469  94177]
NEK2_binding MFP fold1 (it: 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1731, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold2 (it: 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1733, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold3 (it: 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1726, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold4 (it: 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1734, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold5 (it: 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1725, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold1 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1726, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold2 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1732, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold3 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1726, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold4 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1723, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold5 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1738, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold1 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1720, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold2 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1721, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold3 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1725, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold4 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1725, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold5 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1728, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_binding MFP fold1 (it: 3)
train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1740, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold2 (it: 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1730, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold3 (it: 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1728, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold4 (it: 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1728, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold5 (it: 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1728, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_binding MFP fold1 (it: 4)
train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1724, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold2 (it: 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1732, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold3 (it: 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1723, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold4 (it: 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1731, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold5 (it: 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1730, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_binding MFP fold1 (it: 5)
train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1733, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold2 (it: 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1719, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold3 (it: 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1726, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold4 (it: 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1727, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold5 (it: 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1734, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_binding MFP fold1 (it: 6)
train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1731, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold2 (it: 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1731, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold3 (it: 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1732, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold4 (it: 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1727, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold5 (it: 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1737, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold1 (it: 7)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1732, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold2 (it: 7)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1728, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold3 (it: 7)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1733, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold4 (it: 7)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1722, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold5 (it: 7)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1733, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_binding MFP fold1 (it: 8)
train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1729, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold2 (it: 8)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1727, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold3 (it: 8)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1729, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold4 (it: 8)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1735, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold5 (it: 8)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1731, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold1 (it: 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1727, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold2 (it: 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1736, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold3 (it: 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1736, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold4 (it: 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1720, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_binding MFP fold5 (it: 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1728, 2048)
Index([], dtype='object')
train after ADASYN: (1724, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK2_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,NEK2_binding,kdb_29,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,NEK2_binding,kdb_2955,c1cncc(-c2ccc3nc(Nc4ccc(CN5CCOCC5)cc4)ncc3c2)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1122,NEK2_binding,kdb_3034,O=C(NC1CCNCC1)c1n[nH]cc1NC(=O)c1c(F)cccc1F,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1123,NEK2_binding,kdb_51,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_inhibition MFP fold1 (it: 0)
train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2414, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold2 (it: 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2415, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold3 (it: 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2416, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold4 (it: 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2422, 2048)
Index([], dtype='object')
train after ADASYN: (2416, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold5 (it: 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2422, 2048)
Index([], dtype='object')
train after ADASYN: (2430, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_inhibition MFP fold1 (it: 1)
train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2427, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold2 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2431, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold3 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2411, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold4 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2422, 2048)
Index([], dtype='object')
train after ADASYN: (2410, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold5 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2422, 2048)
Index([], dtype='object')
train after ADASYN: (2420, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_inhibition MFP fold1 (it: 2)
train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2420, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold2 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2410, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold3 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2407, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold4 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2422, 2048)
Index([], dtype='object')
train after ADASYN: (2415, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold5 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2422, 2048)
Index([], dtype='object')
train after ADASYN: (2420, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_inhibition MFP fold1 (it: 3)
train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2416, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold2 (it: 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2416, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold3 (it: 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2410, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold4 (it: 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2422, 2048)
Index([], dtype='object')
train after ADASYN: (2425, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold5 (it: 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2422, 2048)
Index([], dtype='object')
train after ADASYN: (2413, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_inhibition MFP fold1 (it: 4)
train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2416, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold2 (it: 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2418, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold3 (it: 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2423, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold4 (it: 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2422, 2048)
Index([], dtype='object')
train after ADASYN: (2413, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold5 (it: 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2422, 2048)
Index([], dtype='object')
train after ADASYN: (2420, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_inhibition MFP fold1 (it: 5)
train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2429, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold2 (it: 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2410, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold3 (it: 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2412, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold4 (it: 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2422, 2048)
Index([], dtype='object')
train after ADASYN: (2431, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold5 (it: 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2422, 2048)
Index([], dtype='object')
train after ADASYN: (2415, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_inhibition MFP fold1 (it: 6)
train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2412, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold2 (it: 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2422, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold3 (it: 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2402, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold4 (it: 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2422, 2048)
Index([], dtype='object')
train after ADASYN: (2418, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold5 (it: 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2422, 2048)
Index([], dtype='object')
train after ADASYN: (2428, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_inhibition MFP fold1 (it: 7)
train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2416, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold2 (it: 7)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2432, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold3 (it: 7)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2420, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold4 (it: 7)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2422, 2048)
Index([], dtype='object')
train after ADASYN: (2409, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold5 (it: 7)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2422, 2048)
Index([], dtype='object')
train after ADASYN: (2428, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_inhibition MFP fold1 (it: 8)
train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2430, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold2 (it: 8)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2422, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold3 (it: 8)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2413, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold4 (it: 8)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2422, 2048)
Index([], dtype='object')
train after ADASYN: (2423, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold5 (it: 8)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2422, 2048)
Index([], dtype='object')
train after ADASYN: (2431, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_inhibition MFP fold1 (it: 9)
train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2420, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold2 (it: 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2422, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold3 (it: 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2420, 2048)
Index([], dtype='object')
train after ADASYN: (2427, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold4 (it: 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2422, 2048)
Index([], dtype='object')
train after ADASYN: (2421, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK2_inhibition MFP fold5 (it: 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (2422, 2048)
Index([], dtype='object')
train after ADASYN: (2421, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK2_inhibition,kdb_1000,Oc1ccc(-c2n[nH]c(Nc3cccc(Cl)c3)c2-c2ccc(O)cc2)cc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK2_inhibition,kdb_1001,Nc1nonc1-n1nnc(C(=O)NN=Cc2cccs2)c1-c1ccccc1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK2_inhibition,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK2_inhibition,kdb_1004,COc1ccc(C=C2C(=O)ON=C2c2ccc(Br)cc2)cc1OC,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK2_inhibition,kdb_1005,CN(C)c1cccc2c(S(=O)(=O)N(CCN)c3cncc(-c4ccc5cnc...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,NEK2_inhibition,kdb_902,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,NEK2_inhibition,kdb_911,CCN(CC)CCNC(=O)C=Cc1cnc(N)c2c(-c3ccc(NC(=O)Nc4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1622,NEK2_inhibition,kdb_927,Cc1cccc(NC(=O)Nc2ccc(-c3cnc4c(-c5cnn(C)c5)cnn4...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1623,NEK2_inhibition,kdb_944,O=C(NNC(=S)Nc1ccc(Cl)cc1)C(O)(c1ccccc1)c1ccccc1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK3_binding MFP fold1 (it: 0)
train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1681, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold2 (it: 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1685, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold3 (it: 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1682, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold4 (it: 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1694, 2048)
Index([], dtype='object')
train after ADASYN: (1688, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold5 (it: 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1694, 2048)
Index([], dtype='object')
train after ADASYN: (1691, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold1 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1677, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold2 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1698, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold3 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1692, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold4 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1694, 2048)
Index([], dtype='object')
train after ADASYN: (1694, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold5 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1694, 2048)
Index([], dtype='object')
train after ADASYN: (1696, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold1 (it: 2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')



train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1677, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold2 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1691, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold3 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1686, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold4 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1694, 2048)
Index([], dtype='object')
train after ADASYN: (1688, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold5 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1694, 2048)
Index([], dtype='object')
train after ADASYN: (1703, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK3_binding MFP fold1 (it: 3)
train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1673, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold2 (it: 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1689, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold3 (it: 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1700, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold4 (it: 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1694, 2048)
Index([], dtype='object')
train after ADASYN: (1680, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold5 (it: 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1694, 2048)
Index([], dtype='object')
train after ADASYN: (1702, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold1 (it: 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1678, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold2 (it: 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1696, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold3 (it: 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1686, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold4 (it: 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1694, 2048)
Index([], dtype='object')
train after ADASYN: (1702, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold5 (it: 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1694, 2048)
Index([], dtype='object')
train after ADASYN: (1682, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK3_binding MFP fold1 (it: 5)
train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1697, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold2 (it: 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1673, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold3 (it: 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1690, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold4 (it: 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1694, 2048)
Index([], dtype='object')
train after ADASYN: (1698, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold5 (it: 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1694, 2048)
Index([], dtype='object')
train after ADASYN: (1684, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK3_binding MFP fold1 (it: 6)
train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1685, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold2 (it: 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1699, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold3 (it: 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1685, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold4 (it: 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1694, 2048)
Index([], dtype='object')
train after ADASYN: (1706, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold5 (it: 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1694, 2048)
Index([], dtype='object')
train after ADASYN: (1704, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK3_binding MFP fold1 (it: 7)
train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1688, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold2 (it: 7)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1681, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold3 (it: 7)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1706, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold4 (it: 7)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1694, 2048)
Index([], dtype='object')
train after ADASYN: (1696, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold5 (it: 7)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1694, 2048)
Index([], dtype='object')
train after ADASYN: (1691, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK3_binding MFP fold1 (it: 8)
train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1684, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold2 (it: 8)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1689, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold3 (it: 8)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1678, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold4 (it: 8)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1694, 2048)
Index([], dtype='object')
train after ADASYN: (1702, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold5 (it: 8)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1694, 2048)
Index([], dtype='object')
train after ADASYN: (1700, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold1 (it: 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1701, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold2 (it: 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1688, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold3 (it: 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1692, 2048)
Index([], dtype='object')
train after ADASYN: (1692, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold4 (it: 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1694, 2048)
Index([], dtype='object')
train after ADASYN: (1699, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK3_binding MFP fold5 (it: 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1694, 2048)
Index([], dtype='object')
train after ADASYN: (1704, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK3_binding,kdb_10,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK3_binding,kdb_100,C[C@@H](Oc1cc(C(=O)Nc2ccc(C(=O)N3CCN(C)CC3)cc2...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NEK3_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK3_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NEK3_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,NEK3_binding,kdb_3070,COc1cccc([C@@H](C)NC(=O)c2ccc(-c3ccnc(Nc4ccnn4...,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1118,NEK3_binding,kdb_3465,C[C@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccn1,test,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1119,NEK3_binding,kdb_3486,C[C@@H](c1cccc(C#N)c1)n1cc(Cl)cc(C(N)=O)c1=N,train,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1120,NEK3_binding,kdb_3507,N#Cc1ccc(Nc2ccc3[nH]c(=O)[nH]c3c2)c(Cl)c1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK5_binding MFP fold1 (it: 0)
train after SMOTE: (1458, 2048)
Index([], dtype='object')
train after ADASYN: (1454, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold2 (it: 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1458, 2048)
Index([], dtype='object')
train after ADASYN: (1443, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold3 (it: 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1460, 2048)
Index([], dtype='object')
train after ADASYN: (1467, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold4 (it: 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1460, 2048)
Index([], dtype='object')
train after ADASYN: (1456, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold5 (it: 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1460, 2048)
Index([], dtype='object')
train after ADASYN: (1446, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold1 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1458, 2048)
Index([], dtype='object')
train after ADASYN: (1463, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold2 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1458, 2048)
Index([], dtype='object')
train after ADASYN: (1456, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold3 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1460, 2048)
Index([], dtype='object')
train after ADASYN: (1454, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold4 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1460, 2048)
Index([], dtype='object')
train after ADASYN: (1456, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold5 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1460, 2048)
Index([], dtype='object')
train after ADASYN: (1469, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold1 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1458, 2048)
Index([], dtype='object')
train after ADASYN: (1456, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold2 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1458, 2048)
Index([], dtype='object')
train after ADASYN: (1445, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold3 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1460, 2048)
Index([], dtype='object')
train after ADASYN: (1458, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold4 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1460, 2048)
Index([], dtype='object')
train after ADASYN: (1458, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold5 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1460, 2048)
Index([], dtype='object')
train after ADASYN: (1464, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold1 (it: 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1458, 2048)
Index([], dtype='object')
train after ADASYN: (1461, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold2 (it: 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1458, 2048)
Index([], dtype='object')
train after ADASYN: (1450, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold3 (it: 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1460, 2048)
Index([], dtype='object')
train after ADASYN: (1471, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold4 (it: 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1460, 2048)
Index([], dtype='object')
train after ADASYN: (1456, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold5 (it: 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1460, 2048)
Index([], dtype='object')
train after ADASYN: (1453, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK5_binding MFP fold1 (it: 4)
train after SMOTE: (1458, 2048)
Index([], dtype='object')
train after ADASYN: (1451, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold2 (it: 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1458, 2048)
Index([], dtype='object')
train after ADASYN: (1469, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold3 (it: 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1460, 2048)
Index([], dtype='object')
train after ADASYN: (1452, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold4 (it: 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1460, 2048)
Index([], dtype='object')
train after ADASYN: (1465, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold5 (it: 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1460, 2048)
Index([], dtype='object')
train after ADASYN: (1454, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold1 (it: 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1458, 2048)
Index([], dtype='object')
train after ADASYN: (1449, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold2 (it: 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1458, 2048)
Index([], dtype='object')
train after ADASYN: (1462, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold3 (it: 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1460, 2048)
Index([], dtype='object')
train after ADASYN: (1460, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold4 (it: 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1460, 2048)
Index([], dtype='object')
train after ADASYN: (1451, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold5 (it: 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1460, 2048)
Index([], dtype='object')
train after ADASYN: (1465, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK5_binding MFP fold1 (it: 6)
train after SMOTE: (1458, 2048)
Index([], dtype='object')
train after ADASYN: (1452, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,test,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold2 (it: 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1458, 2048)
Index([], dtype='object')
train after ADASYN: (1467, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold3 (it: 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1460, 2048)
Index([], dtype='object')
train after ADASYN: (1462, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,test,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold4 (it: 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


train after SMOTE: (1460, 2048)
Index([], dtype='object')
train after ADASYN: (1465, 2048)
Index([], dtype='object')


Unnamed: 0,NEK,compound_id,base_rdkit_smiles,subset,active,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,NEK5_binding,kdb_1003,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NEK5_binding,kdb_101,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,train,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NEK5_binding,kdb_102,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NEK5_binding,kdb_1048,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEK5_binding,kdb_105,OCCn1cc(-c2cnc3nnn(Cc4ccc5ncccc5c4)c3n2)cn1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,NEK5_binding,kdb_3054,O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1C(F...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,NEK5_binding,kdb_3422,CCCOc1ccc([C@H]2[C@H](C(=O)O)[C@@H](c3ccc4c(c3...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,NEK5_binding,kdb_3491,Cc1nc(Cc2cccc(C(F)(F)F)c2)sc1-c1ccc(C(N)=O)nc1,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,NEK5_binding,kdb_89,Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(...,train,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NEK5_binding MFP fold5 (it: 6)
