In [1]:
import pandas as pd 
import numpy as np
import sklearn 
from sklearn.model_selection import StratifiedKFold
from rdkit import Chem
from rdkit.Chem import AllChem
from imblearn.over_sampling import SMOTEN, ADASYN, SMOTE 

In [2]:
neks = ['NEK2_binding', 'NEK2_inhibition', 'NEK3_binding', 'NEK5_binding','NEK9_binding','NEK9_inhibition']
samplings=['none_scaled', 'UNDER','SMOTE','ADASYN']
folds=['fold1','fold2','fold3','fold4','fold5'] 
feats=['MOE','MFP']

In [3]:
def create_folds(df, num): 
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=num)
    for i, (train_index, test_index) in enumerate(skf.split(df, df['active'])):
        df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
    return df
     

In [4]:
def label_subsets(df, test_fold, label):
   """function to label 'train' or 'test' in the 'subset' column
   to be used to create train/test OR train/val
   fold_df: dataframe with column 'fold'
   test_fold (str): fold to make the test set (the remaining folds will be train)
   label (str): 'test' or 'valididation' 
   """ 
   df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')
   return df[['NEK','compound_id', 'active', 'base_rdkit_smiles', 'subset']]


In [26]:
id_cols = ['NEK', 'compound_id', 'base_rdkit_smiles','subset', 'active'] 
def over_sampling(data_path=None,filename=None, df=None, sampling=None, printOut=False):
    """Oversample the datasetes using the SMOTE or ADASYN
    Keeps the feature names and id cols
    file_name (full/absolute path): use the scaled dataframe we just created above 'NEK#_(binding/inhibition)_(MOE/MFP)_none_scaled_df.csv'
    sampling (str): 'SMOTE' or 'ADASYN'
    returns: oversampled dataframe
    """
    if data_path is not None: 
        df = pd.read_csv(data_path+filename) # this is the already scaled ver
    
    feat_cols = list(set(list(df.columns))-set(id_cols))

    # train and test 
    train = df[df['subset']=='train'] 
    test =df[df['subset']=='test'] 
    nek = df['NEK'].iloc[0]
    # separate just id cols
    just_ids = ['NEK', 'compound_id', 'base_rdkit_smiles','subset']
    train_just_ids = train[just_ids]
    test_just_ids = test[just_ids]

    # just feats and 'active'
    trainX = train[feat_cols]
    testX = test[feat_cols]
    
    trainy = train['active']
    testy = test['active']
    
    if sampling == 'ADASYN':
        oversample = ADASYN(random_state=42)
    else: 
        oversample = SMOTE(random_state=42)

    
    trainX_temp, trainy_temp = oversample.fit_resample(trainX.to_numpy(), trainy.to_numpy().reshape(-1))
    if printOut: 
        print(f'train after {sampling}: {trainX_temp.shape}')
    
    trainX_resamp = pd.DataFrame(trainX_temp, columns=feat_cols)
    trainy_resamp = pd.DataFrame(trainy_temp, columns=['active'])

    num_real = len(train)
    num_synthetic = len(trainX_resamp)-num_real
    synthetic_ids = pd.DataFrame({'NEK': [nek] * num_synthetic,
        'compound_id': [f'synthetic_{sampling}_{i}' for i in range(num_synthetic)],
        'base_rdkit_smiles': [f'synthetic_{sampling}'] * num_synthetic,
        'subset': ['train']*num_synthetic}) # ,'active':[1]*num_synthetic}

    real_ids = train_just_ids.reset_index(drop=True)
    combined_ids = pd.concat([real_ids,synthetic_ids], ignore_index=True)
    
    train_resamp = pd.concat([combined_ids, trainX_resamp, trainy_resamp[['active']]], axis=1)

    print(train_resamp.columns[train_resamp.columns.duplicated()])
    test_df_final = pd.concat([test_just_ids.reset_index(drop=True),
                               testX.reset_index(drop=True), testy.reset_index(drop=True)],axis=1)
    
    final_df = pd.concat([train_resamp, test_df_final]).reset_index(drop=True)
    return final_df[list(df.columns)]

In [6]:

from imblearn.under_sampling import RandomUnderSampler
def under_sampling(data_path=None,filename=None, df=None): 
    if data_path is not None: 
        df = pd.read_csv(data_path+filename) # this is the already scaled ver
    feat_cols = list(set(list(df.columns))-set(id_cols))
    
    # train and test 
    train = df[df['subset']=='train'] 
    test =df[df['subset']=='test'] 

    # separate just id cols
    just_ids = ['NEK', 'compound_id', 'base_rdkit_smiles','subset']
    train_just_ids = train[just_ids]
    test_just_ids = test[just_ids]

    # just feats and 'active'
    trainX = train[feat_cols]
    testX = test[feat_cols]
    
    trainy = train['active']
    testy = test['active']
    
    undersample = RandomUnderSampler(random_state=42)
    
    trainX_temp, trainy_temp = undersample.fit_resample(trainX.to_numpy(), trainy.to_numpy().reshape(-1))
    
    trainX_resamp = pd.DataFrame(trainX_temp, columns=feat_cols)
    trainy_resamp = pd.DataFrame(trainy_temp, columns=['active'])
    
    train_ids_resamp = train_just_ids.iloc[trainX_resamp.index].reset_index(drop=True)
    train_resamp= pd.concat([train_ids_resamp, trainX_resamp,trainy_resamp], axis=1)
    # train_resamp['subset'] = 'train'

    test_df_final = pd.concat([test_just_ids.reset_index(drop=True),testX.reset_index(drop=True),testy.reset_index(drop=True)],axis=1)
    # test_df_final['subset'] = 'test'
    final_df = pd.concat([train_resamp,test_df_final]).reset_index(drop=True)
    return final_df[list(df.columns)]
    

In [None]:
def featurize(feat_type,data_path=None, filename=None,moe_path=None, moe_file=None, moe_df=None, df=None,mfp_radius=2, nBits=2048): 
    if (feat_type == 'MOE') and (moe_path is not None) and (data_path is not None): 
        feat_df = create_moe(data_path, filename, moe_path, moe_file)
    elif (feat_type == 'MOE') and (df is not None) and (moe_df is not None): 
        feat_df = create_moe(df=df, moe_df=moe_df)
    elif (feat_type == 'MFP') and (data_path is not None): 
        feat_df = create_mfp(data_path, filename, mfp_radius, nBits)
    elif (feat_type == 'MFP') and (df is not None): 
        feat_df = create_mfp(df=df)
    
    return feat_df

def create_moe(data_path=None, filename=None, moe_path=None, moe_file=None, df=None, moe_df=None):
    """(intended use for already existing dataset)
    This function will use an existing dataframe with smiles column to
    get the features from an existing file (moe_path+moe_file) with the MOE features generated"""
    drop_cols = ['active', 'compound_id']
    id_cols = ['NEK', 'compound_id','base_rdkit_smiles','subset', 'active']
   
    if data_path is not None: 
        df = remove_duplicates(data_path, filename)
    df=df.drop(columns=drop_cols)
    if moe_path is not None: 
        moe_df=remove_duplicates(moe_path,moe_file)
    final_df=moe_df.merge(df, how='outer', on=['base_rdkit_smiles'], suffixes=('_moe_desc', '_og'))
    NEK_col = final_df['NEK_og'] 
    subset_col = final_df['subset_og']
    
    final_df = final_df.loc[:,~final_df.columns.str.endswith(('_moe_desc', '_og'))]
    final_df['NEK']=NEK_col
    final_df['subset']=subset_col
    
    feat_cols = set(list(final_df.columns))-set(id_cols)
    final_order_cols = list(id_cols)+list(feat_cols)
    final_df =final_df[final_order_cols] 
    if 'fold' in final_df.columns: 
        final_df=final_df.drop(columns=['fold']) 
    return final_df
def smiles_to_fps(smiles_list, radius=2, nBits=2048):
    fps = []
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol is not None:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
            arr = np.zeros((1,), dtype=np.int8)
            Chem.DataStructs.ConvertToNumpyArray(fp, arr)
            fps.append(arr)
    return np.array(fps)

def create_mfp(file_path=None, filename=None, df=None,mfp_radius=2, nBits=2048):
    if file_path is not None: 
        df = pd.read_csv(file_path+filename)
    
    id_cols = ['NEK', 'compound_id','base_rdkit_smiles','subset', 'active'] 
    
   
    smiles = df['base_rdkit_smiles']
    mfp_feats = smiles_to_fps(smiles,mfp_radius,nBits)
    mfp_df = pd.DataFrame(mfp_feats)
    # if mfp_df['base_rdkit_smiles'].isnull().any():
    #     print("Warning: Missing values found in 'base_rdkit_smiles' column in df.")
    valid_smiles = smiles[smiles.apply(lambda x: Chem.MolFromSmiles(x) is not None)]
    
    feat_cols = set(list(mfp_df.columns))-set(id_cols)
    final_order_cols = list(id_cols)+list(feat_cols)

    mfp_df.reset_index(drop=True, inplace=True)
    df.reset_index(drop=True, inplace=True)
    final_df = pd.concat([df,mfp_df],axis=1)

    final_df = final_df[final_order_cols]
  

    return final_df

In [None]:
def get_arrays(file_path=None, root_name=None, df=None,nonfeat_cols=None): 
    if file_path is not None: 
        df=pd.read_csv(f'{file_path}{root_name}.csv')
    train=df[df['subset']=='train']
    test=df[df['subset']=='test']
    train_y = train['active'].to_numpy().reshape(-1) 
    test_y =test['active'].to_numpy().reshape(-1) 
    trainX = train.drop(columns= nonfeat_cols) 
    testX = test.drop(columns= nonfeat_cols) 
    return trainX, train_y, testX, test_y

In [25]:
datapath='/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/paper/datasets/80train_20test/featurized/'
rng = np.random.default_rng(seed=42) # Create a Generator object with a seed 
numbers = rng.integers(low=0, high=1e6, size=10)  # Generate random numbers
# print(numbers) # [ 89250 773956 654571 438878 433015 858597  85945 697368 201469  94177] 
for nek in neks: 
    for feat in ['MOE','MFP']: 
        for i, num in enumerate(numbers): # 5fold x10 
            split_df = pd.read_csv(f'{datapath}{nek}_{feat}_none_scaled.csv')
            train=split_df[split_df['subset']=='train'] 
            folded_train_df = create_folds(train,num) # 5 fold split (validation models) in this iteration 
            # 80% train data, split into 5 folds  
            for fold in folds: # then use these 5 folds for train/validation 
                kfold_df=label_subsets(folded_train_df, fold, 'test') 
                
                if feat == 'MOE': 
                    featurized_df = featurize(feat_type='MOE',data_path=None, filename=None,moe_path=None, moe_file=None, moe_df=folded_train_df,df=kfold_df) 
         
                else: 
                    # feat_type,data_path=None,filename=None,moe_path=None,moe_file=None,moe_df=None,df=None,mfp_radius=2,nBits=2048
                    featurized_df = featurize(feat_type='MFP', df=kfold_df,mfp_radius=2, nBits=2048)

                for samp in ["none_scaled",'UNDER', 'SMOTE', 'ADASYN']:
                    if samp == 'UNDER': 
                        sampled_df = under_sampling(data_path=None,filename=None,df=featurized_df) 
                        # sampled_df.to_csv(f'{filepath}{nek}_{feat}_{samp}.csv',index=False)
                        
                    elif samp == "SMOTE" or samp == "ADASYN": 
                        sampled_df=over_sampling(data_path=None,filename=None,df=featurized_df, sampling=samp) 
                        # sampled_df.to_csv(f'{filepath}{nek}_{feat}_{samp}.csv',index=False) 
                    elif samp == 'none_scaled': 
                        sampled_df = featurized_df 
                        
                    root_name = f'{nek}_{feat}_{samp}'
                    print(f'{nek} {feat} {samp} {fold} (it: {i})')
                    id_cols = ['NEK', 'compound_id','base_rdkit_smiles','subset', 'active'] 
                    trainX, train_y, testX, test_y=get_arrays(file_path=None, root_name=None, df=sampled_df,nonfeat_cols=id_cols)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else '

NEK2_binding MOE none_scaled fold1 (it: 0)
NEK2_binding MOE UNDER fold1 (it: 0)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold1 (it: 0)
train after ADASYN: (1731, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold1 (it: 0)
NEK2_binding MOE none_scaled fold2 (it: 0)
NEK2_binding MOE UNDER fold2 (it: 0)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold2 (it: 0)
train after ADASYN: (1731, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold2 (it: 0)
NEK2_binding MOE none_scaled fold3 (it: 0)
NEK2_binding MOE UNDER fold3 (it: 0)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold3 (it: 0)
train after ADASYN: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold3 (it: 0)
NEK2_binding MOE none_scaled fold4 (it: 0)
NEK2_binding MOE UNDER fold4 (it: 0)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold4 (it: 0)
train after ADAS

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: '

train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold2 (it: 2)
train after ADASYN: (1736, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold2 (it: 2)
NEK2_binding MOE none_scaled fold3 (it: 2)
NEK2_binding MOE UNDER fold3 (it: 2)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold3 (it: 2)
train after ADASYN: (1730, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold3 (it: 2)
NEK2_binding MOE none_scaled fold4 (it: 2)
NEK2_binding MOE UNDER fold4 (it: 2)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold4 (it: 2)
train after ADASYN: (1723, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold4 (it: 2)
NEK2_binding MOE none_scaled fold5 (it: 2)
NEK2_binding MOE UNDER fold5 (it: 2)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold5 (it: 2)
train after ADASYN: (1726, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold5 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: '

NEK2_binding MOE UNDER fold2 (it: 3)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold2 (it: 3)
train after ADASYN: (1732, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold2 (it: 3)
NEK2_binding MOE none_scaled fold3 (it: 3)
NEK2_binding MOE UNDER fold3 (it: 3)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold3 (it: 3)
train after ADASYN: (1725, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold3 (it: 3)
NEK2_binding MOE none_scaled fold4 (it: 3)
NEK2_binding MOE UNDER fold4 (it: 3)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold4 (it: 3)
train after ADASYN: (1736, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold4 (it: 3)
NEK2_binding MOE none_scaled fold5 (it: 3)
NEK2_binding MOE UNDER fold5 (it: 3)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold5 (it: 3)
train after ADASYN: (1728, 306)
Index([], dtype='object')
N

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: '

NEK2_binding MOE none_scaled fold3 (it: 4)
NEK2_binding MOE UNDER fold3 (it: 4)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold3 (it: 4)
train after ADASYN: (1725, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold3 (it: 4)
NEK2_binding MOE none_scaled fold4 (it: 4)
NEK2_binding MOE UNDER fold4 (it: 4)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold4 (it: 4)
train after ADASYN: (1733, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold4 (it: 4)
NEK2_binding MOE none_scaled fold5 (it: 4)
NEK2_binding MOE UNDER fold5 (it: 4)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold5 (it: 4)
train after ADASYN: (1722, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold5 (it: 4)
NEK2_binding MOE none_scaled fold1 (it: 5)
NEK2_binding MOE UNDER fold1 (it: 5)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold1 (it: 5)
train after ADAS

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fol

NEK2_binding MOE ADASYN fold2 (it: 5)
NEK2_binding MOE none_scaled fold3 (it: 5)
NEK2_binding MOE UNDER fold3 (it: 5)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold3 (it: 5)
train after ADASYN: (1733, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold3 (it: 5)
NEK2_binding MOE none_scaled fold4 (it: 5)
NEK2_binding MOE UNDER fold4 (it: 5)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold4 (it: 5)
train after ADASYN: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold4 (it: 5)
NEK2_binding MOE none_scaled fold5 (it: 5)
NEK2_binding MOE UNDER fold5 (it: 5)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold5 (it: 5)
train after ADASYN: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold5 (it: 5)
NEK2_binding MOE none_scaled fold1 (it: 6)
NEK2_binding MOE UNDER fold1 (it: 6)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MO

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fol

train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold3 (it: 6)
train after ADASYN: (1733, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold3 (it: 6)
NEK2_binding MOE none_scaled fold4 (it: 6)
NEK2_binding MOE UNDER fold4 (it: 6)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold4 (it: 6)
train after ADASYN: (1721, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold4 (it: 6)
NEK2_binding MOE none_scaled fold5 (it: 6)
NEK2_binding MOE UNDER fold5 (it: 6)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold5 (it: 6)
train after ADASYN: (1724, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold5 (it: 6)
NEK2_binding MOE none_scaled fold1 (it: 7)
NEK2_binding MOE UNDER fold1 (it: 7)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold1 (it: 7)
train after ADASYN: (1722, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold1 (it: 7)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else '

train after ADASYN: (1732, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold3 (it: 7)
NEK2_binding MOE none_scaled fold4 (it: 7)
NEK2_binding MOE UNDER fold4 (it: 7)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold4 (it: 7)
train after ADASYN: (1723, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold4 (it: 7)
NEK2_binding MOE none_scaled fold5 (it: 7)
NEK2_binding MOE UNDER fold5 (it: 7)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold5 (it: 7)
train after ADASYN: (1737, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold5 (it: 7)
NEK2_binding MOE none_scaled fold1 (it: 8)
NEK2_binding MOE UNDER fold1 (it: 8)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold1 (it: 8)
train after ADASYN: (1732, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold1 (it: 8)
NEK2_binding MOE none_scaled fold2 (it: 8)
NEK2_binding MOE UNDER fold2 (it: 8)
train after SM

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else '

NEK2_binding MOE none_scaled fold5 (it: 8)
NEK2_binding MOE UNDER fold5 (it: 8)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold5 (it: 8)
train after ADASYN: (1726, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold5 (it: 8)
NEK2_binding MOE none_scaled fold1 (it: 9)
NEK2_binding MOE UNDER fold1 (it: 9)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold1 (it: 9)
train after ADASYN: (1720, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold1 (it: 9)
NEK2_binding MOE none_scaled fold2 (it: 9)
NEK2_binding MOE UNDER fold2 (it: 9)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold2 (it: 9)
train after ADASYN: (1724, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold2 (it: 9)
NEK2_binding MOE none_scaled fold3 (it: 9)
NEK2_binding MOE UNDER fold3 (it: 9)
train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold3 (it: 9)
train after ADAS

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: '

train after SMOTE: (1728, 306)
Index([], dtype='object')
NEK2_binding MOE SMOTE fold5 (it: 9)
train after ADASYN: (1723, 306)
Index([], dtype='object')
NEK2_binding MOE ADASYN fold5 (it: 9)
NEK2_binding MFP none_scaled fold1 (it: 0)
NEK2_binding MFP UNDER fold1 (it: 0)
train after SMOTE: (1728, 2048)
Index([], dtype='object')
NEK2_binding MFP SMOTE fold1 (it: 0)
train after ADASYN: (1731, 2048)
Index([], dtype='object')
NEK2_binding MFP ADASYN fold1 (it: 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_binding MFP none_scaled fold2 (it: 0)
NEK2_binding MFP UNDER fold2 (it: 0)
train after SMOTE: (1728, 2048)
Index([], dtype='object')
NEK2_binding MFP SMOTE fold2 (it: 0)
train after ADASYN: (1733, 2048)
Index([], dtype='object')
NEK2_binding MFP ADASYN fold2 (it: 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_binding MFP none_scaled fold3 (it: 0)
NEK2_binding MFP UNDER fold3 (it: 0)
train after SMOTE: (1728, 2048)
Index([], dtype='object')
NEK2_binding MFP SMOTE fold3 (it: 0)
train after ADASYN: (1726, 2048)
Index([], dtype='object')
NEK2_binding MFP ADASYN fold3 (it: 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_binding MFP none_scaled fold4 (it: 0)
NEK2_binding MFP UNDER fold4 (it: 0)
train after SMOTE: (1728, 2048)
Index([], dtype='object')
NEK2_binding MFP SMOTE fold4 (it: 0)
train after ADASYN: (1734, 2048)
Index([], dtype='object')
NEK2_binding MFP ADASYN fold4 (it: 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_binding MFP none_scaled fold5 (it: 0)
NEK2_binding MFP UNDER fold5 (it: 0)
train after SMOTE: (1728, 2048)
Index([], dtype='object')
NEK2_binding MFP SMOTE fold5 (it: 0)
train after ADASYN: (1725, 2048)
Index([], dtype='object')
NEK2_binding MFP ADASYN fold5 (it: 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_binding MFP none_scaled fold1 (it: 1)
NEK2_binding MFP UNDER fold1 (it: 1)
train after SMOTE: (1728, 2048)
Index([], dtype='object')
NEK2_binding MFP SMOTE fold1 (it: 1)
train after ADASYN: (1726, 2048)
Index([], dtype='object')
NEK2_binding MFP ADASYN fold1 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_binding MFP none_scaled fold2 (it: 1)
NEK2_binding MFP UNDER fold2 (it: 1)
train after SMOTE: (1728, 2048)
Index([], dtype='object')
NEK2_binding MFP SMOTE fold2 (it: 1)
train after ADASYN: (1732, 2048)
Index([], dtype='object')
NEK2_binding MFP ADASYN fold2 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_binding MFP none_scaled fold3 (it: 1)
NEK2_binding MFP UNDER fold3 (it: 1)
train after SMOTE: (1728, 2048)
Index([], dtype='object')
NEK2_binding MFP SMOTE fold3 (it: 1)
train after ADASYN: (1726, 2048)
Index([], dtype='object')
NEK2_binding MFP ADASYN fold3 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_binding MFP none_scaled fold4 (it: 1)
NEK2_binding MFP UNDER fold4 (it: 1)
train after SMOTE: (1728, 2048)
Index([], dtype='object')
NEK2_binding MFP SMOTE fold4 (it: 1)
train after ADASYN: (1723, 2048)
Index([], dtype='object')
NEK2_binding MFP ADASYN fold4 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_binding MFP none_scaled fold5 (it: 1)
NEK2_binding MFP UNDER fold5 (it: 1)
train after SMOTE: (1728, 2048)
Index([], dtype='object')
NEK2_binding MFP SMOTE fold5 (it: 1)
train after ADASYN: (1738, 2048)
Index([], dtype='object')
NEK2_binding MFP ADASYN fold5 (it: 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_binding MFP none_scaled fold1 (it: 2)
NEK2_binding MFP UNDER fold1 (it: 2)
train after SMOTE: (1728, 2048)
Index([], dtype='object')
NEK2_binding MFP SMOTE fold1 (it: 2)
train after ADASYN: (1720, 2048)
Index([], dtype='object')
NEK2_binding MFP ADASYN fold1 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_binding MFP none_scaled fold2 (it: 2)
NEK2_binding MFP UNDER fold2 (it: 2)
train after SMOTE: (1728, 2048)
Index([], dtype='object')
NEK2_binding MFP SMOTE fold2 (it: 2)
train after ADASYN: (1721, 2048)
Index([], dtype='object')
NEK2_binding MFP ADASYN fold2 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_binding MFP none_scaled fold3 (it: 2)
NEK2_binding MFP UNDER fold3 (it: 2)
train after SMOTE: (1728, 2048)
Index([], dtype='object')
NEK2_binding MFP SMOTE fold3 (it: 2)
train after ADASYN: (1725, 2048)
Index([], dtype='object')
NEK2_binding MFP ADASYN fold3 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_binding MFP none_scaled fold4 (it: 2)
NEK2_binding MFP UNDER fold4 (it: 2)
train after SMOTE: (1728, 2048)
Index([], dtype='object')
NEK2_binding MFP SMOTE fold4 (it: 2)
train after ADASYN: (1725, 2048)
Index([], dtype='object')
NEK2_binding MFP ADASYN fold4 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_binding MFP none_scaled fold5 (it: 2)
NEK2_binding MFP UNDER fold5 (it: 2)
train after SMOTE: (1728, 2048)
Index([], dtype='object')
NEK2_binding MFP SMOTE fold5 (it: 2)
train after ADASYN: (1728, 2048)
Index([], dtype='object')
NEK2_binding MFP ADASYN fold5 (it: 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.index[test_index],'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


NEK2_binding MFP none_scaled fold1 (it: 3)
NEK2_binding MFP UNDER fold1 (it: 3)
train after SMOTE: (1728, 2048)
Index([], dtype='object')
NEK2_binding MFP SMOTE fold1 (it: 3)
train after ADASYN: (1740, 2048)
Index([], dtype='object')
NEK2_binding MFP ADASYN fold1 (it: 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subset'] = df['fold'].apply(lambda x: 'test' if x == test_fold else 'train')


KeyboardInterrupt: 