## Dealing with imbalanced data
Use the NEK2 Binding data set as an example

In [4]:
import math
import torch
import numpy as np
import gpytorch
import pandas as pd
import os
from matplotlib import pyplot as plt

from sklearn.model_selection import KFold

%matplotlib inline
%load_ext autoreload
%autoreload 2


In [6]:
# Binding
nekAll = ["2","3","5","9"]
#nekAll = ["3","5","9"]

for nek in nekAll:
    # Get training data
    data_path = "/p/lustre2/fan4/NEK_data/NEK"+nek+"/scaled_descriptors/"

    binding_df = pd.read_csv(data_path+"NEK"+nek+"_1_uM_min_50_pct_binding_with_moe_descriptors.csv") 
    print(binding_df.shape)

    print(binding_df.active.value_counts())
    num_gap = (binding_df.loc[binding_df['active']==0].shape[0]) - (binding_df.loc[binding_df['active']==1].shape[0])
    print(num_gap)
    num_minority = binding_df.loc[binding_df['active']==1].shape[0]
    print(num_minority)

    # Separate majority and minority classes
    df_majority = binding_df[binding_df['active']==0]
    df_minority = binding_df[binding_df['active']==1]

    #=======================
    # Create 5-fold splits
    #=======================
    kf = KFold(n_splits=5, shuffle=True, random_state=0)

    # majority
    for i, (_, v_ind) in enumerate(kf.split(df_majority)):
        df_majority.loc[df_majority.index[v_ind], 'fold'] = f"fold{i+1}"

    # minority
    for i, (_, v_ind) in enumerate(kf.split(df_minority)):
        df_minority.loc[df_minority.index[v_ind], 'fold'] = f"fold{i+1}"


    print(df_majority['fold'].value_counts())
    print(df_minority['fold'].value_counts())


    # Concat
    all_fold_df = pd.concat([df_majority,df_minority])
    print(all_fold_df.shape)
    print(all_fold_df.active.value_counts())


    # Save to file
    split_path = "/p/lustre2/fan4/NEK_data/NEK_data_4Berkeley/NEK"+nek
    
    if not os.path.exists(split_path):
        os.makedirs(split_path)

    all_fold_df.to_csv(split_path+"/NEK"+nek+"_1_uM_min_50_pct_binding_5fold_random_imbalanced.csv", index=False)


(1408, 309)
active
0    1351
1      57
Name: count, dtype: int64
1294
57
fold
fold1    271
fold4    270
fold2    270
fold3    270
fold5    270
Name: count, dtype: int64
fold
fold1    12
fold2    12
fold5    11
fold4    11
fold3    11
Name: count, dtype: int64
(1408, 310)
active
0    1351
1      57
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_majority.loc[df_majority.index[v_ind], 'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_minority.loc[df_minority.index[v_ind], 'fold'] = f"fold{i+1}"


(1404, 309)
active
0    1323
1      81
Name: count, dtype: int64
1242
81
fold
fold1    265
fold2    265
fold3    265
fold4    264
fold5    264
Name: count, dtype: int64
fold
fold1    17
fold4    16
fold3    16
fold2    16
fold5    16
Name: count, dtype: int64
(1404, 310)
active
0    1323
1      81
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_majority.loc[df_majority.index[v_ind], 'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_minority.loc[df_minority.index[v_ind], 'fold'] = f"fold{i+1}"


(1237, 309)
active
0    1140
1      97
Name: count, dtype: int64
1043
97
fold
fold3    228
fold1    228
fold2    228
fold4    228
fold5    228
Name: count, dtype: int64
fold
fold1    20
fold2    20
fold3    19
fold5    19
fold4    19
Name: count, dtype: int64
(1237, 310)
active
0    1140
1      97
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_majority.loc[df_majority.index[v_ind], 'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_minority.loc[df_minority.index[v_ind], 'fold'] = f"fold{i+1}"


(1409, 309)
active
0    1348
1      61
Name: count, dtype: int64
1287
61
fold
fold1    270
fold2    270
fold3    270
fold4    269
fold5    269
Name: count, dtype: int64
fold
fold1    13
fold5    12
fold4    12
fold3    12
fold2    12
Name: count, dtype: int64
(1409, 310)
active
0    1348
1      61
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_majority.loc[df_majority.index[v_ind], 'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_minority.loc[df_minority.index[v_ind], 'fold'] = f"fold{i+1}"


In [7]:
# inhibition
source = "inhibition"
nekAll = ["2","9"]

for nek in nekAll:
    # Get training data
    data_path = "/p/lustre2/fan4/NEK_data/NEK"+nek+"/scaled_descriptors/"

    binding_df = pd.read_csv(data_path+"NEK"+nek+"_1_uM_min_50_pct_"+source+"_with_moe_descriptors.csv") 
    print(binding_df.shape)

    print(binding_df.active.value_counts())
    num_gap = (binding_df.loc[binding_df['active']==0].shape[0]) - (binding_df.loc[binding_df['active']==1].shape[0])
    print(num_gap)
    num_minority = binding_df.loc[binding_df['active']==1].shape[0]
    print(num_minority)

    # Separate majority and minority classes
    df_majority = binding_df[binding_df['active']==0]
    df_minority = binding_df[binding_df['active']==1]

    #=======================
    # Create 5-fold splits
    #=======================
    kf = KFold(n_splits=5, shuffle=True, random_state=0)

    # majority
    for i, (_, v_ind) in enumerate(kf.split(df_majority)):
        df_majority.loc[df_majority.index[v_ind], 'fold'] = f"fold{i+1}"

    # minority
    for i, (_, v_ind) in enumerate(kf.split(df_minority)):
        df_minority.loc[df_minority.index[v_ind], 'fold'] = f"fold{i+1}"


    print(df_majority['fold'].value_counts())
    print(df_minority['fold'].value_counts())


    # Concat
    all_fold_df = pd.concat([df_majority,df_minority])
    print(all_fold_df.shape)
    print(all_fold_df.active.value_counts())


    # Save to file
    split_path = "/p/lustre2/fan4/NEK_data/NEK_data_4Berkeley/NEK"+nek
    all_fold_df.to_csv(split_path+"/NEK"+nek+"_1_uM_min_50_pct_"+source+"_5fold_random_imbalanced.csv", index=False)


(2044, 309)
active
0    1904
1     140
Name: count, dtype: int64
1764
140
fold
fold4    381
fold1    381
fold2    381
fold3    381
fold5    380
Name: count, dtype: int64
fold
fold4    28
fold3    28
fold1    28
fold5    28
fold2    28
Name: count, dtype: int64
(2044, 310)
active
0    1904
1     140
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_majority.loc[df_majority.index[v_ind], 'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_minority.loc[df_minority.index[v_ind], 'fold'] = f"fold{i+1}"


(393, 309)
active
0    351
1     42
Name: count, dtype: int64
309
42
fold
fold1    71
fold4    70
fold2    70
fold3    70
fold5    70
Name: count, dtype: int64
fold
fold2    9
fold1    9
fold5    8
fold4    8
fold3    8
Name: count, dtype: int64
(393, 310)
active
0    351
1     42
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_majority.loc[df_majority.index[v_ind], 'fold'] = f"fold{i+1}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_minority.loc[df_minority.index[v_ind], 'fold'] = f"fold{i+1}"


## 5-fold Split

In [8]:
# Binding
method = "binding"
nekAll = ["2","3","5","9"]
foldAll = ["fold1","fold2","fold3","fold4","fold5"]

for nek in nekAll:
    # Get training data
    split_path = "/p/lustre2/fan4/NEK_data/NEK_data_4Berkeley/NEK"+nek
    random_df = pd.read_csv(split_path+"/NEK"+nek+"_1_uM_min_50_pct_binding_5fold_random_imbalanced.csv")
    random_df.head

    moe_columns = random_df.columns[3:]
    moe_columns = moe_columns[:-1]
    moe_columns

    # Use the fold 0 as the test set, fold 1,2,3,4 as the train set
    for fold in foldAll:
        test_moe_df = random_df.loc[random_df['fold'] == fold]
        train_moe_df = random_df.loc[random_df['fold'] != fold]
        print(test_moe_df.shape)
        print(train_moe_df.shape)

        test_x_df = test_moe_df[moe_columns]
        test_y_df = test_moe_df['active']
        print(test_x_df)
        print(test_y_df.value_counts())

        train_x_df = train_moe_df[moe_columns]
        train_y_df = train_moe_df['active']
        print(train_x_df.shape)
        print(train_y_df.value_counts())


        # Save to file
        uq_path = "/p/lustre2/fan4/NEK_data/NEK_data_4Berkeley/NEK"+nek
        #uq_path = "/p/lustre2/fan4/myGPyTorch/classification_NEK/data/"

        train_x_df.to_csv(uq_path+"/NEK"+nek+"_binding_random_"+fold+"_trainX.csv", index=False)
        train_y_df.to_csv(uq_path+"/NEK"+nek+"_binding_random_"+fold+"_trainY.csv", index=False)
        test_x_df.to_csv(uq_path+"/NEK"+nek+"_binding_random_"+fold+"_testX.csv", index=False)
        test_y_df.to_csv(uq_path+"/NEK"+nek+"_binding_random_"+fold+"_testY.csv", index=False)





(283, 310)
(1125, 310)
      ASA+_per_atom       ASA-  ASA_H_per_atom       ASA_P  ASA_per_atom  \
1          6.511466  299.14301        8.385662  196.062190     11.950429   
4          9.173611  133.46625        9.677048  114.335650     12.685881   
5          8.296552  237.38503        9.749090  192.898510     14.825367   
8          9.887776  210.51236       13.607906   81.717201     15.877828   
9          7.022085  194.07881        7.802077  156.386930     10.452703   
...             ...        ...             ...         ...           ...   
1385       6.990230  193.16020        8.167764  135.191250     10.348269   
1386       9.424202  155.31209       13.011127   73.114189     15.935694   
1392       9.204168  170.90587       10.350260  128.671800     12.923696   
1406       7.843355  171.71243        8.605889  130.535580     11.023215   
1407       8.132017  268.25793        9.818810  181.742920     13.184420   

      BCUT_PEOE_0  BCUT_PEOE_1  BCUT_PEOE_2  BCUT_PEOE_3  \
1   

(283, 310)
(1126, 310)
      ASA+_per_atom       ASA-  ASA_H_per_atom      ASA_P  ASA_per_atom  \
1          8.863331  200.55731        8.744996  211.27695     13.146599   
5          5.710754  308.49966        8.336615  103.68244      9.665877   
8          7.641164  234.83014       10.412645  114.83887     13.022620   
9          7.728748  250.53065        8.216459  234.36145     13.424491   
14         7.777926  360.22958        9.405464  269.08743     14.210598   
...             ...        ...             ...        ...           ...   
1382       8.375013  138.32080        8.325641  141.62868     10.439502   
1383       7.279829  393.08694        9.612618  271.78192     14.839193   
1388       7.053889  166.09090        6.883618  178.69096      9.298361   
1407       8.538722  185.80338        8.776142  171.08333     11.535550   
1408       9.692602  179.73247        9.938689  168.41252     13.599830   

      BCUT_PEOE_0  BCUT_PEOE_1  BCUT_PEOE_2  BCUT_PEOE_3  \
1       -2.57943

In [10]:
# inhibition
method = "inhibition"
nekAll = ["2","9"]
foldAll = ["fold1","fold2","fold3","fold4","fold5"]

for nek in nekAll:
    # Get training data
    split_path = "/p/lustre2/fan4/NEK_data/NEK_data_4Berkeley/NEK"+nek
    random_df = pd.read_csv(split_path+"/NEK"+nek+"_1_uM_min_50_pct_"+method+"_5fold_random_imbalanced.csv")
    random_df.head

    moe_columns = random_df.columns[3:]
    moe_columns = moe_columns[:-1]
    moe_columns

    # Use the fold 0 as the test set, fold 1,2,3,4 as the train set
    for fold in foldAll:
        test_moe_df = random_df.loc[random_df['fold'] == fold]
        train_moe_df = random_df.loc[random_df['fold'] != fold]
        print(test_moe_df.shape)
        print(train_moe_df.shape)

        test_x_df = test_moe_df[moe_columns]
        test_y_df = test_moe_df['active']
        print(test_x_df)
        print(test_y_df.value_counts())

        train_x_df = train_moe_df[moe_columns]
        train_y_df = train_moe_df['active']
        print(train_x_df.shape)
        print(train_y_df.value_counts())


        # Save to file
        uq_path = "/p/lustre2/fan4/NEK_data/NEK_data_4Berkeley/NEK"+nek
        #uq_path = "/p/lustre2/fan4/myGPyTorch/classification_NEK/data/"

        train_x_df.to_csv(uq_path+"/NEK"+nek+"_"+method+"_random_"+fold+"_trainX.csv", index=False)
        train_y_df.to_csv(uq_path+"/NEK"+nek+"_"+method+"_random_"+fold+"_trainY.csv", index=False)
        test_x_df.to_csv(uq_path+"/NEK"+nek+"_"+method+"_random_"+fold+"_testX.csv", index=False)
        test_y_df.to_csv(uq_path+"/NEK"+nek+"_"+method+"_random_"+fold+"_testY.csv", index=False)





(409, 310)
(1635, 310)
      ASA+_per_atom       ASA-  ASA_H_per_atom      ASA_P  ASA_per_atom  \
1          7.934616  158.87631        7.420585  189.45982     10.865310   
4          7.463830  237.68829       10.271510  108.22626     12.574197   
5          8.182241  281.36035       11.570551  134.44797     14.558284   
6          9.252768  169.27931       10.768620  118.69076     13.735889   
9          8.561838  226.01562       10.099210  161.08264     13.601007   
...             ...        ...             ...        ...           ...   
2015       7.525463  138.65553        7.565813  140.20396      9.340547   
2026       8.021498  323.03540        8.850160  277.65530     13.477749   
2027       7.554388  255.61058        7.652859  252.85341     16.683338   
2035       7.552055  179.63304        8.890672  104.67045     10.759787   
2041       9.273098  136.72115        9.210555  140.78642     11.376500   

      BCUT_PEOE_0  BCUT_PEOE_1  BCUT_PEOE_2  BCUT_PEOE_3  \
1       -2.74318