In [1]:
import math
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import os
import pickle
import matplotlib 
matplotlib.use('Agg')
from matplotlib import pyplot as plt
import sklearn
from sklearn.model_selection import KFold

%matplotlib inline
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
import imblearn as imb

from sklearn.metrics import confusion_matrix
import itertools

from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, roc_curve, precision_recall_curve, auc, recall_score, confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

import sys
sys.path.append('../../')
# import utils
from sklearn.model_selection import GridSearchCV

from split_data import *
# from RF_Utils import *
# from RF_GSCV import *
from RF_atomver import *

In [2]:
def get_arrays(file_path, df_filename, filename_type=None, save=False):
    """use dataframes to get trainX, trainy, testX, testy out. Optional: save those files to csv
    file_path: directory
    df_filename: dataframe NEK#_binding_moe_{sampling}_df.csv (sampling: scaled, UNDER, SMOTE, ADASYN)
    split dataframe to train and test, and x and y
    save: bool, option to save splits to separate csv files (train X, train y, test X, test y) 
    returns: numpy arrays train X, train y, testX, test y"""
    df = pd.read_csv(file_path+df_filename)
    train_df= df[df['subset']=='train']
    test_df = df[df['subset']=='test']
    train_y = train_df['active'].to_numpy().reshape(-1)
    test_y=test_df['active'].to_numpy().reshape(-1)
    train_x_df = train_df.drop(columns='active')

  
    test_x_df = test_df.drop(columns='active')
    
    train_x_df = train_df.drop(columns='active')
    test_x_df = test_df.drop(columns='active')
    trainX = train_x_df.select_dtypes(include='number').to_numpy()
    testX = test_x_df.select_dtypes(include='number').to_numpy()
    
    print(f'train X shape: {trainX.shape}, y: {train_y.shape}, test X: {testX.shape}, y:{test_y.shape}')
    if (save and filename_type is not None): 
        trainxdf = pd.DataFrame(trainX)
        trainxdf.to_csv(file_path+filename_type+'_trainX.csv', index=False)
        # train_x_df.to_csv(filename_type+'_trainX.csv', index=False)
        trainy_df = pd.DataFrame(train_y)
        trainy_df.to_csv(file_path+filename_type+'_train_y.csv', index=False) 
        # test_x_df.to_csv(filename_type+'_testX.csv', index=False)
        testxdf = pd.DataFrame(testX)
        testxdf.to_csv(file_path+filename_type+'_testX.csv', index=False)
        testy_df = pd.DataFrame(test_y)
        testy_df.to_csv(file_path+filename_type+'_test_y.csv', index=False) 
        
    return trainX, train_y, testX, test_y

In [3]:
def add_cm(df): 
    # print(filepath)
    # print(filename)
    # df = pd.read_csv(filepath+filename)
    
    true_labels = df['y'] 
    predictions = df['prediction']
    cm = confusion_matrix(true_labels, predictions )
    cm_flattened = cm.flatten().tolist()
    df['cm'] = [cm_flattened]* len(df)
    df['prediction_type'] = df.apply(lambda x: prediction_type(x['y'], x['prediction']), axis=1)
    return df
        

# Default RF 

In [4]:
data_dir = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/'
nek_nums = [2,3,5,9]
samplings = ['scaled', 'UNDER', 'SMOTE', 'ADASYN'] 
features = ['moe', 'mfp']
NEK= 'NEK'

for i, n in enumerate(nek_nums):
    nek = str(n)
    model_type = 'RF'
    nek_path= f'{data_dir}NEK{nek}/bind/'
 
    for k, feat in enumerate(features): 
        print()
        for j, samp in enumerate(samplings): 
            
            print(f'NEK{nek} {feat} {samp}')
            file_root = f'NEK{nek}_binding_{feat}_{samp}_df.csv'
 
            trainX, trainy, testX, testy = get_arrays(nek_path,file_root)
            model = rf_models(trainX, trainy, testX, testy, model_type, {})
            train_df = gather_rf_results(model, trainX, trainy)
            test_df = gather_rf_results(model, testX, testy)
            train_df['subset'] = 'train' 
            test_df['subset'] = 'test' 
            model_name = f'NEK{nek}_binding_{feat}_{samp}_{model_type}' 
            train_df['model'] = model_name
            
            test_df['model'] = model_name
            train_df = add_cm(train_df)
            test_df = add_cm(test_df)
            with open(f'{model_name}.pkl', 'wb') as f: 
                pickle.dump(model,f)
            train_df.to_csv(f'{file_root}_train_{model_type}.csv',index=False) 
            test_df.to_csv(f'{file_root}_test_{model_type}.csv',index=False) 
            
            
            print()    
    # print()
    if n == 2 or n == 9:

        nek_path= f'{data_dir}NEK{nek}/inhib/'
        for k, feat in enumerate(features): 
            print()
            for j, samp in enumerate(samplings): 
                file_root = f'NEK{nek}_inhibition_{feat}_{samp}_df.csv'
                print(f'NEK{nek} {feat} {samp}')
                trainX, trainy, testX, testy = get_arrays(nek_path,file_root)
                model = rf_models(trainX, trainy, testX, testy, {model_type}, {})
                train_df = gather_rf_results(model, trainX, trainy)
                test_df = gather_rf_results(model, testX, testy)
                train_df['subset'] = 'train' 
                test_df['subset'] = 'test' 
                model_name = f'NEK{nek}_inhibition_{feat}_{samp}_{model_type}' 
                train_df['model'] = model_name
                test_df['model'] = model_name
                train_df = add_cm(train_df)
                test_df = add_cm(test_df)
                with open(f'{model_name}.pkl', 'wb') as f: 
                    pickle.dump(model, f) 
                train_df.to_csv(f'{file_root}_train_{model_type}.csv',index=False) 
                test_df.to_csv(f'{file_root}_test_{model_type}.csv',index=False) 

                print()
      
    print()


NEK2 moe scaled
train X shape: (1125, 306), y: (1125,), test X: (283, 306), y:(283,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.901, precision: 0.100, recall: 0.167, specificity: 0.934

NEK2 moe UNDER
train X shape: (90, 306), y: (90,), test X: (283, 306), y:(283,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.633, precision: 0.058, recall: 0.500, specificity: 0.638

NEK2 moe SMOTE
train X shape: (2160, 306), y: (2160,), test X: (283, 306), y:(283,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.940, precision: 0.273, recall: 0.250, specificity: 0.970

NEK2 moe ADASYN
train X shape: (2158, 306), y: (2158,), test X: (283, 306), y:(283,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.915, precision: 0.125, recall: 0.167, specificity: 0.948


NEK2 mfp scaled
train X shape: (1125, 2048), y: (1125,),

# RF + BCW 

In [5]:
data_dir = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/'
nek_nums = [2,3,5,9]
samplings = ['scaled', 'UNDER', 'SMOTE', 'ADASYN'] 
features = ['moe', 'mfp']
NEK= 'NEK'

for i, n in enumerate(nek_nums):
    nek = str(n)
    model_type = 'RF_BCW'
    nek_path= f'{data_dir}NEK{nek}/bind/'
 
    for k, feat in enumerate(features): 
        print()
        for j, samp in enumerate(samplings): 
            
            print(f'NEK{nek} {feat} {samp}')
            file_root = f'NEK{nek}_binding_{feat}_{samp}_df.csv'
 
            trainX, trainy, testX, testy = get_arrays(nek_path,file_root)
            model = rf_models(trainX, trainy, testX, testy, model_type, {})
            train_df = gather_rf_results(model, trainX, trainy)
            test_df = gather_rf_results(model, testX, testy)
            train_df['subset'] = 'train' 
            test_df['subset'] = 'test' 
            model_name = f'NEK{nek}_binding_{feat}_{samp}_{model_type}' 
            train_df['model'] = model_name
            
            test_df['model'] = model_name
            train_df = add_cm(train_df)
            test_df = add_cm(test_df)
            with open(f'{model_name}.pkl', 'wb') as f: 
                pickle.dump(model,f)
            train_df.to_csv(f'{file_root}_train_{model_type}.csv',index=False) 
            test_df.to_csv(f'{file_root}_test_{model_type}.csv',index=False) 
            
            
            print()    
    # print()
    if n == 2 or n == 9:

        nek_path= f'{data_dir}NEK{nek}/inhib/'
        for k, feat in enumerate(features): 
            print()
            for j, samp in enumerate(samplings): 
                file_root = f'NEK{nek}_inhibition_{feat}_{samp}_df.csv'
                print(f'NEK{nek} {feat} {samp}')
                trainX, trainy, testX, testy = get_arrays(nek_path,file_root)
                model = rf_models(trainX, trainy, testX, testy, {model_type}, {})
                train_df = gather_rf_results(model, trainX, trainy)
                test_df = gather_rf_results(model, testX, testy)
                train_df['subset'] = 'train' 
                test_df['subset'] = 'test' 
                model_name = f'NEK{nek}_inhibition_{feat}_{samp}_{model_type}' 
                train_df['model'] = model_name
                test_df['model'] = model_name
                train_df = add_cm(train_df)
                test_df = add_cm(test_df)
                with open(f'{model_name}.pkl', 'wb') as f: 
                    pickle.dump(model, f) 
                train_df.to_csv(f'{file_root}_train_{model_type}.csv',index=False) 
                test_df.to_csv(f'{file_root}_test_{model_type}.csv',index=False) 

                print()
      
    print()


NEK2 moe scaled
train X shape: (1125, 306), y: (1125,), test X: (283, 306), y:(283,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.940, precision: 0.143, recall: 0.083, specificity: 0.978

NEK2 moe UNDER
train X shape: (90, 306), y: (90,), test X: (283, 306), y:(283,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.562, precision: 0.069, recall: 0.750, specificity: 0.554

NEK2 moe SMOTE
train X shape: (2160, 306), y: (2160,), test X: (283, 306), y:(283,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.936, precision: 0.250, recall: 0.250, specificity: 0.967

NEK2 moe ADASYN
train X shape: (2158, 306), y: (2158,), test X: (283, 306), y:(283,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.912, precision: 0.158, recall: 0.250, specificity: 0.941


NEK2 mfp scaled
train X shape: (1125, 2048), y: (1125,),

# BRFC

In [6]:
data_dir = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/'
nek_nums = [2,3,5,9]
samplings = ['scaled', 'UNDER', 'SMOTE', 'ADASYN'] 
features = ['moe', 'mfp']
NEK= 'NEK'

for i, n in enumerate(nek_nums):
    nek = str(n)
    model_type = 'BRFC'
    nek_path= f'{data_dir}NEK{nek}/bind/'
 
    for k, feat in enumerate(features): 
        print()
        for j, samp in enumerate(samplings): 
            
            print(f'NEK{nek} {feat} {samp}')
            file_root = f'NEK{nek}_binding_{feat}_{samp}_df.csv'
 
            trainX, trainy, testX, testy = get_arrays(nek_path,file_root)
            model = rf_models(trainX, trainy, testX, testy, model_type, {})
            train_df = gather_rf_results(model, trainX, trainy)
            test_df = gather_rf_results(model, testX, testy)
            train_df['subset'] = 'train' 
            test_df['subset'] = 'test' 
            model_name = f'NEK{nek}_binding_{feat}_{samp}_{model_type}' 
            train_df['model'] = model_name
            
            test_df['model'] = model_name
            train_df = add_cm(train_df)
            test_df = add_cm(test_df)
            with open(f'{model_name}.pkl', 'wb') as f: 
                pickle.dump(model,f)
            train_df.to_csv(f'{file_root}_train_{model_type}.csv',index=False) 
            test_df.to_csv(f'{file_root}_test_{model_type}.csv',index=False) 
            
            
            print()    
    # print()
    if n == 2 or n == 9:

        nek_path= f'{data_dir}NEK{nek}/inhib/'
        for k, feat in enumerate(features): 
            print()
            for j, samp in enumerate(samplings): 
                file_root = f'NEK{nek}_inhibition_{feat}_{samp}_df.csv'
                print(f'NEK{nek} {feat} {samp}')
                trainX, trainy, testX, testy = get_arrays(nek_path,file_root)
                model = rf_models(trainX, trainy, testX, testy, {model_type}, {})
                train_df = gather_rf_results(model, trainX, trainy)
                test_df = gather_rf_results(model, testX, testy)
                train_df['subset'] = 'train' 
                test_df['subset'] = 'test' 
                model_name = f'NEK{nek}_inhibition_{feat}_{samp}_{model_type}' 
                train_df['model'] = model_name
                test_df['model'] = model_name
                train_df = add_cm(train_df)
                test_df = add_cm(test_df)
                with open(f'{model_name}.pkl', 'wb') as f: 
                    pickle.dump(model, f) 
                train_df.to_csv(f'{file_root}_train_{model_type}.csv',index=False) 
                test_df.to_csv(f'{file_root}_test_{model_type}.csv',index=False) 

                print()
      
    print()


NEK2 moe scaled
train X shape: (1125, 306), y: (1125,), test X: (283, 306), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 0.751, precision: 0.138, recall: 1.000, specificity: 0.741
TEST: accuracy: 0.721, precision: 0.076, recall: 0.500, specificity: 0.731

NEK2 moe UNDER
train X shape: (90, 306), y: (90,), test X: (283, 306), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.551, precision: 0.068, recall: 0.750, specificity: 0.542

NEK2 moe SMOTE
train X shape: (2160, 306), y: (2160,), test X: (283, 306), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.933, precision: 0.231, recall: 0.250, specificity: 0.963

NEK2 moe ADASYN
train X shape: (2158, 306), y: (2158,), test X: (283, 306), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.912, precision: 0.118, recall: 0.167, specificity: 0.945


NEK2 mfp scaled
train X shape: (1125, 2048), y: (1125,), test X: (283, 2048), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 0.821, precision: 0.183, recall: 1.000, specificity: 0.814
TEST: accuracy: 0.820, precision: 0.145, recall: 0.667, specificity: 0.827

NEK2 mfp UNDER
train X shape: (90, 2048), y: (90,), test X: (283, 2048), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.594, precision: 0.052, recall: 0.500, specificity: 0.598

NEK2 mfp SMOTE
train X shape: (2160, 2048), y: (2160,), test X: (283, 2048), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 0.999, specificity: 1.000
TEST: accuracy: 0.926, precision: 0.235, recall: 0.333, specificity: 0.952

NEK2 mfp ADASYN
train X shape: (2168, 2048), y: (2168,), test X: (283, 2048), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 0.999, specificity: 1.000
TEST: accuracy: 0.940, precision: 0.333, recall: 0.417, specificity: 0.963


NEK2 moe scaled
train X shape: (1635, 306), y: (1635,), test X: (409, 306), y:(409,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.949, precision: 0.667, recall: 0.500, specificity: 0.982

NEK2 moe UNDER
train X shape: (224, 306), y: (224,), test X: (409, 306), y:(409,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.760, precision: 0.188, recall: 0.750, specificity: 0.761

NEK2 moe SMOTE
train X shape: (3046, 306), y: (3046,), test X: (409, 306), y:(409,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.939, precision: 0.548, recall: 0.607, specificity: 0.963

NEK2 moe ADASYN
train X shape: (3037, 306), y: (3037,), test X: (409, 306), y:(409,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000

  warn(
  warn(


TRAIN: accuracy: 0.749, precision: 0.185, recall: 1.000, specificity: 0.733
TEST: accuracy: 0.642, precision: 0.104, recall: 0.647, specificity: 0.642

NEK3 moe UNDER
train X shape: (128, 306), y: (128,), test X: (282, 306), y:(282,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.553, precision: 0.084, recall: 0.647, specificity: 0.547

NEK3 moe SMOTE
train X shape: (2116, 306), y: (2116,), test X: (282, 306), y:(282,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.851, precision: 0.143, recall: 0.294, specificity: 0.887

NEK3 moe ADASYN
train X shape: (2113, 306), y: (2113,), test X: (282, 306), y:(282,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.833, precision: 0.125, recall: 0.294, specificity: 0.868


NEK3 mfp scaled
train X shape: (1122, 2048), y: (1122,), test X: (282, 2048), y:(282,)


  warn(
  warn(


TRAIN: accuracy: 0.809, precision: 0.230, recall: 1.000, specificity: 0.798
TEST: accuracy: 0.770, precision: 0.136, recall: 0.529, specificity: 0.785

NEK3 mfp UNDER
train X shape: (128, 2048), y: (128,), test X: (282, 2048), y:(282,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.514, precision: 0.065, recall: 0.529, specificity: 0.513

NEK3 mfp SMOTE
train X shape: (2116, 2048), y: (2116,), test X: (282, 2048), y:(282,)


  warn(
  warn(


TRAIN: accuracy: 0.999, precision: 1.000, recall: 0.998, specificity: 1.000
TEST: accuracy: 0.901, precision: 0.280, recall: 0.412, specificity: 0.932

NEK3 mfp ADASYN
train X shape: (2130, 2048), y: (2130,), test X: (282, 2048), y:(282,)


  warn(
  warn(


TRAIN: accuracy: 0.999, precision: 1.000, recall: 0.998, specificity: 1.000
TEST: accuracy: 0.894, precision: 0.259, recall: 0.412, specificity: 0.925



NEK5 moe scaled
train X shape: (989, 306), y: (989,), test X: (248, 306), y:(248,)


  warn(
  warn(


TRAIN: accuracy: 0.833, precision: 0.318, recall: 1.000, specificity: 0.819
TEST: accuracy: 0.774, precision: 0.235, recall: 0.800, specificity: 0.772

NEK5 moe UNDER
train X shape: (154, 306), y: (154,), test X: (248, 306), y:(248,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.669, precision: 0.140, recall: 0.600, specificity: 0.675

NEK5 moe SMOTE
train X shape: (1824, 306), y: (1824,), test X: (248, 306), y:(248,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.915, precision: 0.471, recall: 0.400, specificity: 0.961

NEK5 moe ADASYN
train X shape: (1831, 306), y: (1831,), test X: (248, 306), y:(248,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.899, precision: 0.400, recall: 0.500, specificity: 0.934


NEK5 mfp scaled
train X shape: (989, 2048), y: (989,), test X: (248, 2048), y:(248,)


  warn(
  warn(


TRAIN: accuracy: 0.854, precision: 0.348, recall: 1.000, specificity: 0.842
TEST: accuracy: 0.827, precision: 0.283, recall: 0.750, specificity: 0.833

NEK5 mfp UNDER
train X shape: (154, 2048), y: (154,), test X: (248, 2048), y:(248,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.504, precision: 0.088, recall: 0.550, specificity: 0.500

NEK5 mfp SMOTE
train X shape: (1824, 2048), y: (1824,), test X: (248, 2048), y:(248,)


  warn(
  warn(


TRAIN: accuracy: 0.999, precision: 1.000, recall: 0.998, specificity: 1.000
TEST: accuracy: 0.915, precision: 0.476, recall: 0.500, specificity: 0.952

NEK5 mfp ADASYN
train X shape: (1805, 2048), y: (1805,), test X: (248, 2048), y:(248,)


  warn(
  warn(


TRAIN: accuracy: 0.999, precision: 0.998, recall: 1.000, specificity: 0.998
TEST: accuracy: 0.927, precision: 0.562, recall: 0.450, specificity: 0.969



NEK9 moe scaled
train X shape: (1126, 306), y: (1126,), test X: (283, 306), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 0.764, precision: 0.153, recall: 1.000, specificity: 0.753
TEST: accuracy: 0.721, precision: 0.087, recall: 0.538, specificity: 0.730

NEK9 moe UNDER
train X shape: (96, 306), y: (96,), test X: (283, 306), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.572, precision: 0.050, recall: 0.462, specificity: 0.578

NEK9 moe SMOTE
train X shape: (2156, 306), y: (2156,), test X: (283, 306), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.922, precision: 0.091, recall: 0.077, specificity: 0.963

NEK9 moe ADASYN
train X shape: (2164, 306), y: (2164,), test X: (283, 306), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.905, precision: 0.062, recall: 0.077, specificity: 0.944


NEK9 mfp scaled
train X shape: (1126, 2048), y: (1126,), test X: (283, 2048), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 0.829, precision: 0.200, recall: 1.000, specificity: 0.822
TEST: accuracy: 0.820, precision: 0.172, recall: 0.769, specificity: 0.822

NEK9 mfp UNDER
train X shape: (96, 2048), y: (96,), test X: (283, 2048), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.456, precision: 0.062, recall: 0.769, specificity: 0.441

NEK9 mfp SMOTE
train X shape: (2156, 2048), y: (2156,), test X: (283, 2048), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 0.999, precision: 1.000, recall: 0.998, specificity: 1.000
TEST: accuracy: 0.936, precision: 0.333, recall: 0.385, specificity: 0.963

NEK9 mfp ADASYN
train X shape: (2166, 2048), y: (2166,), test X: (283, 2048), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 0.999, precision: 1.000, recall: 0.998, specificity: 1.000
TEST: accuracy: 0.933, precision: 0.286, recall: 0.308, specificity: 0.963


NEK9 moe scaled
train X shape: (313, 306), y: (313,), test X: (80, 306), y:(80,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.950, precision: 0.857, recall: 0.667, specificity: 0.986

NEK9 moe UNDER
train X shape: (66, 306), y: (66,), test X: (80, 306), y:(80,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.812, precision: 0.350, recall: 0.778, specificity: 0.817

NEK9 moe SMOTE
train X shape: (560, 306), y: (560,), test X: (80, 306), y:(80,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.887, precision: 0.500, recall: 0.556, specificity: 0.930

NEK9 moe ADASYN
train X shape: (560, 306), y: (560,), test X: (80, 306), y:(80,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1

# BRFC+BCW

In [7]:
data_dir = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/'
nek_nums = [2,3,5,9]
samplings = ['scaled', 'UNDER', 'SMOTE', 'ADASYN'] 
features = ['moe', 'mfp']
NEK= 'NEK'

for i, n in enumerate(nek_nums):
    nek = str(n)
    model_type = 'BRFC_BCW'
    nek_path= f'{data_dir}NEK{nek}/bind/'
 
    for k, feat in enumerate(features): 
        print()
        for j, samp in enumerate(samplings): 
            
            print(f'NEK{nek} {feat} {samp}')
            file_root = f'NEK{nek}_binding_{feat}_{samp}_df.csv'
 
            trainX, trainy, testX, testy = get_arrays(nek_path,file_root)
            model = rf_models(trainX, trainy, testX, testy, model_type, {})
            train_df = gather_rf_results(model, trainX, trainy)
            test_df = gather_rf_results(model, testX, testy)
            train_df['subset'] = 'train' 
            test_df['subset'] = 'test' 
            model_name = f'NEK{nek}_binding_{feat}_{samp}_{model_type}' 
            train_df['model'] = model_name
            
            test_df['model'] = model_name
            train_df = add_cm(train_df)
            test_df = add_cm(test_df)
            with open(f'{model_name}.pkl', 'wb') as f: 
                pickle.dump(model,f)
            train_df.to_csv(f'{file_root}_train_{model_type}.csv',index=False) 
            test_df.to_csv(f'{file_root}_test_{model_type}.csv',index=False) 
            
            
            print()    
    # print()
    if n == 2 or n == 9:

        nek_path= f'{data_dir}NEK{nek}/inhib/'
        for k, feat in enumerate(features): 
            print()
            for j, samp in enumerate(samplings): 
                file_root = f'NEK{nek}_inhibition_{feat}_{samp}_df.csv'
                print(f'NEK{nek} {feat} {samp}')
                trainX, trainy, testX, testy = get_arrays(nek_path,file_root)
                model = rf_models(trainX, trainy, testX, testy, {model_type}, {})
                train_df = gather_rf_results(model, trainX, trainy)
                test_df = gather_rf_results(model, testX, testy)
                train_df['subset'] = 'train' 
                test_df['subset'] = 'test' 
                model_name = f'NEK{nek}_inhibition_{feat}_{samp}_{model_type}' 
                train_df['model'] = model_name
                test_df['model'] = model_name
                train_df = add_cm(train_df)
                test_df = add_cm(test_df)
                with open(f'{model_name}.pkl', 'wb') as f: 
                    pickle.dump(model, f) 
                train_df.to_csv(f'{file_root}_train_{model_type}.csv',index=False) 
                test_df.to_csv(f'{file_root}_test_{model_type}.csv',index=False) 

                print()
      
    print()


NEK2 moe scaled
train X shape: (1125, 306), y: (1125,), test X: (283, 306), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 0.703, precision: 0.119, recall: 1.000, specificity: 0.691
TEST: accuracy: 0.707, precision: 0.092, recall: 0.667, specificity: 0.708

NEK2 moe UNDER
train X shape: (90, 306), y: (90,), test X: (283, 306), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.671, precision: 0.055, recall: 0.417, specificity: 0.683

NEK2 moe SMOTE
train X shape: (2160, 306), y: (2160,), test X: (283, 306), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.940, precision: 0.273, recall: 0.250, specificity: 0.970

NEK2 moe ADASYN
train X shape: (2158, 306), y: (2158,), test X: (283, 306), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.912, precision: 0.158, recall: 0.250, specificity: 0.941


NEK2 mfp scaled
train X shape: (1125, 2048), y: (1125,), test X: (283, 2048), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 0.508, precision: 0.075, recall: 1.000, specificity: 0.488
TEST: accuracy: 0.505, precision: 0.079, recall: 1.000, specificity: 0.483

NEK2 mfp UNDER
train X shape: (90, 2048), y: (90,), test X: (283, 2048), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.594, precision: 0.052, recall: 0.500, specificity: 0.598

NEK2 mfp SMOTE
train X shape: (2160, 2048), y: (2160,), test X: (283, 2048), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 0.999, specificity: 1.000
TEST: accuracy: 0.926, precision: 0.235, recall: 0.333, specificity: 0.952

NEK2 mfp ADASYN
train X shape: (2168, 2048), y: (2168,), test X: (283, 2048), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 0.999, specificity: 1.000
TEST: accuracy: 0.940, precision: 0.333, recall: 0.417, specificity: 0.963


NEK2 moe scaled
train X shape: (1635, 306), y: (1635,), test X: (409, 306), y:(409,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.944, precision: 0.609, recall: 0.500, specificity: 0.976

NEK2 moe UNDER
train X shape: (224, 306), y: (224,), test X: (409, 306), y:(409,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.763, precision: 0.189, recall: 0.750, specificity: 0.764

NEK2 moe SMOTE
train X shape: (3046, 306), y: (3046,), test X: (409, 306), y:(409,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.941, precision: 0.562, recall: 0.643, specificity: 0.963

NEK2 moe ADASYN
train X shape: (3037, 306), y: (3037,), test X: (409, 306), y:(409,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000

  warn(
  warn(


TRAIN: accuracy: 0.636, precision: 0.136, recall: 1.000, specificity: 0.614
TEST: accuracy: 0.613, precision: 0.117, recall: 0.824, specificity: 0.600

NEK3 moe UNDER
train X shape: (128, 306), y: (128,), test X: (282, 306), y:(282,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.553, precision: 0.084, recall: 0.647, specificity: 0.547

NEK3 moe SMOTE
train X shape: (2116, 306), y: (2116,), test X: (282, 306), y:(282,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.851, precision: 0.143, recall: 0.294, specificity: 0.887

NEK3 moe ADASYN
train X shape: (2113, 306), y: (2113,), test X: (282, 306), y:(282,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.840, precision: 0.132, recall: 0.294, specificity: 0.875


NEK3 mfp scaled
train X shape: (1122, 2048), y: (1122,), test X: (282, 2048), y:(282,)


  warn(
  warn(


TRAIN: accuracy: 0.643, precision: 0.138, recall: 1.000, specificity: 0.621
TEST: accuracy: 0.638, precision: 0.087, recall: 0.529, specificity: 0.645

NEK3 mfp UNDER
train X shape: (128, 2048), y: (128,), test X: (282, 2048), y:(282,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.521, precision: 0.066, recall: 0.529, specificity: 0.521

NEK3 mfp SMOTE
train X shape: (2116, 2048), y: (2116,), test X: (282, 2048), y:(282,)


  warn(
  warn(


TRAIN: accuracy: 0.999, precision: 1.000, recall: 0.998, specificity: 1.000
TEST: accuracy: 0.894, precision: 0.259, recall: 0.412, specificity: 0.925

NEK3 mfp ADASYN
train X shape: (2130, 2048), y: (2130,), test X: (282, 2048), y:(282,)


  warn(
  warn(


TRAIN: accuracy: 0.999, precision: 1.000, recall: 0.998, specificity: 1.000
TEST: accuracy: 0.890, precision: 0.250, recall: 0.412, specificity: 0.921



NEK5 moe scaled
train X shape: (989, 306), y: (989,), test X: (248, 306), y:(248,)


  warn(
  warn(


TRAIN: accuracy: 0.750, precision: 0.238, recall: 1.000, specificity: 0.729
TEST: accuracy: 0.690, precision: 0.157, recall: 0.650, specificity: 0.693

NEK5 moe UNDER
train X shape: (154, 306), y: (154,), test X: (248, 306), y:(248,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.649, precision: 0.140, recall: 0.650, specificity: 0.649

NEK5 moe SMOTE
train X shape: (1824, 306), y: (1824,), test X: (248, 306), y:(248,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.919, precision: 0.500, recall: 0.400, specificity: 0.965

NEK5 moe ADASYN
train X shape: (1831, 306), y: (1831,), test X: (248, 306), y:(248,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.875, precision: 0.323, recall: 0.500, specificity: 0.908


NEK5 mfp scaled
train X shape: (989, 2048), y: (989,), test X: (248, 2048), y:(248,)


  warn(
  warn(


TRAIN: accuracy: 0.738, precision: 0.229, recall: 1.000, specificity: 0.716
TEST: accuracy: 0.681, precision: 0.183, recall: 0.850, specificity: 0.667

NEK5 mfp UNDER
train X shape: (154, 2048), y: (154,), test X: (248, 2048), y:(248,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.468, precision: 0.082, recall: 0.550, specificity: 0.461

NEK5 mfp SMOTE
train X shape: (1824, 2048), y: (1824,), test X: (248, 2048), y:(248,)


  warn(
  warn(


TRAIN: accuracy: 0.999, precision: 1.000, recall: 0.998, specificity: 1.000
TEST: accuracy: 0.923, precision: 0.526, recall: 0.500, specificity: 0.961

NEK5 mfp ADASYN
train X shape: (1805, 2048), y: (1805,), test X: (248, 2048), y:(248,)


  warn(
  warn(


TRAIN: accuracy: 0.999, precision: 0.998, recall: 1.000, specificity: 0.998
TEST: accuracy: 0.927, precision: 0.571, recall: 0.400, specificity: 0.974



NEK9 moe scaled
train X shape: (1126, 306), y: (1126,), test X: (283, 306), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 0.631, precision: 0.104, recall: 1.000, specificity: 0.615
TEST: accuracy: 0.636, precision: 0.083, recall: 0.692, specificity: 0.633

NEK9 moe UNDER
train X shape: (96, 306), y: (96,), test X: (283, 306), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.572, precision: 0.065, recall: 0.615, specificity: 0.570

NEK9 moe SMOTE
train X shape: (2156, 306), y: (2156,), test X: (283, 306), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.926, precision: 0.100, recall: 0.077, specificity: 0.967

NEK9 moe ADASYN
train X shape: (2164, 306), y: (2164,), test X: (283, 306), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.901, precision: 0.105, recall: 0.154, specificity: 0.937


NEK9 mfp scaled
train X shape: (1126, 2048), y: (1126,), test X: (283, 2048), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 0.569, precision: 0.090, recall: 1.000, specificity: 0.550
TEST: accuracy: 0.534, precision: 0.072, recall: 0.769, specificity: 0.522

NEK9 mfp UNDER
train X shape: (96, 2048), y: (96,), test X: (283, 2048), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.463, precision: 0.063, recall: 0.769, specificity: 0.448

NEK9 mfp SMOTE
train X shape: (2156, 2048), y: (2156,), test X: (283, 2048), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 0.999, precision: 1.000, recall: 0.998, specificity: 1.000
TEST: accuracy: 0.933, precision: 0.312, recall: 0.385, specificity: 0.959

NEK9 mfp ADASYN
train X shape: (2166, 2048), y: (2166,), test X: (283, 2048), y:(283,)


  warn(
  warn(


TRAIN: accuracy: 0.999, precision: 1.000, recall: 0.998, specificity: 1.000
TEST: accuracy: 0.929, precision: 0.267, recall: 0.308, specificity: 0.959


NEK9 moe scaled
train X shape: (313, 306), y: (313,), test X: (80, 306), y:(80,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.950, precision: 0.857, recall: 0.667, specificity: 0.986

NEK9 moe UNDER
train X shape: (66, 306), y: (66,), test X: (80, 306), y:(80,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.863, precision: 0.438, recall: 0.778, specificity: 0.873

NEK9 moe SMOTE
train X shape: (560, 306), y: (560,), test X: (80, 306), y:(80,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1.000
TEST: accuracy: 0.887, precision: 0.500, recall: 0.556, specificity: 0.930

NEK9 moe ADASYN
train X shape: (560, 306), y: (560,), test X: (80, 306), y:(80,)
TRAIN: accuracy: 1.000, precision: 1.000, recall: 1.000, specificity: 1

In [8]:

source = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/RF/'
dest = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/RF/RF_pickles/'

if not os.path.exists(source):
    os.makedirs(dest)

for root, dirs, files in os.walk(source):
    for file in files:
        if file.endswith('RF.pkl'):
            source_file = os.path.join(root, file)
            dest_file = os.path.join(dest, file)
            shutil.move(source_file,dest_file)
            print(f"Moved: {source_file} to {dest_file}")
        if file.endswith('RF_BCW.pkl'):
            source_file = os.path.join(root, file)
            dest_file = os.path.join(dest, file)
            shutil.move(source_file,dest_file)
            print(f"Moved: {source_file} to {dest_file}")
        if file.endswith('BRFC.pkl'):
            source_file = os.path.join(root, file)
            dest_file = os.path.join(dest, file)
            shutil.move(source_file,dest_file)
            print(f"Moved: {source_file} to {dest_file}")
        if file.endswith('BRFC_BCW.pkl'):
            source_file = os.path.join(root, file)
            dest_file = os.path.join(dest, file)
            shutil.move(source_file,dest_file)
            print(f"Moved: {source_file} to {dest_file}")



Moved: /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/RF/NEK5_binding_mfp_scaled_RF.pkl to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/RF/RF_pickles/NEK5_binding_mfp_scaled_RF.pkl
Moved: /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/RF/NEK5_binding_moe_scaled_BRFC.pkl to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/RF/RF_pickles/NEK5_binding_moe_scaled_BRFC.pkl
Moved: /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/RF/NEK9_inhibition_mfp_UNDER_RF.pkl to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/RF/RF_pickles/NEK9_inhibition_mfp_UNDER_RF.pkl
Moved: /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/RF/NEK9_binding_moe_UNDER_RF.pkl to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/RF/RF_pickles/NEK9_binding_moe_UNDER_RF.pkl
Moved: /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/RF/NEK9_binding_moe_SMOTE_BRFC_BCW.pkl to /Users/jaycee

In [9]:

source = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/RF/'
dest = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/RF/RF_results/'

if not os.path.exists(source):
    os.makedirs(dest)

for root, dirs, files in os.walk(source):
    for file in files:
        if file.endswith('RF.csv'):
            source_file = os.path.join(root, file)
            dest_file = os.path.join(dest, file)
            shutil.move(source_file,dest_file)
            print(f"Moved: {source_file} to {dest_file}")
        if file.endswith('RF_BCW.csv'):
            source_file = os.path.join(root, file)
            dest_file = os.path.join(dest, file)
            shutil.move(source_file,dest_file)
            print(f"Moved: {source_file} to {dest_file}")
        if file.endswith('BRFC.csv'):
            source_file = os.path.join(root, file)
            dest_file = os.path.join(dest, file)
            shutil.move(source_file,dest_file)
            print(f"Moved: {source_file} to {dest_file}")
        if file.endswith('BRFC_BCW.csv'):
            source_file = os.path.join(root, file)
            dest_file = os.path.join(dest, file)
            shutil.move(source_file,dest_file)
            print(f"Moved: {source_file} to {dest_file}")



Moved: /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/RF/NEK2_inhibition_mfp_UNDER_df.csv_test_RF_BCW.csv to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/RF/RF_results/NEK2_inhibition_mfp_UNDER_df.csv_test_RF_BCW.csv
Moved: /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/RF/NEK9_inhibition_mfp_ADASYN_df.csv_train_BRFC.csv to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/RF/RF_results/NEK9_inhibition_mfp_ADASYN_df.csv_train_BRFC.csv
Moved: /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/RF/NEK2_inhibition_moe_UNDER_df.csv_test_BRFC_BCW.csv to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/RF/RF_results/NEK2_inhibition_moe_UNDER_df.csv_test_BRFC_BCW.csv
Moved: /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/RF/NEK3_binding_mfp_ADASYN_df.csv_train_RF_BCW.csv to /Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/RF/RF_results/NEK3_binding_mfp_ADASYN_df.csv_tra