In [1]:
import math
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import os

import shutil
import sklearn
from sklearn.model_selection import KFold
import gpytorch
from gpytorch.models import ExactGP
from gpytorch.likelihoods import DirichletClassificationLikelihood
from gpytorch.means import ConstantMean
from gpytorch.kernels import ScaleKernel, RBFKernel, MaternKernel

from sklearn.metrics import confusion_matrix
import itertools
from sklearn.metrics import precision_score, recall_score, roc_auc_score, matthews_corrcoef, balanced_accuracy_score, confusion_matrix, f1_score, roc_curve,precision_recall_curve, auc

import sys
sys.path.append('/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/')
from RF_GSCV import * # RF_GSCV contains the calculate metrics function to get the TP, TN, FP, FN scores 
from RF_atomver import prediction_type

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:

class DirichletGPModel(ExactGP):
    """
    A Dirichlet Gaussian Process (GP) model for multi-class classification.
    This model uses a Gaussian Process with a Dirichlet prior to handle multi-class classification tasks.
    It extends the ExactGP class from GPyTorch, a library for Gaussian Processes in PyTorch.
    Attributes:
        mean_module (gpytorch.means.ConstantMean): The mean module for the GP, initialized with a constant mean function for each class.
        covar_module (gpytorch.kernels.ScaleKernel): The covariance module for the GP, using a scaled RBF kernel for each class.

    Args:
        train_x (torch.Tensor): Training data features.
        train_y (torch.Tensor): Training data labels.
        likelihood (gpytorch.likelihoods.Likelihood): The likelihood function.
        num_classes (int): The number of classes for the classification task.
    """
    def __init__(self, train_x, train_y, likelihood, num_classes):
        super(DirichletGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = ConstantMean(batch_shape=torch.Size((num_classes,)))
        self.covar_module = ScaleKernel(MaternKernel(nu=0.5, batch_shape=torch.Size((num_classes,))),
            batch_shape=torch.Size((num_classes,))
        )

    def forward(self, x):
        """
        Forward pass through the GP model.
        Args:
            x (torch.Tensor): Input data features.
        Returns:
            gpytorch.distributions.MultivariateNormal: The multivariate normal distribution representing the GP posterior.
        """
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)


In [3]:
class Trainer: 
    def __init__(self,model, likelihood, iterations): 
        self.model = model
        self.likelihood = likelihood 
        smoke_test = ('CI' in os.environ)
        self.n_iterations = 2 if smoke_test else iterations
        self.optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
        self.loss_fn = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.model)
        
    def train(self, train_x, train_y): 
        self.model.train()
        self.likelihood.train()
        predictions = [] 
        for i in range(self.n_iterations): 
            self.optimizer.zero_grad()
            output = self.model(train_x)
            loss = -self.loss_fn(output, self.likelihood.transformed_targets).sum()
            loss.backward()
            if (i%10==0): 
                print('Iter %d/%d - Loss: %.3f   lengthscale: %.3f   noise: %.3f' % (
                    i + 1, self.n_iterations, loss.item(),
                    self.model.covar_module.base_kernel.lengthscale.mean().item(),
                    self.model.likelihood.second_noise_covar.noise.mean().item()
                ))
             
            self.optimizer.step() 
    def predict(self, input): 
        """
        Make predictions using the GP model.

        Args:
            input (torch.Tensor): The input data for making predictions.
        
        Returns:
            dist (gpytorch.distributions.MultivariateNormal): The distribution representing the GP posterior.
            observed_pred (gpytorch.distributions.MultivariateNormal): The predicted distribution considering the likelihood.
            pred_means (torch.Tensor): The means of the predicted distributions.
            class_pred (torch.Tensor): The predicted class labels.
        """
        self.model.eval()
        self.likelihood.eval()

        with gpytorch.settings.fast_pred_var(), torch.no_grad():
            dist = self.model(input)     # output distribution
            pred_means = dist.loc          # means of distributino 
            observed_pred = self.likelihood(self.model(input))    # likelihood predictions mean and var  

            class_pred = self.model(input).loc.max(0)[1]
            
        return dist, observed_pred, pred_means, class_pred
    

    def evaluate(self, x_input, y_true): 
        """
        Evaluate the GP model.

        Args:
            x_input (torch.Tensor): The input data features.
            y_true (torch.Tensor): The true labels for the input data.
        
        Returns:
            y_pred (numpy.ndarray): The predicted class labels.
        """
        y_pred = self.model(x_input).loc.max(0)[1].numpy()
        
        return y_pred

    def gp_results(self, x_input, y_true, plot_title=None): 
        """
        Calculate evaluation metrics and print results.

        Args:
            x_input (torch.Tensor): The input data features.
            y_true (torch.Tensor or numpy.ndarray): The true labels for the input data.
            plot_title (str, optional): The title for the confusion matrix plot.
        
        Returns:
            dict: A dictionary containing evaluation metrics and confusion matrix components.
        """
        y_pred = self.evaluate(x_input, y_true) 
        if isinstance(y_true, torch.Tensor):
            y_true = y_true.numpy().reshape(-1)
        # plot_confusion_matrix(y_true, y_pred, ['0','1'], title=plot_title)
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        dist = self.model(x_input)     # get predicted distributions 
        pred_means = dist.loc          # means for predicted dist  

        recall = recall_score(y_true, y_pred)
        tp, tn, fp, fn = calculate_metrics(y_true, y_pred) 
        sensitivity = tp / (tp + fn) 
        specificity = tn / (tn + fp) 
        cm = confusion_matrix(y_true, y_pred)
        print(f'accuracy: {accuracy:.4f}, precision: {precision:.4f}, recall: {recall:.4f}, specificity: {specificity:.4f}, cm: {cm}')
        return {'accuracy': accuracy, 'precision': precision,  'recall':recall, 'specificity':specificity, 'TN': tn, 'FN': fn, 'FP': fp, 'TP': tp }

       

In [4]:
def make_torch_tens_float(filepath, filename): 
    df = pd.read_csv(filepath+filename+'.csv')
    train_df = df[df['subset'] == 'train']
    test_df = df[df['subset'] == 'test']
    drop_cols = ['subset',	'base_rdkit_smiles',	'compound_id',	'fold'	,'active']
    trainX_df = train_df.drop(columns=drop_cols) 
    trainy_df = train_df['active'] 
    testX_df = test_df.drop(columns=drop_cols) 
    testy_df = test_df['active'] 

    train_x_temp = trainX_df.to_numpy().astype("double") # double 
    test_x_temp = testX_df.to_numpy().astype("double") #double 
    
    train_y_temp = trainy_df.to_numpy().flatten().astype("double") #double 
    test_y_temp = testy_df.to_numpy().flatten().astype("double") #double 

    trainX = torch.as_tensor(train_x_temp, dtype=torch.float32)
    trainy = torch.as_tensor(train_y_temp, dtype=torch.float32)
    testX = torch.as_tensor(test_x_temp, dtype=torch.float32)
    testy = torch.as_tensor(test_y_temp, dtype=torch.float32)
    return trainX, trainy, testX, testy

In [5]:
GP_path ='/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/undersampler_validation/under_batch2_jp/'
def save_results(trainX, trainy, testX, testy, root_name, n_iterations=300, n_samples=100):
    """
    Train a Dirichlet Gaussian Process model and save the training and test performance results.

    This function trains a Dirichlet GP model on the given training data, evaluates it on both the training
    and test data, and saves various performance metrics and predictions to pandas DataFrames.

    Args:
        trainX (torch.Tensor): The training data features.
        trainy (torch.Tensor): The training data labels.
        testX (torch.Tensor): The test data features.
        testy (torch.Tensor): The test data labels.
        root_name (str): The root name used for labeling the model in the results.
        n_iterations (int, optional): The number of training iterations. Default is 300.
        n_samples (int, optional): The number of samples for prediction. Default is 100.

    Returns:
        train_perf_df (pd.DataFrame): DataFrame containing performance metrics and predictions for the training data.
        test_perf_df (pd.DataFrame): DataFrame containing performance metrics and predictions for the test data.
    """
    likelihood = DirichletClassificationLikelihood(trainy.long(), learn_additional_noise=True)
    model = DirichletGPModel(trainX, likelihood.transformed_targets, likelihood, num_classes=likelihood.num_classes)
    # n_iterations = 300
    trainer = Trainer(model, likelihood, n_iterations)
    trainer.train(trainX, trainy) 
  
    train_dist, train_observed_pred, train_pred_means, train_pred  = trainer.predict(trainX)
    train_results = trainer.gp_results(trainX, trainy)
    test_dist, test_observed_pred, test_pred_means, test_pred  = trainer.predict(testX)
    test_results = trainer.gp_results(testX, testy)
    
    train_observed_pred.mean.numpy()
    train_pred_variance2D = train_observed_pred.variance.numpy()
    test_observed_pred.mean.numpy()
    test_pred_variance2D=test_observed_pred.variance.numpy()
    
    train_pred_samples = train_dist.sample(torch.Size((256,))).exp()
    train_probabilities = (train_pred_samples / train_pred_samples.sum(-2, keepdim=True)).mean(0)

    train_prob_stds = (train_pred_samples / train_pred_samples.sum(-2, keepdim=True)).std(0)

    test_pred_samples = test_dist.sample(torch.Size((100,))).exp()

    test_probabilities = (test_pred_samples / test_pred_samples.sum(-2, keepdim=True)).mean(0)
    test_prob_stds = (test_pred_samples / test_pred_samples.sum(-2, keepdim=True)).std(0)

 
    train_perf_df = pd.DataFrame()
    test_perf_df = pd.DataFrame()
    train_perf_df['mean_pred_class0'] = train_observed_pred.mean.numpy()[0,]
    train_perf_df['mean_pred_class1'] = train_observed_pred.mean.numpy()[1,]
    train_perf_df['y'] = trainy
    train_perf_df['y_pred'] = train_pred_means.max(0)[1]
    train_perf_df['var_pred_class0']=train_observed_pred.variance.numpy()[0,]
    train_perf_df['var_pred_class1']=train_observed_pred.variance.numpy()[1,]
    train_perf_df['pred_prob_class0'] = train_probabilities.numpy()[0,]
    train_perf_df['pred_prob_class1'] = train_probabilities.numpy()[1,]
    train_perf_df['pred_prob_std_class0'] = train_prob_stds.numpy()[0,]
    train_perf_df['pred_prob_std_class1'] = train_prob_stds.numpy()[1,]
    train_perf_df['model'] = f'{root_name}_GP_Dirichlet_matern'
    train_perf_df['subset'] = 'train' 
    train_cm = confusion_matrix(trainy, train_perf_df['y_pred'])
    cm_flattened = train_cm.flatten().tolist()
    train_perf_df['cm']= [cm_flattened]* len(train_perf_df)
    train_perf_df['prediction_type'] = train_perf_df.apply(lambda x: prediction_type(x['y'], x['y_pred']), axis=1)
    train_perf_df['ROC-AUC'] = roc_auc_score(trainy, train_perf_df['y_pred'])
    train_perf_df['MCC'] = matthews_corrcoef(trainy, train_perf_df['y_pred'])
    train_perf_df['Balanced Accuracy'] = balanced_accuracy_score(trainy, train_perf_df['y_pred'])
    train_perf_df['f1'] = f1_score(trainy, train_perf_df['y_pred'])
    
    test_perf_df['mean_pred_class0'] = test_observed_pred.mean.numpy()[0,]
    test_perf_df['mean_pred_class1'] = test_observed_pred.mean.numpy()[1,]
    test_perf_df['y'] = testy
    test_perf_df['y_pred'] = test_pred_means.max(0)[1]
    test_perf_df['var_pred_class0']=test_observed_pred.variance.numpy()[0,]
    test_perf_df['var_pred_class1']=test_observed_pred.variance.numpy()[1,]
    test_perf_df['pred_prob_class0'] = test_probabilities.numpy()[0,]
    test_perf_df['pred_prob_class1'] = test_probabilities.numpy()[1,]
    test_perf_df['pred_prob_std_class0'] =test_prob_stds.numpy()[0,]
    test_perf_df['pred_prob_std_class1'] = test_prob_stds.numpy()[1,]
    test_perf_df['model'] = f'{root_name}_GP_Dirichlet_matern'
    test_perf_df['subset'] = 'test' 
    test_cm = confusion_matrix(testy, test_perf_df['y_pred'])
    test_cm_flattened = test_cm.flatten().tolist()
    test_perf_df['cm']= [test_cm_flattened]* len(test_perf_df)
    test_perf_df['prediction_type'] = test_perf_df.apply(lambda x: prediction_type(x['y'], x['y_pred']), axis=1)
    test_perf_df['ROC-AUC'] = roc_auc_score(testy, test_perf_df['y_pred'])
    test_perf_df['MCC'] = matthews_corrcoef(testy, test_perf_df['y_pred'])
    test_perf_df['Balanced Accuracy'] = balanced_accuracy_score(testy, test_perf_df['y_pred'])
    test_perf_df['f1'] = f1_score(testy, test_perf_df['y_pred'])
    with open(f'{GP_path}{root_name}_GP_Dirichlet_matern_model.pkl', 'wb') as f: 
        pickle.dump(model,f)
    with open(f'{GP_path}{root_name}_GP_Dirichlet_matern_likelihood.pkl', 'wb') as f: 
        pickle.dump(likelihood,f)
    for k, val in train_results.items(): 
        train_perf_df[k] = val
    for k, val in test_results.items():
        test_perf_df[k] = val
    return train_perf_df, test_perf_df


In [6]:
data_dir = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/undersampler_validation/under_batch2_jp/'
samplings = ['UNDER'] 
feat_types = ['moe', 'mfp']

neks = ['2', '3', '5', '9']
GP_path= '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/undersampler_validation/under_batch2_jp/'
for nek in neks:
    print(f'NEK{nek}')
    if nek in ['3','5']: 
        bind_inhib = ['binding']
    else: 
        bind_inhib = ['binding', 'inhibition']
    for bi in bind_inhib: 
        if bi == 'binding': 
            this_bi = 'bind' 
        if bi == 'inhibition': 
            this_bi = 'inhib'
        
        for feat in feat_types: 
            for samp in samplings: 
                print(f'NEK{nek} {bi} {feat} {samp}')
                file_root = f'NEK{nek}_{bi}_{feat}_{samp}_batch2'
                trainX, trainy, testX, testy = make_torch_tens_float(GP_path,file_root) 
                print(f'trainX:{trainX.shape}, train y: {trainy.shape}, testX: {testX.shape}, test y: {testy.shape}')
                train_perf_df, test_perf_df = save_results(trainX, trainy, testX, testy,file_root, n_iterations=300)
                train_perf_df['NEK'] = f'NEK{nek}_{bi}'
                train_perf_df['strategy'] = f'{samp}'
                train_perf_df['feat_type'] = f'{feat}'
                test_perf_df['NEK'] = f'NEK{nek}_{bi}'
                test_perf_df['strategy'] = f'{samp}'
                test_perf_df['feat_type'] = f'{feat}'
                train_perf_df.to_csv(f'{GP_path}{file_root}_train_GP_matern.csv',index=False) 
                test_perf_df.to_csv(f'{GP_path}{file_root}_test_GP_matern.csv',index=False) 
                print() 


NEK2
NEK2 binding moe UNDER
trainX:torch.Size([90, 306]), train y: torch.Size([90]), testX: torch.Size([283, 306]), test y: torch.Size([283])
Iter 1/300 - Loss: 7.107   lengthscale: 0.693   noise: 0.693
Iter 11/300 - Loss: 5.931   lengthscale: 1.265   noise: 1.297
Iter 21/300 - Loss: 5.437   lengthscale: 2.065   noise: 1.982
Iter 31/300 - Loss: 5.254   lengthscale: 3.169   noise: 2.569
Iter 41/300 - Loss: 5.184   lengthscale: 4.482   noise: 2.987
Iter 51/300 - Loss: 5.150   lengthscale: 5.798   noise: 3.235
Iter 61/300 - Loss: 5.130   lengthscale: 6.980   noise: 3.339
Iter 71/300 - Loss: 5.116   lengthscale: 7.999   noise: 3.330
Iter 81/300 - Loss: 5.105   lengthscale: 8.871   noise: 3.242
Iter 91/300 - Loss: 5.096   lengthscale: 9.626   noise: 3.101
Iter 101/300 - Loss: 5.087   lengthscale: 10.293   noise: 2.927
Iter 111/300 - Loss: 5.079   lengthscale: 10.890   noise: 2.731
Iter 121/300 - Loss: 5.072   lengthscale: 11.433   noise: 2.523
Iter 131/300 - Loss: 5.065   lengthscale: 11.93



Iter 11/300 - Loss: 5.937   lengthscale: 1.284   noise: 1.299
Iter 21/300 - Loss: 5.437   lengthscale: 2.144   noise: 1.998
Iter 31/300 - Loss: 5.300   lengthscale: 2.950   noise: 2.648
Iter 41/300 - Loss: 5.261   lengthscale: 3.226   noise: 3.189
Iter 51/300 - Loss: 5.245   lengthscale: 3.148   noise: 3.604
Iter 61/300 - Loss: 5.237   lengthscale: 2.919   noise: 3.906
Iter 71/300 - Loss: 5.232   lengthscale: 2.634   noise: 4.122
Iter 81/300 - Loss: 5.228   lengthscale: 2.339   noise: 4.278
Iter 91/300 - Loss: 5.226   lengthscale: 2.072   noise: 4.394
Iter 101/300 - Loss: 5.224   lengthscale: 1.855   noise: 4.489
Iter 111/300 - Loss: 5.222   lengthscale: 1.685   noise: 4.572
Iter 121/300 - Loss: 5.221   lengthscale: 1.548   noise: 4.649
Iter 131/300 - Loss: 5.220   lengthscale: 1.436   noise: 4.724
Iter 141/300 - Loss: 5.219   lengthscale: 1.345   noise: 4.799
Iter 151/300 - Loss: 5.218   lengthscale: 1.272   noise: 4.875
Iter 161/300 - Loss: 5.218   lengthscale: 1.214   noise: 4.952
I



Iter 11/300 - Loss: 5.930   lengthscale: 1.288   noise: 1.297
Iter 21/300 - Loss: 5.429   lengthscale: 2.097   noise: 1.980
Iter 31/300 - Loss: 5.212   lengthscale: 3.203   noise: 2.553
Iter 41/300 - Loss: 5.091   lengthscale: 4.512   noise: 2.901
Iter 51/300 - Loss: 5.019   lengthscale: 5.786   noise: 2.989
Iter 61/300 - Loss: 4.973   lengthscale: 6.873   noise: 2.860
Iter 71/300 - Loss: 4.939   lengthscale: 7.765   noise: 2.586
Iter 81/300 - Loss: 4.911   lengthscale: 8.506   noise: 2.232
Iter 91/300 - Loss: 4.886   lengthscale: 9.142   noise: 1.850
Iter 101/300 - Loss: 4.863   lengthscale: 9.705   noise: 1.480
Iter 111/300 - Loss: 4.844   lengthscale: 10.215   noise: 1.152
Iter 121/300 - Loss: 4.827   lengthscale: 10.684   noise: 0.884
Iter 131/300 - Loss: 4.814   lengthscale: 11.120   noise: 0.676
Iter 141/300 - Loss: 4.803   lengthscale: 11.529   noise: 0.522
Iter 151/300 - Loss: 4.794   lengthscale: 11.916   noise: 0.411
Iter 161/300 - Loss: 4.787   lengthscale: 12.285   noise: 0




NEK2 inhibition mfp UNDER
trainX:torch.Size([224, 2048]), train y: torch.Size([224]), testX: torch.Size([408, 2048]), test y: torch.Size([408])
Iter 1/300 - Loss: 7.106   lengthscale: 0.693   noise: 0.693
Iter 11/300 - Loss: 5.936   lengthscale: 1.291   noise: 1.301
Iter 21/300 - Loss: 5.451   lengthscale: 2.137   noise: 2.023
Iter 31/300 - Loss: 5.332   lengthscale: 2.798   noise: 2.738
Iter 41/300 - Loss: 5.293   lengthscale: 3.071   noise: 3.356
Iter 51/300 - Loss: 5.275   lengthscale: 3.140   noise: 3.844
Iter 61/300 - Loss: 5.266   lengthscale: 3.130   noise: 4.219
Iter 71/300 - Loss: 5.260   lengthscale: 3.094   noise: 4.511
Iter 81/300 - Loss: 5.256   lengthscale: 3.051   noise: 4.745
Iter 91/300 - Loss: 5.252   lengthscale: 3.004   noise: 4.942
Iter 101/300 - Loss: 5.248   lengthscale: 2.952   noise: 5.115
Iter 111/300 - Loss: 5.244   lengthscale: 2.895   noise: 5.272
Iter 121/300 - Loss: 5.241   lengthscale: 2.829   noise: 5.420
Iter 131/300 - Loss: 5.237   lengthscale: 2.755



accuracy: 0.9464, precision: 0.9630, recall: 0.9286, specificity: 0.9643, cm: [[108   4]
 [  8 104]]
accuracy: 0.5049, precision: 0.0323, recall: 0.2143, specificity: 0.5263, cm: [[200 180]
 [ 22   6]]

NEK3
NEK3 binding moe UNDER
trainX:torch.Size([128, 306]), train y: torch.Size([128]), testX: torch.Size([282, 306]), test y: torch.Size([282])
Iter 1/300 - Loss: 7.107   lengthscale: 0.693   noise: 0.693
Iter 11/300 - Loss: 5.931   lengthscale: 1.285   noise: 1.297
Iter 21/300 - Loss: 5.437   lengthscale: 2.122   noise: 1.981
Iter 31/300 - Loss: 5.262   lengthscale: 3.230   noise: 2.571
Iter 41/300 - Loss: 5.206   lengthscale: 4.438   noise: 3.003
Iter 51/300 - Loss: 5.189   lengthscale: 5.448   noise: 3.294
Iter 61/300 - Loss: 5.183   lengthscale: 6.169   noise: 3.475
Iter 71/300 - Loss: 5.180   lengthscale: 6.686   noise: 3.576
Iter 81/300 - Loss: 5.179   lengthscale: 7.087   noise: 3.620
Iter 91/300 - Loss: 5.178   lengthscale: 7.423   noise: 3.626
Iter 101/300 - Loss: 5.177   lengt



Iter 21/300 - Loss: 5.429   lengthscale: 2.161   noise: 2.002
Iter 31/300 - Loss: 5.295   lengthscale: 2.963   noise: 2.659
Iter 41/300 - Loss: 5.257   lengthscale: 3.334   noise: 3.205
Iter 51/300 - Loss: 5.243   lengthscale: 3.435   noise: 3.622
Iter 61/300 - Loss: 5.237   lengthscale: 3.423   noise: 3.928
Iter 71/300 - Loss: 5.234   lengthscale: 3.370   noise: 4.153
Iter 81/300 - Loss: 5.232   lengthscale: 3.305   noise: 4.323
Iter 91/300 - Loss: 5.230   lengthscale: 3.234   noise: 4.456
Iter 101/300 - Loss: 5.229   lengthscale: 3.158   noise: 4.567
Iter 111/300 - Loss: 5.227   lengthscale: 3.076   noise: 4.665
Iter 121/300 - Loss: 5.226   lengthscale: 2.986   noise: 4.754
Iter 131/300 - Loss: 5.225   lengthscale: 2.888   noise: 4.841
Iter 141/300 - Loss: 5.224   lengthscale: 2.784   noise: 4.926
Iter 151/300 - Loss: 5.222   lengthscale: 2.675   noise: 5.011
Iter 161/300 - Loss: 5.221   lengthscale: 2.564   noise: 5.097
Iter 171/300 - Loss: 5.220   lengthscale: 2.453   noise: 5.185




accuracy: 0.9688, precision: 0.9688, recall: 0.9688, specificity: 0.9688, cm: [[62  2]
 [ 2 62]]
accuracy: 0.3865, precision: 0.0618, recall: 0.6471, specificity: 0.3698, cm: [[ 98 167]
 [  6  11]]

NEK5
NEK5 binding moe UNDER
trainX:torch.Size([154, 306]), train y: torch.Size([154]), testX: torch.Size([248, 306]), test y: torch.Size([248])
Iter 1/300 - Loss: 7.107   lengthscale: 0.693   noise: 0.693
Iter 11/300 - Loss: 5.931   lengthscale: 1.257   noise: 1.297
Iter 21/300 - Loss: 5.437   lengthscale: 2.050   noise: 1.982
Iter 31/300 - Loss: 5.252   lengthscale: 3.150   noise: 2.569
Iter 41/300 - Loss: 5.175   lengthscale: 4.473   noise: 2.983
Iter 51/300 - Loss: 5.137   lengthscale: 5.795   noise: 3.218
Iter 61/300 - Loss: 5.116   lengthscale: 6.938   noise: 3.304
Iter 71/300 - Loss: 5.104   lengthscale: 7.863   noise: 3.283
Iter 81/300 - Loss: 5.096   lengthscale: 8.607   noise: 3.195
Iter 91/300 - Loss: 5.089   lengthscale: 9.219   noise: 3.068
Iter 101/300 - Loss: 5.084   lengthsca




NEK5 binding mfp UNDER
trainX:torch.Size([154, 2048]), train y: torch.Size([154]), testX: torch.Size([248, 2048]), test y: torch.Size([248])
Iter 1/300 - Loss: 7.107   lengthscale: 0.693   noise: 0.693
Iter 11/300 - Loss: 5.949   lengthscale: 1.287   noise: 1.301
Iter 21/300 - Loss: 5.451   lengthscale: 2.143   noise: 2.019
Iter 31/300 - Loss: 5.320   lengthscale: 2.915   noise: 2.713
Iter 41/300 - Loss: 5.279   lengthscale: 3.292   noise: 3.305
Iter 51/300 - Loss: 5.263   lengthscale: 3.433   noise: 3.769
Iter 61/300 - Loss: 5.255   lengthscale: 3.478   noise: 4.122
Iter 71/300 - Loss: 5.251   lengthscale: 3.490   noise: 4.395
Iter 81/300 - Loss: 5.247   lengthscale: 3.494   noise: 4.613
Iter 91/300 - Loss: 5.244   lengthscale: 3.493   noise: 4.793
Iter 101/300 - Loss: 5.242   lengthscale: 3.490   noise: 4.951
Iter 111/300 - Loss: 5.239   lengthscale: 3.482   noise: 5.095
Iter 121/300 - Loss: 5.237   lengthscale: 3.470   noise: 5.231
Iter 131/300 - Loss: 5.235   lengthscale: 3.450   



accuracy: 0.9481, precision: 0.9481, recall: 0.9481, specificity: 0.9481, cm: [[73  4]
 [ 4 73]]
accuracy: 0.4798, precision: 0.0709, recall: 0.4500, specificity: 0.4825, cm: [[110 118]
 [ 11   9]]

NEK9
NEK9 binding moe UNDER
trainX:torch.Size([96, 306]), train y: torch.Size([96]), testX: torch.Size([283, 306]), test y: torch.Size([283])
Iter 1/300 - Loss: 7.107   lengthscale: 0.693   noise: 0.693
Iter 11/300 - Loss: 5.931   lengthscale: 1.254   noise: 1.297
Iter 21/300 - Loss: 5.438   lengthscale: 2.055   noise: 1.982
Iter 31/300 - Loss: 5.260   lengthscale: 3.165   noise: 2.571
Iter 41/300 - Loss: 5.197   lengthscale: 4.479   noise: 2.999
Iter 51/300 - Loss: 5.170   lengthscale: 5.795   noise: 3.272
Iter 61/300 - Loss: 5.156   lengthscale: 6.963   noise: 3.418
Iter 71/300 - Loss: 5.148   lengthscale: 7.938   noise: 3.470
Iter 81/300 - Loss: 5.143   lengthscale: 8.737   noise: 3.460
Iter 91/300 - Loss: 5.140   lengthscale: 9.398   noise: 3.412
Iter 101/300 - Loss: 5.137   lengthscale



Iter 21/300 - Loss: 5.436   lengthscale: 2.143   noise: 1.998
Iter 31/300 - Loss: 5.303   lengthscale: 2.911   noise: 2.650
Iter 41/300 - Loss: 5.264   lengthscale: 3.140   noise: 3.196
Iter 51/300 - Loss: 5.248   lengthscale: 3.028   noise: 3.614
Iter 61/300 - Loss: 5.239   lengthscale: 2.771   noise: 3.918
Iter 71/300 - Loss: 5.233   lengthscale: 2.458   noise: 4.133
Iter 81/300 - Loss: 5.229   lengthscale: 2.126   noise: 4.285
Iter 91/300 - Loss: 5.225   lengthscale: 1.808   noise: 4.396
Iter 101/300 - Loss: 5.222   lengthscale: 1.538   noise: 4.483
Iter 111/300 - Loss: 5.220   lengthscale: 1.338   noise: 4.558
Iter 121/300 - Loss: 5.219   lengthscale: 1.205   noise: 4.631
Iter 131/300 - Loss: 5.218   lengthscale: 1.118   noise: 4.704
Iter 141/300 - Loss: 5.218   lengthscale: 1.059   noise: 4.778
Iter 151/300 - Loss: 5.217   lengthscale: 1.018   noise: 4.852
Iter 161/300 - Loss: 5.217   lengthscale: 0.989   noise: 4.926
Iter 171/300 - Loss: 5.216   lengthscale: 0.965   noise: 4.999




Iter 51/300 - Loss: 5.076   lengthscale: 5.830   noise: 3.126
Iter 61/300 - Loss: 5.031   lengthscale: 7.063   noise: 3.089
Iter 71/300 - Loss: 4.996   lengthscale: 8.116   noise: 2.894
Iter 81/300 - Loss: 4.966   lengthscale: 9.013   noise: 2.590
Iter 91/300 - Loss: 4.938   lengthscale: 9.792   noise: 2.221
Iter 101/300 - Loss: 4.912   lengthscale: 10.488   noise: 1.827
Iter 111/300 - Loss: 4.888   lengthscale: 11.122   noise: 1.443
Iter 121/300 - Loss: 4.866   lengthscale: 11.709   noise: 1.102
Iter 131/300 - Loss: 4.848   lengthscale: 12.257   noise: 0.823
Iter 141/300 - Loss: 4.832   lengthscale: 12.772   noise: 0.612
Iter 151/300 - Loss: 4.820   lengthscale: 13.259   noise: 0.461
Iter 161/300 - Loss: 4.811   lengthscale: 13.722   noise: 0.356
Iter 171/300 - Loss: 4.803   lengthscale: 14.163   noise: 0.281
Iter 181/300 - Loss: 4.797   lengthscale: 14.586   noise: 0.229
Iter 191/300 - Loss: 4.791   lengthscale: 14.992   noise: 0.190
Iter 201/300 - Loss: 4.787   lengthscale: 15.383  



Iter 11/300 - Loss: 6.000   lengthscale: 1.301   noise: 1.305
Iter 21/300 - Loss: 5.529   lengthscale: 2.173   noise: 2.053
Iter 31/300 - Loss: 5.381   lengthscale: 3.007   noise: 2.804
Iter 41/300 - Loss: 5.329   lengthscale: 3.381   noise: 3.473
Iter 51/300 - Loss: 5.303   lengthscale: 3.437   noise: 4.026
Iter 61/300 - Loss: 5.287   lengthscale: 3.356   noise: 4.473
Iter 71/300 - Loss: 5.275   lengthscale: 3.220   noise: 4.836
Iter 81/300 - Loss: 5.265   lengthscale: 3.062   noise: 5.140
Iter 91/300 - Loss: 5.255   lengthscale: 2.893   noise: 5.404
Iter 101/300 - Loss: 5.247   lengthscale: 2.724   noise: 5.642
Iter 111/300 - Loss: 5.240   lengthscale: 2.563   noise: 5.862
Iter 121/300 - Loss: 5.233   lengthscale: 2.420   noise: 6.068
Iter 131/300 - Loss: 5.227   lengthscale: 2.300   noise: 6.264
Iter 141/300 - Loss: 5.222   lengthscale: 2.205   noise: 6.447
Iter 151/300 - Loss: 5.218   lengthscale: 2.133   noise: 6.618
Iter 161/300 - Loss: 5.215   lengthscale: 2.079   noise: 6.774
I



In [11]:
results = [] 
metric_cols = ['accuracy', 'precision', 'recall', 'specificity','TN', 'FN', 'FP', 'TP','f1', 'ROC-AUC', 'MCC', 'Balanced Accuracy',
       'model', 'cm', 'prediction_type', 'NEK', 'feat_type', 'strategy']
for i, nek in enumerate(neks):
    if nek in ['2','9']: 
        bind_inhib = ['binding', 'inhibition']
    else: 
        bind_inhib = ['binding'] 
    for bi in bind_inhib: 
        if bi == 'binding': 
            this_bi = 'bind'
        else: 
            this_bi = 'inhib' 
        for feat in ['moe', 'mfp']: 
            result_df = pd.read_csv(f'{GP_path}NEK{nek}_{bi}_{feat}_UNDER_batch2_test_GP_matern.csv')
            results.append(result_df.iloc[[0]][metric_cols].values.flatten())
results_df =  pd.DataFrame(results,columns=metric_cols)
results_df['model'] = results_df['model'].str.replace('scaled', 'raw')
results_df['strategy'] =results_df['strategy'].str.replace('scaled', 'raw')
results_df['strategy'] = results_df['strategy'].str.replace('scaled', 'raw')
results_df.to_csv(GP_path+'GP_matern_results_UDNER_batch2.csv', index=False) 
results_df['modeling_type'] = 'GP_matern' 
results_df['set'] = 'UNDER_batch2'

In [21]:
original_results = pd.read_csv('/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/general_NEK/final_paper_models.csv')
original_results = original_results[(original_results['set'] == 'original') & (original_results['modeling_type'] == 'GP_matern')]
only_under = original_results[original_results['strategy'] == 'UNDER']
all_under_rf = pd.concat([results_df, only_under])
comparison = all_under_rf[['NEK', 'modeling_type','feat_type','set','cm', 'recall', 'specificity']]
comparison = comparison.sort_values(['NEK', 'feat_type'])
comparison 

Unnamed: 0,NEK,modeling_type,feat_type,set,cm,recall,specificity
1,NEK2_binding,GP_matern,mfp,UNDER_batch2,"[89, 182, 3, 9]",0.75,0.328413
5,NEK2_binding,GP_matern,mfp,original,"[4, 267, 0, 12]",1.0,0.01476
0,NEK2_binding,GP_matern,moe,UNDER_batch2,"[185, 86, 4, 8]",0.666667,0.682657
1,NEK2_binding,GP_matern,moe,original,"[212, 59, 5, 7]",0.583333,0.782288
3,NEK2_inhibition,GP_matern,mfp,UNDER_batch2,"[200, 180, 22, 6]",0.214286,0.526316
13,NEK2_inhibition,GP_matern,mfp,original,"[100, 280, 5, 23]",0.821429,0.263158
2,NEK2_inhibition,GP_matern,moe,UNDER_batch2,"[342, 38, 7, 21]",0.75,0.9
9,NEK2_inhibition,GP_matern,moe,original,"[333, 47, 7, 21]",0.75,0.876316
5,NEK3_binding,GP_matern,mfp,UNDER_batch2,"[98, 167, 6, 11]",0.647059,0.369811
21,NEK3_binding,GP_matern,mfp,original,"[125, 140, 3, 14]",0.823529,0.471698
