In [1]:
import math
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import os

import shutil
import sklearn
from sklearn.model_selection import KFold
import gpytorch
from gpytorch.models import ExactGP
from gpytorch.likelihoods import DirichletClassificationLikelihood
from gpytorch.means import ConstantMean
from gpytorch.kernels import ScaleKernel, RBFKernel, MaternKernel

from sklearn.metrics import confusion_matrix
import itertools
from sklearn.metrics import precision_score, recall_score, roc_auc_score, matthews_corrcoef, balanced_accuracy_score, confusion_matrix, f1_score, roc_curve,precision_recall_curve, auc

import sys
sys.path.append('/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/')
from RF_GSCV import * # RF_GSCV contains the calculate metrics function to get the TP, TN, FP, FN scores 
from RF_atomver import prediction_type

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:

class DirichletGPModel(ExactGP):
    """
    A Dirichlet Gaussian Process (GP) model for multi-class classification.
    This model uses a Gaussian Process with a Dirichlet prior to handle multi-class classification tasks.
    It extends the ExactGP class from GPyTorch, a library for Gaussian Processes in PyTorch.
    Attributes:
        mean_module (gpytorch.means.ConstantMean): The mean module for the GP, initialized with a constant mean function for each class.
        covar_module (gpytorch.kernels.ScaleKernel): The covariance module for the GP, using a scaled RBF kernel for each class.

    Args:
        train_x (torch.Tensor): Training data features.
        train_y (torch.Tensor): Training data labels.
        likelihood (gpytorch.likelihoods.Likelihood): The likelihood function.
        num_classes (int): The number of classes for the classification task.
    """
    def __init__(self, train_x, train_y, likelihood, num_classes):
        super(DirichletGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = ConstantMean(batch_shape=torch.Size((num_classes,)))
        self.covar_module = ScaleKernel(MaternKernel(nu=0.5, batch_shape=torch.Size((num_classes,))),
            batch_shape=torch.Size((num_classes,))
        )

    def forward(self, x):
        """
        Forward pass through the GP model.
        Args:
            x (torch.Tensor): Input data features.
        Returns:
            gpytorch.distributions.MultivariateNormal: The multivariate normal distribution representing the GP posterior.
        """
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)


In [3]:
class Trainer: 
    def __init__(self,model, likelihood, iterations): 
        self.model = model
        self.likelihood = likelihood 
        smoke_test = ('CI' in os.environ)
        self.n_iterations = 2 if smoke_test else iterations
        self.optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
        self.loss_fn = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.model)
        
    def train(self, train_x, train_y): 
        self.model.train()
        self.likelihood.train()
        predictions = [] 
        for i in range(self.n_iterations): 
            self.optimizer.zero_grad()
            output = self.model(train_x)
            loss = -self.loss_fn(output, self.likelihood.transformed_targets).sum()
            loss.backward()
            if (i%10==0): 
                print('Iter %d/%d - Loss: %.3f   lengthscale: %.3f   noise: %.3f' % (
                    i + 1, self.n_iterations, loss.item(),
                    self.model.covar_module.base_kernel.lengthscale.mean().item(),
                    self.model.likelihood.second_noise_covar.noise.mean().item()
                ))
             
            self.optimizer.step() 
    def predict(self, input): 
        """
        Make predictions using the GP model.

        Args:
            input (torch.Tensor): The input data for making predictions.
        
        Returns:
            dist (gpytorch.distributions.MultivariateNormal): The distribution representing the GP posterior.
            observed_pred (gpytorch.distributions.MultivariateNormal): The predicted distribution considering the likelihood.
            pred_means (torch.Tensor): The means of the predicted distributions.
            class_pred (torch.Tensor): The predicted class labels.
        """
        self.model.eval()
        self.likelihood.eval()

        with gpytorch.settings.fast_pred_var(), torch.no_grad():
            dist = self.model(input)     # output distribution
            pred_means = dist.loc          # means of distributino 
            observed_pred = self.likelihood(self.model(input))    # likelihood predictions mean and var  

            class_pred = self.model(input).loc.max(0)[1]
            
        return dist, observed_pred, pred_means, class_pred
    

    def evaluate(self, x_input, y_true): 
        """
        Evaluate the GP model.

        Args:
            x_input (torch.Tensor): The input data features.
            y_true (torch.Tensor): The true labels for the input data.
        
        Returns:
            y_pred (numpy.ndarray): The predicted class labels.
        """
        y_pred = self.model(x_input).loc.max(0)[1].numpy()
        
        return y_pred

    def gp_results(self, x_input, y_true, plot_title=None): 
        """
        Calculate evaluation metrics and print results.

        Args:
            x_input (torch.Tensor): The input data features.
            y_true (torch.Tensor or numpy.ndarray): The true labels for the input data.
            plot_title (str, optional): The title for the confusion matrix plot.
        
        Returns:
            dict: A dictionary containing evaluation metrics and confusion matrix components.
        """
        y_pred = self.evaluate(x_input, y_true) 
        if isinstance(y_true, torch.Tensor):
            y_true = y_true.numpy().reshape(-1)
        # plot_confusion_matrix(y_true, y_pred, ['0','1'], title=plot_title)
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        dist = self.model(x_input)     # get predicted distributions 
        pred_means = dist.loc          # means for predicted dist  

        recall = recall_score(y_true, y_pred)
        tp, tn, fp, fn = calculate_metrics(y_true, y_pred) 
        sensitivity = tp / (tp + fn) 
        specificity = tn / (tn + fp) 
        cm = confusion_matrix(y_true, y_pred)
        print(f'accuracy: {accuracy:.4f}, precision: {precision:.4f}, recall: {recall:.4f}, specificity: {specificity:.4f}, cm: {cm}')
        return {'accuracy': accuracy, 'precision': precision,  'recall':recall, 'specificity':specificity, 'TN': tn, 'FN': fn, 'FP': fp, 'TP': tp }

       

In [15]:
def make_torch_tens_float_mod(filepath, filename): 
    df = pd.read_csv(filepath+filename+'.csv')
    traindf = df[df['subset'] == 'train'] 
    testdf = df[df['subset'] == 'test'] 
    cols = ['subset', 'compound_id', 'base_rdkit_smiles', 'fold','active']
    
    trainX_df = traindf.drop(columns=cols)
    print(len(list(trainX_df.columns)))
    # display(trainX_df)

    trainy_df = traindf['active'] 
    testX_df = testdf.drop(columns=cols)
    testy_df = testdf['active'] 


    train_x_temp = trainX_df.to_numpy().astype("double") # double 
    test_x_temp = testX_df.to_numpy().astype("double") #double 
    
    train_y_temp = trainy_df.to_numpy().flatten().astype("double") #double 
    test_y_temp = testy_df.to_numpy().flatten().astype("double") #double 
    trainX = torch.as_tensor(train_x_temp, dtype=torch.float32)
    trainy = torch.as_tensor(train_y_temp, dtype=torch.float32)
    testX = torch.as_tensor(test_x_temp, dtype=torch.float32)
    testy = torch.as_tensor(test_y_temp, dtype=torch.float32)
    return trainX, trainy, testX, testy

In [16]:

GP_path = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/undersampler_validation/under_batch2/'
def save_results(trainX, trainy, testX, testy, root_name,GP_path, n_iterations=300, n_samples=100):
    """
    Train a Dirichlet Gaussian Process model and save the training and test performance results.

    This function trains a Dirichlet GP model on the given training data, evaluates it on both the training
    and test data, and saves various performance metrics and predictions to pandas DataFrames.

    Args:
        trainX (torch.Tensor): The training data features.
        trainy (torch.Tensor): The training data labels.
        testX (torch.Tensor): The test data features.
        testy (torch.Tensor): The test data labels.
        root_name (str): The root name used for labeling the model in the results.
        n_iterations (int, optional): The number of training iterations. Default is 300.
        n_samples (int, optional): The number of samples for prediction. Default is 100.

    Returns:
        train_perf_df (pd.DataFrame): DataFrame containing performance metrics and predictions for the training data.
        test_perf_df (pd.DataFrame): DataFrame containing performance metrics and predictions for the test data.
    """
    likelihood = DirichletClassificationLikelihood(trainy.long(), learn_additional_noise=True)
    model = DirichletGPModel(trainX, likelihood.transformed_targets, likelihood, num_classes=likelihood.num_classes)
    # n_iterations = 300
    trainer = Trainer(model, likelihood, n_iterations)
    trainer.train(trainX, trainy) 
  
    train_dist, train_observed_pred, train_pred_means, train_pred  = trainer.predict(trainX)
    train_results = trainer.gp_results(trainX, trainy)
    test_dist, test_observed_pred, test_pred_means, test_pred  = trainer.predict(testX)
    test_results = trainer.gp_results(testX, testy)
    
    train_observed_pred.mean.numpy()
    train_pred_variance2D = train_observed_pred.variance.numpy()
    test_observed_pred.mean.numpy()
    test_pred_variance2D=test_observed_pred.variance.numpy()
    
    train_pred_samples = train_dist.sample(torch.Size((256,))).exp()
    train_probabilities = (train_pred_samples / train_pred_samples.sum(-2, keepdim=True)).mean(0)

    train_prob_stds = (train_pred_samples / train_pred_samples.sum(-2, keepdim=True)).std(0)

    test_pred_samples = test_dist.sample(torch.Size((100,))).exp()

    test_probabilities = (test_pred_samples / test_pred_samples.sum(-2, keepdim=True)).mean(0)
    test_prob_stds = (test_pred_samples / test_pred_samples.sum(-2, keepdim=True)).std(0)

 
    train_perf_df = pd.DataFrame()
    test_perf_df = pd.DataFrame()
    train_perf_df['mean_pred_class0'] = train_observed_pred.mean.numpy()[0,]
    train_perf_df['mean_pred_class1'] = train_observed_pred.mean.numpy()[1,]
    train_perf_df['y'] = trainy
    train_perf_df['y_pred'] = train_pred_means.max(0)[1]
    train_perf_df['var_pred_class0']=train_observed_pred.variance.numpy()[0,]
    train_perf_df['var_pred_class1']=train_observed_pred.variance.numpy()[1,]
    train_perf_df['pred_prob_class0'] = train_probabilities.numpy()[0,]
    train_perf_df['pred_prob_class1'] = train_probabilities.numpy()[1,]
    train_perf_df['pred_prob_std_class0'] = train_prob_stds.numpy()[0,]
    train_perf_df['pred_prob_std_class1'] = train_prob_stds.numpy()[1,]
    train_perf_df['model'] = f'{root_name}_GP_Dirichlet_matern'
    train_perf_df['subset'] = 'train' 
    train_cm = confusion_matrix(trainy, train_perf_df['y_pred'])
    cm_flattened = train_cm.flatten().tolist()
    train_perf_df['cm']= [cm_flattened]* len(train_perf_df)
    train_perf_df['prediction_type'] = train_perf_df.apply(lambda x: prediction_type(x['y'], x['y_pred']), axis=1)
    train_perf_df['ROC-AUC'] = roc_auc_score(trainy, train_perf_df['y_pred'])
    train_perf_df['MCC'] = matthews_corrcoef(trainy, train_perf_df['y_pred'])
    train_perf_df['Balanced Accuracy'] = balanced_accuracy_score(trainy, train_perf_df['y_pred'])
    train_perf_df['f1'] = f1_score(trainy, train_perf_df['y_pred'])
    
    test_perf_df['mean_pred_class0'] = test_observed_pred.mean.numpy()[0,]
    test_perf_df['mean_pred_class1'] = test_observed_pred.mean.numpy()[1,]
    test_perf_df['y'] = testy
    test_perf_df['y_pred'] = test_pred_means.max(0)[1]
    test_perf_df['var_pred_class0']=test_observed_pred.variance.numpy()[0,]
    test_perf_df['var_pred_class1']=test_observed_pred.variance.numpy()[1,]
    test_perf_df['pred_prob_class0'] = test_probabilities.numpy()[0,]
    test_perf_df['pred_prob_class1'] = test_probabilities.numpy()[1,]
    test_perf_df['pred_prob_std_class0'] =test_prob_stds.numpy()[0,]
    test_perf_df['pred_prob_std_class1'] = test_prob_stds.numpy()[1,]
    test_perf_df['model'] = f'{root_name}_GP_Dirichlet_matern'
    test_perf_df['subset'] = 'test' 
    test_cm = confusion_matrix(testy, test_perf_df['y_pred'])
    test_cm_flattened = test_cm.flatten().tolist()
    test_perf_df['cm']= [test_cm_flattened]* len(test_perf_df)
    test_perf_df['prediction_type'] = test_perf_df.apply(lambda x: prediction_type(x['y'], x['y_pred']), axis=1)
    test_perf_df['ROC-AUC'] = roc_auc_score(testy, test_perf_df['y_pred'])
    test_perf_df['MCC'] = matthews_corrcoef(testy, test_perf_df['y_pred'])
    test_perf_df['Balanced Accuracy'] = balanced_accuracy_score(testy, test_perf_df['y_pred'])
    test_perf_df['f1'] = f1_score(testy, test_perf_df['y_pred'])
    with open(f'{GP_path}{root_name}_GP_Dirichlet_matern_model.pkl', 'wb') as f: 
        pickle.dump(model,f)
    with open(f'{GP_path}{root_name}_GP_Dirichlet_matern_likelihood.pkl', 'wb') as f: 
        pickle.dump(likelihood,f)
    for k, val in train_results.items(): 
        train_perf_df[k] = val
    for k, val in test_results.items():
        test_perf_df[k] = val
    return train_perf_df, test_perf_df


In [17]:
GP_path = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/undersampler_validation/under_batch2/'
samplings = ['UNDER'] 
feat_types = ['moe', 'mfp']
neks = ['2', '3', '5', '9']
for nek in neks:
    print(f'NEK{nek}')
    
    if nek in ['2','9']: 
        bind_inhib = ['binding', 'inhibition']
    else: 
        bind_inhib = ['binding']
    for bi in bind_inhib: 
        if bi == 'binding': 
            this_bi = 'bind' 
        if bi == 'inhibition': 
            this_bi = 'inhib'
        for feat in feat_types: 
            for samp in samplings: 
                this_dir = f'/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/undersampler_validation/under_batch2/'
                file_root = f'NEK{nek}_{bi}_{feat}_{samp}_batch2'
                print(file_root)
                trainX, trainy, testX, testy = make_torch_tens_float_mod(this_dir,file_root) 
                print(f'trainX:{trainX.shape}, train y: {trainy.shape}, testX: {testX.shape}, test y: {testy.shape}')
                train_perf_df, test_perf_df = save_results(trainX, trainy, testX, testy,file_root,GP_path)
                train_perf_df['NEK'] = f'NEK{nek}_{bi}'
                train_perf_df['strategy'] = f'{samp}'
                train_perf_df['feat_type'] = f'{feat}'
                test_perf_df['NEK'] = f'NEK{nek}_{bi}'
                test_perf_df['strategy'] = f'{samp}'
                test_perf_df['feat_type'] = f'{feat}'
                train_perf_df.to_csv(f'{this_dir}{file_root}_train_GP_matern.csv',index=False) 
                test_perf_df.to_csv(f'{this_dir}{file_root}_test_GP_matern.csv',index=False) 
                print() 


NEK2
NEK2_binding_moe_UNDER_batch2
306
trainX:torch.Size([90, 306]), train y: torch.Size([90]), testX: torch.Size([283, 306]), test y: torch.Size([283])
Iter 1/300 - Loss: 7.107   lengthscale: 0.693   noise: 0.693
Iter 11/300 - Loss: 5.931   lengthscale: 1.112   noise: 1.297
Iter 21/300 - Loss: 5.438   lengthscale: 1.703   noise: 1.982
Iter 31/300 - Loss: 5.262   lengthscale: 2.701   noise: 2.572
Iter 41/300 - Loss: 5.206   lengthscale: 3.933   noise: 3.007
Iter 51/300 - Loss: 5.188   lengthscale: 5.093   noise: 3.302
Iter 61/300 - Loss: 5.181   lengthscale: 5.985   noise: 3.484
Iter 71/300 - Loss: 5.178   lengthscale: 6.620   noise: 3.582
Iter 81/300 - Loss: 5.175   lengthscale: 7.082   noise: 3.622
Iter 91/300 - Loss: 5.174   lengthscale: 7.441   noise: 3.624
Iter 101/300 - Loss: 5.173   lengthscale: 7.739   noise: 3.600
Iter 111/300 - Loss: 5.172   lengthscale: 7.997   noise: 3.560
Iter 121/300 - Loss: 5.171   lengthscale: 8.227   noise: 3.510
Iter 131/300 - Loss: 5.170   lengthscal




NEK2_binding_mfp_UNDER_batch2
2048
trainX:torch.Size([90, 2048]), train y: torch.Size([90]), testX: torch.Size([283, 2048]), test y: torch.Size([283])
Iter 1/300 - Loss: 7.114   lengthscale: 0.693   noise: 0.693
Iter 11/300 - Loss: 6.265   lengthscale: 0.443   noise: 1.318
Iter 21/300 - Loss: 5.732   lengthscale: 0.225   noise: 2.137
Iter 31/300 - Loss: 5.472   lengthscale: 0.107   noise: 2.987
Iter 41/300 - Loss: 5.351   lengthscale: 0.054   noise: 3.752
Iter 51/300 - Loss: 5.294   lengthscale: 0.027   noise: 4.391
Iter 61/300 - Loss: 5.263   lengthscale: 0.014   noise: 4.901
Iter 71/300 - Loss: 5.241   lengthscale: 0.009   noise: 5.302
Iter 81/300 - Loss: 5.232   lengthscale: 0.006   noise: 5.623
Iter 91/300 - Loss: 5.224   lengthscale: 0.005   noise: 5.888
Iter 101/300 - Loss: 5.219   lengthscale: 0.004   noise: 6.114
Iter 111/300 - Loss: 5.215   lengthscale: 0.003   noise: 6.311
Iter 121/300 - Loss: 5.211   lengthscale: 0.003   noise: 6.485
Iter 131/300 - Loss: 5.208   lengthscale

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


accuracy: 0.5000, precision: 0.0000, recall: 0.0000, specificity: 1.0000, cm: [[45  0]
 [45  0]]
accuracy: 0.9576, precision: 0.0000, recall: 0.0000, specificity: 1.0000, cm: [[271   0]
 [ 12   0]]

NEK2_inhibition_moe_UNDER_batch2
306
trainX:torch.Size([224, 306]), train y: torch.Size([224]), testX: torch.Size([408, 306]), test y: torch.Size([408])
Iter 1/300 - Loss: 7.107   lengthscale: 0.693   noise: 0.693
Iter 11/300 - Loss: 5.930   lengthscale: 1.290   noise: 1.297
Iter 21/300 - Loss: 5.430   lengthscale: 2.091   noise: 1.981
Iter 31/300 - Loss: 5.214   lengthscale: 3.189   noise: 2.555
Iter 41/300 - Loss: 5.086   lengthscale: 4.507   noise: 2.904
Iter 51/300 - Loss: 5.001   lengthscale: 5.821   noise: 2.984
Iter 61/300 - Loss: 4.946   lengthscale: 6.963   noise: 2.831
Iter 71/300 - Loss: 4.905   lengthscale: 7.904   noise: 2.523
Iter 81/300 - Loss: 4.871   lengthscale: 8.683   noise: 2.133
Iter 91/300 - Loss: 4.842   lengthscale: 9.349   noise: 1.722
Iter 101/300 - Loss: 4.816   



accuracy: 1.0000, precision: 1.0000, recall: 1.0000, specificity: 1.0000, cm: [[112   0]
 [  0 112]]
accuracy: 0.8431, precision: 0.2692, recall: 0.7500, specificity: 0.8500, cm: [[323  57]
 [  7  21]]

NEK2_inhibition_mfp_UNDER_batch2
2048
trainX:torch.Size([224, 2048]), train y: torch.Size([224]), testX: torch.Size([408, 2048]), test y: torch.Size([408])
Iter 1/300 - Loss: 7.111   lengthscale: 0.693   noise: 0.693
Iter 11/300 - Loss: 6.250   lengthscale: 1.276   noise: 1.319
Iter 21/300 - Loss: 5.729   lengthscale: 2.123   noise: 2.143
Iter 31/300 - Loss: 5.473   lengthscale: 2.863   noise: 3.003
Iter 41/300 - Loss: 5.355   lengthscale: 3.249   noise: 3.762
Iter 51/300 - Loss: 5.297   lengthscale: 3.414   noise: 4.382
Iter 61/300 - Loss: 5.266   lengthscale: 3.482   noise: 4.878
Iter 71/300 - Loss: 5.247   lengthscale: 3.509   noise: 5.276
Iter 81/300 - Loss: 5.235   lengthscale: 3.521   noise: 5.600
Iter 91/300 - Loss: 5.227   lengthscale: 3.526   noise: 5.871
Iter 101/300 - Loss: 5

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


accuracy: 0.9314, precision: 0.0000, recall: 0.0000, specificity: 1.0000, cm: [[380   0]
 [ 28   0]]

NEK3
NEK3_binding_moe_UNDER_batch2
306
trainX:torch.Size([128, 306]), train y: torch.Size([128]), testX: torch.Size([282, 306]), test y: torch.Size([282])
Iter 1/300 - Loss: 7.107   lengthscale: 0.693   noise: 0.693
Iter 11/300 - Loss: 5.931   lengthscale: 1.247   noise: 1.297
Iter 21/300 - Loss: 5.439   lengthscale: 2.029   noise: 1.982
Iter 31/300 - Loss: 5.264   lengthscale: 3.122   noise: 2.574
Iter 41/300 - Loss: 5.207   lengthscale: 4.412   noise: 3.011
Iter 51/300 - Loss: 5.190   lengthscale: 5.545   noise: 3.307
Iter 61/300 - Loss: 5.185   lengthscale: 6.317   noise: 3.494
Iter 71/300 - Loss: 5.183   lengthscale: 6.792   noise: 3.601
Iter 81/300 - Loss: 5.182   lengthscale: 7.087   noise: 3.655
Iter 91/300 - Loss: 5.181   lengthscale: 7.286   noise: 3.672
Iter 101/300 - Loss: 5.181   lengthscale: 7.435   noise: 3.664
Iter 111/300 - Loss: 5.180   lengthscale: 7.556   noise: 3.64



accuracy: 1.0000, precision: 1.0000, recall: 1.0000, specificity: 1.0000, cm: [[64  0]
 [ 0 64]]
accuracy: 0.6667, precision: 0.0947, recall: 0.5294, specificity: 0.6755, cm: [[179  86]
 [  8   9]]

NEK3_binding_mfp_UNDER_batch2
2048
trainX:torch.Size([128, 2048]), train y: torch.Size([128]), testX: torch.Size([282, 2048]), test y: torch.Size([282])
Iter 1/300 - Loss: 7.113   lengthscale: 0.693   noise: 0.693
Iter 11/300 - Loss: 6.261   lengthscale: 1.193   noise: 1.319
Iter 21/300 - Loss: 5.734   lengthscale: 2.024   noise: 2.139
Iter 31/300 - Loss: 5.471   lengthscale: 2.855   noise: 2.996
Iter 41/300 - Loss: 5.354   lengthscale: 3.295   noise: 3.759
Iter 51/300 - Loss: 5.297   lengthscale: 3.480   noise: 4.390
Iter 61/300 - Loss: 5.265   lengthscale: 3.555   noise: 4.895
Iter 71/300 - Loss: 5.246   lengthscale: 3.586   noise: 5.297
Iter 81/300 - Loss: 5.234   lengthscale: 3.599   noise: 5.621
Iter 91/300 - Loss: 5.226   lengthscale: 3.605   noise: 5.889
Iter 101/300 - Loss: 5.220   



Iter 11/300 - Loss: 5.931   lengthscale: 1.281   noise: 1.297
Iter 21/300 - Loss: 5.436   lengthscale: 2.112   noise: 1.981
Iter 31/300 - Loss: 5.251   lengthscale: 3.228   noise: 2.568
Iter 41/300 - Loss: 5.176   lengthscale: 4.549   noise: 2.981
Iter 51/300 - Loss: 5.140   lengthscale: 5.854   noise: 3.219
Iter 61/300 - Loss: 5.121   lengthscale: 6.981   noise: 3.310
Iter 71/300 - Loss: 5.109   lengthscale: 7.899   noise: 3.296
Iter 81/300 - Loss: 5.101   lengthscale: 8.646   noise: 3.214
Iter 91/300 - Loss: 5.094   lengthscale: 9.266   noise: 3.092
Iter 101/300 - Loss: 5.088   lengthscale: 9.795   noise: 2.948
Iter 111/300 - Loss: 5.084   lengthscale: 10.257   noise: 2.792
Iter 121/300 - Loss: 5.079   lengthscale: 10.666   noise: 2.632
Iter 131/300 - Loss: 5.075   lengthscale: 11.033   noise: 2.472
Iter 141/300 - Loss: 5.071   lengthscale: 11.365   noise: 2.313
Iter 151/300 - Loss: 5.067   lengthscale: 11.667   noise: 2.157
Iter 161/300 - Loss: 5.064   lengthscale: 11.943   noise: 2



Iter 11/300 - Loss: 6.257   lengthscale: 1.259   noise: 1.319
Iter 21/300 - Loss: 5.732   lengthscale: 2.110   noise: 2.140
Iter 31/300 - Loss: 5.471   lengthscale: 2.898   noise: 2.998
Iter 41/300 - Loss: 5.354   lengthscale: 3.306   noise: 3.759
Iter 51/300 - Loss: 5.297   lengthscale: 3.478   noise: 4.386
Iter 61/300 - Loss: 5.265   lengthscale: 3.548   noise: 4.889
Iter 71/300 - Loss: 5.246   lengthscale: 3.576   noise: 5.289
Iter 81/300 - Loss: 5.234   lengthscale: 3.588   noise: 5.614
Iter 91/300 - Loss: 5.226   lengthscale: 3.593   noise: 5.883
Iter 101/300 - Loss: 5.221   lengthscale: 3.596   noise: 6.112
Iter 111/300 - Loss: 5.216   lengthscale: 3.598   noise: 6.311
Iter 121/300 - Loss: 5.213   lengthscale: 3.599   noise: 6.485
Iter 131/300 - Loss: 5.211   lengthscale: 3.600   noise: 6.640
Iter 141/300 - Loss: 5.209   lengthscale: 3.600   noise: 6.777
Iter 151/300 - Loss: 5.208   lengthscale: 3.601   noise: 6.898
Iter 161/300 - Loss: 5.207   lengthscale: 3.601   noise: 7.006
I



accuracy: 0.0806, precision: 0.0806, recall: 1.0000, specificity: 0.0000, cm: [[  0 228]
 [  0  20]]

NEK9
NEK9_binding_moe_UNDER_batch2
306
trainX:torch.Size([96, 306]), train y: torch.Size([96]), testX: torch.Size([283, 306]), test y: torch.Size([283])
Iter 1/300 - Loss: 7.107   lengthscale: 0.693   noise: 0.693
Iter 11/300 - Loss: 5.931   lengthscale: 1.247   noise: 1.297
Iter 21/300 - Loss: 5.438   lengthscale: 2.031   noise: 1.982
Iter 31/300 - Loss: 5.260   lengthscale: 3.128   noise: 2.572
Iter 41/300 - Loss: 5.195   lengthscale: 4.450   noise: 3.000
Iter 51/300 - Loss: 5.164   lengthscale: 5.813   noise: 3.269
Iter 61/300 - Loss: 5.147   lengthscale: 7.054   noise: 3.404
Iter 71/300 - Loss: 5.137   lengthscale: 8.106   noise: 3.437
Iter 81/300 - Loss: 5.130   lengthscale: 8.975   noise: 3.403
Iter 91/300 - Loss: 5.124   lengthscale: 9.698   noise: 3.327
Iter 101/300 - Loss: 5.120   lengthscale: 10.308   noise: 3.229
Iter 111/300 - Loss: 5.117   lengthscale: 10.833   noise: 3.11



accuracy: 1.0000, precision: 1.0000, recall: 1.0000, specificity: 1.0000, cm: [[48  0]
 [ 0 48]]
accuracy: 0.6678, precision: 0.0990, recall: 0.7692, specificity: 0.6630, cm: [[179  91]
 [  3  10]]

NEK9_binding_mfp_UNDER_batch2
2048
trainX:torch.Size([96, 2048]), train y: torch.Size([96]), testX: torch.Size([283, 2048]), test y: torch.Size([283])
Iter 1/300 - Loss: 7.114   lengthscale: 0.693   noise: 0.693
Iter 11/300 - Loss: 6.258   lengthscale: 1.157   noise: 1.319
Iter 21/300 - Loss: 5.736   lengthscale: 1.961   noise: 2.139
Iter 31/300 - Loss: 5.471   lengthscale: 2.806   noise: 2.995
Iter 41/300 - Loss: 5.354   lengthscale: 3.259   noise: 3.759
Iter 51/300 - Loss: 5.297   lengthscale: 3.450   noise: 4.393
Iter 61/300 - Loss: 5.265   lengthscale: 3.529   noise: 4.899
Iter 71/300 - Loss: 5.246   lengthscale: 3.561   noise: 5.301
Iter 81/300 - Loss: 5.234   lengthscale: 3.574   noise: 5.625
Iter 91/300 - Loss: 5.226   lengthscale: 3.581   noise: 5.893
Iter 101/300 - Loss: 5.220   le

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



NEK9_inhibition_moe_UNDER_batch2
306
trainX:torch.Size([66, 306]), train y: torch.Size([66]), testX: torch.Size([80, 306]), test y: torch.Size([80])
Iter 1/300 - Loss: 7.107   lengthscale: 0.693   noise: 0.693
Iter 11/300 - Loss: 5.931   lengthscale: 1.239   noise: 1.297
Iter 21/300 - Loss: 5.436   lengthscale: 1.990   noise: 1.982
Iter 31/300 - Loss: 5.242   lengthscale: 3.056   noise: 2.566
Iter 41/300 - Loss: 5.141   lengthscale: 4.377   noise: 2.957
Iter 51/300 - Loss: 5.077   lengthscale: 5.742   noise: 3.126
Iter 61/300 - Loss: 5.031   lengthscale: 6.965   noise: 3.090
Iter 71/300 - Loss: 4.994   lengthscale: 8.003   noise: 2.892
Iter 81/300 - Loss: 4.961   lengthscale: 8.887   noise: 2.580
Iter 91/300 - Loss: 4.930   lengthscale: 9.655   noise: 2.196
Iter 101/300 - Loss: 4.900   lengthscale: 10.343   noise: 1.781
Iter 111/300 - Loss: 4.872   lengthscale: 10.972   noise: 1.375
Iter 121/300 - Loss: 4.847   lengthscale: 11.557   noise: 1.014
Iter 131/300 - Loss: 4.824   lengthscal



2048
trainX:torch.Size([66, 2048]), train y: torch.Size([66]), testX: torch.Size([80, 2048]), test y: torch.Size([80])
Iter 1/300 - Loss: 7.114   lengthscale: 0.693   noise: 0.693
Iter 11/300 - Loss: 6.262   lengthscale: 0.641   noise: 1.319
Iter 21/300 - Loss: 5.741   lengthscale: 0.367   noise: 2.138
Iter 31/300 - Loss: 5.468   lengthscale: 0.188   noise: 2.989
Iter 41/300 - Loss: 5.353   lengthscale: 0.089   noise: 3.755
Iter 51/300 - Loss: 5.297   lengthscale: 0.047   noise: 4.394
Iter 61/300 - Loss: 5.263   lengthscale: 0.028   noise: 4.905
Iter 71/300 - Loss: 5.246   lengthscale: 0.018   noise: 5.308
Iter 81/300 - Loss: 5.233   lengthscale: 0.012   noise: 5.630
Iter 91/300 - Loss: 5.224   lengthscale: 0.008   noise: 5.895
Iter 101/300 - Loss: 5.220   lengthscale: 0.005   noise: 6.121
Iter 111/300 - Loss: 5.215   lengthscale: 0.004   noise: 6.317
Iter 121/300 - Loss: 5.212   lengthscale: 0.003   noise: 6.489
Iter 131/300 - Loss: 5.210   lengthscale: 0.003   noise: 6.642
Iter 141/3

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:

GP_path = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/undersampler_validation/under_batch2/'
train_results = [] 
test_results = [] 
neks = ['2', '3', '5', '9']
count=0 
folds = ['fold1' ,'fold2','fold3', 'fold4', 'fold5'] 

feat_types = ['moe', 'mfp']
for nek in neks:
    bind_inhib = ['binding', 'inhibition']
    if nek in ['3','5']: 
        bind_inhib = ['binding']
    for bi in bind_inhib: 
        if bi == 'binding': 
            this_bi = 'bind' 
        if bi == 'inhibition': 
            this_bi = 'inhib'
        this_ct = 0 
        for feat in ['moe', 'mfp']: 
            for samp in ['UNDER']: 

                print(f'NEK{nek} {this_bi} {feat} {samp}')
                # fold_dir = f'/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/undersampler_validation/idea5_dir/{fold}/'
                file_root = f'NEK{nek}_{bi}_{feat}_{samp}_batch2'
                train = pd.read_csv(f'{fold_dir}{file_root}_train_GP_matern_batch2.csv').iloc[0]
                test = pd.read_csv(f'{fold_dir}{file_root}_test_GP_matern_batch2.csv').iloc[0]
                # NEK2_binding_moe_UNDER_df_fold1_train_GP_matern.csv
                train_results.append(train)
                test_results.append(test) 

In [None]:
train_df = pd.concat(train_results, ignore_index=True)
test_df = pd.concat(test_results, ignore_index=True)

In [None]:
metric_cols = ['model','NEK', 'strategy', 'feat_type','cm','prediction_type', 'recall','ROC-AUC', 'MCC',
       'Balanced Accuracy', 'f1', 'accuracy', 'precision',
       'specificity', 'TN', 'FN', 'FP','TP']

train_df =  pd.DataFrame(train_results,columns=metric_cols)
gp_path ='/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/undersampler_validation/idea5_dir/'
train_df.to_csv(f'{gp_path}GP_maternkern_train_results_foldval.csv', index=False)
test_df =  pd.DataFrame(test_results,columns=metric_cols)
test_df.to_csv(f'{gp_path}GP_maternkern_test_results_foldval.csv',index=False)


In [None]:
# import os

# ends = [] 
# dests = []
# for fold in folds: 
#     # train_end = f'df_{fold}_train_GP_matern.csv'
#     # test_end = f'df_{fold}_test_GP_matern.csv'
#     # ends.append(train_end)
#     # ends.append(test_end)
#     # now do with pickle files
#     # NEK2_binding_mfp_ADASYN_df_fold1_GP_Dirichlet_matern_likelihood.pkl
#     train_end = f'df_{fold}_GP_Dirichlet_matern_likelihood.pkl'
#     test_end = f'df_{fold}_GP_Dirichlet_matern_model.pkl'
#     ends.append(train_end)
#     ends.append(test_end)
#     fold_dir = f'/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/undersampler_validation/idea5_dir/{fold}/'
#     dests.append(fold_dir)
#     dests.append(fold_dir) 
                 
# directory= f'/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/GP/GP_matern_kernel/' 
                                       
# def search_and_move_files(source_directory, destination_directory, filename_endings):
#     moved_files = []
#     not_found = filename_endings.copy()  # Start with all endings in `not_found`

#     for root, _, files in os.walk(source_directory):
#         for file in files:
#             # Check if the file ends with any of the filename endings
#             for ending in filename_endings:
#                 if file.endswith(ending):
#                     source_path = os.path.join(root, file)
#                     destination_path = os.path.join(destination_directory, file)
                    
#                     # Ensure destination folder exists
#                     os.makedirs(destination_directory, exist_ok=True)
                    
#                     # Move the file
#                     try:
#                         shutil.move(source_path, destination_path)
#                         moved_files.append(destination_path)
#                         not_found.remove(ending)  # Remove from `not_found` if found and moved
#                         print(f"Moved {file} to {destination_directory}")
#                     except Exception as e:
#                         print(f"Error moving {file}: {e}")
#                     break  # Move to the next file once a match is found
                    
#     # Report missing files
#     if not_found:
#         print("Files not found:", not_found)
    
#     return moved_files, not_found

# # Run the combined search and move function for each ending and its destination
# for end, dest in zip(ends, dests): 
#     moved_files, not_found_files = search_and_move_files(directory, dest, [end])
#     print("Moved files:", moved_files)
#     if not_found_files:
#         print("Files not found:", not_found_files)

In [None]:
1512+112

In [None]:
test_df[test_df['NEK'] == 'NEK2_inhibition']

In [None]:

train_results2 = [] 
test_results2 = [] 
neks = ['9']
count=0 
samplings = ['scaled', 'UNDER', 'SMOTE', 'ADASYN'] 
feat_types = ['moe', 'mfp']
for nek in neks:
    
    bi ='inhibition'
    this_bi = 'inhib'
    this_ct = 0 
    for feat in feat_types: 
        for samp in samplings: 
            count+=1 
            print(f'{count}. NEK{nek} {this_bi} {feat} {samp}')
            file_root = f'NEK{nek}_{bi}_{feat}_{samp}'
            train = pd.read_csv(f'{GP_path_matern}{file_root}_train_GP_matern.csv').iloc[0]
            test = pd.read_csv(f'{GP_path_matern}{file_root}_test_GP_matern.csv').iloc[0]
            
            display(pd.read_csv(f'{GP_path_matern}{file_root}_train_GP_matern.csv').head(2))
            # if (nek == '9' and bi == 'inhibition'): 
            #     this_ct +=1 
            #     print(str(this_ct)+'. '+file_root)
            #     print(train.shape, test.shape)
            #     # display(test)
            #     print(train)
            #     print()
            nek9_inhib_cols = pd.read_csv(f'{GP_path_matern}{file_root}_train_GP_matern.csv').columns
            train_results2.append(train)
            test_results2.append(test) 

In [None]:
nek2_inhib_cols

In [None]:
nek9_inhib_cols

In [None]:
nek2_inhib_cols == nek9_inhib_cols