In [1]:
import math
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import os

import shutil
import sklearn
from sklearn.model_selection import KFold
import gpytorch
from gpytorch.models import ExactGP
from gpytorch.likelihoods import DirichletClassificationLikelihood
from gpytorch.means import ConstantMean
from gpytorch.kernels import ScaleKernel, RBFKernel

from sklearn.metrics import confusion_matrix
import itertools
from sklearn.metrics import precision_score, recall_score, roc_auc_score, matthews_corrcoef, balanced_accuracy_score, confusion_matrix, f1_score, roc_curve,precision_recall_curve, auc

import sys

sys.path.append('/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/')
from RF_GSCV import * # RF_GSCV contains the calculate metrics function to get the TP, TN, FP, FN scores 
from RF_atomver import prediction_type

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [10]:

class ExactGPModel(gpytorch.models.ExactGP):
    # def __init__(self, train_x, train_y, likelihood):
    def __init__(self, train_x, train_y, likelihood, num_classes):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=0.5))

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)


In [18]:
class Trainer: 
    def __init__(self,model, likelihood, iterations): 
        self.model = model
        self.likelihood = likelihood 
        smoke_test = ('CI' in os.environ)
        self.n_iterations = 2 if smoke_test else iterations
        self.optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
        self.loss_fn = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.model)
        
    def train(self, train_x, train_y): 
        self.model.train()
        self.likelihood.train()
        predictions = [] 
        for i in range(self.n_iterations): 
            self.optimizer.zero_grad()
            output = self.model(train_x)
            loss = -self.loss_fn(output, self.likelihood.transformed_targets).sum()
            # loss = -self.loss_fn(output, train_y).sum()
            loss.backward()
            if (i%10==0): 
                print('Iter %d/%d - Loss: %.3f   lengthscale: %.3f   noise: %.3f' % (
                    i + 1, self.n_iterations, loss.item(),
                    self.model.covar_module.base_kernel.lengthscale.mean().item(),
                    self.model.likelihood.second_noise_covar.noise.mean().item()
                ))
             
            self.optimizer.step() 

    def predict(self, input): 
        """
        Make predictions using the GP model.
        Args:
            input (torch.Tensor): The input data for making predictions.
        
        Returns:
            dist (gpytorch.distributions.MultivariateNormal): The distribution representing the GP posterior.
            observed_pred (gpytorch.distributions.MultivariateNormal): The predicted distribution considering the likelihood.
            pred_means (torch.Tensor): The means of the predicted distributions.
            class_pred (torch.Tensor): The predicted class labels.
        """
        self.model.eval()
        self.likelihood.eval()

        with gpytorch.settings.fast_pred_var(), torch.no_grad():
            dist = self.model(input)     # output distribution
            pred_means = dist.loc          # means of distributino 
            observed_pred = self.likelihood(self.model(input))    # likelihood predictions mean and var  

            class_pred = self.model(input).loc.max(0)[1]
            
        return dist, observed_pred, pred_means, class_pred

    def evaluate(self, x_input, y_true): 
        """
        Evaluate the GP model.
        Args:
            x_input (torch.Tensor): The input data features.
            y_true (torch.Tensor): The true labels for the input data.
        Returns:
            y_pred (numpy.ndarray): The predicted class labels.
        """
        y_pred = self.model(x_input).loc.max(0)[1].numpy()
        return y_pred

    def gp_results(self, x_input, y_true, plot_title=None): 
        """
        Calculate evaluation metrics and print results.
        Args:
            x_input (torch.Tensor): The input data features.
            y_true (torch.Tensor or numpy.ndarray): The true labels for the input data.
            plot_title (str, optional): The title for the confusion matrix plot.
        Returns:
            dict: A dictionary containing evaluation metrics and confusion matrix components.
        """
        y_pred = self.evaluate(x_input, y_true) 
        if isinstance(y_true, torch.Tensor):
            y_true = y_true.numpy().reshape(-1)
        # plot_confusion_matrix(y_true, y_pred, ['0','1'], title=plot_title)
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        dist = self.model(x_input)     # get predicted distributions 
        pred_means = dist.loc          # means for predicted dist  

        recall = recall_score(y_true, y_pred)
        tp, tn, fp, fn = calculate_metrics(y_true, y_pred) 
        sensitivity = tp / (tp + fn) 
        specificity = tn / (tn + fp) 
        cm = confusion_matrix(y_true, y_pred)
        print(f'accuracy: {accuracy:.4f}, precision: {precision:.4f}, recall: {recall:.4f}, specificity: {specificity:.4f}, cm: {cm}')
        return {'accuracy': accuracy, 'precision': precision,  'recall':recall, 'specificity':specificity, 'TN': tn, 'FN': fn, 'FP': fp, 'TP': tp }

       

In [12]:
def make_torch_tens_float(filepath, filename): 
    trainX_df = pd.read_csv(filepath+filename+'_trainX.csv')
    trainy_df = pd.read_csv(filepath+filename+'_train_y.csv')
    testX_df = pd.read_csv(filepath+filename+'_testX.csv')
    testy_df = pd.read_csv(filepath+filename+'_test_y.csv')

    train_x_temp = trainX_df.to_numpy().astype("float32")
    test_x_temp = testX_df.to_numpy().astype("float32")
    
    train_y_temp = trainy_df.to_numpy().flatten().astype("long")
    test_y_temp = testy_df.to_numpy().flatten().astype("long")
    
    trainX = torch.from_numpy(train_x_temp)
    trainy = torch.from_numpy(train_y_temp)
    testX = torch.from_numpy(test_x_temp)
    testy = torch.from_numpy(test_y_temp)
    print(f'train X: {trainX.shape}, train y: {trainy.shape}, test X: {testX.shape}, test y: {testy.shape}')
    return trainX, trainy, testX, testy
    


In [19]:

def save_results(trainX, trainy, testX, testy, root_name, n_iterations=300, n_samples=100):
    """
    Train a Dirichlet Gaussian Process model and save the training and test performance results.
    EXAct ExactGPModel
    This function trains a Dirichlet GP model on the given training data, evaluates it on both the training
    and test data, and saves various performance metrics and predictions to pandas DataFrames.

    Args:
        trainX (torch.Tensor): The training data features.
        trainy (torch.Tensor): The training data labels.
        testX (torch.Tensor): The test data features.
        testy (torch.Tensor): The test data labels.
        root_name (str): The root name used for labeling the model in the results.
        n_iterations (int, optional): The number of training iterations. Default is 300.
        n_samples (int, optional): The number of samples for prediction. Default is 100.

    Returns:
        train_perf_df (pd.DataFrame): DataFrame containing performance metrics and predictions for the training data.
        test_perf_df (pd.DataFrame): DataFrame containing performance metrics and predictions for the test data.
    """
    likelihood = DirichletClassificationLikelihood(trainy, learn_additional_noise=True)
    # likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = ExactGPModel(trainX, likelihood.transformed_targets, likelihood, num_classes=likelihood.num_classes)
    # model = ExactGPModel(trainX, trainy, likelihood)
    # n_iterations = 300
    trainer = Trainer(model, likelihood, n_iterations)
    trainer.train(trainX, trainy) 
  
    train_dist, train_observed_pred, train_pred_means, train_pred  = trainer.predict(trainX)
    train_results = trainer.gp_results(trainX, trainy)
    test_dist, test_observed_pred, test_pred_means, test_pred  = trainer.predict(testX)
    test_results = trainer.gp_results(testX, testy)
    
    train_observed_pred.mean.numpy()
    train_pred_variance2D = train_observed_pred.variance.numpy()
    test_observed_pred.mean.numpy()
    test_pred_variance2D=test_observed_pred.variance.numpy()
    
    train_pred_samples = train_dist.sample(torch.Size((256,))).exp()
    train_probabilities = (train_pred_samples / train_pred_samples.sum(-2, keepdim=True)).mean(0)

    train_prob_stds = (train_pred_samples / train_pred_samples.sum(-2, keepdim=True)).std(0)

    test_pred_samples = test_dist.sample(torch.Size((100,))).exp()

    test_probabilities = (test_pred_samples / test_pred_samples.sum(-2, keepdim=True)).mean(0)
    test_prob_stds = (test_pred_samples / test_pred_samples.sum(-2, keepdim=True)).std(0)

 
    train_perf_df = pd.DataFrame()
    test_perf_df = pd.DataFrame()
    train_perf_df['mean_pred_class0'] = train_observed_pred.mean.numpy()[0,]
    train_perf_df['mean_pred_class1'] = train_observed_pred.mean.numpy()[1,]
    train_perf_df['y'] = trainy
    train_perf_df['y_pred'] = train_pred_means.max(0)[1]
    train_perf_df['var_pred_class0']=train_observed_pred.variance.numpy()[0,]
    train_perf_df['var_pred_class1']=train_observed_pred.variance.numpy()[1,]
    train_perf_df['pred_prob_class0'] = train_probabilities.numpy()[0,]
    train_perf_df['pred_prob_class1'] = train_probabilities.numpy()[1,]
    train_perf_df['pred_prob_std_class0'] = train_prob_stds.numpy()[0,]
    train_perf_df['pred_prob_std_class1'] = train_prob_stds.numpy()[1,]
    train_perf_df['model'] = f'{root_name}_GP'
    train_perf_df['subset'] = 'train' 
    train_cm = confusion_matrix(trainy, train_perf_df['y_pred'])
    cm_flattened = train_cm.flatten().tolist()
    train_perf_df['cm']= [cm_flattened]* len(train_perf_df)
    train_perf_df['prediction_type'] = train_perf_df.apply(lambda x: prediction_type(x['y'], x['y_pred']), axis=1)
    train_perf_df['ROC-AUC'] = roc_auc_score(trainy, train_perf_df['y_pred'])
    train_perf_df['MCC'] = matthews_corrcoef(trainy, train_perf_df['y_pred'])
    train_perf_df['Balanced Accuracy'] = balanced_accuracy_score(trainy, train_perf_df['y_pred'])
    train_perf_df['f1'] = f1_score(trainy, train_perf_df['y_pred'])
    
    test_perf_df['mean_pred_class0'] = test_observed_pred.mean.numpy()[0,]
    test_perf_df['mean_pred_class1'] = test_observed_pred.mean.numpy()[1,]
    test_perf_df['y'] = testy
    test_perf_df['y_pred'] = test_pred_means.max(0)[1]
    test_perf_df['var_pred_class0']=test_observed_pred.variance.numpy()[0,]
    test_perf_df['var_pred_class1']=test_observed_pred.variance.numpy()[1,]
    test_perf_df['pred_prob_class0'] = test_probabilities.numpy()[0,]
    test_perf_df['pred_prob_class1'] = test_probabilities.numpy()[1,]
    test_perf_df['pred_prob_std_class0'] =test_prob_stds.numpy()[0,]
    test_perf_df['pred_prob_std_class1'] = test_prob_stds.numpy()[1,]
    test_perf_df['model'] = f'{root_name}_GP_exactGP'
    test_perf_df['subset'] = 'test' 
    test_cm = confusion_matrix(testy, test_perf_df['y_pred'])
    test_cm_flattened = test_cm.flatten().tolist()
    test_perf_df['cm']= [test_cm_flattened]* len(test_perf_df)
    test_perf_df['prediction_type'] = test_perf_df.apply(lambda x: prediction_type(x['y'], x['y_pred']), axis=1)
    test_perf_df['ROC-AUC'] = roc_auc_score(testy, test_perf_df['y_pred'])
    test_perf_df['MCC'] = matthews_corrcoef(testy, test_perf_df['y_pred'])
    test_perf_df['Balanced Accuracy'] = balanced_accuracy_score(testy, test_perf_df['y_pred'])
    test_perf_df['f1'] = f1_score(testy, test_perf_df['y_pred'])
    with open(f'{GP_holdout}{root_name}_ExactGP_model.pkl', 'wb') as f: 
        pickle.dump(model,f)
    with open(f'{GP_holdout}{root_name}_GP_same_Dirichlet_likelihood.pkl', 'wb') as f: 
        pickle.dump(likelihood,f)
    for k, val in train_results.items(): 
        train_perf_df[k] = val
    for k, val in test_results.items():
        test_perf_df[k] = val
    return train_perf_df, test_perf_df


In [20]:
data_dir = '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/'
samplings = ['scaled', 'UNDER', 'SMOTE', 'ADASYN'] 
feat_types = ['moe', 'mfp']
neks = ['2', '3', '5', '9']
GP_path= '/Users/jayceepang/msse/capstone/atom2024/atom2024/notebooks/NEK/GP/GP_matern_kernel/' 
for nek in neks:
    print(f'NEK{nek}')
    bind_inhib = ['binding', 'inhibition']
    if nek in ['3','5']: 
        bind_inhib = ['binding']
    else: 
        bind_inhib = ['binding', 'inhibition']
    for bi in bind_inhib: 
        if bi == 'binding': 
            this_bi = 'bind' 
        if bi == 'inhibition': 
            this_bi = 'inhib'
        nek_path= f'{data_dir}NEK{nek}/{this_bi}/'
        for feat in feat_types: 
            for samp in samplings: 
                print(f'NEK{nek} {bi} {feat} {samp}')
                file_root = f'NEK{nek}_{bi}_{feat}_{samp}'
                trainX, trainy, testX, testy = make_torch_tens_float(nek_path,file_root)
                print(f'trainX:{trainX.shape}, train y: {trainy.shape}, testX: {testX.shape}, test y: {testy.shape}')
                train_perf_df, test_perf_df = save_results(trainX, trainy, testX, testy,file_root, n_iterations=300)
                train_perf_df['NEK'] = f'NEK{nek}_{bi}'
                train_perf_df['strategy'] = f'{samp}'
                train_perf_df['feat_type'] = f'{feat}'
                test_perf_df['NEK'] = f'NEK{nek}_{bi}'
                test_perf_df['strategy'] = f'{samp}'
                test_perf_df['feat_type'] = f'{feat}'
                train_perf_df.to_csv(f'{GP_path}{file_root}_train_GP_matern.csv',index=False) 
                test_perf_df.to_csv(f'{GP_path}{file_root}_test_GP_matern.csv',index=False) 
                print()
                

NEK2
NEK2 binding moe scaled
train X: torch.Size([1125, 306]), train y: torch.Size([1125]), test X: torch.Size([283, 306]), test y: torch.Size([283])
trainX:torch.Size([1125, 306]), train y: torch.Size([1125]), testX: torch.Size([283, 306]), test y: torch.Size([283])
Iter 1/300 - Loss: 7.107   lengthscale: 0.693   noise: 0.693
Iter 11/300 - Loss: 5.796   lengthscale: 1.301   noise: 0.812
Iter 21/300 - Loss: 5.307   lengthscale: 2.096   noise: 1.092
Iter 31/300 - Loss: 4.786   lengthscale: 3.160   noise: 1.364
Iter 41/300 - Loss: 4.316   lengthscale: 4.420   noise: 1.505
Iter 51/300 - Loss: 4.112   lengthscale: 5.463   noise: 1.517
Iter 61/300 - Loss: 3.964   lengthscale: 6.214   noise: 1.457
Iter 71/300 - Loss: 3.826   lengthscale: 6.801   noise: 1.366
Iter 81/300 - Loss: 3.741   lengthscale: 7.334   noise: 1.275
Iter 91/300 - Loss: 3.684   lengthscale: 7.866   noise: 1.201
Iter 101/300 - Loss: 3.635   lengthscale: 8.377   noise: 1.146
Iter 111/300 - Loss: 3.607   lengthscale: 8.836   



RuntimeError: shape '[1125]' is invalid for input of size 2250