In [1]:
import pandas as pd
from sklearn import model_selection
from sklearn import preprocessing
import numpy as np
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier

In [25]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import check_classification_targets
import torch
from torch.nn.functional import normalize

class HQC_gpu(BaseEstimator, ClassifierMixin):
    """The Helstrom Quantum Centroid (HQC) classifier is a quantum-inspired supervised 
    classification approach for data with binary classes (ie. data with 2 classes only).
                         
    Parameters
    ----------
    rescale : int or float, default = 1
        The dataset rescaling factor. A parameter used for rescaling the dataset. 
    encoding : str, default = 'amplit'
        The encoding method used to encode vectors into quantum densities. Possible values:
        'amplit', 'stereo'. 'amplit' means using the amplitude encoding method. 'stereo' means 
        using the inverse of the standard stereographic projection encoding method. Default set 
        to 'amplit'.
    n_copies : int, default = 1
        The number of copies to take for each quantum density. This is equivalent to taking 
        the n-fold Kronecker tensor product for each quantum density.
    class_wgt : str, default = 'equi'
        The class weights assigned to the Quantum Helstrom observable terms. Possible values: 
        'equi', 'weighted'. 'equi' means assigning equal weights of 1/2 (equiprobable) to the
        two classes in the Quantum Helstrom observable. 'weighted' means assigning weights equal 
        to the proportion of the number of rows in each class to the two classes in the Quantum 
        Helstrom observable. Default set to 'equi'.
    n_splits : int, default = 1
        The number of subset splits performed on the input dataset row-wise and on the number 
        of eigenvalues/eigenvectors of the Quantum Helstrom observable for optimal speed 
        performance. If 1 is given, no splits are performed. For optimal speed, recommend 
        using small values as close to 1 as possible. If memory blow-out occurs, increase 
        n_splits.
    dtype : torch.float32 or torch.float64, default = torch.float64
        The float datatype used for the elements in the Pytorch tensor dataset. Datatype has to
        be of float to ensure calculations are done in float rather than integer. To achieve
        higher n_copies without memory blow-out issues, reduce float precision, which may or may   
        not affect accuracy.
    
    Attributes
    ----------
    classes_ : ndarray, shape (2,)
        Sorted binary classes.
    centroids_ : tensor, size (2, (n_features + 1)**n_copies, (n_features + 1)**n_copies)
        Quantum Centroids for class with index 0 and 1 respectively. Stored in GPU.
    hels_obs_ : tensor, size ((n_features + 1)**n_copies, (n_features + 1)**n_copies)
        Quantum Helstrom observable. Stored in GPU.
    proj_sums_ : tensor, size (2, (n_features + 1)**n_copies, (n_features + 1)**n_copies)
        Sum of the projectors of the Quantum Helstrom observable's eigenvectors, which has
        corresponding positive and negative eigenvalues respectively. Stored in GPU.
    hels_bound_ : float
        Helstrom bound is the upper bound of the probability that one can correctly 
        discriminate whether a quantum density is of which of the two binary quantum density 
        pattern. Stored in CPU.         
    """
    # Added binary_only tag as required by sklearn check_estimator
    def _more_tags(self):
        return {'binary_only': True}        
    
    
    # Initialize model hyperparameters
    def __init__(self, 
                 rescale = 1,
                 encoding = 'amplit',
                 n_copies = 1,                   
                 class_wgt = 'equi', 
                 n_splits = 1,
                 dtype = torch.float64):
        self.rescale = rescale
        self.encoding = encoding
        self.n_copies = n_copies
        self.class_wgt = class_wgt
        self.n_splits = n_splits
        self.dtype = dtype
        
        # Raise error if dtype is not torch.float32 or torch.float64
        if self.dtype not in [torch.float32, torch.float64]:
            raise ValueError('dtype should be torch.float32 or torch.float64 only')
        
    
    # Function for kronecker tensor product of PyTorch tensors, set as global function
    global kronecker
    def kronecker(A, B):
        return torch.einsum('nab,ncd->nacbd', A, B).view(A.size(0), 
                                                         A.size(1)*B.size(1), 
                                                         A.size(2)*B.size(2))
    
    
    # Function for fit
    def fit(self, X, y):
        """Perform HQC classification with the inverse of the standard stereographic 
        projection encoding, with the option to rescale the dataset prior to encoding.
                
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The training input samples. An array of int or float.
        y : array-like, shape (n_samples,)
            The training input binary target values. An array of str, int or float.
            
        Returns
        -------
        self : object
            Returns self.
        """
        # Check that arrays X and y have correct shape
        X, y = check_X_y(X, y)
        
        # Ensure target array y is of non-regression type  
        # Added as required by sklearn check_estimator
        check_classification_targets(y)
            
        # Store binary classes and encode y into binary class indexes 0 and 1
        self.classes_, y_class_index = np.unique(y, return_inverse = True)

        # Raise error if there are more than 2 classes
        if len(self.classes_) > 2:  
            raise ValueError('only 2 classes are supported')
        
        # Cast array X into a floating point tensor to ensure all following calculations below  
        # are done in float rather than integer, and send tensor X from CPU to GPU
        X = torch.tensor(X, dtype = self.dtype).cuda()
        
        # Rescale X
        X = self.rescale*X
        
        # Calculate sum of squares of each row (sample) in X
        X_sq_sum = (X**2).sum(dim = 1)
        
        # Number of rows in X
        m = X.shape[0]
        
        # Number of columns in X
        n = X.shape[1]
        
        # Calculate X' using amplitude or inverse of the standard stereographic projection 
        # encoding method
        if self.encoding == 'amplit':
            X_prime = normalize(torch.cat([X, torch.ones(m, dtype = self.dtype) \
                                           .reshape(-1, 1).cuda()], dim = 1), p = 2, dim = 1)
        elif self.encoding == 'stereo':
            X_prime = (1 / (X_sq_sum + 1)).reshape(-1, 1)*(torch.cat((2*X, (X_sq_sum - 1) \
                                                                      .reshape(-1, 1)), dim = 1))
        else:
            raise ValueError('encoding should be "amplit" or "stereo"')
        
        # Number of columns in X', set as global variable
        global n_prime
        n_prime = n + 1
        
        # Function to calculate terms in the Quantum Centroids and quantum Helstrom 
        # observable for each class, per subset split
        def centroids_terms_func(i):
            # Cast array y_class_index into a tensor and send from CPU to GPU
            # Determine rows (samples) in X' belonging to either class
            X_prime_class = X_prime[torch.CharTensor(y_class_index).cuda() == i]
                                    
            # Split X' belonging to either class into n_splits subsets, row-wise
            # Send tensors from GPU to CPU and cast tensors into arrays, use np.array_split()
            # because the equivalent torch.chunk() doesn't behave similarly to np.array_split()
            X_prime_class_split_arr = np.array_split(X_prime_class.cpu().numpy(),
                                                     indices_or_sections = self.n_splits,
                                                     axis = 0)
            
            # Cast arrays back to tensors and send back from CPU to GPU
            X_prime_class_split = [torch.tensor(a, dtype = self.dtype).cuda() 
                                   for a in X_prime_class_split_arr]
            
            # Function to calculate sum of quantum densities belonging to each class, 
            # per subset split
            def X_prime_class_split_func(j):
                # Counter for j-th split of X'
                X_prime_class_split_jth = X_prime_class_split[j]
                
                # Number of rows (samples) in j-th split of X'
                m_class_split = X_prime_class_split_jth.shape[0]
                
                # Encode vectors into quantum densities
                density_chunk = torch.matmul(X_prime_class_split_jth.view(m_class_split, 
                                                                          n_prime, 1),
                                             X_prime_class_split_jth.view(m_class_split, 
                                                                          1, n_prime))
                
                # Calculate n-fold Kronecker tensor product
                if self.n_copies == 1:
                    density_chunk = density_chunk
                else:
                    density_chunk_copy = density_chunk
                    for b in range(self.n_copies - 1):
                        density_chunk = kronecker(density_chunk, density_chunk_copy)
                    
                # Calculate sum of quantum densities
                density_chunk_sum = density_chunk.sum(dim = 0)
                return density_chunk_sum

            # Number of rows/columns in density matrix, set as global variable
            global density_nrow_ncol
            density_nrow_ncol = n_prime**self.n_copies
            
            # Initialize array density_class_sum
            density_class_sum = torch.zeros([density_nrow_ncol, density_nrow_ncol], 
                                            dtype = self.dtype).cuda()
            for c in range(self.n_splits):
                # Calculate sum of quantum densities belonging to either class
                density_class_sum = density_class_sum + X_prime_class_split_func(c)
            
            # Number of rows (samples) in X' belonging to either class
            m_class = X_prime_class.shape[0]
            
            # Function to calculate centroid belonging to either class
            def centroid():
                # Calculate Quantum Centroid belonging to either class
                # Added ZeroDivisionError as required by sklearn check_estimator
                try:
                    centroid = (1 / m_class)*density_class_sum
                except ZeroDivisionError:
                    centroid = 0 
                return centroid
            
            # Calculate centroid belonging to either class
            centroid_class = centroid()
            
            # Calculate terms in the quantum Helstrom observable belonging to either class
            if self.class_wgt == 'equi':
                hels_obs_terms = 0.5*centroid_class
            elif self.class_wgt == 'weighted':
                hels_obs_terms = (m_class / m)*centroid_class
            else:
                raise ValueError('class_wgt should be "equi" or "weighted"')
            return m_class, centroid_class, hels_obs_terms
        
        # Calculate Quantum Centroids and terms in the quantum Helstrom observable belonging 
        # to either class
        centroids_terms = [centroids_terms_func(0), centroids_terms_func(1)] 
                    
        # Determine Quantum Centroids
        self.centroids_ = torch.stack([centroids_terms[0][1], centroids_terms[1][1]], dim = 0)
                
        # Calculate quantum Helstrom observable
        self.hels_obs_ = centroids_terms[0][2] - centroids_terms[1][2] 
                
        # Calculate eigenvalues w and eigenvectors v of the quantum Helstrom observable
        w, v = torch.symeig(self.hels_obs_, eigenvectors = True)
          
        # Length of w
        len_w = len(w)
        
        # Initialize array eigval_class
        eigval_class = torch.empty_like(w, dtype = self.dtype).cuda()
        for d in range(len_w):
            # Create an array of 0s and 1s to indicate positive and negative eigenvalues
            # respectively
            if w[d] > 0:
                eigval_class[d] = 0
            else:
                eigval_class[d] = 1
        
        # Transpose matrix v containing eigenvectors to row-wise
        eigvec = v.T
        
        # Function to calculate sum of the projectors corresponding to positive and negative
        # eigenvalues respectively
        def sum_proj_func(e):
            # Split eigenvectors belonging to positive or negative eigenvalues into n_splits subsets
            # Send tensors from GPU to CPU and cast tensors into arrays, use np.array_split()
            # because the equivalent torch.chunk() doesn't behave similarly to np.array_split()
            eigvec_class_split_arr_full = np.array_split(eigvec.cpu().numpy()[eigval_class.cpu() == e],
                                                         indices_or_sections = self.n_splits,
                                                         axis = 0)
            
            # Remove empty rows in eigvec_class_split_arr_full
            eigvec_class_split_arr = [f for f in eigvec_class_split_arr_full if f.shape[0] > 0]

            # Cast arrays back to tensors and send back from CPU to GPU
            eigvec_class_split = [torch.tensor(g, dtype = self.dtype).cuda() 
                                  for g in eigvec_class_split_arr]             
            
            # Function to calculate sum of the projectors corresponding to positive and negative
            # eigenvalues respectively, per subset split
            def eigvec_class_split_func(h):
                # Counter for h-th split of eigvec
                eigvec_class_split_hth = eigvec_class_split[h]
                
                # Number of rows (samples) in h-th split of eigvec
                m_eigvec_class_split = eigvec_class_split_hth.shape[0]
                
                # Calculate projectors corresponding to positive and negative eigenvalues  
                # respectively, per subset split
                proj_split = torch.matmul(eigvec_class_split_hth.view(m_eigvec_class_split, 
                                                                      density_nrow_ncol, 1),
                                          eigvec_class_split_hth.view(m_eigvec_class_split, 
                                                                      1, density_nrow_ncol))
                
                # Calculate sum of projectors
                proj_split_sum = proj_split.sum(dim = 0)
                return proj_split_sum
            
            # Determine length of eigvec_class_split_arr
            eigvec_class_split_arr_len = len(eigvec_class_split_arr)

            # Initialize array proj_class_sum
            proj_class_sum = torch.zeros([density_nrow_ncol, density_nrow_ncol], 
                                         dtype = self.dtype).cuda()  
            for k in range(eigvec_class_split_arr_len):
                # Calculate sum of the projectors corresponding to positive and negative eigenvalues
                # respectively
                proj_class_sum = proj_class_sum + eigvec_class_split_func(k)
            return proj_class_sum
        
        # Calculate sum of the projectors corresponding to positive and negative eigenvalues 
        # respectively
        self.proj_sums_ = torch.stack([sum_proj_func(0), sum_proj_func(1)], dim = 0)        
                       
        # Calculate Helstrom bound
        self.hels_bound_ = (centroids_terms[0][0] / m)*torch.einsum('ij,ji->', self.centroids_[0], 
                                                                   self.proj_sums_[0]).item() \
                           + (centroids_terms[1][0] / m)*torch.einsum('ij,ji->', self.centroids_[1], 
                                                                     self.proj_sums_[1]).item()
        return self
        
    
    # Function for predict_proba
    def predict_proba(self, X):
        """Performs HQC classification on X and returns the trace of the dot product of the densities 
        and the sum of the projectors with corresponding positive and negative eigenvalues respectively.
        
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The input samples. An array of int or float.       
            
        Returns
        -------
        trace_matrix : array-like, shape (n_samples, 2)
            Column index 0 corresponds to the trace of the dot product of the densities and the sum  
            of projectors with positive eigenvalues. Column index 1 corresponds to the trace of the  
            dot product of the densities and the sum of projectors with negative eigenvalues. An array 
            of float.
        """
        # Send tensor self.proj_sums_ from GPU to CPU and cast into an array
        self.proj_sums_arr_ = self.proj_sums_.cpu().numpy()
                
        # Check if fit had been called
        check_is_fitted(self, ['proj_sums_arr_'])
               
        # Input validation of array X
        X = check_array(X)
                 
        # Cast array X into a floating point tensor to ensure all following calculations below  
        # are done in float rather than integer, and send tensor X from CPU to GPU
        X = torch.tensor(X, dtype = self.dtype).cuda()
        
        # Rescale X
        X = self.rescale*X        
        
        # Calculate sum of squares of each row (sample) in X
        X_sq_sum = (X**2).sum(dim = 1)
        
        # Number of rows in X
        m = X.shape[0]
        
        # Number of columns in X
        n = X.shape[1]

        # Calculate X' using amplitude or inverse of the standard stereographic projection 
        # encoding method
        if self.encoding == 'amplit':
            X_prime = normalize(torch.cat([X, torch.ones(m, dtype = self.dtype) \
                                           .reshape(-1, 1).cuda()], dim = 1), p = 2, dim = 1)
        elif self.encoding == 'stereo':
            X_prime = (1 / (X_sq_sum + 1)).reshape(-1, 1)*(torch.cat((2*X, (X_sq_sum - 1) \
                                                                      .reshape(-1, 1)), dim = 1))
        else:
            raise ValueError('encoding should be "amplit" or "stereo"')
                       
        # Function to calculate trace values for each class
        def trace_func(i):
            # Split X' into n_splits subsets, row-wise
            # Send tensors from GPU to CPU and cast tensors into arrays, use np.array_split()
            # because the equivalent torch.chunk() doesn't behave similarly to np.array_split()
            X_prime_split_arr_full = np.array_split(X_prime.cpu().numpy(),
                                                    indices_or_sections = self.n_splits,
                                                    axis = 0)
            
            # Remove empty rows in X_prime_split_arr_full
            X_prime_split_arr = [a for a in X_prime_split_arr_full if a.shape[0] > 0]

            # Cast arrays back to tensors and send back from CPU to GPU
            X_prime_split = [torch.tensor(q, dtype = self.dtype).cuda() for q in X_prime_split_arr]
            
            # Function to calculate trace values for each class, per subset split
            def trace_split_func(j):
                # Counter for j-th split X'
                X_prime_split_jth = X_prime_split[j]
                
                # Number of rows (samples) in j-th split X'
                X_prime_split_m = X_prime_split_jth.shape[0]
                
                # Encode vectors into quantum densities
                density_chunk = torch.matmul(X_prime_split_jth.view(X_prime_split_m, n_prime, 1),
                                             X_prime_split_jth.view(X_prime_split_m, 1, n_prime))
                
                # Calculate n-fold Kronecker tensor product
                if self.n_copies == 1:
                    density_chunk = density_chunk
                else:
                    density_chunk_copy = density_chunk
                    for b in range(self.n_copies - 1):
                        density_chunk = kronecker(density_chunk, density_chunk_copy)
                        
                # Calculate trace of the dot product of density of each row and sum of projectors
                # with corresponding positive and negative eigenvalues respectively
                return torch.einsum('bij,ji->b', density_chunk, self.proj_sums_[i])
            
            # Determine length of X_prime_split_arr
            X_prime_split_arr_len = len(X_prime_split_arr)

            # Initialize array trace_class
            trace_class = torch.empty([0], dtype = self.dtype).cuda()
            for c in range(X_prime_split_arr_len):
                # Calculate trace values for each class, per subset split
                trace_class = torch.cat([trace_class, trace_split_func(c)], dim = 0)
            return trace_class
        
        # Calculate trace values for each class, send from GPU to CPU and cast into an array
        trace_matrix = torch.stack([trace_func(0), trace_func(1)], dim = 1).cpu().numpy()
        return trace_matrix
                
    
    # Function for predict
    def predict(self, X):
        """Performs HQC classification on X and returns the binary classes.
        
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The input samples. An array of int or float.
            
        Returns
        -------
        self.classes_[predict_trace_index] : array-like, shape (n_samples,)
            The predicted binary classes. An array of str, int or float.
        """
        # Determine column index with the higher trace value in trace_matrix
        # Cast predict_proba(X) from an array into a tensor and send from CPU to GPU
        # If both columns have the same trace value, returns column index 1, which is different 
        # to np.argmax() which returns column index 0
        predict_trace_index = torch.argmax(torch.tensor(self.predict_proba(X),
                                                        dtype = self.dtype).cuda(), axis = 1)
        # Returns the predicted binary classes, send tensor from GPU to CPU and cast tensor
        # into an array
        return self.classes_[predict_trace_index.cpu().numpy()]

In [3]:
# Read in dataset
df = pd.read_csv('sample_data.csv')

In [4]:
# Observe first 5 rows
df.head()

Unnamed: 0.1,Unnamed: 0,Hemoglobin,Absolute Lymphocyte Count,Absolute Neutrophil Count,Platelet Count,C-Reactive Protein,Ferritin,D-DIMER,Absolute Basophil Count,Absolute Eosinophil Count,Absolute Monocyte Count,Lactate Dehydrogenase,Red Blood Cell Count,Lymp/Neut,Age,positive,Male,TestResult,Sex,Race
0,0,7.9,,7.27,170.0,,,,0.06,0.04,,,4.36,0.433942,52,0,1,negative,Male,Black
1,1,13.3,1.75,10.18,415.0,15.2,,,0.01,0.04,0.85,,4.14,0.119211,64,0,0,negative,Female,Black
2,2,6.8,0.45,8.8,319.0,,,,0.04,0.38,0.63,,5.34,0.062818,45,0,1,negative,Male,Black
3,3,15.1,1.16,11.01,262.0,1.2,,,0.01,0.0,0.61,,5.08,,48,0,0,negative,Female,White
4,4,11.1,,10.12,111.0,25.6,95.0,1244.0,0.01,0.03,0.5,340.0,4.39,0.053373,76,0,1,negative,Male,White


In [5]:
# Check no. of rows and columns
df.shape

(1072, 20)

## Data Preprocessing

In [6]:
# Check datatypes
df.dtypes

Unnamed: 0                     int64
Hemoglobin                   float64
Absolute Lymphocyte Count    float64
Absolute Neutrophil Count    float64
Platelet Count               float64
C-Reactive Protein           float64
Ferritin                     float64
D-DIMER                      float64
Absolute Basophil Count      float64
Absolute Eosinophil Count    float64
Absolute Monocyte Count      float64
Lactate Dehydrogenase        float64
Red Blood Cell Count         float64
Lymp/Neut                    float64
Age                            int64
positive                       int64
Male                           int64
TestResult                    object
Sex                           object
Race                          object
dtype: object

In [7]:
# Cast "Age" feature to float datatype for floating point calculations later on
df['Age'] = df['Age'].astype(float)

# Cast "Male" feature to object datatype as these are categorical features
df['Male'] = df['Male'].astype(object)

In [8]:
# Check missing values
(df.isnull().sum(axis=0)/df.shape[0]).sort_values(ascending=False)

D-DIMER                      0.566231
Lactate Dehydrogenase        0.491604
Ferritin                     0.474813
C-Reactive Protein           0.333955
Absolute Lymphocyte Count    0.222015
Lymp/Neut                    0.222015
Race                         0.159515
TestResult                   0.159515
Sex                          0.159515
Absolute Basophil Count      0.138060
Absolute Neutrophil Count    0.138060
Absolute Eosinophil Count    0.138060
Absolute Monocyte Count      0.138060
Platelet Count               0.014925
Hemoglobin                   0.013060
Red Blood Cell Count         0.012127
Age                          0.000000
positive                     0.000000
Male                         0.000000
Unnamed: 0                   0.000000
dtype: float64

In [9]:
# Drop rows with no complete blood counts
df_drop_count = df.dropna(axis=0, how='any', subset=[feature_name for feature_name in df.columns if feature_name[-5:] in ['Count']])
df_drop_count.shape

(457, 20)

In [10]:
# Drop rows with no imflammatory markers at all
df_drop_count_marker = df_drop_count.dropna(axis=0, how='all', subset=['C-Reactive Protein', 'Ferritin', 'Lactate Dehydrogenase'])
df_drop_count_marker.shape

(414, 20)

## Feature Selection

In [11]:
# Drop features not used in the model from the paper
df_drop_count_marker_feat = df_drop_count_marker.drop(['Unnamed: 0', 'D-DIMER', 'TestResult', 'Sex', 'Race'], axis=1)
df_drop_count_marker_feat.shape

(414, 15)

## Feature Engineering

In [12]:
# Extract features
df_X = df_drop_count_marker_feat.drop(['positive'], axis=1)

# Extract target variable
df_y = df_drop_count_marker_feat[['positive']]

In [36]:
# Train/test split
##### NO STRATIFIED SAMPLING IN TRAIN/TEST SPLIT #####
df_X_train, df_X_test, df_y_train, df_y_test = model_selection.train_test_split(df_X, df_y, test_size=0.3, random_state=0)

##### WITH STRATIFIED SAMPLING IN TRAIN/TEST SPLIT #####
# df_X_train, df_X_test, df_y_train, df_y_test = model_selection.train_test_split(df_X, df_y, test_size=0.3, random_state=0, stratify=df_y)

In [37]:
# Extract continuous features from training set
df_X_train_con = df_X_train.select_dtypes(exclude=['object'])

# Calculate mean and std dev of continuous features in training set
transformer = preprocessing.StandardScaler().fit(df_X_train_con.values)

# Normalize continuous features in training set
X_train_con_norm = transformer.transform(df_X_train_con.values)

In [38]:
# Extract continuous features from test set
df_X_test_con = df_X_test.select_dtypes(exclude=['object'])

# Normalize continuous features in test set (according to mean and std dev of continuous features in training set)
X_test_con_norm = transformer.transform(df_X_test_con.values)

In [39]:
# Extract categorical features from training set, ie. just the "Male" feature
df_X_train_cat = df_X_train.select_dtypes(include=['object'])

# Perform one-hot encoding on categorical features in training set
df_X_train_cat_onehot = pd.get_dummies(df_X_train_cat)

# Concatenate normalized continuous features and one-hot encoded categorical features for training set
X_train_fe = np.concatenate([X_train_con_norm, df_X_train_cat_onehot.values], axis=1)

# Imput missing values with zeros in training set
X_train_fe_zero = np.nan_to_num(X_train_fe, nan=0.0)

In [40]:
# Extract categorical features from test set
df_X_test_cat = df_X_test.select_dtypes(include=['object'])

# Perform one-hot encoding on categorical features in test set
df_X_test_cat_onehot = pd.get_dummies(df_X_test_cat)

# Concatenate normalized continuous features and one-hot encoded categorical features for test set
X_test_fe = np.concatenate([X_test_con_norm, df_X_test_cat_onehot.values], axis=1)

# Imput missing values with zeros in test set
X_test_fe_zero = np.nan_to_num(X_test_fe, nan=0.0)

In [41]:
# Check no. of rows and columns 
X_train_fe_zero.shape, X_test_fe_zero.shape, df_y_train.shape, df_y_test.shape

((289, 15), (125, 15), (289, 1), (125, 1))

In [42]:
# Check missing values in X_train_fe_zero
(pd.DataFrame(X_train_fe_zero).isnull().sum(axis=0)/pd.DataFrame(X_train_fe_zero).shape[0]).sort_values(ascending=False)

14    0.0
13    0.0
12    0.0
11    0.0
10    0.0
9     0.0
8     0.0
7     0.0
6     0.0
5     0.0
4     0.0
3     0.0
2     0.0
1     0.0
0     0.0
dtype: float64

In [43]:
# Check missing values in X_test_fe_zero
(pd.DataFrame(X_test_fe_zero).isnull().sum(axis=0)/pd.DataFrame(X_test_fe_zero).shape[0]).sort_values(ascending=False)

14    0.0
13    0.0
12    0.0
11    0.0
10    0.0
9     0.0
8     0.0
7     0.0
6     0.0
5     0.0
4     0.0
3     0.0
2     0.0
1     0.0
0     0.0
dtype: float64

In [44]:
# Check missing values in df_y_train
(df_y_train.isnull().sum(axis=0)/df_y_train.shape[0]).sort_values(ascending=False)

positive    0.0
dtype: float64

In [45]:
# Check missing values in df_y_test
(df_y_test.isnull().sum(axis=0)/df_y_test.shape[0]).sort_values(ascending=False)

positive    0.0
dtype: float64

In [46]:
# Check if class imbalance
df_y_train['positive'].value_counts(normalize=True)*100

0    74.048443
1    25.951557
Name: positive, dtype: float64

## Model Development

##### NO STRATIFIED SAMPLING IN TRAIN/TEST SPLIT #####

In [26]:
# Create rescale hyperparamter list [0.1, 0.5, 1, 1.5,...,10.0]
rescale_list1 = [0.1]
rescale_list2 = np.linspace(0.5, 10, 20).tolist()
rescale_list1.extend(rescale_list2)

# Using scikit-learn's GridSearchCV (with 7-folds following the paper)
# Did not try n_copies=4 because it took too much memory or too much time
param_grid = {'rescale':rescale_list1, 'encoding':['amplit', 'stereo'], 'n_copies':[1, 2, 3], 'class_wgt':['equi', 'weighted']}
models = model_selection.GridSearchCV(HQC_gpu(n_splits=100, dtype=torch.float64), param_grid, scoring='roc_auc', cv=7).fit(X_train_fe_zero, df_y_train.values.ravel())

In [27]:
# Best AUROC score
best_model = models.best_estimator_
y_score = best_model.predict_proba(X_test_fe_zero)
metrics.roc_auc_score(df_y_test.values.ravel(), y_score[:,1])

0.47397891963109356

In [28]:
# Best hyperparameter combination
models.best_params_

{'class_wgt': 'equi', 'encoding': 'amplit', 'n_copies': 1, 'rescale': 1.0}

In [29]:
# Random forest model (following the paper's hyperparameter selection)
rand_forest_model = RandomForestClassifier(n_estimators=100, bootstrap=True, max_features='sqrt', random_state=0).fit(X_train_fe_zero, df_y_train.values.ravel())
y_score = rand_forest_model.predict_proba(X_test_fe_zero)
metrics.roc_auc_score(df_y_test.values.ravel(), y_score[:,1])

0.5484189723320159

In [30]:
# Logistic regression model (following the paper's hyperparameter selection)
log_reg_model = LogisticRegression().fit(X_train_fe_zero, df_y_train.values.ravel())
y_score = log_reg_model.predict_proba(X_test_fe_zero)
metrics.roc_auc_score(df_y_test.values.ravel(), y_score[:,1])

0.5602766798418973

In [31]:
# SVM model (following the paper's hyperparameter selection)
svm_model = svm.SVC(probability=True).fit(X_train_fe_zero, df_y_train.values.ravel())
y_score = svm_model.predict_proba(X_test_fe_zero)
metrics.roc_auc_score(df_y_test.values.ravel(), y_score[:,1])

0.566205533596838

In [32]:
# Nueral net model (following the paper's hyperparameter selection, with the exception of added max_iter as model was not converging)
mlp_model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5,2), random_state=0, max_iter=250).fit(X_train_fe_zero, df_y_train.values.ravel())
y_score = mlp_model.predict_proba(X_test_fe_zero)
metrics.roc_auc_score(df_y_test.values.ravel(), y_score[:,1])

0.5205862977602109

In [33]:
# SGD model (following the paper's hyperparameter selection)
sgd_model = SGDClassifier(loss="modified_huber", penalty="l2", max_iter=500, random_state=0).fit(X_train_fe_zero, df_y_train.values.ravel())
y_score = sgd_model.predict_proba(X_test_fe_zero)
metrics.roc_auc_score(df_y_test.values.ravel(), y_score[:,1])

0.48221343873517786

In [34]:
# XGBoost model (following the paper's hyperparameter selection)
xgboost_model = XGBClassifier(random_state=0).fit(X_train_fe_zero, df_y_train.values.ravel())
y_score = xgboost_model.predict_proba(X_test_fe_zero)
metrics.roc_auc_score(df_y_test.values.ravel(), y_score[:,1])

0.5658761528326746

In [35]:
# ADABoost model (following the paper's hyperparameter selection)
ada_model = AdaBoostClassifier(n_estimators=100, random_state=0).fit(X_train_fe_zero, df_y_train.values.ravel())
y_score = ada_model.predict_proba(X_test_fe_zero)
metrics.roc_auc_score(df_y_test.values.ravel(), y_score[:,1])

0.6320816864295125

##### WITH STRATIFIED SAMPLING IN TRAIN/TEST SPLIT #####

In [47]:
# Create rescale hyperparamter list [0.1, 0.5, 1, 1.5,...,10.0]
rescale_list1 = [0.1]
rescale_list2 = np.linspace(0.5, 10, 20).tolist()
rescale_list1.extend(rescale_list2)

# Using scikit-learn's GridSearchCV (with 7-folds following the paper)
# Did not try n_copies=4 because it took too much memory or too much time
param_grid = {'rescale':rescale_list1, 'encoding':['amplit', 'stereo'], 'n_copies':[1, 2, 3], 'class_wgt':['equi', 'weighted']}
models = model_selection.GridSearchCV(HQC_gpu(n_splits=100, dtype=torch.float64), param_grid, scoring='roc_auc', cv=7).fit(X_train_fe_zero, df_y_train.values.ravel())

In [57]:
# Best AUROC score
best_model = models.best_estimator_
y_score = best_model.predict_proba(X_test_fe_zero)
metrics.roc_auc_score(df_y_test.values.ravel(), y_score[:,1])

0.4801747311827957

In [48]:
# Best hyperparameter combination
models.best_params_

{'class_wgt': 'equi', 'encoding': 'stereo', 'n_copies': 1, 'rescale': 8.0}

In [50]:
# Random forest model (following the paper's hyperparameter selection)
rand_forest_model = RandomForestClassifier(n_estimators=100, bootstrap=True, max_features='sqrt', random_state=0).fit(X_train_fe_zero, df_y_train.values.ravel())
y_score = rand_forest_model.predict_proba(X_test_fe_zero)
metrics.roc_auc_score(df_y_test.values.ravel(), y_score[:,1])

0.5336021505376344

In [51]:
# Logistic regression model (following the paper's hyperparameter selection)
log_reg_model = LogisticRegression().fit(X_train_fe_zero, df_y_train.values.ravel())
y_score = log_reg_model.predict_proba(X_test_fe_zero)
metrics.roc_auc_score(df_y_test.values.ravel(), y_score[:,1])

0.5766129032258065

In [52]:
# SVM model (following the paper's hyperparameter selection)
svm_model = svm.SVC(probability=True).fit(X_train_fe_zero, df_y_train.values.ravel())
y_score = svm_model.predict_proba(X_test_fe_zero)
metrics.roc_auc_score(df_y_test.values.ravel(), y_score[:,1])

0.48588709677419356

In [53]:
# Nueral net model (following the paper's hyperparameter selection)
mlp_model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5,2), random_state=0).fit(X_train_fe_zero, df_y_train.values.ravel())
y_score = mlp_model.predict_proba(X_test_fe_zero)
metrics.roc_auc_score(df_y_test.values.ravel(), y_score[:,1])

0.5047043010752688

In [54]:
# SGD model (following the paper's hyperparameter selection)
sgd_model = SGDClassifier(loss="modified_huber", penalty="l2", max_iter=500, random_state=0).fit(X_train_fe_zero, df_y_train.values.ravel())
y_score = sgd_model.predict_proba(X_test_fe_zero)
metrics.roc_auc_score(df_y_test.values.ravel(), y_score[:,1])

0.4512768817204301

In [55]:
# XGBoost model (following the paper's hyperparameter selection)
xgboost_model = XGBClassifier(random_state=0).fit(X_train_fe_zero, df_y_train.values.ravel())
y_score = xgboost_model.predict_proba(X_test_fe_zero)
metrics.roc_auc_score(df_y_test.values.ravel(), y_score[:,1])

0.6129032258064516

In [56]:
# ADABoost model (following the paper's hyperparameter selection)
ada_model = AdaBoostClassifier(n_estimators=100, random_state=0).fit(X_train_fe_zero, df_y_train.values.ravel())
y_score = ada_model.predict_proba(X_test_fe_zero)
metrics.roc_auc_score(df_y_test.values.ravel(), y_score[:,1])

0.6320564516129032