# Pairwise Naive Bayes models

## Setup

Import libraries

In [7]:
library(naivebayes) #For the naive bayes model
library(caret) #For performing cross-validation
library(yardstick) #For calculating the MCC
library(utils)

Define the data path

In [8]:
data_path <- '/Users/joefarrington/Documents/MSc Machine Learning/Project/August/Data for import/'

Define a function for running repeated training and evaluation

In [9]:
repeated_training <- function(data, n_repeats, n_folds, laplace_grid, prior) {

    #Get a count of the number of classes
    n_classes <- length(unique(data[["Label"]]))
    
    #Instantiate some object to hold the results
    results_list = list()
    mcc_array = vector()
    acc_array = vector()
    overall_conf = matrix(rep( 0, len=(n_classes*n_classes)), nrow = n_classes, ncol=n_classes)
    allpreds = data.frame(matrix(ncol = 2, nrow = 0))
    colnames(allpreds) <- c('ytrue', 'ypred')
    
    #Get laplace values as string to use as index during grid search
    lrange_index <- lapply(lrange, toString)
    
    #
    for (i in 1:n_repeats){
        
        #Create a train/test split
        tr <- (createDataPartition(data$Label, times = 1, p = 0.8, list = TRUE))$Resample1
        train <- data[tr,]
        test <- data[-tr,]
        
        #Run a grid search to find the best value of the hyperparameter
        
        ##Create the folds
        folds <- createFolds(train$Label, k = n_folds)
        
        ##Create a dataframe to hold the results of the CV
        
        ###Create  a matrix of zeros with the right dimensions
        matz <- matrix(0, length(folds), length(lrange) )

        ###Create the dataframe
        CVdf <- data.frame(matz)

        ###Name the rows and columns
        rownames(CVdf) <- names(folds)
        names(CVdf) <- lrange_index
        
        #Loop over the hyperparameter value
        for(lp in lrange){ 
    
            #Get the value of laplace as a string to use as an index
            ind <- toString(lp)
    
            #Loop over the folds
            for(fname in names(folds)){

                #Get the name of the fold to use as a column index
                f <- folds[[fname]]

                #Get the training rows for this fold
                cvtrain <- train[f,]

                #Get the validation rows for this fold
                cvalid <- train[-f,]

                #Train the model for this fold
                nb <- naive_bayes(Label ~ ., cvtrain, laplace=lp, prior=prior)

                #Perform predictions on the validation set
                preds <- predict(nb, within(cvalid, rm(Label)) , type = "class")

                #Store the MCC on the validation set in the dataframe
                CVdf[fname,ind] <- (mcc_vec(cvalid$Label, preds))}}
        
        #Find the best hyperparameter value
        CV_Mean <- colMeans(CVdf)
        CVsummary <- data.frame(CV_Mean)
        laplace_opt <- lrange[which.max(CVsummary$CV_Mean)]
            
        #Train a model using the best value of Laplace
        nb_opt <- naive_bayes(Label ~ ., train, laplace=lp, prior=prior)
            
        #Get predictions on the test set
        ypred <- predict(nb_opt, within(test, rm(Label)) , type = "class")
        
        #Get the accuracy and MCC on the test set and store them in the relevant arrays
        
        acc_test <- accuracy_vec(test$Label, ypred)
        acc_array <- c(acc_array, acc_test)
        
        mcc_test <- mcc_vec(test$Label, ypred)
        mcc_array <- c(mcc_array, mcc_test)
        
        #Get the confusion matrix
        conf = table(test$Label, ypred)
        
        #Normalize the confusion matrix
        nconf <- conf/(rowSums(conf))
        
        #Add 1/n_repeats of the confusion matrix to a running total
        overall_conf <- overall_conf + (nconf/n_repeats)
        
        #Add the labels and predictions from this run to a dataframe
        predstemp <- data.frame(ytrue = test$Label, ypred = ypred)
        allpreds <- rbind(allpreds, predstemp)
        
        #Store the information from this repeat
        results_list[[paste("Run",i, sep="")]] <- list(train = train, test = test, ypred=ypred, model=nb_opt, 
                                                      mcc_test = mcc_test, acc_test = acc_test, laplace = laplace_opt, conf=conf)
    
        }
    
    #Calculate and print the mean and std dev of accuracy
    print(paste('Mean test Acc:', mean(acc_array), sep = " "))
    print(paste('Std dev test Acc:', sd(acc_array), sep= " "))
    
    #Calculate and print the mean and std dev of MCC
    print(paste('Mean test MCC:', mean(mcc_array), sep = " "))
    print(paste('Std dev test MCC:', sd(mcc_array), sep= " "))
    
    #Store the statistics from each model
    results_list[["mean_acc"]] = mean(acc_array)
    results_list[["std_acc"]] = sd(acc_array)
    results_list[["mean_mcc"]] = mean(mcc_array)
    results_list[["std_mcc"]] = sd(mcc_array)
    
    #Store the overall confusion matrix
    results_list[["conf"]] <- overall_conf
    
    #Store the dataframe of cumulative labels and predictions
    results_list[["allpreds"]] <- allpreds
    
    return(results_list)
}

Define a function to get pairwise data

In [10]:
get_pairwise_data <- function(class1, class2, data) {
    
    subset <- data[data$Label %in% c(class1, class2), ]
    subset <- droplevels(subset)
    
    return(subset)
}

Set the range for the hyperparameter laplace

In [11]:
lrange <- 0
for (pow in seq(-20, 1, by=1)){
    lrange<- c(lrange, 2**pow)
}

Define the number of repeats and the number of folds

In [12]:
n_repeats = 10
n_folds = 10

Define a path to save the results

In [13]:
results_path <- '/Users/joefarrington/Documents/MSc Machine Learning/Project/August/Results/Ch4Allpairwise/'

### Import and process the data

Import the data

In [16]:
X_all_Rev = read.csv(paste(data_path,"X_all_Rev.csv", sep = "", collapse = NULL),
               colClasses=c("integer",rep("factor",34)),header = TRUE)
y_all_Rev = read.csv(paste(data_path,"y_all_Rev.csv", sep = "", collapse = NULL),
               header = FALSE)

Combine data and labels, get rid of the index column and rename the label column

In [17]:
data_Rev <- cbind(X_all_Rev,y_all_Rev)
data_Rev <- within(data_Rev, rm(X, V1))
colnames(data_Rev)[colnames(data_Rev)=="V2"] <- "Label"

In [18]:
comparisons<-combn(classes, 2,simplify = FALSE)

Get the number of classes, and a vector of the classes and a matrix to use as a template for the dataframes that will hold the results

In [19]:
classes = c('22q11DS', 'Down', 'FXS-F', 'FXS-M', 'NF1',  'PW-UPD', 'PW-del', 'SMC15', 'TSC', 'XXY')
n_classes <- length(unique(data_Rev[["Label"]]))
temp_map <- matrix(0, n_classes, n_classes)

## Pairwise, no prior

 Define a subfolder to save the results

In [20]:
subfolder = 'NB_tonoerprior/'

Set up dataframes to store the results

In [21]:
###Create dataframes to store the results ofthe pairwise analysis
df_mean_mcc_noprior <- data.frame(temp_map)
df_std_mcc_noprior <- data.frame(temp_map)
df_mean_acc_noprior <- data.frame(temp_map)
df_std_acc_noprior <- data.frame(temp_map)

###Name the rows and columns
rownames(df_mean_mcc_noprior) <- classes
names(df_mean_mcc_noprior) <- classes

rownames(df_std_mcc_noprior) <- classes
names(df_std_mcc_noprior) <- classes

rownames(df_mean_acc_noprior) <- classes
names(df_mean_acc_noprior) <- classes

rownames(df_std_acc_noprior) <- classes
names(df_std_acc_noprior) <- classes

Define the prior

In [22]:
prior_noprior = NULL

Train the pairwise models and store the results

In [None]:
for (comp in comparisons){
    
    class1 = as.character(comp[[1]])
    class2 = as.character(comp[[2]])
    
    data <- get_pairwise_data(class1,class2, data_Rev)
    
    res_list = repeated_training(data, n_repeats, n_folds, lrange, prior_noprior) 
    
    df_mean_acc_noprior[[class1, class2]]<- res_list[["mean_acc"]] 
    df_std_acc_noprior[[class1, class2]]<- res_list[["std_acc"]]
    df_mean_mcc_noprior[[class1, class2]]<- res_list[["mean_mcc"]]
    df_std_mcc_noprior[[class1, class2]]<- res_list[["std_mcc"]]
    
}

Save the results

In [28]:
write.csv(df_mean_acc_noprior, file = paste(results_path,subfolder,"df_mean_acc_noprior.csv", sep = "", collapse = NULL))
write.csv(df_std_acc_noprior, file = paste(results_path,subfolder,"df_std_acc_noprior.csv", sep = "", collapse = NULL))
write.csv(df_mean_mcc_noprior, file = paste(results_path,subfolder,"df_mean_mcc_noprior.csv", sep = "", collapse = NULL))
write.csv(df_std_mcc_noprior, file = paste(results_path,subfolder,"df_std_mcc_noprior.csv", sep = "", collapse = NULL))

Print out the dataframes

In [24]:
df_mean_acc_noprior

Unnamed: 0_level_0,22q11DS,Down,FXS-F,FXS-M,NF1,PW-UPD,PW-del,SMC15,TSC,XXY
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
22q11DS,0,0.8695652,0.8,0.7911765,0.7259259,0.9,0.8296296,0.936,0.6863636,0.8357143
Down,0,0.0,0.8214286,0.8428571,0.7714286,0.6428571,0.6,0.7833333,0.7677419,0.82
FXS-F,0,0.0,0.0,0.74,0.6944444,0.9333333,0.8611111,0.8375,0.6542857,0.7947368
FXS-M,0,0.0,0.0,0.0,0.76,0.888,0.888,0.9478261,0.6952381,0.8269231
NF1,0,0.0,0.0,0.0,0.0,0.8555556,0.8333333,0.85625,0.6657143,0.7842105
PW-UPD,0,0.0,0.0,0.0,0.0,0.0,0.5555556,0.8125,0.7942857,0.8631579
PW-del,0,0.0,0.0,0.0,0.0,0.0,0.0,0.86875,0.7885714,0.8263158
SMC15,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.869697,0.8647059
TSC,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6805556
XXY,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
df_std_acc_noprior

Unnamed: 0_level_0,22q11DS,Down,FXS-F,FXS-M,NF1,PW-UPD,PW-del,SMC15,TSC,XXY
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
22q11DS,0,0.07389883,0.06098316,0.06269913,0.06579216,0.03923518,0.09105723,0.02065591,0.06320198,0.06563833
Down,0,0.0,0.11785113,0.08708487,0.11566626,0.08247861,0.07678341,0.13146844,0.04511006,0.07062333
FXS-F,0,0.0,0.0,0.08485281,0.06547285,0.05105203,0.09166199,0.05270463,0.09566571,0.10364249
FXS-M,0,0.0,0.0,0.0,0.06531973,0.06746192,0.05593647,0.03429611,0.07999748,0.07954919
NF1,0,0.0,0.0,0.0,0.0,0.07499428,0.08281733,0.0782291,0.07133327,0.08754368
PW-UPD,0,0.0,0.0,0.0,0.0,0.0,0.11712139,0.08333333,0.04426267,0.07104722
PW-del,0,0.0,0.0,0.0,0.0,0.0,0.0,0.09524275,0.0764282,0.0499307
SMC15,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07149632,0.06820599
TSC,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05438583
XXY,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
df_mean_mcc_noprior

Unnamed: 0_level_0,22q11DS,Down,FXS-F,FXS-M,NF1,PW-UPD,PW-del,SMC15,TSC,XXY
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
22q11DS,0,0.7111601,0.5323684,0.5842164,0.3737677,0.8007881,0.6672725,0.8422161,0.4092508,0.6426711
Down,0,0.0,0.6724746,0.6591486,0.5776153,0.2698063,0.1104505,0.5826048,0.4651472,0.6839917
FXS-F,0,0.0,0.0,0.4579591,0.4119995,0.8734798,0.7304455,0.6871722,0.3392854,0.6000226
FXS-M,0,0.0,0.0,0.0,0.4637268,0.7811097,0.7786025,0.878752,0.3938398,0.6491618
NF1,0,0.0,0.0,0.0,0.0,0.7278067,0.7000507,0.7329771,0.1517188,0.575292
PW-UPD,0,0.0,0.0,0.0,0.0,0.0,0.1274421,0.622709,0.566041,0.7547571
PW-del,0,0.0,0.0,0.0,0.0,0.0,0.0,0.7377573,0.5835816,0.6673723
SMC15,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6844883,0.7289544
TSC,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4045097
XXY,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
df_std_mcc_noprior

Unnamed: 0_level_0,22q11DS,Down,FXS-F,FXS-M,NF1,PW-UPD,PW-del,SMC15,TSC,XXY
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
22q11DS,0,0.1549653,0.1591321,0.1273586,0.1522174,0.07061786,0.1336772,0.05245931,0.12479664,0.14570045
Down,0,0.0,0.2081011,0.1380285,0.2301131,0.16569959,0.1632742,0.26298517,0.08919455,0.09393338
FXS-F,0,0.0,0.0,0.1684248,0.1406751,0.09440246,0.1838018,0.11094055,0.17763476,0.21084643
FXS-M,0,0.0,0.0,0.0,0.1609273,0.12825981,0.1165361,0.08035546,0.1535923,0.15394931
NF1,0,0.0,0.0,0.0,0.0,0.14384974,0.1478149,0.14671242,0.1560065,0.1777621
PW-UPD,0,0.0,0.0,0.0,0.0,0.0,0.2461355,0.16956611,0.07964314,0.11826751
PW-del,0,0.0,0.0,0.0,0.0,0.0,0.0,0.19247172,0.13068947,0.10340001
SMC15,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13404497,0.13981589
TSC,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10601852
XXY,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Pairwise, uniform prior

Define a subfolder to save the results

In [29]:
subfolder = 'NB_uniprior/'

Set up dataframes to store the results

In [30]:
###Create dataframes to store the results ofthe pairwise analysis
df_mean_mcc_uniprior <- data.frame(temp_map)
df_std_mcc_uniprior <- data.frame(temp_map)
df_mean_acc_uniprior <- data.frame(temp_map)
df_std_acc_uniprior <- data.frame(temp_map)

###Name the rows and columns
rownames(df_mean_mcc_uniprior) <- classes
names(df_mean_mcc_uniprior) <- classes

rownames(df_std_mcc_uniprior) <- classes
names(df_std_mcc_uniprior) <- classes

rownames(df_mean_acc_uniprior) <- classes
names(df_mean_acc_uniprior) <- classes

rownames(df_std_acc_uniprior) <- classes
names(df_std_acc_uniprior) <- classes

Define the prior

In [31]:
prior_uniprior = rep(1/2, times=2)

Train the pairwise models and store the results

In [None]:
for (comp in comparisons){
    
    class1 = as.character(comp[[1]])
    class2 = as.character(comp[[2]])
    
    data <- get_pairwise_data(class1,class2, data_Rev)
    
    res_list = repeated_training(data, n_repeats, n_folds, lrange, prior_uniprior) 
    
    df_mean_acc_uniprior[[class1, class2]]<- res_list[["mean_acc"]] 
    df_std_acc_uniprior[[class1, class2]]<- res_list[["std_acc"]]
    df_mean_mcc_uniprior[[class1, class2]]<- res_list[["mean_mcc"]]
    df_std_mcc_uniprior[[class1, class2]]<- res_list[["std_mcc"]]
    
}

Save the results

In [33]:
write.csv(df_mean_acc_uniprior, file = paste(results_path,subfolder,"df_mean_acc_uniprior.csv", sep = "", collapse = NULL))
write.csv(df_std_acc_uniprior, file = paste(results_path,subfolder,"df_std_acc_uniprior.csv", sep = "", collapse = NULL))
write.csv(df_mean_mcc_uniprior, file = paste(results_path,subfolder,"df_mean_mcc_uniprior.csv", sep = "", collapse = NULL))
write.csv(df_std_mcc_uniprior, file = paste(results_path,subfolder,"df_std_mcc_uniprior.csv", sep = "", collapse = NULL))

Print out the dataframes

In [34]:
df_mean_acc_uniprior

Unnamed: 0_level_0,22q11DS,Down,FXS-F,FXS-M,NF1,PW-UPD,PW-del,SMC15,TSC,XXY
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
22q11DS,0,0.8173913,0.8074074,0.8352941,0.7518519,0.8777778,0.8111111,0.912,0.6636364,0.8071429
Down,0,0.0,0.8785714,0.8333333,0.7071429,0.6714286,0.6571429,0.8666667,0.6741935,0.7333333
FXS-F,0,0.0,0.0,0.768,0.6833333,0.9277778,0.8166667,0.8875,0.5971429,0.7894737
FXS-M,0,0.0,0.0,0.0,0.76,0.844,0.888,0.9565217,0.6809524,0.8192308
NF1,0,0.0,0.0,0.0,0.0,0.8388889,0.8777778,0.8375,0.6314286,0.7894737
PW-UPD,0,0.0,0.0,0.0,0.0,0.0,0.5166667,0.8125,0.7914286,0.8894737
PW-del,0,0.0,0.0,0.0,0.0,0.0,0.0,0.85625,0.7228571,0.8105263
SMC15,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.8882353
TSC,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6583333
XXY,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
df_std_acc_uniprior

Unnamed: 0_level_0,22q11DS,Down,FXS-F,FXS-M,NF1,PW-UPD,PW-del,SMC15,TSC,XXY
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
22q11DS,0,0.05724179,0.07570227,0.06383851,0.0630718,0.0495368,0.05907894,0.04131182,0.05335399,0.07751819
Down,0,0.0,0.10674529,0.06827242,0.1406577,0.13552619,0.10540926,0.05826716,0.07967154,0.07027284
FXS-F,0,0.0,0.0,0.07728734,0.1482638,0.07878536,0.11126533,0.05743354,0.08979695,0.07845853
FXS-M,0,0.0,0.0,0.0,0.08,0.04788876,0.0491709,0.0409917,0.09336788,0.07702984
NF1,0,0.0,0.0,0.0,0.0,0.09240722,0.08996265,0.12219065,0.04140393,0.09283338
PW-UPD,0,0.0,0.0,0.0,0.0,0.0,0.05885277,0.07216878,0.08416591,0.0762704
PW-del,0,0.0,0.0,0.0,0.0,0.0,0.0,0.07246886,0.09336248,0.11423734
SMC15,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07979159,0.09383126
TSC,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08182797
XXY,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
df_mean_mcc_uniprior

Unnamed: 0_level_0,22q11DS,Down,FXS-F,FXS-M,NF1,PW-UPD,PW-del,SMC15,TSC,XXY
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
22q11DS,0,0.6405912,0.5838005,0.6706299,0.4260457,0.7590521,0.6273706,0.7897862,0.3808178,0.5694701
Down,0,0.0,0.7934959,0.6400197,0.501789,0.3586404,0.248698,0.7614891,0.3857166,0.5499838
FXS-F,0,0.0,0.0,0.5173494,0.3672573,0.8711583,0.6466648,0.7748572,0.29089,0.5913408
FXS-M,0,0.0,0.0,0.0,0.4999119,0.6930867,0.7824012,0.9015448,0.3673748,0.6313791
NF1,0,0.0,0.0,0.0,0.0,0.7091546,0.7836563,0.6886851,0.1743192,0.5840964
PW-UPD,0,0.0,0.0,0.0,0.0,0.0,0.0346257,0.6297577,0.6226124,0.7886172
PW-del,0,0.0,0.0,0.0,0.0,0.0,0.0,0.7212619,0.5037121,0.6438729
SMC15,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.561732,0.7781012
TSC,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3875652
XXY,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
df_std_mcc_uniprior

Unnamed: 0_level_0,22q11DS,Down,FXS-F,FXS-M,NF1,PW-UPD,PW-del,SMC15,TSC,XXY
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
22q11DS,0,0.09964519,0.174693,0.1288155,0.165108,0.07807705,0.10618178,0.10232353,0.09529416,0.1898214
Down,0,0.0,0.1603541,0.1409318,0.2695277,0.24111319,0.23610671,0.09314127,0.12645086,0.1055033
FXS-F,0,0.0,0.0,0.1645911,0.3032293,0.13669714,0.22388173,0.11841962,0.14738394,0.1541945
FXS-M,0,0.0,0.0,0.0,0.1735969,0.0771806,0.09211531,0.09327821,0.19413124,0.1619132
NF1,0,0.0,0.0,0.0,0.0,0.15240741,0.15159289,0.23337937,0.15795507,0.1883515
PW-UPD,0,0.0,0.0,0.0,0.0,0.0,0.11929729,0.14518663,0.11140848,0.1436586
PW-del,0,0.0,0.0,0.0,0.0,0.0,0.0,0.1445635,0.14701948,0.2220914
SMC15,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13147375,0.1935264
TSC,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1229115
XXY,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
