# Code for the Experiments in Open Category Detection with Guarantees

In [1]:
library(devtools)

In [23]:
library(histogram)

In [2]:
install_github("liusi2019/btloda")

Downloading GitHub repo liusi2019/btloda@master
from URL https://api.github.com/repos/liusi2019/btloda/zipball/master
Installing btloda
'/Users/si/anaconda2/lib/R/bin/R' --no-site-file --no-environ --no-save  \
  --no-restore --quiet CMD INSTALL  \
  '/private/var/folders/0d/ty9fnz017m734hdf0m7ntgcr0000gn/T/RtmpAwfHgo/devtools19158e28fff/liusi2019-btloda-a0793c0'  \
  --library='/Users/si/anaconda2/lib/R/library' --install-tests 



In [24]:
library(btloda)

In [25]:
library(MASS)

In [28]:
## fuction for getting quantile estimate tau^hat_q using raw ECDF method
raw_cdf<- function(datab, datan, alpha, q){
    trialv <- sort(c(datan, datab))
    F.n <- ecdf(datan)
    Fn  <- F.n(trialv)
    F.b <- ecdf(datab)
    Fb <- F.b(trialv)
    Fa <- (Fn - (1-alpha)*Fb)/alpha 
    if (length(which(Fa <= q))==0){
      index <- 1
    }else{
      index <-max(which(Fa <= q))
    }
    return(trialv[index])
  }

## function for getting quantile estimate tau^hat_q using isotonized ECDF method
iso_cdf <- function(datab, datan, alpha, q){
    trialv <- sort(c(datan, datab))
    F.n <- ecdf(datan)
    Fn  <- F.n(trialv)
    F.b <- ecdf(datab)
    Fb <- F.b(trialv)
    Fa <- (Fn - (1-alpha)*Fb)/alpha 
    F.is = isoreg(Fa)$yf ## Computes the Isotonic Estimator of Fa
    F.is[which(F.is<=0)]=0  
    F.is[which(F.is>=1)]=1
    if (length(which(F.is <= q))==0){
      index.is <- 1
    }else{
      index.is <-max(which(F.is <= q))
    }
    return(trialv[index.is])
  }

In [29]:
## take as input anomaly scores for nominal and mixture data sets, alpha, q, anomaly scores of the
## big nominal and anomaly data set
## provide as output the 

total_result <- function(datab, datan, alpha, q, score_nominal, score_anomaly){
    ## estiamte from raw ECDF method
    est1 <- raw_cdf(datab, datan, alpha, q)
    ## estiamte from isotonized ECDF method
    est2 <- iso_cdf(datab, datan, alpha, q)
      
    result = rep(0, 5)
    result[1] <- mean(score_anomaly >= est1)## recall, raw ECDF
    result[2] <- mean(score_nominal >= est1)## FPR, the proportion of nominal that are misclassified as alien, raw ECDF
    result[3] <- mean(score_anomaly >= est2)## recall, isotonized ECDF
    result[4] <- mean(score_nominal >= est2)## FPR, isotonized ECDF
    result[5] <- mean(score_nominal >= quantile(score_anomaly, q)) ## ground truth about FPR
    return(result)
  }

### The following cells are for the experiments on synthetic data sets

In [30]:
## create two big datsets: big_nominal, big_anomaly for calculating ground truth about FPR
big_nominal = base::matrix(rnorm(20000*9, 0, 1), nrow = 20000, ncol = 9)
big_anomaly = matrix(ncol = 9, nrow = 20000)
vmat = matrix(0, ncol = 9, nrow = 9)
diag(vmat) = 1
for(i in (1:20000)){
    center = rep(0, 9)
    if(rbinom(1, 1, 0.4)==1){
      center[sample(9, 3,replace = F)] = 3
      }else{
      center[sample(9, 4,replace = F)] = 3
      }
    big_anomaly[i,] = mvrnorm(1, center, vmat)
    } 

## write them into .csv file for running iforest in linux
write.csv(big_nominal, file = paste("big_nominal.csv", sep = ""), row.names = FALSE)
write.csv(big_anomaly, file = paste("big_anomaly.csv", sep = ""), row.names = FALSE) 

In [3]:
## set up the sample size n, alien proportion alpha, and q
n = 1000
vpro = 10
alpha = vpro/100  ## anomaly proportion in the mixture data set
q = 0.05 ## targetting on q quantile

In [32]:
##  generate nominal and mixture datasets, each of size n
dat =  matrix(ncol = 9, nrow = 2*n)
vmat = matrix(0, ncol = 9, nrow = 9)
diag(vmat) = 1
for(i in (1:n*alpha)){
  center = rep(0, 9)
  if(rbinom(1, 1, 0.4)==1){
    center[sample(9, 3,replace = F)] = 3
    }else{
    center[sample(9, 4,replace = F)] = 3
    }
  dat[i,] = mvrnorm(1, center, vmat)
  } 
    
nnrow <- round(n * (2 - alpha))
rvec <- rnorm(9 * nnrow, 0, 1)
## the top alpha * n points are alien points 
dat[(round(n*alpha)+1):(2*n),] <- base::matrix(rvec, nrow = round(n*(2-alpha)), ncol = 9)
    
data1 <- dat[(n+1):(2*n),] #nominal data set
data2 <- dat[1:n,] #mixture data set

## write them into .csv file for running iforest using linux    
write.csv(data1, file = paste("data1.csv", sep = ""), row.names = FALSE)
write.csv(data2, file = paste("data2.csv", sep = ""), row.names = FALSE)


### The isolation forest implementation used here is from https://github.com/tadeze/osu_iforest (note that this link will provide the correct version after July 1st).
### The code in the following cell send command to run iforest in Linux

In [None]:
## grow iforest using the nominal data set, get out-of-bag anomaly scores for each nominal point
## and save the iforest grown
system(paste('./iforest','-i', paste('./data1.csv', sep = ""),'-o', paste('./depth1.csv', sep = ""),'-s', round(0.2*n), '-t 1000 -g -k -b', paste('./forest1.bin',sep = "")), wait = TRUE)
score1 <- read.csv(paste('./depth1.csv', sep = ""), header = TRUE)
datab <- as.numeric(unlist(score1))
   
## run the mixture data set through the saved iforest 
system(paste('./iforest','-i', paste('./data2.csv', sep = "") ,'-o',paste('./depth2.csv', sep = ""),'-g -f', paste('./forest1.bin',sep = "")), wait = TRUE)
score2 <- read.csv(paste('./depth2.csv', sep = ""), header = TRUE)
datan <- as.numeric(unlist(score2))

### run the big nominal and anomaly datasets through the forest
system(paste('./iforest','-i', paste('./anomaly','.csv', sep = "") ,'-o',paste('./depth_anomaly.csv', sep = ""),'-g -f', paste('./forest1.bin',sep = "")), wait = TRUE)
depth_anomaly <- read.csv(paste('./depth_anomaly.csv', sep = ""), header = TRUE)
score_anomaly <- as.numeric(unlist(depth_anomaly))
    
system(paste('./iforest','-i', paste('./nominal','.csv', sep = "") ,'-o',paste('./depth_nominal.csv', sep = ""),'-g -f', paste('./forest1.bin',sep = "")), wait = TRUE)
depth_nominal <- read.csv(paste('./depth_nominal.csv', sep = ""), header = TRUE)
score_nominal <- as.numeric(unlist(depth_nominal))

In [None]:
output_iforest <- total_result(datab, datan, alpha, q, score_nominal, score_anomaly)
output_iforest

### This cell below uses R package btloda, which implements the bootstrap and out-of-bag version of LODA

In [33]:
## build projections and create histograms using bootstrap of the nominal data set
## get out-of-bag anomaly scores for each nominal point
bt_out = btloda(data1,sparsity=NA, maxk=1000, keep=NULL, exclude=NULL, debug=F, inf_replace = log(1e-09))

bt_datab = bt_out$oob_nll 
bt_datan = get_neg_ll_all_hist(data2, bt_out$pvh$w, bt_out$pvh$hists, inf_replace = log(1e-09))

## get anomaly scores for the big nominal and anomaly data sets using projections and histograms
bt_score_nominal = get_neg_ll_all_hist(big_nominal, bt_out$pvh$w, bt_out$pvh$hists, inf_replace = log(1e-09))
bt_score_anomaly = get_neg_ll_all_hist(big_anomaly, bt_out$pvh$w, bt_out$pvh$hists, inf_replace = log(1e-09))

In [34]:
output_btloda = total_result(bt_datab, bt_datan, alpha, q, bt_score_nominal, bt_score_anomaly)
output_btloda