In [2]:
#Params
datasetPath = "../datasets/"
dataPath = "./data_clustered/"
resultPath = "./results/plots/"

file <- "Apache"
baseVersion<-"0001"
versions<-c("0001","0002","0003","0004")
oracle=c("0.2","0.5","0.8")

In [3]:
library(ggplot2)
library(plyr)
library(reshape2)
library(tools)

In [4]:
# Plot without a base version
processData <- function( data, nconfigurations){
  
  #Adding a helper colum
  data$index <- 1
  data$index <- ave(data$index, cumsum(c(F, diff(data$sr) > 0)), FUN=seq_along)
  data$freq<-ave( as.numeric(data[[1]]), data[["sr"]] , FUN=length)
  data$index= data$index * 100 / data$freq
    
  #CALCULATE METRICS
  data$Accuracy = (data$TP+data$TN)/(data$TP+data$TN+data$FN+data$FP)
  data$Specificity = data$TN/(data$TN+data$FP)
  data$FPR=data$FP/as.numeric(nconfigurations)
  data$FNR=data$FN/as.numeric(nconfigurations)
  data$Precision=data$TP/(data$TP + data$FP)
  data$Recall = data$TP / (data$TP+ data$FN)
  data$NPV=data$TN/(data$TN+data$FN)

  data$configpercentage=data$sr*100/nconfigurations
  
  #This melting is because the data wasn't in the proper csv style
  data<-melt(data, id.vars = c("FN","FP","TN","TP","sr","t","index","freq","dataset","configpercentage"))
  
  return(data)
}

draw<-function(datasets, c){
  for (v in unique(datasets$variable)){
    datatmp=datasets[which(datasets$variable == v),]
      #print(datatmp)
    
    #Generate the plot
    plot<-ggplot(datatmp,aes(configpercentage,value),group=dataset)+geom_line(aes(linetype=dataset,colour=dataset), na.rm = TRUE)+ theme_bw()+
      xlab(paste("Percentage of configurations in the training set for a threshold of ",c,sep="")) + 
      ylab(paste(v," Value",sep=""))+
      theme(axis.text = element_text(size = rel(2)),axis.title= element_text(size = rel(2)))
    #savetheplot
    ggsave(plot,file=paste(paste(resultPath, c,sep=""), ".pdf", sep = paste("-",v,"-",file,sep="")), width=11, height=11)
  }
}

In [5]:
dir.create(resultPath, showWarnings=FALSE);
for(c in oracle){
    
    datasets = data.frame()
    pattern<-paste("^",file,"*",sep="")
    temp = list.files(path = paste(dataPath,paste(c,"/",sep=""),sep=""),pattern=pattern)
    
    if(length(versions)>0){
        temp = c()
        for(n in versions){
            temp<-append(temp,paste(file,"-",n,".csv",sep=""))
        }
    }
    
    for (i in 1:length(temp)) {
        filename<-paste(dataPath,c,"/",temp[i],sep="")
        filenameDataset<-paste(datasetPath,strsplit(temp[i],"-")[[1]][1],".csv",sep="")
        tmp= read.csv(filename)
        tmp$dataset <- temp[i]
        command<-paste("wc -l ",filenameDataset,sep="")
        pipewc<-pipe(command)
        lines<-(scan(pipewc, what=list(0, NULL))[[1]])
        close(pipewc)
        tmp<-processData(tmp,lines)
        datasets=rbind(datasets,tmp)
    }
    
    draw(datasets,c)
}

In [21]:
# Plot with a base version
processData <- function(dataBase, data, nconfigurations){
  
  #Adding a helper colum
  data$index <- 1
  data$index <- ave(data$index, cumsum(c(F, diff(data$sr) > 0)), FUN=seq_along)
  data$freq<-ave( as.numeric(data[[1]]), data[["sr"]] , FUN=length)
  data$index= data$index * 100 / data$freq
    
  #CALCULATE METRICS
  data$Accuracy = (data$TP+data$TN)/(data$TP+data$TN+data$FN+data$FP) - (dataBase$TP+dataBase$TN)/(dataBase$TP+dataBase$TN+dataBase$FN+dataBase$FP)
  data$Specificity = data$TN/(data$TN+data$FP) - dataBase$TN/(dataBase$TN+dataBase$FP)
  data$FPR=data$FP/as.numeric(nconfigurations) - dataBase$FP/as.numeric(nconfigurations)
  data$FNR=data$FN/as.numeric(nconfigurations) - dataBase$FN/as.numeric(nconfigurations)
  data$Precision=data$TP/(data$TP + data$FP) - dataBase$TP/(dataBase$TP + dataBase$FP)
  data$Recall = data$TP / (data$TP+ data$FN)  - dataBase$TP / (dataBase$TP+ dataBase$FN)# Recall of class 1
  data$NPV=data$TN/(data$TN+data$FN) - dataBase$TN/(dataBase$TN+dataBase$FN)

  data$configpercentage=data$sr*100/nconfigurations
  
  #This melting is because the data wasn't in the proper csv style
  data<-melt(data, id.vars = c("FN","FP","TN","TP","sr","t","index","freq","dataset","configpercentage"))
  
  return(data)
}

draw<-function(datasets, c){
  for (v in unique(datasets$variable)){
    datatmp=datasets[which(datasets$variable == v),]
      #print(datatmp)
    
    #Generate the plot
    plot<-ggplot(datatmp,aes(configpercentage,value),group=dataset)+geom_line(aes(linetype=dataset,colour=dataset), na.rm = TRUE)+ theme_bw()+
      xlab(paste("Percentage of configurations in the training set for a threshold of ",c,sep="")) + 
      ylab(paste(v," Value",sep=""))+
      theme(axis.text = element_text(size = rel(2)),axis.title= element_text(size = rel(2)))
    #savetheplot
    ggsave(plot,file=paste(paste(resultPath, c,sep=""), ".pdf", sep = paste("-",v,"-",file,sep="")), width=11, height=11)
  }
}

In [22]:
dir.create(resultPath, showWarnings=FALSE);
for(c in oracle){
    
    data=read.table(paste(dataPath, c, "/", file, "-", baseVersion, ".csv", sep=""), header=T,sep=",");
    
    datasets = data.frame()
    pattern<-paste("^",file,"*",sep="")
    temp = list.files(path = paste(dataPath,paste(c,"/",sep=""),sep=""),pattern=pattern)
    
    if(length(versions)>0){
        temp = c()
        for(n in versions){
            temp<-append(temp,paste(file,"-",n,".csv",sep=""))
        }
    }
    
    for (i in 1:length(temp)) {
        filename<-paste(dataPath,c,"/",temp[i],sep="")
        filenameDataset<-paste(datasetPath,strsplit(temp[i],"-")[[1]][1],".csv",sep="")
        tmp= read.csv(filename)
        tmp$dataset <- temp[i]
        command<-paste("wc -l ",filenameDataset,sep="")
        pipewc<-pipe(command)
        lines<-(scan(pipewc, what=list(0, NULL))[[1]])
        close(pipewc)
        tmp<-processData(data,tmp,lines)
        datasets=rbind(datasets,tmp)
    }
    
    draw(datasets,c)
}