In [5]:
library(parallel)
source('PTrees.R')
source('Datasets.R')

In [253]:
split_data <- function(D, cutoff=0.8){
    cutoff <- as.integer(nrow(D$data)*cutoff)
    D1 <- D
    D2 <- D
    D1$data <- D$data[1:cutoff, ]
    D2$data <- D$data[(cutoff+1):nrow(D$data), ]
    return(list(train=D1, test=D2))
}
test <- function(D, B, dep, dt_gen){
    dt <- dt_gen(D$train, B, dep)
    ys <- get(D$test$y_names, D$test$data)
    default <- ys[1]
    p <- predict(dt, D$test$data, default)
    return(sum(p == ys) / length(ys))
}

In [254]:
get_leaf_node <- function(ys, node, eps){
    m <- data.frame(table(ys))
    node$counts <- hist_noiser(m$Freq, eps)
    idx <- which.max(node$counts)
    guess <- m$ys[idx]
    node$guess <- guess
    node$name <- paste(node$name, guess, sep=';')
}

In [255]:
split <- function(attr){
    if(attr$type == 'numeric'){
        s <- attr$split
        return(c(list(name=paste('<=', s), fn=function(d){return(d <= s)}),
                 list(name=paste('>', s), fn=function(d){return(d > s)})))
    }else
        return(lapply(attr$split, function(a) list(name=a, fn=equality(a))))
}

In [198]:
get_sigma <- function(data, attrs, y_name){
    sizes <- lapply(data, function(col){
        if(class(col) == 'numeric')
            return(2)
        return(length(levels(col)))
    })
    t <- sizes[attrs] %>% reduce(`max`)
    C <- get(y_name, sizes)
    return(C*t*sqrt(2))
}

In [258]:
predict <- function(t, D, default){
    preds <- rep(default, nrow(D))
    if(nrow(D) == 0)
        return(preds) 
    if(!is.null(t$guess)){
        return(rep(t$guess, nrow(D)))
    }
    for(i in 1:length(t$pars)){
        mask <- t$pars[[i]]$fn(get(t$attr, D))
        preds[mask] <- predict(t$children[[i]], D[mask, ], default)
    }
    return(preds)
}

In [300]:
L1 <- list(dir=1, pts=data.frame())
L2 <- list(dir=0, pts=data.frame())
L3 <- list(dir=0, pts=data.frame())
L4 <- list(dir=1, pts=data.frame())

NoisyBranch <- function(learner, params){
    return(rbind(learner$pts, params$attr_left))
} #Stub

dtree_helper <- function(t, data, attrs, y_name, D_ranges, dep, epsilon, params){
    ys <- get(y_name, data)
    t$nrows <- Inf
    #Decides whether to collect the size info
    L3$pts <<- NoisyBranch(L3, list(attr_left=attrs %>% length))
    if(params$collect_size || L3$dir){
        t$nrows <- hist_noiser(nrow(data), params$query_budget)
        epsilon <- epsilon-params$query_budget
    }
    if(dep == 0 || t$nrows < params$sigma){
        get_leaf_node(ys, t, epsilon)
        return()
    }
    attr_best <- 0
    L4$pts <<- NoisyBranch(L4, list(attr_left=attrs %>% length))
    if(L4$dir){
        atp <- lapply(attrs, function(name) cond_eval(name, data, range_bounds, ys))
        utilities <- sapply(atp, function(x) -x$ent)
        attr_best <- exp_mech(atp, utilities, params$query_budget, ent_util$sens)
        epsilon <- epsilon-params$query_budget
    }else{
        attr_best <- random(attrs)
    }
    t$name <- paste(t$name, attr_best$name, sep=';')
    t$pars <- split(attr_best)
    t$attr <- attr_best$name
    col <- get(attr_best$name, data)
    attr_new <- attrs[attrs != attr_best$name]
    for(fn in t$pars){
        subset <- (fn$fn)(col)
        dtree_helper(t$AddChild(fn$name), data[subset, ], attr_new, y_name, D_ranges, dep-1, epsilon, params)
    }
}

dtree_outer <- function(data, attrs, y_name, D_ranges, dep, epsilon){
    dt <- Node$new('')
    params <- list(collect_size=TRUE, sigma=0, query_budget=epsilon/(2*(dep+1)))
    if(L1$dir){
        params$sigma <- get_sigma(data, attrs, y_name) / params$query_budget
        dtree_helper(dt, data, attrs, y_name, D_ranges, dep, epsilon, params)
        if(L2$dir){
            prune_tree(dt, y_name %>% get(data) %>% levels)
        }   
    }else{
        params$collect_size <- FALSE
        params$query_budget <- params$query_budget*2
        dt <- dtree_helper(dt, data, attrs, y_name, D_ranges, dep, epsilon, params)
    }
    return(dt)
}
dtree_general <- function(D, B, dep) dtree_outer(D$data, D$x_names, D$y_names, D$rng, dep, B)

In [301]:
t <- dtree_outer(ttt$data, ttt$x_names, ttt$y_names, ttt$rng, 4, 10)

In [278]:
T <- split_data(ttt, 0.7)
test(T, 5, 4, dtree_general)

In [296]:
?mutate