Skip to content

Commit

Permalink
Allways fit g on full dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
lendle committed Feb 6, 2016
1 parent 91d53ea commit cce735f
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 97 deletions.
68 changes: 41 additions & 27 deletions src/CTMLEs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,16 @@ using ..LReg, ..Common, ..Parameters, ..Qmodels

import ..Qmodels: fluctuate!, defluctuate!, Fluctuation

include("pcor.jl")
include("strategies.jl")

subset{P<:Parameter}(param::P, idx::AbstractVector{Int}) =
P(map(d -> subset(d, idx), regimens(param))...)
type GMachine{T<:AbstractFloat}
A::Vector{T}
W::Matrix{T}
end

function give_me_a_g(gmachine::GMachine; subset=1:size(gmachine.W, 2))
lreg(gmachine.W, gmachine.A, subset=subset)
end

subset(d::StaticRegimen, idx) = d
subset(d::DynamicRegimen, idx) = DynamicRegimen(d.a[idx])

"""
A `Qchunk` holds (a subset of) the data set and corresponding indexes along with a `Qmodel` which can compute predictions for
Expand All @@ -46,28 +48,42 @@ type Qchunk{T<:AbstractFloat}
W::Matrix{T}
A::Vector{T}
Y::Vector{T}
gmachine::GMachine
param::Parameter{T}
idx::AbstractVector{Int}
gbounds::Vector{T}
penalize::Bool
risk::T
used_covars::IntSet
unused_covars::IntSet
end

include("pcor.jl")
include("strategies.jl")

subset{P<:Parameter}(param::P, idx::AbstractVector{Int}) =
P(map(d -> subset(d, idx), regimens(param))...)

subset(d::StaticRegimen, idx) = d
subset(d::DynamicRegimen, idx) = DynamicRegimen(d.a[idx])


function make_chunk_subset{T<:AbstractFloat}(logitQnA1::Vector{T}, logitQnA0::Vector{T},
W::Matrix{T}, A::Vector{T}, Y::Vector{T},
param::Parameter{T}, idx::AbstractVector{Int}, gbounds::Vector{T},
penalize::Bool)
penalize::Bool, used_covars::IntSet, unused_covars::IntSet)
length(logitQnA1) == length(logitQnA0) == size(W, 1) == length(A) == length(Y) ||
error(ArgumentError("Input sizes do not match"))
q = Qmodel(logitQnA1[idx], logitQnA0[idx])
param = subset(param, idx)
gmachine=GMachine(A, W)
W = W[idx, :]
A = A[idx]
Y = Y[idx]
#Can't compute penalized risk if q isn't fluctuated yet,
#so set to Inf
q_risk = penalize? convert(T, Inf) : risk(q, A, Y, param, false)
Qchunk(q, W, A, Y, param, idx, gbounds, penalize, q_risk)
Qchunk(q, W, A, Y, gmachine, param, idx, gbounds, penalize, q_risk, used_covars, unused_covars)
end

StatsBase.nobs(chunk::Qchunk) = nobs(chunk.q)
Expand Down Expand Up @@ -152,10 +168,10 @@ function makeQCV{T<:AbstractFloat}(logitQnA1::Vector{T}, logitQnA0::Vector{T},
n, p = size(W)
idx_train = sort(idx_train)
idx_val = setdiff(1:n, idx_train)
chunk_train = make_chunk_subset(logitQnA1, logitQnA0, W, A, Y, param, idx_train, gbounds, penalize)
chunk_val= make_chunk_subset(logitQnA1, logitQnA0, W, A, Y, param, idx_val, gbounds, false)
used_covars = IntSet()
unused_covars = setdiff(IntSet(1:p), used_covars)
unused_covars = IntSet(1:p)
chunk_train = make_chunk_subset(logitQnA1, logitQnA0, W, A, Y, param, idx_train, gbounds, penalize, used_covars, unused_covars)
chunk_val= make_chunk_subset(logitQnA1, logitQnA0, W, A, Y, param, idx_val, gbounds, false, used_covars, unused_covars)
gseq = LR{T}[]
QCV(chunk_train, chunk_val, gseq, deepcopy(searchstrategy), used_covars, unused_covars)
end
Expand Down Expand Up @@ -267,17 +283,17 @@ function ctmle{T<:AbstractFloat}(logitQnA1::Vector{T}, logitQnA0::Vector{T},
best_steps = find_steps(cvqs, patience=patience)

#build a chunk with the full data set
fullchunk = Qchunk(Qmodel(logitQnA1, logitQnA0), W, A, Y, param, 1:n, gbounds, penalize_risk, convert(T, Inf))
gseq = LR{T}[]
new_flucseq = Bool[]
used_covars = IntSet()
unused_covars = IntSet(1:p)
fullchunk = Qchunk(Qmodel(logitQnA1, logitQnA0), W, A, Y, GMachine(A, W), param, 1:n, gbounds, penalize_risk, convert(T, Inf), used_covars, unused_covars)
gseq = LR{T}[]
new_flucseq = Bool[]
#and add covariates steps # of times, saving g fits
@info("Fitting final estimate of Q")
for step in 1:best_steps
# @debug("used_covars: $used_covars")
# @debug("unused_covars: $unused_covars")
gfit, new_fluc = add_covars!(fullchunk, searchstrategy, used_covars, unused_covars)
gfit, new_fluc = add_covar_chunk!(fullchunk, searchstrategy)
push!(gseq, gfit)
push!(new_flucseq, new_fluc)
@info(if length(gseq) == 1
Expand Down Expand Up @@ -332,7 +348,7 @@ function find_steps{T<:AbstractFloat}(qcvs::Vector{QCV{T}}; patience::Int=typema
@info("Choosing number of steps via cross validation")
while notdone
steps += 1
map(add_covars!, qcvs)
map(add_covar_qcv!, qcvs)
val_risk = mean(map(qcv -> qcv.chunk_val.risk, qcvs))
if val_risk < best_risk
best_risk = val_risk
Expand Down Expand Up @@ -363,29 +379,27 @@ Adds next covariate(s) to a Qchunk based on a search strategy
** Details **
If `used_covars` is emtpy (because this is the first time `add_covars!` has been called on this chunk,
If `used_covars` is emtpy (because this is the first time `add_covar_chunk!` has been called on this chunk,
a fluctuation based on an intercept only model is added to Q.
Otherwise, next covariates are added where the order is determined by `searchstrategy`
Returns most recent `gfit` object and a boolean which is `true` if the most recent covariates were added to a new fluctuation
or `false` if the previous fluctuation was updated with new covariates.
"""
function add_covars!(chunk::Qchunk, searchstrategy::SearchStrategy, used_covars, unused_covars)
if isempty(used_covars)
function add_covar_chunk!(chunk::Qchunk, searchstrategy::SearchStrategy)
if isempty(chunk.used_covars)
# @debug("Initial fluctuation")
push!(used_covars, 1)
delete!(unused_covars, 1)
gfit = lreg(chunk.W, chunk.A, subset=1:1)
push!(chunk.used_covars, 1)
delete!(chunk.unused_covars, 1)
gfit = give_me_a_g(chunk.gmachine, subset=1:1)
fluc = computefluc(chunk.q, chunk.param, bound!(predict(gfit, chunk.W), chunk.gbounds), chunk.A, chunk.Y)
fluctuate!(chunk.q, fluc)
chunk.risk = risk(chunk.q,chunk.A, chunk.Y, chunk.param, chunk.penalize)
return gfit, true
else
# @debug("Adding covariate(s)")
abc = add_covars!(searchstrategy, chunk.q, chunk.param, chunk.W, chunk.A, chunk.Y,
used_covars, unused_covars, chunk.risk, chunk.gbounds, chunk.penalize)
q_risk, gfit, new_fluc = abc
q_risk, gfit, new_fluc = add_covar!(searchstrategy, chunk)
chunk.risk=q_risk
return gfit, new_fluc
end
Expand All @@ -404,7 +418,7 @@ If `qcv` has no unused covariates, none are added. New covariates are first add
Returns updated `qcv`.
"""
function add_covars!(qcv::QCV)
function add_covar_qcv!(qcv::QCV)
if isempty(qcv.unused_covars)
@debug("No covariates to add")
return qcv
Expand All @@ -415,7 +429,7 @@ function add_covars!(qcv::QCV)
# update q_val, which dpeends on g fit on q_train, and fluc

#add next covar to training chunk and store gfit
gfit, new_fluc = add_covars!(train, qcv.searchstrategy, qcv.used_covars, qcv.unused_covars)
gfit, new_fluc = add_covar_chunk!(train, qcv.searchstrategy)
push!(qcv.gseq, gfit)

#add newest fluctuation to val chunk and compute risk
Expand Down
137 changes: 69 additions & 68 deletions src/strategies.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,42 +34,37 @@ function init!(strat::PreOrdered)
strat
end

# added covar(s) are added to used_covars and deleted from unused_covars
# added covar(s) are added to chunk.used_covars and deleted from chunk.unused_covars
# q is fluctuated
#return best_risk, gfit, new_fluc
function add_covars!{T<:AbstractFloat}(::ForwardStepwise,
q::Qmodel{T},
param::Parameter{T},
W::Matrix{T},
A::Vector{T},
Y::Vector{T},
used_covars::IntSet,
unused_covars::IntSet,
prev_risk::T,
gbounds::Vector{T},
penalize::Bool)
@debug("unused_covars: $(unused_covars)")
@debug("used_covars: $(used_covars)")

next_covar_risk = fill(Inf, maximum(unused_covars))
function add_covar!(::ForwardStepwise, chunk::Qchunk)
q=chunk.q
W=chunk.W
A=chunk.A
Y=chunk.Y
prev_risk=chunk.risk
@debug("unused_covars: $(chunk.unused_covars)")
@debug("used_covars: $(chunk.used_covars)")

next_covar_risk = fill(Inf, maximum(chunk.unused_covars))
g_fluc_dict = Dict{IntSet, Tuple{LR, Fluctuation}}()

#Remove most recent fluctuation (and save for later use)
#So that we can add one covar to the current fluctuation to see if it works
fluc_now = defluctuate!(q)

#try adding each covariate to the current fluctuation
for j in unused_covars
for j in chunk.unused_covars
#estimate g with additional covariate j
#println("Vars: $(union(used_covars, IntSet(j)))")
current_covars = union(used_covars, IntSet(j))
gfit = lreg(W, A, subset=collect(current_covars)) #sslreg(w, a, current_covars)
#println("Vars: $(union(chunk.used_covars, IntSet(j)))")
current_covars = union(chunk.used_covars, IntSet(j))
gfit = give_me_a_g(chunk.gmachine, subset=collect(current_covars))
#fluctuate and get risk
fluc = computefluc(q, param, bound!(predict(gfit, W), gbounds), A, Y)
fluc = computefluc(q, chunk.param, bound!(predict(gfit, W), chunk.gbounds), A, Y)
fluctuate!(q, fluc)
next_covar_risk[j] = risk(q, A, Y, param, penalize)
next_covar_risk[j] = risk(q, A, Y, chunk.param, chunk.penalize)
if isnan(next_covar_risk[j])
@warn("Risk is NaN. used covars: $used_covars")
@warn("Risk is NaN. used covars: $chunk.used_covars")
end
#remove newest fluctuation
defluctuate!(q)
Expand All @@ -81,116 +76,122 @@ function add_covars!{T<:AbstractFloat}(::ForwardStepwise,
@debug("Adding to old fluc, best_j:$best_j")
new_fluc = false
#adding one covariate to current fluc does better, then use that fluctuation
push!(used_covars, best_j)
@debug("used_covars after adding best_j: $used_covars")
delete!(unused_covars, best_j)
gfit, fluc = g_fluc_dict[used_covars]
push!(chunk.used_covars, best_j)
@debug("used_covars after adding best_j: $chunk.used_covars")
delete!(chunk.unused_covars, best_j)
gfit, fluc = g_fluc_dict[chunk.used_covars]
fluctuate!(q, fluc)
else
new_fluc = true
#otherwise, keep current fluctuation find best new fluctuation adding a
#single covariate
fluctuate!(q, fluc_now) #reuse previous best fluctuation
next_covar_risk = fill(Inf, maximum(unused_covars))
for j in unused_covars
current_covars = union(used_covars, IntSet(j))
next_covar_risk = fill(Inf, maximum(chunk.unused_covars))
for j in chunk.unused_covars
current_covars = union(chunk.used_covars, IntSet(j))
gfit = g_fluc_dict[current_covars][1]
fluc = computefluc(q, param, bound!(predict(gfit, W), gbounds), A, Y)
fluc = computefluc(q, chunk.param, bound!(predict(gfit, W), chunk.gbounds), A, Y)
fluctuate!(q, fluc)
next_covar_risk[j] = risk(q, A, Y, param, penalize)
next_covar_risk[j] = risk(q, A, Y, chunk.param, chunk.penalize)
if isnan(next_covar_risk[j])
@warn("Risk is NaN. used covars: $used_covars")
@warn("Risk is NaN. used covars: $chunk.used_covars")
end
defluctuate!(q)
g_fluc_dict[current_covars] = (gfit, fluc)
end
@debug("next_covar_risk: $next_covar_risk")
best_risk, best_j = findmin(next_covar_risk)
if !penalize && best_risk > prev_risk
if !chunk.penalize && best_risk > prev_risk
@warn("Best risk is worse than previous best risk. This doesn't make sense. Continuing anyway...")
end
@debug("best_j: $best_j, unused_covars: $unused_covars")
push!(used_covars, best_j)
delete!(unused_covars, best_j)
gfit, fluc = g_fluc_dict[used_covars]
@debug("best_j: $best_j, unused_covars: $chunk.unused_covars")
push!(chunk.used_covars, best_j)
delete!(chunk.unused_covars, best_j)
gfit, fluc = g_fluc_dict[chunk.used_covars]
fluctuate!(q, fluc)
end
best_risk, gfit, new_fluc
end

# added covar(s) are added to used_covars and deleted from unused_covars
# added covar(s) are added to chunk.used_covars and deleted from chunk.unused_covars
# q is fluctuated
#return best_risk, gfit, new_fluc
function add_covars!{T<:AbstractFloat}(strategy::PreOrdered,
q::Qmodel{T},
param::Parameter{T},
W::Matrix{T},
A::Vector{T},
Y::Vector{T},
used_covars::IntSet,
unused_covars::IntSet,
prev_risk::T,
gbounds::Vector{T},
penalize::Bool)
function add_covar!(strategy::PreOrdered, chunk::Qchunk)
q=chunk.q
W=chunk.W
A=chunk.A
Y=chunk.Y
prev_risk=chunk.risk

if isempty(strategy.covar_order)
append!(strategy.covar_order, order_covars(strategy.ordering, q, param, W, A, Y, unused_covars, gbounds, penalize))
append!(strategy.covar_order, order_covars(strategy.ordering, chunk))
end

ordered_unused_covars = filter(x -> x used_covars, strategy.covar_order)
ordered_unused_covars = filter(x -> x chunk.used_covars, strategy.covar_order)
next_covars = length(ordered_unused_covars) >= strategy.k_at_once ?
ordered_unused_covars[1:strategy.k_at_once] :
ordered_unused_covars


fluc_now = defluctuate!(q)

union!(used_covars, next_covars)
setdiff!(unused_covars, next_covars)
union!(chunk.used_covars, next_covars)
setdiff!(chunk.unused_covars, next_covars)

g_fit = lreg(W, A, subset=collect(used_covars))
gn1 = bound!(predict(g_fit, W), gbounds)
g_fit = give_me_a_g(chunk.gmachine, subset=collect(chunk.used_covars))
gn1 = bound!(predict(g_fit, W), chunk.gbounds)

fluc = computefluc(q, param, gn1, A, Y)
fluc = computefluc(q, chunk.param, gn1, A, Y)
fluctuate!(q, fluc)
new_risk = risk(q, A, Y, param, penalize)
new_risk = risk(q, A, Y, chunk.param, chunk.penalize)

if new_risk < prev_risk
return new_risk, g_fit, false
end

defluctuate!(q)
fluctuate!(q, fluc_now)
fluctuate!(q, param, gn1, A, Y)
return risk(q, A, Y, param, penalize), g_fit, true
fluctuate!(q, chunk.param, gn1, A, Y)
return risk(q, A, Y, chunk.param, chunk.penalize), g_fit, true
end

function order_covars(ordering::LogisticOrdering, q, param, W, A, Y, available_covars, gbounds, penalize)
function order_covars(ordering::LogisticOrdering, chunk)
q=chunk.q
W=chunk.W
A=chunk.A
Y=chunk.Y
available_covars=chunk.unused_covars
logitQnAA = linpred(q, A)
scores = Dict{Int, Float64}()
for i in available_covars
g_fit = lreg(W, A, subset=[i])
gn1 = bound!(predict(g_fit, W), gbounds)
fluc = computefluc(q, param, gn1, A, Y)
g_fit = give_me_a_g(chunk.gmachine, subset=[i])
gn1 = bound!(predict(g_fit, W), chunk.gbounds)
fluc = computefluc(q, chunk.param, gn1, A, Y)
fluctuate!(q, fluc)
scores[i] = risk(q, A, Y, param, penalize)
scores[i] = risk(q, A, Y, chunk.param, chunk.penalize)
defluctuate!(q)
end
sort!(collect(keys(scores)), by = x -> scores[x])
end

function order_covars(ordering::PartialCorrOrdering, q, param, W, A, Y, available_covars, gbounds, penalize)
function order_covars(ordering::PartialCorrOrdering, chunk)
q=chunk.q
W=chunk.W
A=chunk.A
Y=chunk.Y
available_covars=chunk.unused_covars
resid = Y .- predict(q, A)
W_available = W[:, collect(available_covars)]
scores = Dict(collect(zip(available_covars, abs(pcor(resid, W_available, A)))))
return sort!(collect(keys(scores)), by = x -> scores[x], rev=true)
end

function order_covars(ordering::HDPSOrdering, q, param, W, A, Y, available_covars, gbounds, penalize)
function order_covars(ordering::HDPSOrdering, chunk)
error()
end

function order_covars(ordering::ManualOrdering, q, param, W, A, Y, available_covars, gbounds, penalize)
function order_covars(ordering::ManualOrdering, chunk)
available_covars=chunk.unused_covars
collect(available_covars) == sort(ordering.indexes) ||
error("ManualOrdering indexes do not match available covariates")
ordering.indexes
Expand Down

0 comments on commit cce735f

Please sign in to comment.