# System Info

In [1]:
versioninfo()

Julia Version 1.4.1
Commit 381693d3df* (2020-04-14 17:20 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: Intel(R) Core(TM) i7-7500U CPU @ 2.70GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-8.0.1 (ORCJIT, skylake)
Environment:
  JULIA_EDITOR = "/home/user/.vscode-server-insiders/bin/d487078dc7fc1c276657cadb61b4f63833a8df55/bin/code-insiders"
  JULIA_GPG = 3673DF529D9049477F76B37566E3C7DC03D6E495
  JULIA_PATH = /usr/local/julia
  JULIA_NUM_THREADS = 3
  JULIA_VERSION = 1.4.1


# Setup

In [2]:
cd("../")

In [3]:
ENV["COLUMNS"] = 1000
ENV["LINES"] = 1000
;

# Packages

In [4]:
using Interact

In [5]:
using Gadfly, DataFrames, StatsBase, Statistics, StatsBase, Distributions, LinearAlgebra

In [6]:
using Query

In [7]:
using Gen, GLM

In [8]:
include("../src/Utils.jl")
using .Utils

In [9]:
using RCall

R"""
library(rethinking)
"""

│ Loading required package: StanHeaders
│ Loading required package: ggplot2
│ rstan (Version 2.19.3, GitRev: 2e1f913d3ca3)
│ For execution on a local, multicore CPU with excess RAM we recommend calling
│ options(mc.cores = parallel::detectCores()).
│ To avoid recompilation of unchanged Stan programs, we recommend calling
│ rstan_options(auto_write = TRUE)
│ Loading required package: parallel
│ Loading required package: dagitty
│ rethinking (Version 2.01)
│ 
│ Attaching package: ‘rethinking’
│ 
│ The following object is masked from ‘package:stats’:
│ 
│     rstudent
│ 
└ @ RCall /home/user/.julia/packages/RCall/jOnwc/src/io.jl:160


RObject{StrSxp}
 [1] "rethinking"  "dagitty"     "parallel"    "rstan"       "ggplot2"    
 [6] "StanHeaders" "stats"       "graphics"    "grDevices"   "utils"      
[11] "datasets"    "methods"     "base"       


# Estimate Overfit

Build the dataset:

In [10]:
N = 20
num_feats = 4

function build_dataset(obs, feats)
    columns = vcat(:y, [Symbol("x"*string(i)) for i in 1:feats])
    
    Σ = rand(Normal(0, 1), (feats, feats)) |> 
        Symmetric |>
        m -> begin 
                m[diagind(m)] .= 1
                m
             end
    
    μ = rand(Normal(0, 1), feats)

    X = rand(MvNormal(μ, Σ * Σ'), obs)'

    Y = 0.15 .* X[:, 1] - 0.4 .* X[:, 2]
    Y = rand.(Normal.(Y, 0.5))
    
    DataFrame(hcat(Y, X), columns)
end

train_set = build_dataset(N, num_feats)
test_set = build_dataset(N, num_feats)

first(train_set, 5)

Unnamed: 0_level_0,y,x1,x2,x3,x4
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,-1.30314,1.4716,0.659078,0.408672,-0.888923
2,-1.62796,-0.459274,2.35637,-1.58003,0.376419
3,-0.388343,0.303939,3.15515,1.28453,-0.695851
4,0.673126,0.457454,0.723723,-0.713507,0.794058
5,0.240675,0.464993,-0.780679,-2.28458,1.41734


Build some linear models:

In [11]:
model1 = lm(@formula(y ~ x1 + x2 + 0), train_set)

@show r2(model1)
model1

r2(model1) = 0.43404744207452906


StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Array{Float64,1}},GLM.DensePredChol{Float64,Cholesky{Float64,Array{Float64,2}}}},Array{Float64,2}}

y ~ 0 + x1 + x2

Coefficients:
────────────────────────────────────────────────────────────────────
     Estimate  Std. Error   t value  Pr(>|t|)   Lower 95%  Upper 95%
────────────────────────────────────────────────────────────────────
x1   0.17522    0.103348    1.69545    0.1072  -0.0419049   0.392346
x2  -0.429164   0.0880478  -4.87422    0.0001  -0.614146   -0.244183
────────────────────────────────────────────────────────────────────

In [12]:
model2 = lm(@formula(y ~ x1 + x2 + x3 + x4 + 0), train_set) 


@show r2(model2)
model2

r2(model2) = 0.5398388021559171


StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Array{Float64,1}},GLM.DensePredChol{Float64,Cholesky{Float64,Array{Float64,2}}}},Array{Float64,2}}

y ~ 0 + x1 + x2 + x3 + x4

Coefficients:
────────────────────────────────────────────────────────────────────
     Estimate  Std. Error   t value  Pr(>|t|)   Lower 95%  Upper 95%
────────────────────────────────────────────────────────────────────
x1   0.327723   0.163303    2.00684    0.0620  -0.0184637   0.67391
x2  -0.420157   0.0863643  -4.86494    0.0002  -0.603242   -0.237073
x3   0.23209    0.121717    1.9068     0.0747  -0.0259388   0.49012
x4   0.227684   0.172558    1.31946    0.2056  -0.138123    0.593492
────────────────────────────────────────────────────────────────────

Evaluate using RMSE:

In [13]:
rmse(ŷ, y) = sum((y - ŷ).^2)/length(y) |> sqrt

model1_insample = rmse(predict(model1, train_set), train_set.y)
model1_outsample = rmse(predict(model1, test_set), test_set.y)

model2_insample = rmse(predict(model2, train_set), train_set.y)
model2_outsample = rmse(predict(model2, test_set), test_set.y)

@show model1_insample
@show model1_outsample

@show model2_insample
@show model2_outsample
;

model1_insample = 0.5492533601758237
model1_outsample = 0.4006606250597501
model2_insample = 0.4952650781808787
model2_outsample = 0.8640722345579165


Build Gen models:

In [14]:
params_model_1 = [:Β₁, :Β₂]
params_model_2 = [:Β₁, :Β₂, :Β₃, :Β₄]

X_model_1 = train_set[:, [:x1, :x2]] |> Matrix
X_model_2 = train_set[:, [:x1, :x2, :x3, :x4]] |> Matrix
Y = train_set.y

@gen function model_1(X::Matrix{Float64})
    # define the priors
    Β₁ = @trace(normal(0, 5), :Β₁)
    Β₂ = @trace(normal(0, 5), :Β₂)    
    
    # model the ys
    for (i, (x1, x2)) in enumerate(eachrow(X))
        μ = Β₁ * x1 + Β₂ * x2
        @trace(normal(μ, 1), (:y, i))
    end
end

@gen function model_2(X::Matrix{Float64})
    # define the priors
    Β₁ = @trace(normal(0, 5), :Β₁)
    Β₂ = @trace(normal(0, 5), :Β₂)
    Β₃ = @trace(normal(0, 5), :Β₃)
    Β₄ = @trace(normal(0, 5), :Β₄)
    
    # model the ys
    for (i, (x1, x2, x3, x4)) in enumerate(eachrow(X))
        μ =  Β₁ * x1 + Β₂ * x2 +  Β₃ * x3 + Β₄ * x4
        @trace(normal(μ, 5e-3), (:y, i))
    end
end
;

Sample some parameters from the posterior:

In [15]:
N = 1000
budget = 100

model_1_results = @time get_posterior_samples(model_1, N, budget, X_model_1, Y, params_model_1)
model_1_summary = summarize_posterior_samples(model_1_results, params_model_1)

  6.327124 seconds (21.22 M allocations: 1.011 GiB, 12.10% gc time)


Unnamed: 0_level_0,param,mean,std,5%,95%
Unnamed: 0_level_1,Any,Any,Any,Any,Any
1,Β₁,0.180137,0.546181,-0.654808,1.05582
2,Β₂,-0.408211,0.454506,-1.18321,0.302611


In [16]:
N = 1000
budget = 4000


model_2_results =  @time get_posterior_samples(model_2, N, budget, X_model_2, Y, params_model_2)
model_2_summary = summarize_posterior_samples(model_2_results, params_model_2)

248.952231 seconds (864.40 M allocations: 39.538 GiB, 18.49% gc time)


Unnamed: 0_level_0,param,mean,std,5%,95%
Unnamed: 0_level_1,Any,Any,Any,Any,Any
1,Β₁,0.316819,0.794988,-0.990631,1.58617
2,Β₂,-0.429965,0.431638,-1.12669,0.243596
3,Β₃,0.243631,0.588763,-0.664512,1.18533
4,Β₄,0.227302,0.841729,-1.06648,1.55931


Evaluate the LPPD for each model:

In [17]:
lppd_model1_insample = get_prediction_log_probs(DataFrame(model_1_results, params_model_1), X_model_1, Y, model_1) |> 
    lppd |> 
    sum


lppd_model2_insample = get_prediction_log_probs(DataFrame(model_2_results, params_model_2), X_model_2, Y, model_2) |> 
    lppd |> 
    sum

lppd_model1_outsample = get_prediction_log_probs(DataFrame(model_1_results, params_model_1), 
                                                 test_set[:, [:x1, :x2]] |> Matrix, 
                                                 test_set.y, model_1) |> 
    lppd |> 
    sum

lppd_model2_outsample = get_prediction_log_probs(DataFrame(model_2_results, params_model_2), 
                                                test_set[:, [:x1, :x2, :x3, :x4]] |> Matrix, 
                                                test_set.y, model_2) |> 
    lppd |> 
    sum


@show lppd_model1_insample
@show lppd_model1_outsample

@show lppd_model2_insample
@show lppd_model2_outsample
;

lppd_model1_insample = -25.741146571373086
lppd_model1_outsample = -34.53622284034154
lppd_model2_insample = -38.99546258578117
lppd_model2_outsample = -47.0870587574426


The pwaic of model 1 and model 2:

In [18]:
get_prediction_log_probs(DataFrame(model_1_results, params_model_1), X_model_1, Y, model_1) |>
    pwaic |>
    sum

22.101173572023608

In [19]:
get_prediction_log_probs(DataFrame(model_2_results, params_model_2), X_model_2, Y, model_2) |>
    pwaic |>
    sum

4.2017932261213745e10

The waic of model 1 and model 2:

In [20]:
get_prediction_log_probs(DataFrame(model_1_results, params_model_1), X_model_1, Y, model_1) |>
   waic |>
   sum

-47.8423201433967

In [21]:
get_prediction_log_probs(DataFrame(model_2_results, params_model_2), X_model_2, Y, model_2) |>
    waic |>
    sum

-4.2017932300209206e10

# WAIC penalty analysis

In [22]:
logprobs_model_1 = get_prediction_log_probs(DataFrame(model_1_results, params_model_1), X_model_1, Y, model_1)
logprobs_model_2 = get_prediction_log_probs(DataFrame(model_2_results, params_model_2), X_model_2, Y, model_2)
;

In [23]:
data = [logprobs_model_1 |> lppd,
        logprobs_model_2 |> lppd,
        logprobs_model_1 |> pwaic,
        logprobs_model_2 |> pwaic]

obs_analysis = DataFrame(data, [:lppd_m1, :lppd_m2, :pwaic_m1, :pwaic_m2])
round.(obs_analysis, digits=4)

Unnamed: 0_level_0,lppd_m1,lppd_m2,pwaic_m1,pwaic_m2
Unnamed: 0_level_1,Float64,Float64,Float64,Float64
1,-1.6788,-9.5736,1.3006,2200730000.0
2,-1.4209,-1.8329,1.3931,2328510000.0
3,-1.5708,-1.2151,3.6594,3674690000.0
4,-1.3285,-1.0764,0.1137,1031950000.0
5,-1.0315,-0.6662,0.0371,682540000.0
6,-1.568,-1.4671,3.3039,5913780000.0
7,-1.0827,-1.2665,0.0472,1287090000.0
8,-0.9874,-1.228,0.0059,1763950000.0
9,-1.1603,-1.3258,0.2054,264278000.0
10,-1.6095,-6.862,4.9958,4138910000.0


In [24]:
df1 = DataFrame(logprobs_model_1 .|> exp, [Symbol(string(i)) for i in 1:20])
df1[!, :model] .= "1"

df2 = DataFrame(logprobs_model_2 .|> exp, [Symbol(string(i)) for i in 1:20])
df2[!, :model] .= "2"

df = vcat(df1, df2)
df = stack(df, 1:20)
rename!(df, [:observation, :density, :model])

first(df, 5)

Unnamed: 0_level_0,observation,density,model
Unnamed: 0_level_1,Symbol,Float64,String
1,1,0.145,1
2,1,0.00959437,1
3,1,0.272811,1
4,1,0.123852,1
5,1,0.0520028,1


In [25]:
gdf = groupby(df, [:observation, :model])

f(xs) = begin
    markers =  [0.0, 0.25, 0.5, 0.75, 1.0]
    vals = quantile(xs, markers)
    (;zip(Symbol.(markers), vals)..., mean=mean(xs), var=var(xs))
end
    
dist = combine(:density => f, gdf)
first(dist, 6)

Unnamed: 0_level_0,observation,model,0.0,0.25,0.5,0.75,1.0,mean,var
Unnamed: 0_level_1,Symbol,String,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,1,1,0.000261347,0.0776812,0.174014,0.291009,0.398847,0.186606,0.0150654
2,1,2,0.0,0.0,0.0,0.0,0.0655192,6.95384e-05,4.30506e-06
3,2,1,1.9644e-07,0.129407,0.265949,0.358703,0.398942,0.241491,0.0162116
4,2,2,0.0,0.0,0.0,0.0,79.6887,0.159952,12.354
5,3,1,2.27198e-08,0.0697495,0.21692,0.347585,0.398942,0.207879,0.0196104
6,3,2,0.0,0.0,0.0,0.0,72.5894,0.296693,18.1376


In [26]:
set_default_plot_size(20Gadfly.cm, 20Gadfly.cm)

@manipulate for i = 1:4
    
    start_ob = 1 + (i - 1) * 5
    end_ob = 5 + (i - 1) * 5

    subset_df = df |>
    @filter(parse(Int, string(_.observation)) >= start_ob && parse(Int, string(_.observation)) <= end_ob) |>
    DataFrame

    plot(subset_df, 
            x = :observation, 
            ygroup = :model, 
            y = :density,
            Guide.title("Distribution of Pointwise Predictive Density"), 
            Geom.subplot_grid(Geom.boxplot, free_y_axis = true))
end