/
binned_resamplings.jl
220 lines (168 loc) · 7.18 KB
/
binned_resamplings.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import ..UncertainValues: UncertainScalarPopulation, UncertainScalarKDE
"""
RawValues
Indicates that instead of summarising each bin, vectors of raw values should
be returned for a binned resampling.
"""
struct RawValues end
const BINREPR = Union{UncertainScalarKDE, UncertainScalarPopulation, RawValues}
"""
BinnedResampling(left_bin_edges, n::Int; bin_repr = UncertainScalarKDE)
BinnedResampling(UncertainScalarKDE, left_bin_edges, n::Int)
BinnedResampling(UncertainScalarPopulation, left_bin_edges, n::Int)
BinnedResampling(RawValues, left_bin_edges, n::Int)
Indicates that binned resampling should be performed.
## Fields
- `left_bin_edges`. The left edgepoints of the bins. Either a range or some
custom type which implements `minimum` and `step` methods.
- `n`. The number of draws. Each point in the dataset is sampled `n` times.
If there are `m` points in the dataset, then the total number of draws
is `n*m`.
- `bin_repr`. A type of uncertain value indicating how each bin should be
summarised (`UncertainScalarKDE` for kernel density estimated distributions
in each bin, `UncertainScalarPopulation` to represent values in each bin
as an equiprobable population) or not summarise but return raw values
falling in each bin (`RawValues`).
## Examples
```julia
using UncertainData
# Resample on a grid from 0 to 200 in steps of 20
grid = 0:20:200
# The number of samples per point in the dataset
n_draws = 10000
# Create the resampling scheme. Use kernel density estimates to distribution
# in each bin.
resampling = BinnedResampling(grid, n_draws, bin_repr = UncertainScalarKDE)
# Represent each bin as an equiprobably population
resampling = BinnedResampling(grid, n_draws, bin_repr = UncertainScalarPopulation)
# Keep raw values for each bin (essentially the same as UncertainScalarPopulation,
# but avoids storing an additional vector of weights for the population members).
resampling = BinnedResampling(grid, n_draws, bin_repr = RawValues)
```
"""
Base.@kwdef struct BinnedResampling{R, B} <: AbstractBinnedUncertainValueResampling where {R <: BINREPR, B}
bin_repr::Type{R} = UncertainScalarKDE
left_bin_edges::B
n::Int
end
BinnedResampling(left_bin_edges, n::Int; bin_repr = UncertainScalarKDE) =
BinnedResampling(bin_repr, left_bin_edges, n)
function Base.show(io::IO, b::BinnedResampling{R, B}) where {R, B}
T = typeof(b)
println(io, "$(T.name){bin_repr: $R, left_bin_edges: $B, n=$(b.n)}")
end
"""
BinnedWeightedResampling(left_bin_edges, weights, n::Int; bin_repr = UncertainScalarKDE)
BinnedWeightedResampling(UncertainScalarKDE, left_bin_edges, weights, n::Int)
BinnedWeightedResampling(UncertainScalarPopulation, left_bin_edges, weights, n::Int)
BinnedWeightedResampling(RawValues, left_bin_edges, weights, n::Int)
Indicates that binned resampling should be performed, but weighting each
point in the dataset differently.
## Fields
- `left_bin_edges`. The left edgepoints of the bins. Either a range or some
custom type which implements `minimum` and `step` methods.
- `weights`. The relative probability weights assigned to each point.
- `n`. The total number of draws. These are distributed among the
points of the dataset according to `weights`.
- `bin_repr`. A type of uncertain value indicating how each bin should be
summarised (`UncertainScalarKDE` for kernel density estimated distributions
in each bin, `UncertainScalarPopulation` to represent values in each bin
as an equiprobable population) or not summarise but return raw values
falling in each bin (`RawValues`).
## Examples
```julia
using UncertainData, StatsBase
# Resample on a grid from 0 to 200 in steps of 20
grid = 0:20:200
# Assume our dataset has 50 points. We'll assign random weights to them.
wts = Weights(rand(50))
# The total number of draws (on average 1000000/50 = 20000 draws per point
# if weights are equal)
n_draws = 10000000
# Create the resampling scheme. Use kernel density estimates to distribution
# in each bin.
resampling = BinnedWeightedResampling(grid, wts, n_draws, bin_repr = UncertainScalarKDE)
# Represent each bin as an equiprobably population
resampling = BinnedWeightedResampling(grid, wts, n_draws, bin_repr = UncertainScalarPopulation)
# Keep raw values for each bin (essentially the same as UncertainScalarPopulation,
# but avoids storing an additional vector of weights for the population members).
resampling = BinnedWeightedResampling(grid, wts n_draws, bin_repr = RawValues)
```
"""
Base.@kwdef struct BinnedWeightedResampling{R, B, W} <: AbstractBinnedUncertainValueResampling where {R <: BINREPR, B, W}
bin_repr::Type{R} = UncertainScalarKDE
left_bin_edges::B
weights::W
n::Int
end
BinnedWeightedResampling(left_bin_edges, weights, n::Int; bin_repr = UncertainScalarKDE) =
BinnedWeightedResampling(bin_repr, left_bin_edges, weights, n)
function Base.show(io::IO, b::BinnedWeightedResampling{R, B, W}) where {R, B, W}
T = typeof(b)
println(io, "$(T.name){bin_repr: $R, left_bin_edges: $B, weights: $W, n=$(b.n)}")
end
"""
BinnedMeanResampling
Binned resampling where each bin is summarised using
the mean of all draws falling in that bin.
## Fields
- `left_bin_edges`. The left edgepoints of the bins. Either a range or some
custom type which implements `minimum` and `step` methods.
- `n`. The number of draws. Each point in the dataset is sampled `n` times.
If there are `m` points in the dataset, then the total number of draws
is `n*m`.
## Examples
```julia
using UncertainData
# Resample on a grid from 0 to 200 in steps of 20
grid = 0:20:200
# The number of samples per point in the dataset
n_draws = 10000
# Create the resampling scheme
resampling = BinnedMeanResampling(grid, n_draws)
```
"""
struct BinnedMeanResampling{B} <: AbstractBinnedSummarisedResampling
left_bin_edges::B
n::Int
end
function Base.show(io::IO, b::BinnedMeanResampling{B}) where {B}
T = typeof(b)
println(io, "$(T.name){left_bin_edges=$(b.left_bin_edges), n=$(b.n)}")
end
"""
BinnedMeanWeightedResampling
Binned resampling where each bin is summarised using the mean of all draws
falling in that bin. Points in the dataset are sampled with probabilities
according to `weights`.
## Fields
- `left_bin_edges`. The left edgepoints of the bins. Either a range or some
custom type which implements `minimum` and `step` methods.
- `weights`. The relative probability weights assigned to each point.
- `n`. The total number of draws. These are distributed among the
points of the dataset according to `weights`.
## Examples
```julia
using UncertainData, StatsBase
# Resample on a grid from 0 to 200 in steps of 20
grid = 0:20:200
# Assume our dataset has 50 points. We'll assign random weights to them.
wts = Weights(rand(50))
# The total number of draws (on average 1000000/50 = 20000 draws per point
# if weights are equal)
n_draws = 10000000
# Create the resampling scheme
resampling = BinnedMeanWeightedResampling(grid, wts, n_draws)
```
"""
struct BinnedMeanWeightedResampling{B} <: AbstractBinnedSummarisedResampling
left_bin_edges::B
weights
n::Int
end
export
BinnedResampling,
BinnedWeightedResampling,
BinnedMeanResampling,
BinnedMeanWeightedResampling,
RawValues