/
mve.jl
180 lines (149 loc) · 6.1 KB
/
mve.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
module MVE
export mve
import DataFrames: DataFrame
import LinearAlgebra: diag, det
import Distributions: median, cov, mean, quantile, sample, Chisq
import ..Basis:
RegressionSetting, @extractRegressionSetting, designMatrix, responseVector, applyColumns
import ..Diagnostics: mahalanobisSquaredMatrix
function enlargesubset(initialsubset, data::DataFrame, dataMatrix::AbstractMatrix, h::Int)
n, _ = size(dataMatrix)
basicsubset = copy(initialsubset)
while length(basicsubset) < h
meanvector = applyColumns(mean, data[basicsubset, :])
covmatrix = cov(dataMatrix[basicsubset, :])
md2mat =
mahalanobisSquaredMatrix(data, meanvector = meanvector, covmatrix = covmatrix)
md2 = diag(md2mat)
md2sortedindex = sortperm(md2)
basicsubset = md2sortedindex[1:(length(basicsubset)+1)]
end
return basicsubset
end
function robcov(data::DataFrame; alpha = 0.01, estimator = :mve)
dataMatrix = Matrix(data)
n, p = size(dataMatrix)
chisquared = Chisq(p)
chisqcrit = quantile(chisquared, 1.0 - alpha)
c = sqrt(chisqcrit)
h = Int(floor((n + p + 1.0) / 2.0))
indices = collect(1:n)
k = p + 1
mingoal = Inf
bestinitialsubset = []
besthsubset = []
maxiter = minimum([p * 500, 3000])
initialsubset = []
hsubset = []
for iter = 1:maxiter
goal = Inf
try
initialsubset = sample(indices, k, replace = false)
hsubset = enlargesubset(initialsubset, data, dataMatrix, h)
covmatrix = cov(dataMatrix[hsubset, :])
if estimator == :mve
meanvector = applyColumns(mean, data[hsubset, :])
md2mat = mahalanobisSquaredMatrix(
data,
meanvector = meanvector,
covmatrix = covmatrix,
)
DJ = sqrt(sort(diag(md2mat))[h])
goal = (DJ / c)^p * det(covmatrix)
else
goal = det(covmatrix)
end
catch e
# Possibly singularity
end
if goal < mingoal
mingoal = goal
bestinitialsubset = initialsubset
besthsubset = hsubset
end
end
meanvector = applyColumns(mean, data[besthsubset, :])
covariancematrix = cov(dataMatrix[besthsubset, :])
md2 = diag(
mahalanobisSquaredMatrix(
data,
meanvector = meanvector,
covmatrix = covariancematrix,
),
)
outlierset = filter(x -> md2[x] > chisqcrit, 1:n)
result = Dict{String, Any}()
result["goal"] = mingoal
result["best.subset"] = sort(besthsubset)
result["robust.location"] = meanvector
result["robust.covariance"] = covariancematrix
result["squared.mahalanobis"] = md2
result["chisq.crit"] = chisqcrit
result["alpha"] = alpha
result["outliers"] = outlierset
return result
end
"""
mve(data; alpha = 0.01)
Performs the Minimum Volume Ellipsoid algorithm for a robust covariance matrix.
# Arguments
- `data::DataFrame`: Multivariate data.
- `alpha::Float64`: Probability for quantiles of Chi-Squared statistic.
# Description
`mve` searches for a robust location vector and a robust scale matrix, e.g covariance matrix.
The method also reports a usable diagnostic measure, Mahalanobis distances, which are calculated using
the robust counterparts instead of mean vector and usual covariance matrix. Mahalanobis distances
are directly comparible with quantiles of a ChiSquare Distribution with `p` degrees of freedom.
# Output
- `["goal"]`: Objective value
- `["best.subset"]`: Indices of best h-subset of observations
- `["robust.location"]`: Vector of robust location measures
- `["robust.covariance"]`: Robust covariance matrix
- `["squared.mahalanobis"]`: Array of Mahalanobis distances calculated using robust location and scale measures.
- `["chisq.crit"]`: Chisquare quantile used in threshold
- `["alpha"]`: Probability used in calculating the Chisquare quantile, e.g `chisq.crit`
- `["outliers"]`: Array of indices of outliers.
# References
Van Aelst, Stefan, and Peter Rousseeuw. "Minimum volume ellipsoid." Wiley
Interdisciplinary Reviews: Computational Statistics 1.1 (2009): 71-82.
"""
function mve(data::DataFrame; alpha = 0.01)
robcov(data, alpha = alpha, estimator = :mve)
end
function mve(data::AbstractMatrix{Float64}; alpha = 0.01)
return mve(DataFrame(data), alpha = alpha)
end
"""
mcd(data; alpha = 0.01)
Performs the Minimum Covariance Determinant algorithm for a robust covariance matrix.
# Arguments
- `data::DataFrame`: Multivariate data.
- `alpha::Float64`: Probability for quantiles of Chi-Squared statistic.
# Description
`mcd` searches for a robust location vector and a robust scale matrix, e.g covariance matrix.
The method also reports a usable diagnostic measure, Mahalanobis distances, which are calculated using
the robust counterparts instead of mean vector and usual covariance matrix. Mahalanobis distances
are directly comparible with quantiles of a ChiSquare Distribution with `p` degrees of freedom.
# Output
- `["goal"]`: Objective value
- `["best.subset"]`: Indices of best h-subset of observations
- `["robust.location"]`: Vector of robust location measures
- `["robust.covariance"]`: Robust covariance matrix
- `["squared.mahalanobis"]`: Array of Mahalanobis distances calculated using robust location and scale measures.
- `["chisq.crit"]`: Chisquare quantile used in threshold
- `["alpha"]`: Probability used in calculating the Chisquare quantile, e.g `chisq.crit`
- `["outliers"]`: Array of indices of outliers.
# Notes
Algorithm is implemented using concentration steps as described in the reference paper.
However, details about number of iterations are slightly different.
# References
Rousseeuw, Peter J., and Katrien Van Driessen. "A fast algorithm for the minimum covariance
determinant estimator." Technometrics 41.3 (1999): 212-223.
"""
function mcd(data::DataFrame; alpha = 0.01)
robcov(data, alpha = alpha, estimator = :mcd)
end
function mcd(data::AbstractMatrix{Float64}; alpha = 0.01)
return mcd(DataFrame(data), alpha = alpha)
end
end # end of module MVE