-
Notifications
You must be signed in to change notification settings - Fork 5
/
imon2005.jl
107 lines (87 loc) · 3.01 KB
/
imon2005.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
module Imon2005
export imon2005
import ..Basis:
RegressionSetting, @extractRegressionSetting, designMatrix, responseVector, applyColumns
import ..OrdinaryLeastSquares: ols, predict, residuals, coef
import ..LTS: lts
"""
imon2005(setting)
Perform the Imon 2005 algorithm for a given regression setting.
# Arguments
- `setting::RegressionSetting`: A regression setting.
# Description
The algorithm estimates the GDFFITS diagnostic, which is an extension of well-known regression
diagnostic DFFITS. Unlikely, GDFFITS is used for detecting multiple outliers whereas the original
one was used for detecting single outliers.
# Output
- `["crit"]`: The critical value used
- `["gdffits"]`: Array of GDFFITS diagnostic calculated for observations
- `["outliers"]`: Array of indices of outliers.
- `["betas"]`: Vector of regression coefficients.
# Notes
The implementation uses LTS rather than LMS as suggested in the paper.
# References
A. H. M. Rahmatullah Imon (2005) Identifying multiple influential observations in linear regression,
Journal of Applied Statistics, 32:9, 929-946, DOI: 10.1080/02664760500163599
"""
function imon2005(setting::RegressionSetting)
X, y = @extractRegressionSetting setting
return imon2005(X, y)
end
function imon2005(X::AbstractMatrix{Float64}, y::AbstractVector{Float64})
function SigmaWithoutIndex(X, y, R, i)
n, p = size(X)
Ri = filter(x -> x != i, R)
XRi = X[Ri, :]
yRi = y[Ri]
reg = ols(XRi, yRi)
betas = coef(reg)
res = y .- X * betas
return sqrt(sum(res .^ 2.0) / (n - p))
end
n, p = size(X)
allindex = collect(1:n)
ltsreg = lts(X, y)
R::Array{Int,1} = ltsreg["hsubset"]
XR = X[R, :]
yR = y[R]
XRXR = transpose(XR) * XR
invXRXR = inv(XRXR)
wiiR = [X[i, :]' * invXRXR * X[i, :] for i in allindex]
wiiRAsterix = zeros(Float64, n)
for i in allindex
if i in R
wiiRAsterix[i] = wiiR[i] / (1.0 - wiiR[i])
else
wiiRAsterix[i] = wiiR[i] / (1.0 + wiiR[i])
end
end
RegR = ols(XR, yR)
BetaHatR = coef(RegR)
resR = y - X * BetaHatR
sigmaR = sum(resR .^ 2.0) / (n - p)
tAsterix = zeros(Float64, n)
for i in allindex
if i in R
tAsterix[i] = resR[i] / (SigmaWithoutIndex(X, y, R, i) * sqrt(1.0 - wiiR[i]))
else
tAsterix[i] = resR[i] / (SigmaWithoutIndex(X, y, R, i) * sqrt(1.0 + wiiR[i]))
end
end
GDFFITS = zeros(Float64, n)
for i in allindex
GDFFITS[i] = sqrt(wiiRAsterix[i]) * tAsterix[i]
end
crit = 3.0 * sqrt(p / length(R))
outlyingindex = filter(i -> abs(GDFFITS[i]) >= crit, allindex)
inlierindex = setdiff(1:n, outlyingindex)
cleanols = ols(X[inlierindex, :], y[inlierindex])
cleanbeta = coef(cleanols)
result::Dict{String,Any} = Dict{String, Any}()
result["crit"] = crit
result["gdffits"] = GDFFITS
result["outliers"] = outlyingindex
result["betas"] = cleanbeta
return result
end
end # end of module Imon2005