/
ridge_cross_validation.m
150 lines (116 loc) · 4.64 KB
/
ridge_cross_validation.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
function [lambda_opt, cv_error_lambda] = ridge_cross_validation(y, X, lambda, K, cv_randomized, num_cores)
% Performing cross-validation on ridge regression data to determine the
% optimal value of parameter lambda for a single voxel. The function
% applies K-fold cross-validation, i.e. cross-validation with K
% subsets of elements, removing each subset in turn. Each time, we
% choose one subset as the validation set and the others as the
% training set.
%
% Inputs:
%
% y: n-by-1 vector of observed responses.
%
% X: n-by-p matrix of p predictors at n observations.
%
% Optional inputs:
%
% lambda: A vector of parameters for ridge regression, e.g.
% [0 1 10 100 1000 10^4 10^5 10^6]. If lambda consists of m elements,
% the calculated b_lambda is p-by-m in size.
%
% K: The number of training sets used in cross-validation. Each set
% is treated as the validation set in turn. The data is split evenly
% into the sets by its data points. E.g. with K = 2 the two
% training sets are the first half and the second half.
%
% cv_randomized: Signifies whether the order of the data points is
% randomized for cross-validation. For instance, they are not
% randomized when the data represents a time series, but randomization
% is more suitable when the data points represent different test
% subjects instead. Possible values: 0 and 1. By default, the data
% points are randomized.
%
% num_cores: The number of cores to be used for parallel processing.
% Default: 1 (non-parallel).
%
% Outputs:
%
% lambda_opt: the optimal value of lambda.
%
% cv_error_lambda: the error terms calculated by cross-validation with
% various values of lambda.
%
% version 3.0, 2019-03-08; Jonatan Ropponen, Tomi Karjalainen
% Default values
if nargin < 3
lambda = [0 1 10 100 1000 10^4 10^5 10^6];
end
n_lambda = length(lambda);
% Lambda must not be given negative values.
for i = 1:n_lambda
if lambda(i) < 0
lambda(i) = 0;
msg = 'Lambda must be non-negative.';
disp(msg);
end
end
if nargin < 4
K = 2;
end
if nargin < 5
cv_randomized = 1;
end
% By default, parallel computing is not used.
if nargin < 6 || num_cores < 1
num_cores = 1;
end
% If only a single value of lambda has been provided, we can simply return
% it.
if n_lambda == 1
lambda_opt = lambda(1);
cv_error_lambda = [];
else
number_of_indices = size(X, 1);
% Determining the training sets.
subset_size = number_of_indices / K;
% Randomizing the order of the data points for cross-validation if
% enabled.
if cv_randomized
indices = randperm(number_of_indices);
else
indices = 1:number_of_indices;
end
training_set_indices = {};
for i = 1:K
first_index = round((i-1) * subset_size) + 1;
last_index = min(round(i * subset_size), number_of_indices);
new_training_set = indices(first_index:last_index);
training_set_indices = [training_set_indices, {new_training_set}];
end
% Choosing the optimal lambda by cross-validation
% Applying K-fold cross-validation, i.e. cross-validation with K
% subsets of elements, removing each subset in turn. We choose one
% subset as the validation set and the others as training sets.
if num_cores > 1
[~, par_workers] = create_parpool(num_cores);
parfor i = 1:K
current_indices = cell2mat(training_set_indices(i));
cv_error_lambda_i(i, :) = ridge_cv_error_calculation(y, X, lambda, current_indices);
end
if ~isempty(par_workers)
delete(par_workers);
end
else
for i = 1:K
current_indices = cell2mat(training_set_indices(i));
cv_error_lambda_i(i, :) = ridge_cv_error_calculation(y, X, lambda, current_indices);
end
end
% Determining the error overall for each value of lambda.
for i = 1:n_lambda
cv_error_lambda(i) = mean(cv_error_lambda_i(:, i));
end
[~, opt_idx] = min(cv_error_lambda);
lambda_opt = lambda(opt_idx(1));
end
end