/
simulation-srpe.R
366 lines (309 loc) · 15.5 KB
/
simulation-srpe.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
# this script is for simulating missing data
# and compare different methods of imputation
library(tidyverse) # for datawrangling
library(mice) # for imputation methods
# so we don't have to deal with scientific notations
# and strings aren't automaticcaly read as factors
options(scipen = 17,
stringsAsFactors = FALSE)
# note that the RPE data is per session (which there can be multiple of per day)
d_rpe_full = read_delim("norwegian_premier_league_football_rpe_anon.csv", delim = ",")
# remove missing
# select vars we need in the simulation, key variables we think are correlated with the level of sRPE
keyvars = c("p_id", "training_date", "mc_day", "week_nr")
d_srpe = d_rpe_full %>% filter(!is.na(rpe) & !is.na(duration)) %>% select(all_of(keyvars), rpe, duration) %>% mutate(srpe = rpe*duration)
# adding a variable for match
# under the assumption that this is very predictive of sRPE
d_srpe = d_srpe %>% mutate(match = ifelse(mc_day == "M", 1, 0))
#------------------------------------------Simulation
# we have all functions needed for performing the simulation in this section
# if you want to look at each step in the simulation, please start at the section Step 1
# All the functions we need are listed first
# logistic function
log_reg = function(tl_coef){
res = 1 / (1 + exp(-tl_coef))
res
}
# linear logistic regression function
# where the level of sRPE effects injury probability
inj_probability_srpe = function(srpe){
y = log_reg(-2 + 0.003*srpe)
y
}
# Create missing completely at random
add_mcar_rpe = function(d, missing_prop){
n_values = nrow(d)
d = d %>% rownames_to_column()
random_spots_rpe = sample(1:n_values, round(missing_prop*n_values))
random_spots_min = sample(1:n_values, round(missing_prop*n_values))
d = d %>% mutate(rpe = ifelse(rowname %in% random_spots_rpe, NA, rpe),
duration = ifelse(rowname %in% random_spots_min, NA, duration)) %>% dplyr::select(-rowname)
d
}
# Create missing with Missing probability based on other variables (Missing at Random)
# linear logistic regression function
# for the relationship between different variables and the probability of missing
mar_function = function(d, corr){
if(corr == "light"){
y = log_reg(-2 + (0.03*d$age) + (0.02*d$sex) + (0.3*d$freeday))
} else if(corr == "medium"){
y = log_reg(-2 + (0.08*d$age) + (0.04*d$sex) + (0.8*d$freeday))
} else if(corr == "strong"){
y = log_reg(-2 + (0.13*d$age) + (0.1*d$sex) + (1.8*d$freeday) + (1.8*d$match))
}
y
}
# adding missing at random to a dataset
add_mar_rpe = function(d, corr){
d = d %>% mutate(na_prop = mar_function(., corr),
na_spot_rpe = rbinom(length(na_prop), 1, prob = na_prop),
na_spot_duration = rbinom(length(na_prop), 1, prob = na_prop),
rpe = ifelse(na_spot_rpe == 1, NA, rpe),
duration = ifelse(na_spot_duration == 1, NA, duration))
d %>% dplyr::select(-starts_with("na"))
}
# impute mean function which calculates the mean by a chosen grouping variable
impute_mean = function(d, group_var){
group_var = enquo(group_var)
# calc mean
d_imp = d %>%
group_by(!!group_var) %>%
mutate(m_rpe = mean(rpe, na.rm = TRUE),
m_duration = mean(duration, na.rm = TRUE)) %>%
ungroup()
# impute with the mean
d_imp = d_imp %>% mutate(rpe = ifelse(is.na(rpe), m_rpe, rpe),
duration = ifelse(is.na(duration), m_duration, duration),
srpe = rpe*duration) %>%
dplyr::select(-m_rpe, -m_duration)
d_imp
}
# fit our models
fit_glm = function(d){
fit = glm(injury ~ srpe, family = "binomial", data = d)
fit
}
# function for obtaining parameters from any model fit
# specify method for a column with the model-type
# and whether its based on a mids object or not
get_params = function(fit, method, pool = FALSE){
if(pool){
d_params = summary(mice::pool(fit), "all", conf.int = TRUE) %>%
dplyr::select(term, std.error, estimate, CI_low = "2.5 %", CI_high = "97.5 %", p = p.value) %>%
as_tibble() %>%
mutate(term = as.character(term))
} else {
d_params = parameters::parameters(fit) %>% tibble() %>%
dplyr::select(term = Parameter, std.error = SE, estimate = Coefficient, CI_low, CI_high, p)
}
d_params = d_params %>% mutate(method = method)
d_params
}
# The final, helper function that performs all the imputations at once, given a dataset with missing
# then, it fits a logistic regression model on the data
# and outputs the needed model parameters for the validation of the imputation models
sim_impfit = function(d_missing, target_param, level = "NULL", rep = 1){
mar_vars = c("sex", "age", "freeday")
if(level == "light" | level == "medium"){
d_missing = d_missing %>% dplyr::select(-all_of(mar_vars))
} else if(level == "strong"){
d_missing = d_missing %>% dplyr::select(-all_of(mar_vars), -match)
}
#-----------------impute using all the different methods
# Mean imputation by the mean per player
d.mean.p_id = impute_mean(d_missing, p_id)
# Mean imputation by the mean per week
d.mean.week = impute_mean(d_missing, week_nr)
# Mulitple Imputation - Regression imputation
mids.reg = mice(d_missing, method = "norm.predict", seed = 1234, m = 1, print = FALSE)
imp.reg = mice::complete(mids.reg, "long", include = TRUE)
imp.reg$srpe = with(imp.reg, rpe*duration)
mids.reg = as.mids(imp.reg)
# Multiple imputation - predicted mean matching
mids.pmm = mice(d_missing, print = FALSE, seed = 1234)
imp.pmm = mice::complete(mids.pmm, "long", include = TRUE)
imp.pmm$srpe = with(imp.pmm, rpe*duration)
mids.itt.pmm = as.mids(imp.pmm)
# complete case analysis
d.cc = na.omit(d_missing) %>% mutate(srpe = rpe*duration)
# fit our models
fit.mean.p_id = fit_glm(d.mean.p_id)
fit.mean.week = fit_glm(d.mean.week)
fit.reg = with(mids.reg, glm(injury ~ srpe, family = binomial))
fit.pmm = with(mids.itt.pmm, glm(injury ~ srpe, family = binomial))
fit.cc = fit_glm(d.cc)
# fetch model parameters
tab1 = get_params(fit.mean.p_id, "Mean Imputation - Mean per player")
tab2 = get_params(fit.mean.week, "Mean Imputation - Mean per week")
tab3 = summary(fit.reg, conf.int = TRUE) %>%
dplyr::select(term, std.error, estimate, CI_low = "conf.low", CI_high = "conf.high", p = p.value) %>%
as_tibble() %>%
mutate(term = as.character(term),
method = "Regression Imputation")
tab4 = get_params(fit.pmm, "MI - PMM", pool = TRUE)
tab5 = get_params(fit.cc, "Complete Case Analysis")
d_fits = bind_rows(target_param, tab1, tab2, tab3, tab4, tab5)
d_fits = d_fits %>% mutate(rep = rep)
d_fits
}
# The first helper function outputs model fit estimated parameters
# we also want the imputed values as is
# function for adding the target srpe values
# to an imputed dataset
add_target_imp = function(d, imp_rows_pos, target, method){
d = d %>% rownames_to_column() %>%
mutate(imp_place = ifelse(rowname %in% imp_rows_pos, 1, 0),
target = target,
method = method) %>%
dplyr::select(method, imp_place, rpe, duration, srpe, target)
d
}
# helper function that imputes using all methods and outputs the
# data with the target srpe we wish to compare
# in theory, it would be more computationally efficient to have 1 function
# that imputes data, 1 function that runs the fits on data from the first function,
# and 1 function that outputs the imputed data from the first function.
# instead, we have 2 functions that both perform step 1, i.e., we are running it double.
# this is because more computationally power-efficient code
# would require more complicated programming. We would have to run fits on lists of data instead of
# a single dataset, and pooling might have to be manually implemented according to Ruben's rules.
# The simulation is, experienced from our derived-variable substudy, not that computationally heavy,
# and so I think this solution is fine, although it breaks the Do-not-Repeat-Yourself (DRY) Principle.
sim_imp = function(d_missing, target, level = "NULL", run = 1){
mar_vars = c("sex", "age", "freeday")
if(level == "light" | level == "medium"){
d_missing = d_missing %>% dplyr::select(-all_of(mar_vars))
} else if(level == "strong"){
d_missing = d_missing %>% dplyr::select(-all_of(mar_vars), -match)
}
# find which rows have missing and need imputation
imp_rows_pos = which(is.na(d_missing$rpe) | is.na(d_missing$duration))
#-----------------impute using all the different methods
# Mean imputation by the mean per player
d.mean.p_id = impute_mean(d_missing, p_id)
# Mean imputation by the mean per week
d.mean.week = impute_mean(d_missing, week_nr)
# Multiple Imputation - Regression imputation
mids.reg = mice(d_missing, method = "norm.predict", seed = 1234, m = 1, print = FALSE)
imp.reg = mice::complete(mids.reg, "long", include = TRUE)
imp.reg$srpe = with(imp.reg, rpe*duration)
mids.reg = as.mids(imp.reg)
# Multiple Imputation - Predicted Mean Matching
mids.pmm = mice(d_missing, print = FALSE, seed = 1234)
imp.pmm = mice::complete(mids.pmm, "long", include = TRUE)
imp.pmm$srpe = with(imp.pmm, rpe*duration)
mids.itt.pmm = as.mids(imp.pmm)
# complete case analysis
d.cc = na.omit(d_missing) %>% mutate(srpe = rpe*duration)
# add column of which row was imputed
d.mean.p_id = d.mean.p_id %>% add_target_imp(., imp_rows_pos, target = target, method = "Mean Imputation - Mean per player")
d.mean.week = d.mean.week %>% add_target_imp(., imp_rows_pos, target = target, method = "Mean Imputation - Mean per week")
# since multiple imputation has 5 datasets, the column is added to a list of datasets
d.reg = mice::complete(mids.reg, "all") %>%
map(. %>% add_target_imp(., imp_rows_pos, target = target, method = "Regression Imputation")) %>%
imap(., ~mutate(., dataset_n = .y)) %>% # a column for which dataset number, as multiple imputation imputes multiple
bind_rows() %>% filter(dataset_n != 0) # unimputed dataset is included in the ITT method, we remove this
d.pmm = mice::complete(mids.itt.pmm, "all") %>%
map(. %>% add_target_imp(., imp_rows_pos, target = target, method = "MI - PMM")) %>%
imap(., ~mutate(., dataset_n = .y)) %>% # a column for which dataset number, as multiple imputation imputes multiple
bind_rows() %>% filter(dataset_n != 0) # unimputed dataset is included in the ITT method, we remove this
# target column can't be added to complete case analysis, because they've been listwise deleted
d.cc = d.cc %>% mutate(method = "Complete Case Analysis") %>% dplyr::select(method, rpe, duration, srpe)
# combine to 1 dataset
d_imp = bind_rows(d.mean.p_id, d.mean.week, d.reg, d.pmm, d.cc) %>% tibble()
d_imp
}
###############Data Preparation
set.seed(1234)
# create fake injuries
d_srpe = d_srpe %>%
mutate(inj_prop = inj_probability_srpe(srpe),
injury = rbinom(length(inj_prop), 1, prob = inj_prop))
# calculate number of injuries
d_srpe %>% summarise(sum(injury))
# logistic regression for comparison
fit.target = glm(injury ~ srpe, family = "binomial", data = d_srpe)
target_param = get_params(fit.target, "No Imputation")
# remove variables we in theory wouldn't know about
# in a real life situation
d_exdata_srpe = d_srpe %>% dplyr::select(-inj_prop, -srpe)
# for missing at random, we create fake variables with correlation to the amount of missing
# we add fake age and sex, and we use the day of the week to determine weekend
# we'll use these variables to create Missing at Random
srpe_p_id = d_srpe %>% distinct(p_id)
srpe_id_base = srpe_p_id %>% mutate(age = sample(18:30, length(srpe_p_id$p_id), replace = TRUE),
sex = sample(0:1, length(srpe_p_id$p_id), replace = TRUE))
d_exdata_mar = d_exdata_srpe %>%
left_join(srpe_id_base, by = "p_id") %>%
mutate(freeday = ifelse(mc_day == "M+1" | mc_day == "M+2", 1, 0))
# fetch our original sRPE column, which is our original, true value, and we aim to target it
target_col = d_srpe$srpe
#####################For-loop simulation
# performing simulations with n runs
# the warnings are caused by collinearity between the variables
# which is expected
base_folder = "my\\future\\data\\folder\\"
folder_fits = paste0(base_folder, "srpe_fits\\")
folder_imps = paste0(base_folder, "srpe_imps\\")
# helper function for performing all needed simulations
# given a missing type (missing), either "mcar" or "mar"
# and amount of missing, a proportion for mcar or "light"/"medium"/"strong" for mar
# not that the datasets have to be ready beforehand, this is NOT a general function
sim_impute = function(missing, missing_amount, rep){
if(missing == "mcar"){
d_mcar = add_mcar_rpe(d_exdata_srpe, missing_amount)
d_sim_fits_mcar = sim_impfit(d_mcar, target_param = target_param, rep) %>% mutate(missing_type = missing, missing_amount = missing_amount)
saveRDS(d_sim_fits_mcar, file=paste0(folder_fits, rep,"_d_srpe_fits_", missing, "_", missing_amount,".rds"))
d_sim_imps_mcar = sim_imp(d_mcar, target = target_col, rep) %>% mutate(missing_type = missing, missing_amount = missing_amount)
saveRDS(d_sim_imps_mcar, file=paste0(folder_imps, rep,"_d_srpe_imps_", missing, "_", missing_amount,".rds"))
} else if(missing == "mar"){
d_mar = add_mar_rpe(d_exdata_mar, missing_amount)
d_sim_fits_mar = sim_impfit(d_mar, target_param = target_param, level = missing_amount, rep) %>% mutate(missing_type = missing, missing_amount = missing_amount)
saveRDS(d_sim_fits_mar, file=paste0(folder_fits, rep,"_d_srpe_fits_", missing, "_", missing_amount,".rds"))
d_sim_imps_mar = sim_imp(d_mar, target = target_col, level = missing_amount, rep) %>% mutate(missing_type = missing, missing_amount = missing_amount)
saveRDS(d_sim_imps_mar, file=paste0(folder_imps, rep,"_d_srpe_imps_", missing, "_", missing_amount,".rds"))
}
}
# vector of chosen missing proportions
# if we ever want to change it or add more proportions, easily done here.
missing_prop_mcar = c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)
missing_prop_mar = c("light", "medium", "strong")
# for-loop for runing the simulation
# using just 1 core. See optional
# for-loop using multiple cores below
options(warn=-1)
set.seed(1234)
n_sim = 1900
for(i in 1:n_sim){
# walk will run the function for each missing proportion in the vector
# without attempting to spit out a list (in comparison to map(), which will create a list or die trying)
missing_prop_mcar %>% walk(~sim_impute("mcar", ., rep = i))
missing_prop_mar %>% walk(~sim_impute("mar", ., rep = i))
}
options(warn=0)
#---------------------Optional: using multiple cores
library(foreach) # for using multiple cores
library(doParallel) # for using the cores in parallel
numCores = 4 # number of cores you wish to use
n_sim = 1900
set.seed(1234)
registerDoParallel(numCores)
#foreach (i = 1:n_sim) %dopar% {
# library(tidyverse)
# library(mice)
# purrr::walk(missing_prop_mcar, ~sim_impute("mcar", ., rep = i))
# purrr::walk(missing_prop_mar, ~sim_impute("mar", ., rep = i))
#}
#------------------------------------------------to check if having position changes the results
folder_fits = paste0(base_folder, "srpe_fits_pos\\")
folder_imps = paste0(base_folder, "srpe_imps_pos\\")
options(warn=-1)
set.seed(1234)
n_sim = 1900
for(i in 1:n_sim){
# walk will run the function for each missing proportion in the vector
# without attempting to spit out a list (in comparison to map(), which will create a list or die trying)
missing_prop_mcar %>% walk(~sim_impute("mcar", ., rep = i))
missing_prop_mar %>% walk(~sim_impute("mar", ., rep = i))
}
options(warn=0)