-
Notifications
You must be signed in to change notification settings - Fork 0
/
hypothesis_testing_using_data_exported_from_safe_haven.Rmd
225 lines (192 loc) · 8.93 KB
/
hypothesis_testing_using_data_exported_from_safe_haven.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
---
title: CHASe Hypothesis testing using data exported from Safe Haven
author: "Jan Savinc"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output:
html_document:
toc: true
toc_float: true
code_folding: hide
editor_options:
chunk_output_type: console
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
# Libraries
```{r, warning=FALSE}
library(tidyverse)
library(knitr) # for showing tables inside rmarkdown
library(readxl) # read excel files
library(broom)
```
# Load data
```{r}
consort_data <-
read_csv(file = "../Safe Haven Exports/CONSORT_diagram_data.csv") %>%
mutate(
note = case_when(
note == "Excluded: cases with no hospital records prior to death before age 18" &
N > 1000 ~ "Excluded: controls with no hospital records prior to death before age 18",
TRUE ~ note
)
) # correction: instead of 'cases' it should be 'controls' at the entry for exclusions with no records prior to death or age 18 with N>1000
data_cohort_by_whether_they_had_records_prior_to_death <- read_excel(
path = "X:/R1369/CSO FULL grant/Safe Haven Exports/Descriptive summary of cohort & individuals with any records/Descriptive_summary_cohort_by_whether_they_had_any_records_prior_to_death.xlsx",
sheet = 1,
) %>%
slice(-13) # remove footnote entry in spreadsheet
data_descriptives_carstairs_simd <- read_excel(
path = "X:/R1369/CSO FULL grant/Safe Haven Exports/Exports 2020-10-07/Results_paper_1_tables/Descriptives_carstairs_and_simd.xlsx", sheet = 1
) %>% slice(-nrow(.)) # remove last row - it's the footnote
hypothesis_tests <- list() # empty list for storing hypothesis tests!
```
## Hypothesis: the proportion of males is higher in the suicide group with hospital records than in the control group with hospital records
```{r}
table_cohort_sex_has_any_records_prior_to_death <-
data_cohort_by_whether_they_had_records_prior_to_death %>%
select(case, sex, has_no_records_prior_to_death, N)
## print the table first so we see the data we're working with
table_cohort_sex_has_any_records_prior_to_death %>%
filter(sex!="both" & has_no_records_prior_to_death=="No") %>%
group_by(case) %>%
mutate(percent = scales::percent(N/sum(N), accuracy = 0.1)) %>%
ungroup %>%
kable(caption = "Number of individuals with records prior to death by sex & case/control.")
## the proportions look very similar
table_cohort_sex_has_any_records_prior_to_death %>%
filter(sex!="both" & has_no_records_prior_to_death=="No") %>%
uncount(N) %>%
{table(.$case, .$sex)} %>%
chisq.test()
```
## Hypothesis: the average age of death is different for suicide cases with no prior records to those with prior records
```{r}
mean_cases_with_records <- data_cohort_by_whether_they_had_records_prior_to_death %>% filter(case=="case" & sex=="both" & has_no_records_prior_to_death=="No") %>% .$mean_age
mean_cases_without_records <- data_cohort_by_whether_they_had_records_prior_to_death %>% filter(case=="case" & sex=="both" & has_no_records_prior_to_death=="Yes") %>% .$mean_age
sd_cases_with_records <- data_cohort_by_whether_they_had_records_prior_to_death %>% filter(case=="case" & sex=="both" & has_no_records_prior_to_death=="No") %>% .$sd_age
sd_cases_without_records <- data_cohort_by_whether_they_had_records_prior_to_death %>% filter(case=="case" & sex=="both" & has_no_records_prior_to_death=="Yes") %>% .$sd_age
n_cases_with_records <- data_cohort_by_whether_they_had_records_prior_to_death %>% filter(case=="case" & sex=="both" & has_no_records_prior_to_death=="No") %>% .$N
n_cases_without_records <- data_cohort_by_whether_they_had_records_prior_to_death %>% filter(case=="case" & sex=="both" & has_no_records_prior_to_death=="Yes") %>% .$N
## convert table to a format of one entry per row
age_at_death_tbl <-
data_cohort_by_whether_they_had_records_prior_to_death %>%
filter(!is.na(mean_age)) %>%
select(case, sex, has_no_records_prior_to_death, N, mean_age, sd_age) %>%
mutate(has_no_records_prior_to_death = if_else(has_no_records_prior_to_death=="Yes", "no_records", "had_records")) %>%
pivot_wider(values_from = c(N, mean_age, sd_age), names_from = has_no_records_prior_to_death)
## t test using statistics rather than raw data
hypothesis_tests$t_test_age_difference_cases_with_and_without_records <-
age_at_death_tbl %>%
group_by(case, sex) %>%
summarise(
tidy(BSDA::tsum.test(
mean.x = mean_age_had_records,
s.x = sd_age_had_records,
n.x = N_had_records,
mean.y = mean_age_no_records,
s.y = sd_age_no_records,
n.y = N_no_records,
var.equal = FALSE
)),
.groups = "drop"
) %>%
mutate(
difference = estimate1 - estimate2,
meaning = "1 = had records, 2 = no records"
) %>%
relocate(difference, .before=statistic)
hypothesis_tests$t_test_age_difference_cases_with_and_without_records %>%
kable(caption = "t-test of age difference between cases with records and cases without records", format.args = list(nsmall=4))
```
## Hypotheses: gender differences between characteristics of cases & controls, separately for individuals with records or no records
```{r}
# quantities_to_test_with_t_test <- c("age","carstairs","urban_rural")
# quantities_to_test_with_chisq_test <- c("proportion_not_in_work")
long_data_cohort_by_whether_they_had_records_prior_to_death_for_t_tests <-
bind_rows(
## AGE
data_cohort_by_whether_they_had_records_prior_to_death %>% # this uses deciles for carstairs so we'll take the newer quintile carstairs data from different table
# filter(sex != "both") %>%
mutate(n_age = N, n_carstairs = N, n_urban_rural = N) %>%
select(-matches("median|iqr|q1|q3|proportion"), -N) %>%
pivot_longer(cols = matches("age|carstairs|urban_rural"), names_to = c("statistic","variable"), values_to = "value", names_pattern = "(mean|sd|n)_(.*)") %>%
filter(variable=="age" & case=="case") # only keep age, which only makes sense for cases
,
## CARSTAIRS & SIMD
data_descriptives_carstairs_simd %>%
# filter(measure=="quintile") %>%
mutate(variable=paste0(tolower(geographic_indicator),"_",measure)) %>%
rename_all(~tolower(.)) %>%
select(-matches("median|iqr|q1|q3|proportion|geographic_indicator|^measure")) %>%
pivot_longer(cols=c(n,mean,sd), names_to="statistic",values_to="value")
)
hypothesis_tests$t_tests_sex_differences <-
long_data_cohort_by_whether_they_had_records_prior_to_death_for_t_tests %>%
filter(sex!="both") %>%
pivot_wider(names_from = c("sex","statistic"), values_from = "value") %>%
group_by(case, has_no_records_prior_to_death, variable) %>%
summarise(
BSDA::tsum.test(
mean.x = female_mean,
s.x = female_sd,
n.x = female_n,
mean.y = male_mean,
s.y = male_sd,
n.y = male_n,
var.equal = FALSE
) %>% tidy,
difference = estimate1-estimate2,
meaning = "female - male (1=female)",
.groups = "drop"
) %>%
relocate(meaning, difference, .after = variable)
hypothesis_tests$t_tests_cohort_differences <-
long_data_cohort_by_whether_they_had_records_prior_to_death_for_t_tests %>%
filter(variable!="age") %>% # can't compare age, not sensible for controls
filter(has_no_records_prior_to_death != "Both") %>% # without splitting by had records prior to death, all descriptives are the same! (cases matched to control on geography)
pivot_wider(names_from = c("case","statistic"), values_from = "value") %>%
group_by(sex, has_no_records_prior_to_death, variable) %>%
summarise(
BSDA::tsum.test(
mean.x = case_mean,
s.x = case_sd,
n.x = case_n,
mean.y = control_mean,
s.y = control_sd,
n.y = control_n,
var.equal = FALSE
) %>% tidy,
difference = estimate1-estimate2,
meaning = "case - control (1=case)",
.groups = "drop"
) %>%
relocate(meaning, difference, .after = variable)
## Proportion tests of not in work
hypothesis_tests$prop_tests_sex_differences <-
data_cohort_by_whether_they_had_records_prior_to_death %>%
filter(sex != "both") %>%
filter(!is.na(proportion_not_in_work)) %>%
select(case, sex, has_no_records_prior_to_death, N, proportion_not_in_work) %>%
mutate(n_success = round(N * proportion_not_in_work), n_fail = N-n_success) %>%
select(-N, -proportion_not_in_work) %>%
pivot_longer(cols=c(n_success,n_fail), names_to="outcome", values_to="n", names_pattern="n_(.*)") %>%
uncount(n) %>%
group_by(case, sex, has_no_records_prior_to_death) %>%
summarise(
prop.test(table(outcome)) %>% tidy,
.groups = "drop"
)
```
### Display results
```{r}
hypothesis_tests$t_tests_sex_differences %>%
filter(p.value < .05) %>%
kable(caption = "t-tests of sex difference at p < .05", format.args = list(nsmall=4))
hypothesis_tests$t_tests_cohort_differences %>%
filter(p.value < .05) %>%
kable(caption = "t-tests of cohorty (case - control) difference at p < .05", format.args = list(nsmall=4))
hypothesis_tests$prop_tests_sex_differences %>%
filter(p.value < .05) %>%
kable(caption = "Fisher's z-tests of sex difference at p < .05", format.args = list(nsmall=4))
```