-
Notifications
You must be signed in to change notification settings - Fork 0
/
01_smpl_demog_geocdng.Rmd
97 lines (80 loc) · 3.48 KB
/
01_smpl_demog_geocdng.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
---
title: "01_smpl_demog_geocdng"
author: "Puvvula"
date: "2024-04-15"
output: pdf_document
---
#read address data
```{r}
#MRN-participant_id link file
mrn_link<- read_csv(paste0(data, "mrn_link.csv")) |>
mutate(MRN = ifelse(str_detect(MRN, "^peach"), MRN, str_pad(MRN, width = 9, pad = "0")),
pt_id = as.character(pt_id))
#patient-demographics file
pt_demog<- read_csv(paste0(data, "pt_demograph.csv")) |>
mutate(pt_id = str_extract(CLIENT_SAMPLE_ID, "\\d+"),
pt_id = ifelse(str_detect(CLIENT_SAMPLE_ID, "^PEACH"), paste0("peach_", pt_id), pt_id),
pt_id = ifelse(!is.na(pt_id), str_replace(pt_id, "^0+", ""), NA))
pt_demog_w_mrn<- left_join(pt_demog, mrn_link, by="pt_id")
#import delivery date and additional variables
pt_date<- read_csv(paste0(data, "pt_date_info.csv")) |>
clean_names()|>
select(-c(study, lmp_date, height_inch, pp_weight_lb))|>
mutate(delivery_date = mdy(delivery_date),
ga_days = round(as.numeric(ga_at_delivery * 7)),
lmp_date = delivery_date - days(ga_days))
#join dates with other variables
pt_final<- left_join(pt_demog_w_mrn, pt_date, by="pt_id")
write_csv(pt_final, paste0(data, "pt_demog_final.csv"))
```
#address geocoding
```{r}
#address file for geocoding
address<- read_csv(paste0(data, "pt_address.csv")) |>
select(-c(REC_CREATE_DATE))|>
mutate(add_line_1 = str_replace(add_line_1, "(?i)\\b(?:apt|unit)\\b\\s*\\w*", ""),
zip = str_extract(zip, "\\b\\d{5}\\b")) |>
filter(!(study_id == "CRIB_rita" | study_id == "PEACH_Aimin") |
(study_id == "CRIB_rita" | study_id == "PEACH_Aimin") &
(MRN %in% pt_demog_w_mrn$MRN)) |>
mutate(MRN= as.numeric(MRN)+2255612) |>
mutate(state = fct_recode(state,
"CA" = "California",
"DE" = "Delaware",
"MD" = "Maryland",
"NJ" = "New Jersey",
"PA" = "Pennsylvania"),
city = str_to_title(city),
addr = paste(add_line_1, city, state, zip, sep = ", "))
geocoded_addresses <- address |>
geocode(addr, method = 'osm', lat = latitude , long = longitude)
#geocoded address export
geocoded_addresses_exp<- geocoded_addresses |>
mutate(MRN= as.character(as.numeric(MRN)-2255612))|>
filter(study_id != "Dummy_mixing")
write_csv(geocoded_addresses_exp, paste0(analy_dat, "geocoded_addrs.csv"))
```
#join geocoded info with participant final data
```{r}
pt_geocoded<- read_csv(paste0(data, "geocoded_addrs.csv")) |>
mutate(MRN = ifelse(str_detect(MRN, "^peach"), MRN, str_pad(MRN, width = 9, pad = "0")))
pt_demog_final<- read_csv(paste0(data, "pt_demog_final.csv"))
#join all data for exposure assessment
df_fin<- left_join(pt_final, pt_geocoded, by="MRN")
write_csv(df_fin, paste0(analy_dat, "df_for_exposures.csv"))
```
Mapping geocoded addresses for context
```{r}
geocoded_df<- read_csv(paste0(analy_dat, "geocoded_addrs.csv")) |>
filter(!is.na(zip)) |>
filter(state != "CA")
ggplot(geocoded_df, aes(longitude, latitude)) +
borders("county", regions = c("Pennsylvania", "New Jersey", "Delaware", "Maryland")) +
geom_point(aes(color = factor(study_id)), size=1) +
ggspatial::annotation_scale(plot_unit = "km", location= "br")+
theme_void()+
theme(legend.position = "bottom", legend.box = "horizontal", legend.justification = "center")+
labs(color="Study ID (n=195)")+
guides(color = guide_legend(override.aes = list(shape = 15, size = 5)))
ggsave(paste0(result, "mapping_participants.pdf"), dpi=300)
```