# load packages

In [None]:
library(tidyverse)

In [None]:
library(data.table)

# read in input files

In [None]:
icd = fread('Phenotypes/3.0/PMBB-Release-2024-3.0_phenotype_condition_occurrence.txt',
            sep = '\t',
            select = c('person_id', 'condition_start_date', 'condition_source_value', 'visit_occurrence_id'))
head(icd)

In [None]:
med = fread('Phenotypes/3.0/PMBB-Release-2024-3.0_phenotype_drug_exposure.txt',
            sep = '\t',
            select = c('person_id', 'drug_exposure_start_date', 'drug_source_value', 'visit_occurrence_id'))
head(med)

In [None]:
tsh = fread('Phenotypes/3.0/PMBB-Release-2024-3.0_phenotype_labs_tsh.txt',
            sep = '\t',
            select = c('person_id', 'value_converted', 'measurement_date', 'visit_occurrence_id'))
head(tsh)

In [None]:
demo = fread('Phenotypes/3.0/PMBB-Release-2024-3.0_covariates.txt',
             sep = '\t',
             select = c('person_id', 'Sequenced_gender'))
head(demo)

In [None]:
dob = fread('Phenotypes/3.0/PMBB-Release-2024-3.0_phenotype_person.txt',
            sep = '\t',
            select = c('person_id', 'birth_datetime'))
head(dob)

In [None]:
visit = fread('Phenotypes/3.0/PMBB-Release-2024-3.0_phenotype_visit_occurrence.txt',
            sep = '\t',
            select = c('person_id', 'visit_occurrence_id', 'visit_source_value'))
head(dob)

In [None]:
concept_name = fread('Phenotypes/3.0/concept.txt',
                     sep = '\t',
                     select = c('concept_id', 'concept_name', 'domain_id'))
head(concept_name)

In [None]:
gia = fread('Exome/PCA/PMBB-Release-2024-3.0_genetic_exome.norm.commonsnps_samples_ancestries.txt',
            sep = '\t',
            select = c('FID', 'Class'))
head(gia)
unique(gia$Class)

In [None]:
no_related = fread('Exome/IBD/PMBB-Release-2024-3.0_genetic_exome.3rd_degree_unrelated.txt',
                   sep = '\t',
                   header = F)
head(no_related)

# clean demo data

## clean up files

In [None]:
unique(demo$Sequenced_gender)

In [None]:
demo_clean = demo%>%
rename('SEX' = 'Sequenced_gender') %>%
mutate(SEX = case_match(SEX,
                        'Male' ~ 1,
                        'Female' ~ 2))
head(demo_clean)
nrow(demo_clean)

### merge files

In [None]:
demo_dob = demo_clean %>%
inner_join(dob, by = 'person_id')
head(demo_dob)

# clean TSH data

## clean data

In [None]:
tsh_clean = tsh%>%
rename('TSH' = 'value_converted') %>%
mutate(measurement_date = as.Date(measurement_date)) %>%
na.omit()
head(tsh_clean)
length(unique(tsh_clean$person_id))

# clean ICD data for pregnancy data

In [None]:
preg = icd %>%
mutate(condition_start_date = as.Date(condition_start_date)) %>%
na.omit() %>%
filter(grepl('^63|^64|^65|^66|^67|^O0', condition_source_value))
head(preg)
length(unique(preg$person_id))
unique(preg$condition_source_value)

In [None]:
preg_clean = preg %>%
select(-condition_source_value) %>%
group_by(person_id) %>%
summarise_all(.funs = first)
head(preg_clean)
nrow(preg_clean)

# clean omop table for thyroid meds

In [None]:
unique(concept_name$domain_id)

In [None]:
concept_drug = concept_name %>%
filter(grepl('^Drug$', domain_id))
head(concept_drug)
unique(concept_drug$domain_id)

In [None]:
thyroid_med_map = concept_drug %>%
filter(grepl('Levothyroxine|Euthyrox|Synthroid|Levothroid|Unithroid|Tirosint|Levo-T|Levoxyl|Thyrolar|Thyroid USP|Armour|Nature-Throid|Westhroid|Liothyronine|Cytomel|Methimazole|Tapazole|Propylthiouracil|PTU|Amiodarone|Cordarone|Pacerone|Nexterone|Pembrolizumab|Nivolumab|Avelumab|Cemiplimab|Atezolizumab|Durvalumab|Ipilimumab|Keytruda|Opdivo|Bavencio|Libtayo|Tecentriq|Imfinzi|Yervoy|sunitinib|lenvatinib|sorafenib|pazopanib|axitinib|tivozanib|vandetanib|regorafenib|Sutent|Lenvima|NexAVAR|Votrient|Inlyta|Fotivda|Caprlesa|Stivarga|lenalidomide|pomalidomide|brigatinib|Cabozantinib|Imatinib|Revlimid|Pomalyst|Alunbrig|Abometyx|Cometriq|Gleevac|MK-3475|Interferon|Ibrutinib|Ruxolitinib|cedarinib|Gilteritinib|durvalumab|tremelimumab',
             concept_name,
             ignore.case = TRUE)) %>%
filter(!grepl('eucalyptus',concept_name,ignore.case = TRUE))
unique(thyroid_med_map$concept_name)

# clean medication data

## filter to meds in map

In [None]:
med_thyroid = med[med$drug_source_value %in% thyroid_med_map$concept_id,]
head(med_thyroid)
unique(med_thyroid$drug_source_value)

## check what meds mapped

In [None]:
mapped_meds = thyroid_med_map[thyroid_med_map$concept_id %in% med_thyroid$drug_source_value,]
mapped_meds %>%
select(concept_id,concept_name)

## get date of first med

In [None]:
med_clean = med_thyroid%>%
select(person_id, drug_exposure_start_date)%>%
mutate(drug_exposure_start_date = as.Date(drug_exposure_start_date)) %>%
group_by(person_id) %>%
summarise_all(.funs = first)
head(med_clean)
nrow(med_clean)

# create EUR and AFR sample lists

In [None]:
eur = gia %>%
filter(Class == 'EUR')
head(eur)
unique(eur$Class)
nrow(eur)

In [None]:
afr = gia %>%
filter(Class == 'AFR')
head(eur)
unique(afr$Class)
nrow(afr)

# Remove excluding measurements

## pregnancy

### merge

In [None]:
tsh_preg = tsh_clean %>%
full_join(preg, by = 'person_id', relationship = "many-to-many")
head(tsh_preg)

### remove labs after pregnancy

In [None]:
tsh_no_preg_before = tsh_preg %>%
mutate(difftime_preg_TSH = (difftime(condition_start_date, measurement_date, units = 'days'))) %>%
filter(!grepl('-', difftime_preg_TSH))
head(tsh_no_preg_before)
tsh_no_preg_before %>%
mutate(difftime_preg_TSH = gsub(' days', '', difftime_preg_TSH)) %>%
mutate(difftime_preg_TSH = as.numeric(difftime_preg_TSH)) %>%
select(difftime_preg_TSH) %>%
summary()
nrow(tsh_no_preg_before)
nrow(tsh_preg)

## thyroid meds

### merge

In [None]:
tsh_med=tsh_no_preg_before %>%
full_join(med_clean, by = 'person_id',relationship = "many-to-many")
head(tsh_med)

### remove labs after thryoid meds

In [None]:
tsh_no_med_before = tsh_med %>%
mutate(difftime_med_TSH = (difftime(drug_exposure_start_date, measurement_date, units = 'days'))) %>%
filter(!grepl('-', difftime_med_TSH))
head(tsh_no_med_before)
tsh_no_med_before %>%
mutate(difftime_med_TSH = gsub(' days', '', difftime_med_TSH)) %>%
mutate(difftime_med_TSH = as.numeric(difftime_med_TSH)) %>%
select(difftime_med_TSH) %>%
summary()
nrow(tsh_no_med_before)
nrow(tsh_med)

# add covariates

In [None]:
tsh_demo = tsh_no_med_before %>%
left_join(demo_dob, by = 'person_id') %>%
mutate(birth_datetime = as.Date(birth_datetime)) %>%
mutate(AGE = (difftime(measurement_date, birth_datetime, units = 'days'))) %>%
mutate(AGE = gsub('days','',AGE)) %>%
mutate(AGE = as.numeric(AGE)) %>%
mutate(AGE = AGE / 365) %>%
filter(AGE >= 18) %>%
select(person_id, TSH, SEX, AGE) %>%
na.omit()
head(tsh_demo)

# remove related

In [None]:
tsh_no_related = tsh_demo[tsh_demo$person_id %in% no_related$V1,]
length(unique(tsh_no_related$person_id))
length(unique(tsh_demo$person_id))
head(tsh_no_related)

# filter to EUR & AFR

In [None]:
tsh_eur = tsh_no_related[tsh_no_related$person_id %in% eur$FID,]
head(tsh_eur)
length(unique(tsh_eur$person_id))

In [None]:
tsh_afr = tsh_no_related[tsh_no_related$person_id %in% afr$FID,]
head(tsh_afr)
length(unique(tsh_afr$person_id))

# take mean TSH & Age, then filter labs out of normal range TSH

In [None]:
tsh_norm_range_eur = tsh_eur %>%
group_by(person_id) %>%
summarise_all(.funs = mean) %>%
filter(TSH >= 0.45 & TSH <= 4.5)
head(tsh_norm_range_eur)
nrow(tsh_norm_range_eur)
summary(tsh_norm_range_eur$TSH)

In [None]:
tsh_norm_range_afr = tsh_afr %>%
group_by(person_id) %>%
summarise_all(.funs = mean) %>%
filter(TSH >= 0.45 & TSH <= 4.5)
head(tsh_norm_range_afr)
nrow(tsh_norm_range_afr)
summary(tsh_norm_range_afr$TSH)

# inverse normally transform TSH and square age

In [None]:
inv_norm_tsh_eur = tsh_norm_range_eur %>%
mutate(INV_NORMAL_TSH = qnorm((rank(TSH, na.last = "keep") - 0.5) / sum(!is.na(TSH)))) %>%
mutate(AGE_AGE = AGE^2) %>%
select(-c(TSH, AGE))
head(inv_norm_tsh_eur)

In [None]:
inv_norm_tsh_afr = tsh_norm_range_afr %>%
mutate(INV_NORMAL_TSH = qnorm((rank(TSH, na.last = "keep") - 0.5) / sum(!is.na(TSH)))) %>%
mutate(AGE_AGE = AGE^2) %>%
select(-c(TSH, AGE))
head(inv_norm_tsh_afr)

# take EUR random sample

In [None]:
nrow(inv_norm_tsh_afr)
nrow(inv_norm_tsh_eur)

In [None]:
set.seed(1234)

In [None]:
eur_random_sample = data.frame(sample(x = inv_norm_tsh_eur$person_id, size = 7011, replace = F))
colnames(eur_random_sample) = 'person_id'
nrow(eur_random_sample)
head(eur_random_sample)

In [None]:
inv_norm_tsh_eur_random = inv_norm_tsh_eur[inv_norm_tsh_eur$person_id %in% eur_random_sample$person_id,]
head(inv_norm_tsh_eur_random)
nrow(inv_norm_tsh_eur_random)

# create & export sample lists and in progress phenotype files

## create

In [None]:
eur_random_sample_list = inv_norm_tsh_eur_random %>%
select(person_id)
head(eur_random_sample_list)
nrow(eur_random_sample_list)

In [None]:
afr_sample_list = inv_norm_tsh_afr %>%
select(person_id)
head(afr_sample_list)
nrow(afr_sample_list)

## export

In [None]:
write.table(eur_random_sample_list, 'input/PMBB_v3.TSH.GWAS.EUR.n=7011.sample_list.txt',
            col.names = F,
            row.names = F,
            quote = F)

In [None]:
write.table(afr_sample_list, 'input/PMBB_v3.TSH.GWAS.AFR.n=7011.sample_list.txt',
            col.names = F,
            row.names = F,
            quote = F)

In [None]:
write.table(inv_norm_tsh_eur_random, 'input/PMBB_v3.TSH.GWAS.EUR.n=7011.phenotype_covariates.in_progress.txt',
            sep = '\t',
            col.names = T,
            row.names = F,
            quote = F)

In [None]:
write.table(inv_norm_tsh_afr, 'input/PMBB_v3.TSH.GWAS.AFR.n=7011.phenotype_covariates.in_progress.txt',
            sep = '\t',
            col.names = T,
            row.names = F,
            quote = F)

# read in PCs and in progress phenos

## eigenvec files

In [None]:
afr_eigenvec = read.csv('output/pca/PMBB_v3.TSH.GWAS.AFR.PCA.eigenvec',
                        sep = '',
                        skip = 1,
                        header = F)
dim(afr_eigenvec)
head(afr_eigenvec)

In [None]:
eur_eigenvec = read.csv('output/pca/PMBB_v3.TSH.GWAS.EUR.PCA.eigenvec',
                        sep = '',
                        skip = 1,
                        header = F)
dim(eur_eigenvec)
head(eur_eigenvec)

## eigenval

In [None]:
afr_eigenval = read.csv('output/pca/PMBB_v3.TSH.GWAS.AFR.PCA.eigenval',
                        sep = '\t',
                        header = F)
head(afr_eigenval)

In [None]:
eur_eigenval = read.csv('output/pca/PMBB_v3.TSH.GWAS.EUR.PCA.eigenval',
                        sep = '\t',
                        header = F)
head(eur_eigenval)

## in progress pheno

In [None]:
inv_norm_tsh_eur_random = read.csv('input/PMBB_v3.TSH.GWAS.EUR.n=7011.phenotype_covariates.in_progress.txt',
                                   sep = '\t')
head(inv_norm_tsh_eur_random)

In [None]:
inv_norm_tsh_afr = read.csv('input/PMBB_v3.TSH.GWAS.AFR.n=7011.phenotype_covariates.in_progress.txt',
                            sep = '\t')
head(inv_norm_tsh_afr)

# create scree plots

## clean eigenval files

In [None]:
afr_eigenval_clean = afr_eigenval %>%
mutate(PC = rownames(afr_eigenval)) %>%
mutate(PC = as.numeric(PC)) %>%
mutate(VARIANCE = V1 / sum(V1))
head(afr_eigenval_clean)

In [None]:
eur_eigenval_clean = eur_eigenval %>%
mutate(PC = rownames(eur_eigenval)) %>%
mutate(PC = as.numeric(PC)) %>%
mutate(VARIANCE = V1 / sum(V1))
head(eur_eigenval_clean)

## make plots

In [None]:
ggplot(afr_eigenval_clean, aes(x = PC, y = VARIANCE)) + geom_line() + scale_x_continuous(breaks = seq(1, 20, by = 1),limits = c(1, 20))

In [None]:
ggplot(eur_eigenval_clean, aes(x = PC, y = VARIANCE)) + geom_line() + scale_x_continuous(breaks = seq(1, 20, by = 1),limits = c(1, 20))

# add pcs to pheno/covar file

## clean eigenvec

In [None]:
afr_eigenvec_sub = afr_eigenvec %>%
rename('person_id' = 'V1',
       'PC1' = 'V2') %>%
select(person_id, PC1) %>%
mutate(person_id = gsub('*0:', '', person_id))
head(afr_eigenvec_sub)

In [None]:
eur_eigenvec_sub = eur_eigenvec %>%
rename('person_id' = 'V1',
       'PC1' = 'V2',
       'PC2' = 'V3',
       'PC3' = 'V4') %>%
select(person_id, PC1, PC2, PC3) %>%
mutate(person_id = trimws(person_id)) %>%
mutate(person_id = gsub('^*0:', '', person_id))
head(eur_eigenvec_sub)

## merge

In [None]:
inv_norm_tsh_afr_pc = inv_norm_tsh_afr %>%
inner_join(afr_eigenvec_sub, by = 'person_id')
nrow(inv_norm_tsh_afr)
nrow(afr_eigenvec_sub)
nrow(inv_norm_tsh_afr_pc)
head(inv_norm_tsh_afr_pc)

In [None]:
inv_norm_tsh_eur_pc = inv_norm_tsh_eur_random %>%
inner_join(eur_eigenvec_sub, by = 'person_id')
nrow(inv_norm_tsh_eur_random)
nrow(eur_eigenvec_sub)
nrow(inv_norm_tsh_eur_pc)
head(inv_norm_tsh_eur_pc)

# randomly downsample AFR to match new EUR

In [None]:
afr_random_sample = data.frame(sample(x = inv_norm_tsh_afr_pc$person_id, size = 6937, replace = F))
colnames(afr_random_sample) = 'person_id'
nrow(afr_random_sample)
head(afr_random_sample)

In [None]:
inv_norm_tsh_afr_pc_random = inv_norm_tsh_afr_pc[inv_norm_tsh_afr_pc$person_id %in% afr_random_sample$person_id,]
nrow(inv_norm_tsh_afr_pc_random)
nrow(inv_norm_tsh_eur_pc)
head(inv_norm_tsh_afr_pc_random)

# change ID col name

In [None]:
inv_norm_tsh_eur_pc = inv_norm_tsh_eur_pc %>%
rename('IID' = 'person_id')
head(inv_norm_tsh_eur_pc)

In [None]:
inv_norm_tsh_afr_pc_random = inv_norm_tsh_afr_pc_random%>%
rename('IID' = 'person_id')
head(inv_norm_tsh_afr_pc_random)

# make new sample lists

In [None]:
eur_sample_list_final = inv_norm_tsh_eur_pc %>%
select(person_id)
nrow(eur_sample_list_final)

In [None]:
afr_sample_list_final = inv_norm_tsh_afr_pc_random %>%
select(person_id)
nrow(afr_sample_list_final)

# export final dfs

In [None]:
write.table(eur_sample_list_final, 'input/PMBB_v3.TSH.GWAS.EUR.n=6937.sample_list.final.txt',
            col.names = F,
            row.names = F,
            quote = F)

In [None]:
write.table(afr_sample_list_final, 'input/PMBB_v3.TSH.GWAS.AFR.n=6937.sample_list.final.txt',
            col.names = F,
            row.names = F,
            quote = F)

In [None]:
write.table(inv_norm_tsh_eur_pc, 'input/PMBB_v3.TSH.GWAS.EUR.n=6937.phenotype_covariates.final.txt',
            sep = '\t',
            col.names = T,
            row.names = F,
            quote = F)

In [None]:
write.table(inv_norm_tsh_afr_pc_random, '/project/ritchie/projects/TSH/PMBB_v3/input/PMBB_v3.TSH.GWAS.AFR.n=6937.phenotype_covariates.final.txt',
            sep = '\t',
            col.names = T,
            row.names = F,
            quote = F)