# load packages

In [None]:
library(tidyverse)

# read in input files

In [None]:
vcf_id = read.csv('rosmap/ID_mapping/ROSMAP_VCF_IDs.txt',
                  header = F)
head(vcf_id)

In [None]:
rnaseq_id = read.csv('rosmap/ID_mapping/ROSMAP_RNAseq_IDs.txt')
head(rnaseq_id)

In [None]:
biospec = read.csv('rosmap/MetaData/ROSMAP_biospecimen_metadata.csv')
head(biospec)

In [None]:
wgs_data = read.csv('rosmap/Metadata/ROSMAP_assay_wholeGenomeSeq_metadata.csv')
head(wgs_data)

In [None]:
wgs_qc = read.csv('rosmap/Whole_Genome_Sequencing/WGS_sample_QC_info.csv')
head(wgs_qc)
nrow(wgs_qc)

In [None]:
pheno = read.csv('rosmap/Metadata/ROSMAP_clinical.csv')
head(pheno)

In [None]:
array = read.csv('rosmap/Genomic_Variants_SNP_Array/Raw/ROSMAP_arrayGenotype.fam',
                 sep = '',
                 header = F)
head(array)

In [None]:
adsp = read.csv('ADSP/MAP_FILES/ROSMAP_ADSPID_mapping.csv')
head(adsp)
nrow(adsp)

In [None]:
adsp_wgs_list = read.csv('rosmap/ID_mapping/ROSMAP_ADSP_WGS_samples.txt',
                         sep = '',
                         header = F)
head(adsp_wgs_list)

In [None]:
adsp_pheno = read.csv('ADSP/v11_122023/v11_CombinedPhenotypes/ADSPIntegratedPhenotypes_DS_2023.08.08.csv')
head(adsp_pheno)

In [None]:
adsp_qc = read.csv('/project/ritchie/datasets/ADSP/v11_122023/231211-ADSPv11_Release/downloads/v11-dl-dir/gcad.r4.wgs.36361.VCPA1.1.2022.08.15.v3.qcmetrics.seq.ALL.txt',
                   sep = '\t')
head(adsp_qc)

In [None]:
methylation = read.csv('/project/ritchie/projects/AD_KMI/pathway_score/rosmap/ID_mapping/ROSMAP.Methylation.IDs.txt')
nrow(methylation)
length(unique(methylation$TargetID))
head(methylation)

# remove WGS samples that didn't pass QC

## filter to samples that failed

In [None]:
unique(wgs_qc$QC)

In [None]:
wgs_qc %>%
filter(grepl('Fail',QC)) %>%
filter(grepl('identical',QC))

In [None]:
wgs_fail_qc = wgs_qc %>%
filter(grepl('Fail',QC))
head(wgs_fail_qc)
nrow(wgs_fail_qc)

## remove samples

In [None]:
vcf_pass_qc=data.frame(vcf_id[!(vcf_id$V1 %in% wgs_fail_qc$WGS_id),])
colnames(vcf_pass_qc) = c('ID')
head(vcf_pass_qc)
nrow(vcf_pass_qc)
nrow(vcf_id)

# fix duplicates

## filter to duplicates

In [None]:
wgs_dup = wgs_qc %>%
filter(grepl('identical', Duplicate)) %>%
filter(!grepl('Fail', QC))
nrow(wgs_dup)
print(wgs_dup)

## check to see if dups still exist even after removing samples that failed QC

In [None]:
wgs_qc%>%
filter(!grepl('Fail',QC))%>%
filter(grepl('SM-CTDVR',WGS_id))%>%
head()

In [None]:
wgs_qc%>%
filter(!grepl('Fail',QC))%>%
filter(grepl('SM-CTEGE',WGS_id))

In [None]:
wgs_qc%>%
filter(!grepl('Fail',QC))%>%
filter(grepl('SM-CJEI2',WGS_id))

In [None]:
wgs_qc%>%
filter(!grepl('Fail',QC))%>%
filter(grepl('SM-CJEL3',WGS_id))

In [None]:
wgs_qc%>%
filter(!grepl('Fail',QC))%>%
filter(grepl('SM-CTEGU',WGS_id))

In [None]:
wgs_qc%>%
filter(!grepl('Fail',QC))%>%
filter(grepl('SM-CTEDF',WGS_id))

In [None]:
wgs_qc%>%
filter(!grepl('Fail',QC))%>%
filter(grepl('SM-CTEEX',WGS_id))

In [None]:
wgs_qc%>%
filter(!grepl('Fail',QC))%>%
filter(grepl('SM-CJFKP',WGS_id))

## see how many samples have RNAseq based on the table

In [None]:
wgs_qc %>%
filter(!grepl('Fail', QC)) %>%
filter(is.na(Concordant_with_RNAseq) == F) %>%
head()
wgs_qc%>%
filter(!grepl('Fail', QC)) %>%
filter(is.na(Concordant_with_RNAseq) == F) %>%
nrow()

# create vcf individual ID map

## filter to individuals in VCF pass QC file

In [None]:
biospec_vcf = biospec[biospec$specimenID %in% vcf_pass_qc$ID,]
nrow(biospec_vcf)
length(unique(biospec_vcf$individualID))
nrow(vcf_pass_qc)
head(biospec_vcf)

## check out duplicates

In [None]:
biospec_vcf_id = biospec_vcf %>%
select(individualID)
head(biospec_vcf_id)

In [None]:
write.table(biospec_vcf_id,
            'rosmap/ID_mapping/biospec_vcf_pass_qc_poss_dups.txt',
            sep = '\t',
            col.names = F,
            row.names = F,
            quote = F)

In [None]:
biospec_vcf_id_dup_list = read.csv('rosmap/ID_mapping/biospec_vcf_pass_qc_dups.txt',
                                   header = F)
head(biospec_vcf_id_dup_list)

In [None]:
biospec_vcf_id_dups = biospec_vcf[biospec_vcf$individualID %in% biospec_vcf_id_dup_list$V1,]
nrow(biospec_vcf_id_dups)
biospec_vcf_id_dups %>%
arrange(individualID)

## fix duplicates

### filter to duplicates

In [None]:
wgs_qc_dup = wgs_qc[wgs_qc$WGS_id %in% biospec_vcf_id_dups$specimenID,]
wgs_qc_dup

### keep duplicate with highest GQN

In [None]:
wgs_qc_dup_fixed = wgs_qc_dup %>%
filter(grepl('SM-CJK4Y|SM-CTEE2|SM-CTEN3|SM-CTED9|SM-CTEIJ|SM-CTEMN', WGS_id))
wgs_qc_dup_fixed

### remove dups from biospecimen file

In [None]:
biospec_vcf_no_dups = biospec_vcf[!(biospec_vcf$individualID %in% biospec_vcf_id_dup_list$V1),]
nrow(biospec_vcf_no_dups)
nrow(biospec_vcf)

### filter to fixed dups in biospecimen file

In [None]:
biospec_vcf_new_dups = biospec_vcf[biospec_vcf$specimenID %in% wgs_qc_dup_fixed$WGS_id,]
nrow(wgs_qc_dup_fixed)
nrow(biospec_vcf_new_dups)

### concatenate files

In [None]:
biospec_vcf_dups_fixed = rbind(biospec_vcf_no_dups,biospec_vcf_new_dups)
nrow(biospec_vcf_dups_fixed)
length(unique(biospec_vcf_dups_fixed$individualID))

# create map for RNAseq sample

## get unique RNA seq samples

In [None]:
rnaseq_unique = rnaseq_id%>%
distinct()
nrow(rnaseq_unique)

## check out duplicates

In [None]:
write.table(rnaseq_id,'rosmap/ID_mapping/rnaseq_ids.txt',
            sep = '\t',
            col.names = F,
            row.names = F,
            quote = F)

In [None]:
rnaseq_dups = read.csv('rosmap/ID_mapping/rnaseq_ids_dups.txt',
                       header = F)
head(rnaseq_dups)

In [None]:
rnaseq_id[rnaseq_id$individualID %in% rnaseq_dups$V1,]

In [None]:
biospec_rnaseq_dup = biospec[biospec$individualID %in% rnaseq_dups$V1,]
nrow(biospec_rnaseq_dup)
biospec_rnaseq_dup %>%
filter(grepl('rnaSeq', assay))

In [None]:
biospec_rnaseq = biospec[biospec$individualID %in% rnaseq_id$individualID,]
biospec_rnaseq %>%
filter(grepl('rnaSeq', assay)) %>%
group_by(tissue) %>%
summarize(n())

## filter

In [None]:
biospec_rnaseq = biospec_vcf_no_dups[biospec_vcf_no_dups$individualID %in% rnaseq_unique$individualID,]
nrow(biospec_rnaseq)
nrow(biospec_vcf_no_dups)
nrow(rnaseq_unique)

## check out IDs that didn't map

In [None]:
rnaseq_mismatch = data.frame(rnaseq_id[!(rnaseq_id$individualID %in% biospec_rnaseq$individualID),])
colnames(rnaseq_mismatch) = c('individualID')
nrow(rnaseq_mismatch)

In [None]:
unique(biospec$assay)

In [None]:
biospec_mismatch = biospec[biospec$individualID %in% rnaseq_mismatch$individualID,]
head(biospec_mismatch)

In [None]:
biospec_mismatch_wgs = biospec_mismatch %>%
filter(grepl('wholeGenomeSeq', assay))
nrow(biospec_mismatch_wgs)

In [None]:
biospec_mismatch_wgs_pass_qc=biospec_mismatch_wgs[!(biospec_mismatch_wgs$specimenID %in% wgs_fail_qc$WGS_id),]
nrow(biospec_mismatch_wgs_pass_qc)

In [None]:
biospec_mismatch_wgs_no_dups=biospec_mismatch_wgs_pass_qc[!(biospec_mismatch_wgs_pass_qc$individualID %in% biospec_vcf_id_dup_list$V1),]
nrow(biospec_mismatch_wgs_no_dups)

In [None]:
biospec_mismatch_wgs_dups = biospec_mismatch_wgs_pass_qc[biospec_mismatch_wgs_pass_qc$individualID %in% biospec_vcf_id_dup_list$V1,]
biospec_mismatch_wgs_dups %>%
arrange(individualID)

In [None]:
biospec_mismatch_wgs_no_dups
nrow(biospec_mismatch_wgs_no_dups)

## see if we can fix IDs with pheno file

In [None]:
nrow(pheno)

In [None]:
pheno_mismatch = pheno[pheno$individualID %in% biospec_mismatch_wgs_no_dups$individualID,]
nrow(pheno_mismatch)
head(pheno_mismatch)

In [None]:
pheno_mismatch_sub = pheno_mismatch %>%
select(projid,Study,individualID)
head(pheno_mismatch_sub)

In [None]:
pheno_mismatch_wgs = merge(pheno_mismatch_sub,wgs_qc, by = 'projid')
nrow(pheno_mismatch_wgs)
head(pheno_mismatch_wgs)

In [None]:
nrow(data.frame(vcf_id[vcf_id$V1 %in% pheno_mismatch_wgs$WGS_id,]))

In [None]:
pheno_mismatch_wgs_sub = pheno_mismatch_wgs %>%
select(individualID,WGS_id)

In [None]:
biospec_fixed = merge(biospec_mismatch_wgs_no_dups, pheno_mismatch_wgs_sub, by = 'individualID')
nrow(biospec_fixed)
head(biospec_fixed)

In [None]:
biospec_fixed_sub = biospec_fixed %>%
select(individualID,
       WGS_id,
       specimenIdSource,
       samplingDate,
       organ,
       tissue,
       BrodmannArea,
       sampleStatus,
       tissueWeight,
       tissueVolume,
       nucleicAcidSource,
       cellType,
       fastingState,
       isPostMortem,
       samplingAge,
       visitNumber,
       assay,
       exclude,
       excludeReason) %>%
rename('specimenID' = 'WGS_id')
head(biospec_fixed_sub)

## concatenate to make final RNAseq map

In [None]:
rnaseq_map_final = rbind(biospec_rnaseq, biospec_fixed_sub)
nrow(rnaseq_map_final)
head(rnaseq_map_final)

## subset to IDs

In [None]:
rnaseq_map_final_sub = rnaseq_map_final %>%
select(individualID, specimenID)

## create ID list for plink score

In [None]:
rnaseq_map_final_id = rnaseq_map_final_sub %>%
select(specimenID)

In [None]:
nrow(rnaseq_map_final_sub[rnaseq_map_final_sub$specimenID %in% vcf_id$V1,])
nrow(rnaseq_map_final_sub)

## export

In [None]:
write.table(rnaseq_map_final_sub,
            'rosmap/ID_mapping/ROSMAP_RNA_seq_WGS_ID_map.txt',
            sep = '\t',
            col.names = T,
            row.names = F,
            quote = F)

In [None]:
write.table(rnaseq_map_final_id,
            'rosmap/ID_mapping/ROSMAP_RNA_seq_WGS_sample_list.txt',
            sep = '\t',
            col.names = F,
            row.names = F,
            quote = F)

# create methylation id map

## filter biospec file

In [None]:
biospec_methyl = biospec[biospec$specimenID %in% methylation$TargetID,]
nrow(biospec_methyl)
length(unique(biospec_methyl$individualID))
length(unique(biospec_methyl$specimenID))
head(biospec_methyl)

## subset to IDs

In [None]:
biospec_methyl_sub = biospec_methyl %>%
select(individualID, specimenID)
head(biospec_methyl_sub)

## export

In [None]:
write.table(biospec_methyl_sub,
            'rosmap/ID_mapping/ROSMAP.Methylation_Array.individualID_map.txt',
            sep = '\t',
            col.names = T,
            row.names = F,
            quote = F)

# create VCF map for all samples

## subset map

In [None]:
biospec_vcf_dups_fixed_sub = subset(biospec_vcf_dups_fixed, select = c(individualID, specimenID))
head(biospec_vcf_dups_fixed_sub)

## find samples that didn't match

In [None]:
vcf_no_match = data.frame(vcf_pass_qc[!(vcf_pass_qc$ID %in% biospec_vcf_dups_fixed$specimenID),])
colnames(vcf_no_match) = 'ID'
nrow(vcf_no_match)
head(vcf_no_match)

## find these samples in phenotype file

In [None]:
pheno_id = pheno %>%
select(projid, Study, individualID) %>%
mutate(specimenID = paste0(Study, projid))
head(pheno_id)

In [None]:
pheno_vcf_no_match = pheno_id[pheno_id$specimenID %in% vcf_no_match$ID,]
nrow(pheno_vcf_no_match)

## subset pheno map

In [None]:
pheno_vcf_no_match_sub = pheno_vcf_no_match %>%
select(individualID, specimenID)

## remove samples in VCF map (IDK HOW THEY GOT THERE OKAY)

In [None]:
pheno_vcf_def_no_match_sub = pheno_vcf_no_match_sub[!(pheno_vcf_no_match_sub$individualID %in% biospec_vcf_dups_fixed_sub$individualID),]
nrow(pheno_vcf_def_no_match_sub)

## concatenate

In [None]:
vcf_map_final = rbind(biospec_vcf_dups_fixed_sub, pheno_vcf_def_no_match_sub)
head(vcf_map_final)
nrow(vcf_map_final)
length(unique(vcf_map_final$individualID))

In [None]:
nrow(biospec_vcf_dups_fixed_sub)
length(unique(biospec_vcf_dups_fixed_sub$individualID))

In [None]:
nrow(pheno_vcf_no_match_sub)
length(unique(pheno_vcf_no_match_sub$individualID))

In [None]:
test = pheno_vcf_no_match_sub[pheno_vcf_no_match_sub$individualID %in% biospec_vcf_dups_fixed_sub$individualID,]
nrow(test)

## create sample list

In [None]:
vcf_map_final_sample = vcf_map_final %>%
select(specimenID)

## export

In [None]:
write.table(vcf_map_final_sample,
            'rosmap/ID_mapping/ROSMAP.ALL_WGS.pass_qc.sample_list.txt',
            sep = '\t',
            col.names = F,
            row.names = F,
            quote = F)

In [None]:
write.table(vcf_map_final,
            'rosmap/ID_mapping/ROSMAP.ALL_WGS.pass_qc.ID_map.txt',
            sep = '\t',
            col.names = T,
            row.names = F,
            quote = F)

# create adsp map

## clean adsp file

In [None]:
adsp_id = adsp %>%
mutate(specimenID = paste0(cohort_key, subject_key_original)) %>%
select(subject_key, specimenID)%>%
rename('ADSP_ID' = 'subject_key')
head(adsp_id)
nrow(adsp_id)

## map to pheno file

In [None]:
head(pheno_id)

In [None]:
adsp_rosmap = merge(pheno_id, adsp_id, by = 'specimenID')
nrow(adsp_rosmap)
nrow(adsp)
head(adsp_rosmap)

## filter mapped file

In [None]:
adsp_rosmap_sub = adsp_rosmap %>%
select(individualID, ADSP_ID)
head(adsp_rosmap_sub)

## map to vcf file

In [None]:
head(vcf_map_final)
nrow(vcf_map_final)

In [None]:
adsp_rosmap_map = merge(vcf_map_final, adsp_rosmap_sub, by = 'individualID')
nrow(adsp_rosmap_map)
nrow(adsp_rosmap_sub)
nrow(vcf_map_final)
length(unique(adsp_rosmap_map$individualID))
head(adsp_rosmap_map)

## check if mapped IDs are all in ADSP WGS list

In [None]:
adsp_pheno_id = adsp_pheno %>%
select(SampleID, SUBJID)
head(adsp_pheno_id)

In [None]:
adsp_pheno_rosmap_wgs = adsp_pheno[adsp_pheno$SampleID %in% adsp_wgs_list$V1,]
nrow(adsp_pheno_rosmap_wgs)
nrow(adsp_wgs_list)

In [None]:
adsp_rosmap_map_wgs = adsp_rosmap_map[adsp_rosmap_map$ADSP_ID %in% adsp_pheno_rosmap_wgs$SUBJID,]
nrow(adsp_rosmap_map_wgs)
head(adsp_rosmap_map_wgs)

In [None]:
adsp_rosmap_map_wgs_rename = adsp_rosmap_map_wgs %>%
rename('SUBJID'='ADSP_ID')
adsp_rosmap_map_wgs_all_id = adsp_rosmap_map_wgs_rename %>%
left_join(adsp_pheno_id, by = 'SUBJID')%>%
select(individualID, specimenID, SampleID, SUBJID)
head(adsp_rosmap_map_wgs_all_id)
nrow(adsp_rosmap_map_wgs_all_id)

In [None]:
write.table(adsp_rosmap_map_wgs_all_id,
            'rosmap/ID_mapping/ROSMAP.ADSP_WGS.ID_map.txt',
            sep = '\t',
            col.names = T,
            row.names = F,
            quote = F)

In [None]:
adsp_rosmap_map_pheno = adsp_rosmap_map[adsp_rosmap_map$ADSP_ID %in% adsp_pheno$SUBJID,]
nrow(adsp_rosmap_map_pheno)
head(adsp_rosmap_map_pheno)

In [None]:
adsp_rosmap_map_pheno_rename = adsp_rosmap_map_pheno %>%
rename('SUBJID' = 'ADSP_ID')
adsp_rosmap_map_pheno_all_id = adsp_rosmap_map_pheno_rename %>%
left_join(adsp_pheno_id, by='SUBJID')%>%
select(individualID, specimenID, SampleID, SUBJID)
head(adsp_rosmap_map_pheno_all_id)
nrow(adsp_rosmap_map_pheno_all_id)

In [None]:
adsp_rosmap_map_pheno_all_id %>%
select(specimenID_RNAseq) %>%
na.omit() %>%
nrow()

In [None]:
adsp_rosmap_map_pheno_all_id %>%
select(specimenID_MethylArray) %>%
na.omit() %>%
nrow()

In [None]:
write.table(adsp_rosmap_map_pheno_all_id,
            'rosmap/ID_mapping/ROSMAP.ADSP_PHENO.ID_map.txt',
            sep = '\t',
            col.names = T,
            row.names = F,
            quote = F)

In [None]:
adsp_pheno %>%
filter(grepl('ROS|MAP',Cohort)) %>%
nrow()

In [None]:
adsp_pheno %>%
filter(!grepl('ROS|MAP',Cohort)) %>%
distinct(Cohort)

## check out ids that didn't map

### ADSP IDs that aren't in VCF files

In [None]:
adsp_no_map = adsp_rosmap_sub[!(adsp_rosmap_sub$ADSP_ID %in% adsp_rosmap_map$ADSP_ID),]
nrow(adsp_no_map)
length(unique(adsp_no_map$individualID))
length(unique(adsp_no_map$ADSP_ID))
head(adsp_no_map)

In [None]:
head(biospec_vcf_dups_fixed)

In [None]:
biospec_vcf_adsp_no_map = biospec_vcf_dups_fixed[biospec_vcf_dups_fixed$individualID %in% adsp_no_map$individualID,]
nrow(biospec_vcf_adsp_no_map)

In [None]:
nrow(biospec)

In [None]:
biospec_vcf_broken=biospec[!(biospec$individualID %in% biospec_vcf_dups_fixed$individualID),]
nrow(biospec_vcf_broken)
length(unique(biospec_vcf_broken$individualID))

In [None]:
adsp_broken_vcf=adsp_no_map[adsp_no_map$individualID %in% biospec_vcf_broken$individualID,]
nrow(adsp_no_map)

In [None]:
biospec_vcf_broken_adsp=biospec_vcf_broken[biospec_vcf_broken$individualID %in% adsp_no_map$individualID,]
head(biospec_vcf_broken_adsp)

In [None]:
unique(biospec_vcf_broken_adsp$assay)

In [None]:
biospec_vcf_broken_adsp %>%
filter(grepl('wholeGenomeSeq', assay)) %>%
head()

### mismatch

#### create mismatch file

In [None]:
adsp_mismatch = vcf_map_final[!(vcf_map_final$individualID %in% adsp_rosmap_sub$individualID),]
nrow(adsp_mismatch)
head(adsp_mismatch)

#### check for leading zeros

In [None]:
adsp %>%
select(subject_key_original) %>%
filter(grepl('^0', subject_key_original)) %>%
nrow()

In [None]:
pheno_id %>%
select(projid) %>%
filter(grepl('^0', projid)) %>%
nrow()

#### check if all IDs are SM

In [None]:
adsp_mismatch %>%
filter(grepl('SM-', specimenID)) %>%
nrow()

In [None]:
adsp_mismatch %>%
filter(!grepl('SM-', specimenID)) %>%
head()

#### see if individualID ares in pheno file

In [None]:
pheno_mismatch = pheno_id[pheno_id$individualID %in% adsp_mismatch$individualID,]
nrow(pheno_mismatch)
head(pheno_mismatch)

In [None]:
adsp_mismatch_rename = adsp_mismatch %>%
rename('vcf_specimenID' = 'specimenID')
pheno_adsp_mismatch = merge(adsp_mismatch_rename,pheno_mismatch, by = 'individualID')
nrow(pheno_adsp_mismatch)
head(pheno_adsp_mismatch)

In [None]:
adsp_test=pheno_adsp_mismatch[pheno_adsp_mismatch$individualID %in% biospec_vcf_broken_adsp$individualID,]
nrow(adsp_test)
nrow(adsp_no_map)

In [None]:
adsp_mismatch_fixed = merge(pheno_adsp_mismatch,adsp_rosmap, by = 'individualID')
nrow(adsp_mismatch_fixed)
head(adsp_mismatch_fixed)

In [None]:
adsp_test = pheno_adsp_mismatch[pheno_adsp_mismatch$individualID %in% biospec$individualID,]
nrow(adsp_test)
head(adsp_test)

In [None]:
head(adsp)
nrow(adsp)

# see if ROSMAP is in keep quest comb in ADSP

In [None]:
adsp_qc_sub = adsp_qc %>%
select(Study, Sample.name) %>%
rename('SampleID' = 'Sample.name')
head(adsp_qc_sub)
print(nrow(adsp_qc_sub))

In [None]:
adsp_pheno_study = adsp_pheno %>%
select(SampleID,Cohort)
head(adsp_pheno_study)
print(nrow(adsp_pheno_study))

In [None]:
adsp_pheno_qc = left_join(adsp_pheno_study, adsp_qc_sub, by = 'SampleID')
head(adsp_pheno_qc)
nrow(adsp_pheno_qc)

In [None]:
adsp_pheno_qc_rosmap = adsp_pheno_qc %>%
filter(grepl('ROS|MAP', Cohort))
head(adsp_pheno_qc_rosmap)
nrow(adsp_pheno_qc_rosmap)

In [None]:
unique(adsp_pheno_qc_rosmap$Study)