# load packages

In [None]:
library(tidyverse)

In [None]:
library(data.table)

# read in input files

In [None]:
id_map = read.csv('rosmap/ID_mapping/ROSMAP.Methylation_Array.individualID_map.txt',
                  sep = '\t',
                  na.strings = c(''))
head(id_map)

In [None]:
methylation = fread('rosmap/ROSMAP_arrayMethylation_imputed.tsv.gz',
                    sep = '\t',
                    na.strings = c(''))
head(methylation)

In [None]:
gene_map = read.csv('rosmap/ROSMAP_arrayMethylation_metaData.tsv',
                    sep = '\t',
                    na.strings = c(''))
head(gene_map)

In [None]:
ref = fread('ensembl_start_stop/Homo_sapiens.GRCh38.113.refseq.tsv.gz',
            sep = '\t')
head(ref)

In [None]:
raw_methyl = fread('rosmap/Epigenetics/Epigenetics (DNA methylation array)/ROSMAP_arrayMethylation_raw.gz',
                   sep = '\t')
raw_methyl.head()

# look at data

In [None]:
dim(id_map)

In [None]:
dim(methylation)

In [None]:
dim(gene_map)

In [None]:
gene_map %>%
filter(!is.na(RefGene)) %>%
nrow()

In [None]:
length(unique(methylation$TargetID))

In [None]:
length(unique(gene_map$TargetID))

In [None]:
length(unique(gene_map$RefGene))

# clean up meta data

## filter to non-missing ref genes

In [None]:
ref_gene_map = gene_map %>%
filter(!is.na(RefGene))
dim(ref_gene_map)
length(unique(ref_gene_map$RefGene))
head(ref_gene_map)

## subset

In [None]:
ref_gene_map_sub = ref_gene_map %>%
dplyr::select(TargetID,RefGene,Accession)
head(ref_gene_map_sub)

## split rows with multiple genes

In [None]:
ref_gene_map_explode = ref_gene_map_sub %>%
separate_rows(RefGene, Accession, sep = ";")
nrow(ref_gene_map_explode)
length(unique(ref_gene_map_explode$RefGene))
length(unique(ref_gene_map_explode$Accession))
length(unique(ref_gene_map_explode$TargetID))
head(ref_gene_map_explode)

In [None]:
ref_gene_map_explode %>%
filter(grepl('cg00001583', TargetID))

## change month/day genes

### filter to date genes

In [None]:
ref_gene_map_explode %>%
arrange(RefGene) %>%
distinct(RefGene) %>%
head(n = 20)

In [None]:
ref_gene_map_explode_date = ref_gene_map_explode %>%
filter(grepl('Dec|Mar|Sep', RefGene))
unique(ref_gene_map_explode_date$RefGene)
head(ref_gene_map_explode_date)

## create map

### create vectors

In [None]:
date_accession_vec = as.vector(unique(ref_gene_map_explode_date$Accession))
date_accession_vec

In [None]:
gene_vec = c('MARCHF4',
             'MARCHF7',
             'MARCHF1',
             'SEPTIN11',
             'MARCHF11',
             'MARCHF3',
             'MARCHF6',
             'SEPTIN8',
             'SEPTIN8',
             'SEPTIN7P2',
             'SEPTIN14',
             'DELEC1',
             'MARCHF5',
            'MARCHF9',
            'SEPTIN1',
            'SEPTIN9',
            'MARCHF10',
            'SEPTIN5')
gene_vec

### create df

In [None]:
all_accession_gene_map = data.frame(date_accession_vec,gene_vec)
all_accession_gene_map=all_accession_gene_map %>%
rename('Accession' = 'date_accession_vec')
all_accession_gene_map

## merge

In [None]:
ref_gene_map_explode_date_correct_genes = ref_gene_map_explode_date %>%
inner_join(all_accession_gene_map, by='Accession')
nrow(ref_gene_map_explode_date_correct_genes)
nrow(ref_gene_map_explode_date_correct_genes)
head(ref_gene_map_explode_date_correct_genes)

## subset

In [None]:
ref_gene_map_explode_date_correct_genes = ref_gene_map_explode_date_correct_genes %>%
dplyr::select(TargetID, gene_vec, Accession) %>%
rename('RefGene' = 'gene_vec')
head(ref_gene_map_explode_date_correct_genes)

## create df without date genes

In [None]:
ref_gene_map_explode_no_date = ref_gene_map_explode %>%
filter(!grepl('Dec|Mar|Sep', RefGene))
head(ref_gene_map_explode_no_date)

## concatenate date + fixed date

In [None]:
ref_gene_map_explode_fixed = ref_gene_map_explode_no_date %>%
rbind(ref_gene_map_explode_date_correct_genes)
nrow(ref_gene_map_explode_fixed)
nrow(ref_gene_map_explode)
head(ref_gene_map_explode_fixed)

## clean up ref seq file

In [None]:
ref_direct = ref %>%
filter(info_type == 'DIRECT')
nrow(ref)
nrow(ref_direct)
head(ref_direct)

## filter gene map to ref transcripts

In [None]:
ref_gene_map_ref_trans = ref_gene_map_explode_fixed[ref_gene_map_explode_fixed$Accession %in% ref_direct$xref,]
nrow(ref_gene_map_explode_fixed)
nrow(ref_gene_map_ref_trans)
length(unique(ref_gene_map_ref_trans$Accession))
head(ref_gene_map_ref_trans)

In [None]:
length(unique(ref_gene_map_ref_trans$RefGene))
length(unique(ref_gene_map_ref_trans$TargetID))

In [None]:
dups <- ref_gene_map_ref_trans[duplicated(ref_gene_map_ref_trans$TargetID) | duplicated(ref_gene_map_ref_trans$TargetID, fromLast = TRUE), ]
head(dups)

In [None]:
ref_gene_map_ref_trans_no_dup = ref_gene_map_ref_trans %>%
distinct()
nrow(ref_gene_map_ref_trans)
nrow(ref_gene_map_ref_trans_no_dup)
head(ref_gene_map_ref_trans_no_dup)

In [None]:
ref_gene_map_ref_trans_no_dup_again = ref_gene_map_ref_trans_no_dup %>%
distinct(TargetID, RefGene, .keep_all = T)
nrow(ref_gene_map_ref_trans_no_dup)
nrow(ref_gene_map_ref_trans_no_dup_again)
head(ref_gene_map_ref_trans_no_dup_again)

In [None]:
still_dups <- ref_gene_map_ref_trans_no_dup_again[duplicated(ref_gene_map_ref_trans_no_dup_again$TargetID) | duplicated(ref_gene_map_ref_trans_no_dup_again$TargetID, fromLast = TRUE), ]
head(still_dups)

## create unique gene list

In [None]:
gene_list = ref_gene_map_ref_trans_no_dup_again %>%
dplyr::select(RefGene) %>%
distinct()
nrow(gene_list)
head(gene_list)

# filter methylation data to ref genes

## merge

In [None]:
methyl_ref = ref_gene_map_ref_trans_no_dup_again %>%
inner_join(methylation, by = 'TargetID')
nrow(methyl_ref)
length(unique(methyl_ref$TargetID))
length(unique(methyl_ref$RefGene))
head(methyl_ref)

## create column combining RefGene and TargetID

In [None]:
methyl_ref_id = methyl_ref %>%
mutate(RefGene_TargetID_Accession = paste0(RefGene, '_', TargetID, '_', Accession)) %>%
select(RefGene_TargetID_Accession) %>%
cbind(methyl_ref) %>%
select(-c(TargetID, RefGene, Accession)) %>%
distinct()
nrow(methyl_ref_id)
length(unique(methyl_ref_id$RefGene_TargetID_Accession))
head(methyl_ref_id)

In [None]:
methyl_ref_id%>%
filter(duplicated(RefGene_TargetID_Accession) | duplicated(RefGene_TargetID_Accession, fromLast = TRUE)) %>%
arrange(RefGene_TargetID_Accession) %>%
head()

# map to individual ids

## transpose

In [None]:
methyl_ref_tranpose = data.frame(t(methyl_ref_id))
head(methyl_ref_tranpose)

## rename columns

In [None]:
colnames(methyl_ref_tranpose) = as.character(methyl_ref_tranpose[1, ])
head(methyl_ref_tranpose)

## drop first row

In [None]:
methyl_ref_tranpose <- methyl_ref_tranpose[-1, ]
head(methyl_ref_tranpose)

## add ID column

In [None]:
methyl_ref_tranpose_id = data.frame(rownames(methyl_ref_tranpose))
colnames(methyl_ref_tranpose_id) = c('specimenID')
head(methyl_ref_tranpose_id)

In [None]:
methyl_ref_tranpose = cbind(methyl_ref_tranpose_id, methyl_ref_tranpose)
head(methyl_ref_tranpose)

## add individual ID

In [None]:
methyl_ref_tranpose_id_map = id_map %>%
inner_join(methyl_ref_tranpose, by = 'specimenID')
nrow(methyl_ref_tranpose_id_map)
nrow(methyl_ref_tranpose)
nrow(id_map)
head(methyl_ref_tranpose_id_map)

## drop specimenID

In [None]:
methyl_ref_tranpose_id_map = methyl_ref_tranpose_id_map %>%
dplyr::select(-c(specimenID))
head(methyl_ref_tranpose_id_map)

# convert to m-values (logit transformed beta values)

## drop id col

In [None]:
methyl_id = methyl_ref_tranpose_id_map %>%
select(individualID)

In [None]:
methyl_no_id = methyl_ref_tranpose_id_map %>%
select(-c(individualID))

## convert to numeric

In [None]:
methyl_no_id[] = lapply(methyl_no_id, function(x) as.numeric(as.character(x)))
head(methyl_no_id)

## create normalizing function

In [None]:
beta_to_m <- function(beta) {
  log2(beta / (1 - beta))
}

## normalize

In [None]:
methyl_ref_norm <- as.data.frame(lapply(methyl_no_id, beta_to_m))
head(methyl_ref_norm)

## add id back in

In [None]:
methyl_ref_norm_id = cbind(methyl_id, methyl_ref_norm)
head(methyl_ref_norm_id)

# export

In [None]:
write.table(gene_list,
            'rosmap/ROSMAP_arrayMethylation_imputed.gene_list.txt',
            sep = '\t',
            col.names = T,
            row.names = F,
            quote = F)

In [None]:
write.table(methyl_ref_norm_id,
            'rosmap/ROSMAP_arrayMethylation_imputed.gene_symbol.individualID.mvalue_norm.txt',
            sep = '\t',
            col.names = T,
            row.names = F,
            quote = F)