# Methylation Markers in DMGs
Within DMGs identified in [DESeq_geneFeatureCounts.ipynb](https://github.com/jgmcdonough/CE18_methylRAD_analysis/blob/master/analysis/DMGs_analysis/DESeq_geneFeatureCounts.ipynb), we want to know where methylation is different (locus by locus)

As of rn (03/31/2025) I'm not sure how to do this, so going to start by extracting the CpG dinucleotides that are contained within DMGs 

## 0. load libraries

In [14]:
library(tidyverse)
library(rtracklayer)

## 1. CpGs within DMGs
using `bedtools intersect` and command line to create new CG file

In [None]:
cpgList="/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_methyl_assembly/assembly_pipeline_files/genomic_bed_files/2.C_virginica-3.0_CG-motif.bed"

dmgList="/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/analysis/genomic_feature_tracks/proportion_overlap/dmg_gff.bed"

outputDir="/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_methyl_assembly/assembly_pipeline_files/genomic_bed_files/"


In [None]:
!bedtools intersect -u -a ${cpgList} -b ${dmgList} > ${outputDir}CpGs_inDMG.bed

There's only 53,220 CpGs located within DMGs

## 2. Generate new counts matrix
using `bedtools multicov` and command line to generate counts matrix 

In [None]:
cd /project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_methyl_assembly/assembly_pipeline_files/BEDtools/working_BAM_sequences

output_dir="/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_methyl_assembly/assembly_pipeline_files/genomic_bed_files/"

CG_motif="/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_methyl_assembly/assembly_pipeline_files/genomic_bed_files/CpGs_inDMG.bed"

bedtools multicov -bams *.bam -bed $CG_motif > ${output_dir}CpGs_inDMG_multicov.csv

## 3. Load in new counts matrix

In [3]:
matrix <- read.csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_methyl_assembly/assembly_pipeline_files/genomic_bed_files/CpGs_inDMGs_multicov.csv', sep='\t')
head(matrix)

Unnamed: 0_level_0,NC_035780.1,X315523,X315525,CG_motif,X0,X0.1,X0.2,X0.3,X0.4,X0.5,⋯,X0.10,X0.11,X0.12,X0.13,X0.14,X0.15,X0.16,X0.17,X0.18,X0.19
Unnamed: 0_level_1,<chr>,<int>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,NC_035780.1,315549,315551,CG_motif,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
2,NC_035780.1,315563,315565,CG_motif,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
3,NC_035780.1,315567,315569,CG_motif,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
4,NC_035780.1,315618,315620,CG_motif,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
5,NC_035780.1,315688,315690,CG_motif,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
6,NC_035780.1,315736,315738,CG_motif,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


adding sample names to the columns

In [6]:
# getting names of the BAM files for each oyster individual
bam_files <- dir(path = "/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_methyl_assembly/assembly_pipeline_files/BEDtools/working_BAM_sequences/", 
                 pattern = "^.*\\.bam$", 
                 full.names = FALSE, 
                 ignore.case = TRUE, 
                 all.files = TRUE)

# clean sample names
bam_files_clean <- gsub("^2018--|-CV_sorted\\.bam$", "", bam_files)
bam_files_clean

# renaming columns with sample name - based on order that appears in the directory
colnames(matrix) <- c('chromosome', 'start', 'stop', 'CG_motif', bam_files_clean)
head(matrix)

Unnamed: 0_level_0,chromosome,start,stop,CG_motif,BBB-WBO-B21,BBB-WBV-B70,BBO-BBO-B16,BBO-BBY-B27,BBO-WBO-B16,BBO-WBV-B64,⋯,WBB-WBV-W69,WBG-BBB-W56,WBG-WBG-W44,WBO-BBR-W03,WBO-WBV-W64,WBR-BBY-W25,WBV-WBO-W23,WBV-WBR-W12,WBY-BBV-W65,WBY-BBY-W30
Unnamed: 0_level_1,<chr>,<int>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,NC_035780.1,315549,315551,CG_motif,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
2,NC_035780.1,315563,315565,CG_motif,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
3,NC_035780.1,315567,315569,CG_motif,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
4,NC_035780.1,315618,315620,CG_motif,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
5,NC_035780.1,315688,315690,CG_motif,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
6,NC_035780.1,315736,315738,CG_motif,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


So now I have a counts matrix with CpGs that are only within DMGs for all oyster replicates - it would be helpful to know which gene a given CpG is found in - I wonder if there's a way to add a column that contains the gene accession number

In [11]:
# read in dmg bed file (which was used to generate the matrix above)
dmg.bed <- read.csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/analysis/genomic_feature_tracks/proportion_overlap/dmg_gff.bed', 
                    sep = '\t', 
                    header = FALSE)
head(dmg.bed)

Unnamed: 0_level_0,V1,V2,V3
Unnamed: 0_level_1,<chr>,<int>,<int>
1,NC_035780.1,315522,340261
2,NC_035780.1,444907,453310
3,NC_035780.1,9011563,9043404
4,NC_035780.1,9288063,9301976
5,NC_035780.1,13324137,13332152
6,NC_035780.1,13591534,13596982


In [17]:
# read in dmg csv file
dmg <- read.csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/analysis/deseq_res_files/sig_DMGs/all_sigDMGs.csv')

# read in gff file
gff <- as.data.frame(import.gff('/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/Cvirginica_genome/CV_working_genomic.gff'))

# only grab gene features
gff_genes <- gff[grep("LOC", gff$Name),1:12]

# filter to only include genes that are DMG
dmg_gff <- gff_genes[gff_genes$Name %in% dmg$symbol,]

# double check everything looks good
head(dmg_gff)

# are the dimensions the same? 
dim(dmg_gff)
dim(dmg)

Unnamed: 0_level_0,seqnames,start,end,width,strand,source,type,score,phase,ID,Dbxref,Name
Unnamed: 0_level_1,<fct>,<int>,<int>,<int>,<fct>,<fct>,<fct>,<dbl>,<int>,<chr>,<list>,<chr>
303,NC_035780.1,315522,340261,24740,+,Gnomon,gene,,,gene-LOC111133260,GeneID:1....,LOC111133260
543,NC_035780.1,444907,453310,8404,-,Gnomon,gene,,,gene-LOC111109809,GeneID:1....,LOC111109809
16195,NC_035780.1,9011563,9043404,31842,-,Gnomon,gene,,,gene-LOC111137635,GeneID:1....,LOC111137635
16555,NC_035780.1,9288063,9301976,13914,-,Gnomon,gene,,,gene-LOC111132155,GeneID:1....,LOC111132155
27864,NC_035780.1,13324137,13332152,8016,+,Gnomon,gene,,,gene-LOC111113022,GeneID:1....,LOC111113022
28276,NC_035780.1,13591534,13596982,5449,-,Gnomon,gene,,,gene-LOC111131329,GeneID:1....,LOC111131329


In [28]:
# create df to use to match the gene names to the matrix
match_info <- dmg_gff %>%
  select(seqnames, start, end, Name) 

colnames(match_info) <- c('chromosome', 'start', 'stop', 'gene')

head(match_info)

Unnamed: 0_level_0,chromosome,start,stop,gene
Unnamed: 0_level_1,<fct>,<int>,<int>,<chr>
303,NC_035780.1,315522,340261,LOC111133260
543,NC_035780.1,444907,453310,LOC111109809
16195,NC_035780.1,9011563,9043404,LOC111137635
16555,NC_035780.1,9288063,9301976,LOC111132155
27864,NC_035780.1,13324137,13332152,LOC111113022
28276,NC_035780.1,13591534,13596982,LOC111131329


matching gene names to the counts matrix CpGs so now we know which genes these CpGs belong to

In [32]:
matrix2 <- matrix %>% 
  inner_join(match_info %>% select(chromosome, start, stop, gene), 
               by = "chromosome") %>% 
  filter(start.x >= start.y, stop.x <= stop.y) %>% 
  select(-c(start.y, stop.y))

head(matrix2)

“[1m[22mDetected an unexpected many-to-many relationship between `x` and `y`.
[36mℹ[39m Row 1 of `x` matches multiple rows in `y`.
[36mℹ[39m Row 1 of `y` matches multiple rows in `x`.
[36mℹ[39m If a many-to-many relationship is expected, set `relationship =


Unnamed: 0_level_0,chromosome,start.x,stop.x,CG_motif,BBB-WBO-B21,BBB-WBV-B70,BBO-BBO-B16,BBO-BBY-B27,BBO-WBO-B16,BBO-WBV-B64,⋯,WBG-BBB-W56,WBG-WBG-W44,WBO-BBR-W03,WBO-WBV-W64,WBR-BBY-W25,WBV-WBO-W23,WBV-WBR-W12,WBY-BBV-W65,WBY-BBY-W30,gene
Unnamed: 0_level_1,<chr>,<int>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>
1,NC_035780.1,315549,315551,CG_motif,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,LOC111133260
2,NC_035780.1,315563,315565,CG_motif,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,LOC111133260
3,NC_035780.1,315567,315569,CG_motif,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,LOC111133260
4,NC_035780.1,315618,315620,CG_motif,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,LOC111133260
5,NC_035780.1,315688,315690,CG_motif,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,LOC111133260
6,NC_035780.1,315736,315738,CG_motif,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,LOC111133260


In [34]:
# for each gene, how many CpGs are there?
matrix2 %>%
group_by(gene) %>%
count()

gene,n
<chr>,<int>
LOC111099548,154
LOC111099571,159
LOC111099585,68
LOC111099930,2103
LOC111099978,1645
LOC111100608,47
LOC111100625,178
LOC111100898,59
LOC111100915,916
LOC111101237,83
