In [1]:
#filter out redundant gene coordinates from chr6 bed file
#5th and 6th columns are gene coordinates while the 13th column is the gene

input_bedfile = "/Users/cmdb/qbb2020-answers/assignment6/mm10_refseq_genes_chr6_50M_60M.bed"

#keep track of unique genes detected
#use dictionary as a hash table for look-up instead of a list which will be slower

genes = {}
with open(input_bedfile, 'r') as f:
    for line in f: 
        fields = line.rstrip().split('\t')
        gene = fields[12]
        gene_start = fields[4]
        gene_end = fields[5]
        if gene not in genes:
            genes[gene] = [gene_start, gene_end]
        
#convert dictionary to list of tuples, where first element of tuple is the gene name
#second element is gene start location, and the third element is gene end location
gene_loci = []
gene_keys = genes.keys()
for key in gene_keys:
    gene_location = genes[key]
    gene_locus = (key, gene_location[0], gene_location[1])
    gene_loci.append(gene_locus)


In [2]:
#calculate methylation scores for each region in the bedgraph file
SRR3083026_bedgraph_file = "/Users/cmdb/qbb2020-answers/assignment6/SRR3083926_1.chr6_bismark_bt2_pe.bedGraph"
SRR3083029_bedgraph_file = "/Users/cmdb/qbb2020-answers/assignment6/SRR3083929_1.chr6_bismark_bt2_pe.bedGraph"

#initialize entpy dictionary to store mean methylation values
SRR3083026_gene_methylscore = {}
SRR3083029_gene_methylscore = {}
for locus in gene_loci:
    gene_name = locus[0]
    SRR3083026_gene_methylscore[gene_name] = [0,0]
    SRR3083029_gene_methylscore[gene_name] = [0,0]


header = True
with open(SRR3083026_bedgraph_file, 'r') as f:

    for line in f:
        if header == True:
            header = False
            continue
        fields = line.rstrip().split('\t')
        site_start = int(fields[1])
        site_end = int(fields[2])
        pct_methyl = float(fields[3])
        
        for locus in gene_loci:
            gene_name = locus[0]
            gene_start = int(locus[1])
            gene_end = int(locus[2])
            
            if site_start>=gene_start and site_end<=gene_end:
                methyl_score = (site_end-site_start)*(pct_methyl)
                SRR3083026_gene_methylscore[gene_name][0] += (site_end-site_start)
                SRR3083026_gene_methylscore[gene_name][1] += methyl_score
                continue

header = True
with open(SRR3083029_bedgraph_file, 'r') as f:
    for line in f:
        if header == True:
            header = False
            continue
            
        fields = line.rstrip().split('\t')
        site_start = int(fields[1])
        site_end = int(fields[2])
        pct_methyl = float(fields[3])
        
        for locus in gene_loci:
            gene_name = locus[0]
            gene_start = int(locus[1])
            gene_end = int(locus[2])
            
            if site_start>=gene_start and site_end<=gene_end:
                methyl_score = (site_end-site_start)*(pct_methyl)
                SRR3083029_gene_methylscore[gene_name][0] += (site_end-site_start)
                SRR3083029_gene_methylscore[gene_name][1] += methyl_score
                continue
                


In [3]:
genes = SRR3083026_gene_methylscore.keys()
methylated_genes = []
fold_changes = []

#calculate fold change for each gene between E4.0(SRR3083026) and E5.5(SRR30830229)
for gene in genes:
    SRR3083026_num_bases = SRR3083026_gene_methylscore[gene][0]
    SRR3083026_methyl_score = SRR3083026_gene_methylscore[gene][1]
    if SRR3083026_num_bases == 0:
        continue
    SRR3083026_pct_methyl= SRR3083026_methyl_score/SRR3083026_num_bases
    if SRR3083026_pct_methyl ==0:
        continue
    SRR3083029_num_bases = SRR3083029_gene_methylscore[gene][0]
    SRR3083029_methyl_score = SRR3083029_gene_methylscore[gene][1]
    if SRR3083029_num_bases == 0:
        continue
    SRR3083029_pct_methyl= SRR3083029_methyl_score/SRR3083029_num_bases
    
    fold_change = SRR3083029_pct_methyl/SRR3083026_pct_methyl
    methylated_genes.append(gene)
    fold_changes.append(fold_change)


In [4]:
output_file = "/Users/cmdb/qbb2020-answers/assignment6/fold-methylation-bedgraph.txt"
with open(output_file, 'w') as f:
    f.write("gene" + '\t' + 'fold_change, E5.5/E4.0' + '\n')
    for i, gene in enumerate(methylated_genes):
        f.write(gene + '\t' + str(fold_changes[i]) + '\n')