In [1]:
import pandas as pd
import numpy as np
import pybedtools
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests

In [4]:
# merge overlapped gene body
!bedtools merge -i gene.sorted.bed > non_overlap_intragenic.bed
# get intergenic and intragenic region
!bedtools subtract -a mm10.whole_genome.bed -b non_overlap_intragenic.bed > non_overlap_intergenic.bed
!bedtools subtract -a non_overlap_intergenic.bed -b mm10-blacklist.v2.bed > non_overlap_intergenic.subtract_black.bed
!bedtools subtract -a non_overlap_intragenic.bed -b mm10-blacklist.v2.bed > non_overlap_intragenic.subtract_black.bed

# subdivide dmr into intra- and intergenic
!bedtools intersect -a ../DG_DMR.decrease.mch_group.bed -b non_overlap_intergenic.subtract_black.bed -wa -u > DG_DMR.decrease.intergenic.bed
!bedtools intersect -a ../DG_DMR.decrease.mch_group.bed -b non_overlap_intragenic.subtract_black.bed -wa -u > DG_DMR.decrease.intragenic.bed
!bedtools intersect -a ../DG_DMR.increase.mch_group.bed -b non_overlap_intragenic.subtract_black.bed -wa -u > DG_DMR.increase.intragenic.bed
!bedtools intersect -a ../DG_DMR.increase.mch_group.bed -b non_overlap_intergenic.subtract_black.bed -wa -u > DG_DMR.increase.intergenic.bed

# get fasta of each region
!bedtools getfasta -fi /home/hanliu/ref/mouse/genome/fasta/raw/mm10.fa -bed non_overlap_intergenic.subtract_black.bed > intergenic.fa
!bedtools getfasta -fi /home/hanliu/ref/mouse/genome/fasta/raw/mm10.fa -bed non_overlap_intergenic.subtract_black.bed > intergenic.fa



In [2]:
dmr_data = pd.read_csv('../../axis-mch_rms_results_collapsed_passfilter_annotated.csv', index_col=0)
dmr_data.index = dmr_data.index.map(lambda i: f'DGmCH_{i}')
dmr_data.index.name = 'DMR_id'

delta_cutoff = 0.4

mc_rate = dmr_data[['low', 'midlow', 'midhigh', 'high']]
delta = mc_rate.max(axis=1) - mc_rate.min(axis=1)
dmr_data = dmr_data[delta > delta_cutoff].copy()

dmr_data.head()

Unnamed: 0_level_0,#chr,start,end,number_of_dms,low,midlow,midhigh,high,rho,trend,...,annot-stop_codon.all,annot-TSS.all,annot-TSS.protein_coding,annot-UTR3.all,annot-UTR3.protein_coding,annot-UTR5.all,annot-UTR5.protein_coding,annot-CGI,annot-CGI_shore,annot-Transposon
DMR_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DGmCH_8,chr1,3213139,3213139,1,0.411765,0.704918,0.75,0.882353,0.947496,increasing,...,False,False,False,False,False,False,False,False,False,False
DGmCH_10,chr1,3299623,3299623,1,0.6,0.490196,0.327273,0.170213,-0.996546,decreasing,...,False,False,False,False,False,False,False,False,False,False
DGmCH_11,chr1,3327720,3327720,1,0.64,0.258065,0.306452,0.195652,-0.835279,decreasing,...,False,False,False,False,False,False,False,False,False,True
DGmCH_13,chr1,3387472,3387472,1,0.783784,0.806452,0.69697,0.285714,-0.852106,decreasing,...,False,False,False,False,False,False,False,False,False,False
DGmCH_16,chr1,3397936,3397936,1,0.685185,0.8,0.622642,0.326087,-0.801536,decreasing,...,False,False,False,False,False,False,False,False,False,False


In [3]:
inc_intra_bed = pd.read_csv('../DG_DMR.increase.intragenic.bed', 
                            header=None, sep='\t', index_col=3, 
                            names=['chrom', 'start', 'end', 'DMR_id'])
inc_intra_bed = inc_intra_bed[inc_intra_bed.index.isin(dmr_data.index)].copy()
inc_intra_total_dms = dmr_data.loc[inc_intra_bed.index]['number_of_dms'].sum()

inc_inter_bed = pd.read_csv('../DG_DMR.increase.intergenic.bed', 
                            header=None, sep='\t', index_col=3, 
                            names=['chrom', 'start', 'end', 'DMR_id'])
inc_inter_bed = inc_inter_bed[inc_inter_bed.index.isin(dmr_data.index)].copy()
inc_inter_total_dms = dmr_data.loc[inc_inter_bed.index]['number_of_dms'].sum()

dec_intra_bed = pd.read_csv('../DG_DMR.decrease.intragenic.bed', 
                            header=None, sep='\t', index_col=3, 
                            names=['chrom', 'start', 'end', 'DMR_id'])
dec_intra_bed = dec_intra_bed[dec_intra_bed.index.isin(dmr_data.index)].copy()
dec_intra_total_dms = dmr_data.loc[dec_intra_bed.index]['number_of_dms'].sum()

dec_inter_bed = pd.read_csv('../DG_DMR.decrease.intergenic.bed', 
                            header=None, sep='\t', index_col=3, 
                            names=['chrom', 'start', 'end', 'DMR_id'])
dec_inter_bed = dec_inter_bed[dec_inter_bed.index.isin(dmr_data.index)].copy()
dec_inter_total_dms = dmr_data.loc[dec_inter_bed.index]['number_of_dms'].sum()

## Gene meta

In [4]:
gene_meta = pd.read_csv('/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
                        sep='\t', index_col='gene_id')
gene_meta['length'] = gene_meta['end'] - gene_meta['start']
gene_name_to_id = {v: k for k, v in gene_meta['gene_name'].items()}
gene_meta.head()

Unnamed: 0_level_0,chrom,source,feature,start,end,score,strand,phase,transcript_id,gene_type,...,transcript_type,transcript_status,transcript_name,exon_number,exon_id,level,mgi_id,havana_gene,tag,length
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000102693.1,chr1,HAVANA,gene,3073253,3074322,.,+,.,,TEC,...,,,,,,2,MGI:1918292,OTTMUSG00000049935.1,,1069
ENSMUSG00000064842.1,chr1,ENSEMBL,gene,3102016,3102125,.,+,.,,snRNA,...,,,,,,3,MGI:5455983,,,109
ENSMUSG00000051951.5,chr1,HAVANA,gene,3205901,3671498,.,-,.,,protein_coding,...,,,,,,2,MGI:3528744,OTTMUSG00000026353.2,,465597
ENSMUSG00000102851.1,chr1,HAVANA,gene,3252757,3253236,.,+,.,,processed_pseudogene,...,,,,,,1,MGI:5011141,OTTMUSG00000049958.1,pseudo_consens,479
ENSMUSG00000103377.1,chr1,HAVANA,gene,3365731,3368549,.,-,.,,TEC,...,,,,,,2,MGI:5610408,OTTMUSG00000049960.1,,2818


## Intra Inter genic

In [5]:
intragenic_df = pd.read_csv(
    '../non_overlap_intragenic.subtract_black.gene_id_anno.bed',
    header=None,
    sep='\t',
    index_col=None,
    names=['chrom', 'start', 'end', 'gene_ids'])
intragenic_df.index = intragenic_df.chrom + ':' + intragenic_df.start.astype(str) + '-' + intragenic_df.end.astype(str)

intergenic_df = pd.read_csv(
    '../non_overlap_intergenic.subtract_black.bed',
    header=None,
    sep='\t',
    index_col=None,
    names=['chrom', 'start', 'end'])
intergenic_df.index = intergenic_df.chrom + ':' + intergenic_df.start.astype(str) + '-' + intergenic_df.end.astype(str)


## Background C

In [6]:
def count_c(file_path):
    records = {}
    with open(file_path) as f:
        name = None
        for line in f:
            if line.startswith('>'):
                name = line[1:-1]
            else:
                records[name] = line.lower().count('c')
    return pd.Series(records)

In [7]:
intragenic_c = count_c('../intragenic.fa')
intergenic_c = count_c('../intergenic.fa')

total_intragenic_c = intragenic_c.sum()
total_intergenic_c = intergenic_c.sum()

In [8]:
total_intragenic_c, total_intergenic_c, inc_intra_total_dms, dec_intra_total_dms

(253112183, 261977002, 49653, 35809)

## Per gene DMS

In [9]:
dms_dict = dmr_data['number_of_dms'].to_dict()

def get_dms(value):
    if isinstance(value, float):
        return 0
    else:
        return sum([dms_dict.get(dmr_id, 0) for dmr_id in value.split(',')])

## Significance cutoff

In [10]:
p_cutoff = 0.01
odds_cutoff = 1.5

## Test Decreasing DMR

In [11]:
intra_dec_df = pd.read_csv('../non_overlap_intragenic.subtract_black.dec_dmr.bed', 
                           sep='\t', header=None, na_values='.', names=['chrom', 'start', 'end', 'DMR_IDs'])
intra_dec_df['dms_in'] = intra_dec_df['DMR_IDs'].apply(get_dms)

intra_dec_df.index = intra_dec_df.chrom + ':' + intra_dec_df.start.astype(str) + '-' + intra_dec_df.end.astype(str)
intra_dec_df['other_in'] = intragenic_c - intra_dec_df['dms_in']

intra_dec_df['dms_out'] = dec_intra_total_dms - intra_dec_df['dms_in']
intra_dec_df['other_out'] = total_intragenic_c - intragenic_c - intra_dec_df['dms_out']

intra_dec_df.head()

Unnamed: 0,chrom,start,end,DMR_IDs,dms_in,other_in,dms_out,other_out
chr1:3073252-3074321,chr1,3073252,3074321,,0,160,35809,253076214
chr1:3102015-3102124,chr1,3102015,3102124,,0,20,35809,253076354
chr1:3205900-3671497,chr1,3205900,3671497,"DGmCH_9,DGmCH_10,DGmCH_11,DGmCH_12,DGmCH_13,DG...",10,89678,35799,252986696
chr1:3680154-3681787,chr1,3680154,3681787,,0,336,35809,253076038
chr1:3752009-3754359,chr1,3752009,3754359,,0,474,35809,253075900


In [12]:
def test_enrich(row):
    return pd.Series(
        fisher_exact([[row['dms_in'], row['dms_out']],
                      [row['other_in'], row['other_out']]], 'greater'))

test_result = intra_dec_df.apply(test_enrich, axis=1)
test_result.columns = ['odds', 'p']
test_result = test_result[test_result['odds'] > 1].copy()

judge, adj_p, _, _ = multipletests(test_result['p'], method='fdr_bh')

test_result['judge'] = judge
test_result['adj_p'] = adj_p
test_result['-lgp'] = -np.log10(adj_p)

In [13]:
sig_dec_intra_result = test_result[(test_result['adj_p'] < p_cutoff) & 
                                   (test_result['odds'] > odds_cutoff)].index
total_dec_intra_result = pd.concat([
    intra_dec_df.loc[sig_dec_intra_result],
    test_result.loc[sig_dec_intra_result],
    intragenic_df.loc[sig_dec_intra_result, ['gene_ids']]
],
                                   axis=1,
                                   sort=True)

total_dec_intra_result.to_csv('FINAL.Intragenic.decrease.results.csv')

In [14]:
nofilter_dec_intra_result = pd.concat([
    intra_dec_df.loc[test_result.index],
    test_result,
    intragenic_df.loc[test_result.index, ['gene_ids']]
],
                                   axis=1,
                                   sort=True)
nofilter_dec_intra_result.to_csv('FINAL.nofilter.Intragenic.decrease.results.csv')

In [15]:
judges = (total_dec_intra_result['dms_in'] > 50) & \
         (total_dec_intra_result['odds'] > 2.5)

length_cutoff = 10000

dec_genes = set()
for gene_ids in total_dec_intra_result[judges]['gene_ids']:
    for gene in gene_ids.split(','):
        gene_length = gene_meta.at[gene, 'length']
        if gene_length < length_cutoff:
            continue
        dec_genes.add(gene)
with open('dec_genes.txt', 'w') as f:
    for gene in dec_genes:
        f.write(gene.split('.')[0]+'\n')
# this number decludes many small gene 
len(dec_genes)

64

## Test increasing DMR

In [16]:
intra_inc_df = pd.read_csv('../non_overlap_intragenic.subtract_black.inc_dmr.bed', 
                           sep='\t', header=None, na_values='.', names=['chrom', 'start', 'end', 'DMR_IDs'])
intra_inc_df['dms_in'] = intra_inc_df['DMR_IDs'].apply(get_dms)

intra_inc_df.index = intra_inc_df.chrom + ':' + intra_inc_df.start.astype(str) + '-' + intra_inc_df.end.astype(str)
intra_inc_df['other_in'] = intragenic_c - intra_inc_df['dms_in']

intra_inc_df['dms_out'] = inc_intra_total_dms - intra_inc_df['dms_in']
intra_inc_df['other_out'] = total_intragenic_c - intragenic_c - intra_inc_df['dms_out']

intra_inc_df.head()

Unnamed: 0,chrom,start,end,DMR_IDs,dms_in,other_in,dms_out,other_out
chr1:3073252-3074321,chr1,3073252,3074321,,0,160,49653,253062370
chr1:3102015-3102124,chr1,3102015,3102124,,0,20,49653,253062510
chr1:3205900-3671497,chr1,3205900,3671497,"DGmCH_8,DGmCH_22",1,89687,49652,252972843
chr1:3680154-3681787,chr1,3680154,3681787,,0,336,49653,253062194
chr1:3752009-3754359,chr1,3752009,3754359,,0,474,49653,253062056


In [17]:
def test_enrich(row):
    return pd.Series(
        fisher_exact([[row['dms_in'], row['dms_out']],
                      [row['other_in'], row['other_out']]], 'greater'))

test_result = intra_inc_df.apply(test_enrich, axis=1)
test_result.columns = ['odds', 'p']
test_result = test_result[test_result['odds'] > 1].copy()

judge, adj_p, _, _ = multipletests(test_result['p'], method='fdr_bh')

test_result['judge'] = judge
test_result['adj_p'] = adj_p
test_result['-lgp'] = -np.log10(adj_p)

  


In [18]:
sig_inc_intra_result = test_result[(test_result['adj_p'] < p_cutoff) & 
                                   (test_result['odds'] > odds_cutoff)].index
total_inc_intra_result = pd.concat([
    intra_inc_df.loc[sig_inc_intra_result],
    test_result.loc[sig_inc_intra_result],
    intragenic_df.loc[sig_inc_intra_result, ['gene_ids']]
],
                                   axis=1,
                                   sort=True)

total_inc_intra_result.to_csv('FINAL.Intragenic.increase.results.csv')

In [19]:
nofilter_inc_intra_result = pd.concat([
    intra_inc_df.loc[test_result.index],
    test_result,
    intragenic_df.loc[test_result.index, ['gene_ids']]
],
                                   axis=1,
                                   sort=True)
nofilter_inc_intra_result.to_csv('FINAL.nofilter.Intragenic.increase.results.csv')

In [20]:
judges = (total_inc_intra_result['dms_in'] > 50) & \
         (total_inc_intra_result['odds'] > 2.5)

length_cutoff = 10000

inc_genes = set()
for gene_ids in total_inc_intra_result[judges]['gene_ids']:
    for gene in gene_ids.split(','):
        gene_length = gene_meta.at[gene, 'length']
        if gene_length < length_cutoff:
            continue
        inc_genes.add(gene)
with open('inc_genes.txt', 'w') as f:
    for gene in inc_genes:
        f.write(gene.split('.')[0]+'\n')
# this number includes many small gene 
len(inc_genes)

71

In [21]:
for g in (dec_genes & inc_genes):
    print(gene_meta.at[g, 'gene_name'])

In [27]:
gene_meta.loc[dec_genes]['gene_name'].tolist()

['Gfra1',
 'Cadm1',
 'Slit1',
 'Kcnma1',
 'Lpp',
 'Kirrel3',
 'Glp2r',
 'Fam160a1',
 'Slit3',
 'Nos1ap',
 'Gatb',
 'Gm39038',
 '5830418P13Rik',
 'Gm15942',
 'Gm16252',
 'Ncald',
 'Gas7',
 'Rfx3',
 'Zbtb16',
 'Dcp1b',
 'D930032P07Rik',
 'Rapgef5',
 'Tfb1m',
 'Trio',
 'Glis3',
 'Gm16277',
 'Fam189a1',
 'Slc4a4',
 'Tenm2',
 'Kalrn',
 '4930447A16Rik',
 'Prkce',
 'Tmem108',
 'Tafa5',
 'Zmat4',
 'Bfsp2',
 'Rtn1',
 'Abr',
 'Ahcyl2',
 '4930567K12Rik',
 'Gm49397',
 'Fat4',
 'Dab1',
 'Pip5k1b',
 'Tiam2',
 'Ntrk3',
 'Cacna1c',
 '1700015C17Rik',
 'Glt28d2',
 'Gm32743',
 'Dlgap1',
 'Gm41609',
 'Dlg2',
 'Kirrel3os',
 'Atp2b1',
 '5031415H12Rik',
 'Pip5k1bos',
 'C230034O21Rik',
 'Gm37240',
 'Dlgap2',
 'Cdh13',
 'Gm15941',
 'Adgrb3',
 'St6galnac3']