In [1]:
import pathlib
import pandas as pd
import pybedtools
import seaborn as sns

## Liftover BED slop

In [2]:
min_match = 0.5
slop = 250
chrom_sizes = '/home/hanliu/ref/human/hg19/genome/hg19.main.chrom.sizes'

In [3]:
# mm10 liftover to hg19, the coords is hg19 !
total_bed = f'/home/hanliu/project/mouse_rostral_brain/study/LiftoverDMR/SubTypeDMR.lifted.{min_match}.bed'
bed = pybedtools.BedTool(total_bed)
bed = bed.slop(b=slop, g=chrom_sizes)
total_bed_df = bed.to_dataframe()

total_bed_df['DMR_length'] = (total_bed_df['end'] - total_bed_df['start'])
# some DMR is very long after liftover... remove it, max dmr length in mm10 is ~5kb
total_bed_df = total_bed_df[total_bed_df['DMR_length'] < 10000]
# some weird but very rare problem from liftover
total_bed_df = total_bed_df[total_bed_df['start'] > 0]

total_bed_df.set_index('name', inplace=True)
total_bed_df.head()

Unnamed: 0_level_0,chrom,start,end,DMR_length
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Subchr1_8,chr8,56562097,56562806,709
Subchr1_37,chr8,56549652,56550269,617
Subchr1_80,chr8,56538898,56539459,561
Subchr1_82,chr8,56537699,56538333,634
Subchr1_84,chr8,56537064,56537667,603


## Save SubType DMR in hg19 coords

In [6]:
dmr_bed_dir = pathlib.Path(
    '/home/hanliu/project/mouse_rostral_brain/DMR/SubType/HypoBed/')
dmr_list = list(dmr_bed_dir.glob('*DMS2.bed'))
print(len(dmr_list), 'bed files')

output_dir = pathlib.Path('InputBed')
output_dir.mkdir(exist_ok=True)

for i, bed_path in enumerate(dmr_list):
    # the coords is mm10
    this_bed_df = pd.read_csv(bed_path, sep='\t', header=None, index_col=3)

    # the coords is hg19
    use_bed_df = total_bed_df.loc[this_bed_df.index & total_bed_df.index]
    use_bed = pybedtools.BedTool.from_dataframe(
        use_bed_df.reset_index(drop=True).iloc[:, :3])
    
    use_bed = use_bed.sort()
    use_bed.saveas(output_dir /
                   (bed_path.name[:-3] + f'hg19-{min_match}-slop{slop}.bed'))
    print(i,
          bed_path.name.split('.')[0],
          int(use_bed_df.shape[0] / this_bed_df.shape[0] * 100),
          sep='\t')

161 bed files
0	CT-L6_Il1rap	67
1	NP-L6_Cntnap5a	66
2	CGE-Lamp5_Sorcs1	72
3	CGE-Vip_Grm8	71
4	LSX-Inh_Dock10	74
5	CGE-Vip_Ccser1	74
6	CGE-Vip_Ntng1	71
7	ASC_cortex-olf	70
8	MGE-Sst_Chodl	70
9	IT-L6_Oxr1	65
10	VLMC_Col4a1	55
11	LSX-Inh_Lats2	69
12	L6b_Kcnk2	67
13	Chd7_Megf11	75
14	MGE-Sst_Bmper	74
15	CT-L6_Megf9	66
16	Chd7_Kcnc2	73
17	DG-po_Kctd8	64
18	DG_dg-all	69
19	NP-L6_Cyp7b1	66
20	D1L-Fstl4_Crim1	71
21	PT-L5_Tenm2	67
22	Unc5c_Unc5c	71
23	CGE-Lamp5_Grid1	70
24	OLF-Exc_Pld5	67
25	PT-L5_Tmtc2	66
26	CLA_Cdh8	65
27	CA3-St18_Tead1	67
28	PAL-Inh_Meis2	83
29	NP-L6_Boc	66
30	IT-L23_Foxp1	65
31	MGC_mgc-all	46
32	Chd7_Trpc7	77
33	LSX-Inh_Nxph1	72
34	CA3-St18_Nuak1	67
35	Gfra1_Gfra1	67
36	MSN-D2_Nrp2	70
37	PT-L5_Kcnh1	67
38	LSX-Inh_Zeb2	70
39	DG-po_Bcl11a	61
40	L6b_Nrp2	68
41	PAL-Inh_Tmem178	79
42	PAL-Inh_Tcf7l2	76
43	CT-L6_Hcrtr2	66
44	OLF-Exc_Cdh9	68
45	PT-L5_Abca12	69
46	MSN-D1_Plxnc1	70
47	D1L-Fstl4_Sipa1l2	70
48	PAL-Inh_Chat	77
49	PT-L5_Unc5b	68
50	CLA_Nrp2	67
51	MGE-Pvalb_Ptprk	70
52	NP

In [None]:
!rsync -zarv /home/hanliu/project/mouse_rostral_brain/study/LDSC/ salk-login:/gale/netapp/scratch2/hanliu/LDSC/

## Prepare annot command

In [21]:
dmr_list = list(output_dir.glob(f'*hg19-{min_match}-slop{slop}.bed'))
print(len(dmr_list), 'bed files')

chroms = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]

base_dir = '/gale/netapp/scratch2/hanliu/LDSC/SubTypeDMRTotal'

for input_bed in dmr_list:
    cluster_commands = []
    mkdir_cmd = f'mkdir /gale/netapp/scratch2/hanliu/LDSC/SubTypeDMRTotal/Annotation/{cluster}'
    for chrom in chroms:
        cluster = input_bed.name.split('.')[0]
        ldsc_dir = '/gale/netapp/home/hanliu/pkg/ldsc'
        chrom_bim_file = f'/gale/netapp/home/hanliu/ref/LDSC/baseline/1000G_EUR_Phase3_plink/1000G.EUR.QC.{chrom}.bim'
        baseline_dir = '/gale/netapp/home/hanliu/ref/LDSC/baseline/model_all_snp/all_snp/'
        bfile_pattern = chrom_bim_file[:-4]
        output_prefix = f'/gale/netapp/scratch2/hanliu/LDSC/SubTypeDMRTotal/Annotation/{cluster}/{cluster}.{chrom}'
        make_annot_cmd = f'python2.7 {ldsc_dir}/make_annot.py --bed-file {base_dir}/{input_bed} --bimfile {chrom_bim_file} --annot-file {output_prefix}.annot.gz'
        ldsc_cmd = f'python2.7 /gale/netapp/home/chongyuan/application/ldsc/ldsc.py --l2 --bfile {bfile_pattern} --ld-wind-cm 1 --annot {output_prefix}.annot.gz --out {output_prefix} --print-snps {baseline_dir}/baseline.{chrom}.snp  --thin-annot'
        total_cmd = f'{make_annot_cmd} && {ldsc_cmd}'
        

161 bed files


In [22]:
mkdir_cmd

'mkdir /gale/netapp/scratch2/hanliu/LDSC/SubTypeDMRTotal/Annotation/L6b_Pkhd1'

In [23]:
make_annot_cmd

'python2.7 /gale/netapp/home/hanliu/pkg/ldsc/make_annot.py --bed-file /gale/netapp/scratch2/hanliu/LDSC/SubTypeDMRTotal/InputBed/L6b_Pkhd1.HypoDMR.DMS2.hg19-0.5-slop250.bed --bimfile /gale/netapp/home/hanliu/ref/LDSC/baseline/1000G_EUR_Phase3_plink/1000G.EUR.QC.22.bim --annot-file /gale/netapp/scratch2/hanliu/LDSC/SubTypeDMRTotal/Annotation/L6b_Pkhd1/L6b_Pkhd1.22.annot.gz'

In [24]:
ldsc_cmd

'python2.7 /gale/netapp/home/chongyuan/application/ldsc/ldsc.py --l2 --bfile /gale/netapp/home/hanliu/ref/LDSC/baseline/1000G_EUR_Phase3_plink/1000G.EUR.QC.22 --ld-wind-cm 1 --annot /gale/netapp/scratch2/hanliu/LDSC/SubTypeDMRTotal/Annotation/L6b_Pkhd1/L6b_Pkhd1.22.annot.gz --out /gale/netapp/scratch2/hanliu/LDSC/SubTypeDMRTotal/Annotation/L6b_Pkhd1/L6b_Pkhd1.22 --print-snps /gale/netapp/home/hanliu/ref/LDSC/baseline/model_all_snp/all_snp//baseline.22.snp  --thin-annot'