In [1]:
import pathlib
import pandas as pd
import pybedtools
import seaborn as sns

## Liftover BED slop

In [2]:
min_match = 0.5
slop = 250
chrom_sizes = '/home/hanliu/ref/human/hg19/genome/hg19.main.chrom.sizes'

In [3]:
# mm10 liftover to hg19, the coords is hg19!
total_bed = f'/home/hanliu/project/mouse_rostral_brain/study/LiftoverDMR/MajorTypeDMR.lifted.{min_match}.bed'
bed = pybedtools.BedTool(total_bed)
bed = bed.slop(b=slop, g=chrom_sizes)
total_bed_df = bed.to_dataframe()

total_bed_df['DMR_length'] = (total_bed_df['end'] - total_bed_df['start'])
# some DMR is very long after liftover... remove it, max dmr length in mm10 is ~5kb
total_bed_df = total_bed_df[total_bed_df['DMR_length'] < 10000]
# some weird but very rare problem from liftover
total_bed_df = total_bed_df[total_bed_df['start'] > 0]

total_bed_df.set_index('name', inplace=True)
total_bed_df.head()

Unnamed: 0_level_0,chrom,start,end,DMR_length
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Major_8,chr21,26607978,26608945,967
Major_9,chr8,56561843,56562897,1054
Major_38,chr8,56549542,56550530,988
Major_39,chr8,56549380,56550413,1033
Major_80,chr8,56539385,56540231,846


In [4]:
total_bed_df.shape

(1935511, 4)

## Save SubType DMR in hg19 coords

In [5]:
dmr_bed_dir = pathlib.Path(
    '/home/hanliu/project/mouse_rostral_brain/DMR/MajorType/HypoBed/')
dmr_list = list(dmr_bed_dir.glob('*DMS1.bed'))
print(len(dmr_list), 'bed files')

output_dir = pathlib.Path('InputBed')
output_dir.mkdir(exist_ok=True)

for i, bed_path in enumerate(dmr_list):
    # the coords is mm10
    this_bed_df = pd.read_csv(bed_path, sep='\t', header=None, index_col=3)

    # the coords is hg19
    use_bed_df = total_bed_df.loc[this_bed_df.index & total_bed_df.index]
    use_bed = pybedtools.BedTool.from_dataframe(
        use_bed_df.reset_index(drop=True).iloc[:, :3])
    
    use_bed = use_bed.sort()
    use_bed.saveas(output_dir /
                   (bed_path.name[:-3] + f'hg19-{min_match}-slop{slop}.bed'))
    print(i,
          bed_path.name.split('.')[0],
          int(use_bed_df.shape[0] / this_bed_df.shape[0] * 100),
          use_bed_df.shape[0],
          this_bed_df.shape[0],
          sep='\t')

41 bed files
0	IT-L23	63	287531	287531
1	ASC	67	225133	225133
2	DG-po	62	222168	222168
3	EC	41	557680	557680
4	PAL-Inh	85	5919	5919
5	IT-L6	63	246852	246852
6	LSX-Inh	73	80176	80176
7	IG-CA2	66	389300	389300
8	OLF-Exc	68	164180	164180
9	Unc5c	67	179383	179383
10	MGC	37	674356	674356
11	OLF	68	87791	87791
12	NP-L6	66	77080	77080
13	L6b	66	119475	119475
14	Gfra1	64	287492	287492
15	CT-L6	66	228623	228623
16	ODC	56	254194	254194
17	VLMC-Pia	53	397393	397393
18	IT-L4	63	327129	327129
19	Foxp2	71	96231	96231
20	MSN-D2	68	169580	169580
21	ANP	69	230357	230357
22	CGE-Vip	74	79417	79417
23	Chd7	77	61978	61978
24	CA3-St18	64	328161	328161
25	CGE-Lamp5	68	253206	253206
26	D1L-PAL	67	98521	98521
27	OPC	52	368619	368619
28	MGE-Pvalb	72	128885	128885
29	EP	66	185907	185907
30	PC	40	958587	958587
31	PT-L5	66	261820	261820
32	MGE-Sst	72	64006	64006
33	CLA	65	239452	239452
34	D1L-Fstl4	70	138892	138892
35	CA1	65	330653	330653
36	VLMC	52	369718	369718
37	CA3	65	368749	368749
38	IT-L5	63	292052	292052
3

In [7]:
# !rsync -zarv /home/hanliu/project/mouse_rostral_brain/study/LDSC/MajorDMRTotal salk-login:/gale/netapp/scratch2/hanliu/LDSC/

## Prepare annot command

In [21]:
dmr_list = list(output_dir.glob(f'*hg19-{min_match}-slop{slop}.bed'))
print(len(dmr_list), 'bed files')

chroms = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]

base_dir = '/gale/netapp/scratch2/hanliu/LDSC/SubTypeDMRTotal'

for input_bed in dmr_list:
    cluster_commands = []
    mkdir_cmd = f'mkdir /gale/netapp/scratch2/hanliu/LDSC/SubTypeDMRTotal/Annotation/{cluster}'
    for chrom in chroms:
        cluster = input_bed.name.split('.')[0]
        ldsc_dir = '/gale/netapp/home/hanliu/pkg/ldsc'
        chrom_bim_file = f'/gale/netapp/home/hanliu/ref/LDSC/baseline/1000G_EUR_Phase3_plink/1000G.EUR.QC.{chrom}.bim'
        baseline_dir = '/gale/netapp/home/hanliu/ref/LDSC/baseline/model_all_snp/all_snp/'
        bfile_pattern = chrom_bim_file[:-4]
        output_prefix = f'/gale/netapp/scratch2/hanliu/LDSC/SubTypeDMRTotal/Annotation/{cluster}/{cluster}.{chrom}'
        make_annot_cmd = f'python2.7 {ldsc_dir}/make_annot.py --bed-file {base_dir}/{input_bed} --bimfile {chrom_bim_file} --annot-file {output_prefix}.annot.gz'
        ldsc_cmd = f'python2.7 /gale/netapp/home/chongyuan/application/ldsc/ldsc.py --l2 --bfile {bfile_pattern} --ld-wind-cm 1 --annot {output_prefix}.annot.gz --out {output_prefix} --print-snps {baseline_dir}/baseline.{chrom}.snp  --thin-annot'
        total_cmd = f'{make_annot_cmd} && {ldsc_cmd}'
        

161 bed files


In [22]:
mkdir_cmd

'mkdir /gale/netapp/scratch2/hanliu/LDSC/SubTypeDMRTotal/Annotation/L6b_Pkhd1'

In [23]:
make_annot_cmd

'python2.7 /gale/netapp/home/hanliu/pkg/ldsc/make_annot.py --bed-file /gale/netapp/scratch2/hanliu/LDSC/SubTypeDMRTotal/InputBed/L6b_Pkhd1.HypoDMR.DMS2.hg19-0.5-slop250.bed --bimfile /gale/netapp/home/hanliu/ref/LDSC/baseline/1000G_EUR_Phase3_plink/1000G.EUR.QC.22.bim --annot-file /gale/netapp/scratch2/hanliu/LDSC/SubTypeDMRTotal/Annotation/L6b_Pkhd1/L6b_Pkhd1.22.annot.gz'

In [24]:
ldsc_cmd

'python2.7 /gale/netapp/home/chongyuan/application/ldsc/ldsc.py --l2 --bfile /gale/netapp/home/hanliu/ref/LDSC/baseline/1000G_EUR_Phase3_plink/1000G.EUR.QC.22 --ld-wind-cm 1 --annot /gale/netapp/scratch2/hanliu/LDSC/SubTypeDMRTotal/Annotation/L6b_Pkhd1/L6b_Pkhd1.22.annot.gz --out /gale/netapp/scratch2/hanliu/LDSC/SubTypeDMRTotal/Annotation/L6b_Pkhd1/L6b_Pkhd1.22 --print-snps /gale/netapp/home/hanliu/ref/LDSC/baseline/model_all_snp/all_snp//baseline.22.snp  --thin-annot'