<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Step1-Prepare-Annotation-and-calc-LD-Score" data-toc-modified-id="Step1-Prepare-Annotation-and-calc-LD-Score-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Step1 Prepare Annotation and calc LD Score</a></span><ul class="toc-item"><li><span><a href="#Notes" data-toc-modified-id="Notes-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Notes</a></span></li></ul></li><li><span><a href="#Step-2-multiple-regression" data-toc-modified-id="Step-2-multiple-regression-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Step 2 multiple regression</a></span></li></ul></div>

In [1]:
# prepare bed file from cemba
# /home/hanliu/project/mouse_rostral_brain/study/LDSC/

In [2]:
import pathlib
pathlib.Path('qsub').mkdir(exist_ok=True)

## Step1 Prepare Annotation and calc LD Score

In [14]:
ldsc_dir = '/gale/netapp/home/hanliu/pkg/ldsc'


In [5]:
input_dir = pathlib.Path('InputBed/').absolute()

dmr_list = list(input_dir.glob(f'*hg19-*-slop*.bed'))
print(len(dmr_list), 'bed files')

chroms = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]

base_dir = '/gale/netapp/scratch2/hanliu/LDSC/SubTypeDMRTotal'

cmds = []
for input_bed in dmr_list:
    cluster = input_bed.name.split('.')[0]
    cluster_dir =f'/gale/netapp/scratch2/hanliu/LDSC/SubTypeDMRTotal/Annotation/{cluster}'
    pathlib.Path(cluster_dir).mkdir(exist_ok=True)
    
    for chrom in chroms:
        chrom_bim_file = f'/gale/netapp/home/hanliu/ref/LDSC/baseline/1000G_EUR_Phase3_plink/1000G.EUR.QC.{chrom}.bim'
        baseline_dir = '/gale/netapp/home/hanliu/ref/LDSC/baseline/model_all_snp/all_snp/'
        bfile_pattern = chrom_bim_file[:-4]
        output_prefix = f'/gale/netapp/scratch2/hanliu/LDSC/SubTypeDMRTotal/Annotation/{cluster}/{cluster}.{chrom}'
        make_annot_cmd = f'/gale/netapp/home/hanliu/anaconda3/envs/ldsc/bin/python2.7 {ldsc_dir}/make_annot.py --bed-file {input_bed} --bimfile {chrom_bim_file} --annot-file {output_prefix}.annot.gz'
        ldsc_cmd = f'/gale/netapp/home/hanliu/anaconda3/envs/ldsc/bin/python2.7 {ldsc_dir}/ldsc.py --l2 --bfile {bfile_pattern} --ld-wind-cm 1 --annot {output_prefix}.annot.gz --out {output_prefix} --print-snps {baseline_dir}/baseline.{chrom}.snp  --thin-annot'
        total_cmd = f'{make_annot_cmd} && {ldsc_cmd}'
        cmds.append(total_cmd)
        
with open(f'qsub/ldsc_commands.txt', 'w') as f:
    f.write('\n'.join(cmds))

this_dir = pathlib.Path().absolute()
qsub_script = f"""
#!/bin/bash
#$ -N master
#$ -V
#$ -pe smp 1
#$ -l h_vmem=5G
#$ -l h_rt=99:99:99
#$ -l s_rt=99:99:99
#$ -wd {this_dir}
#$ -e {this_dir}/qsub/ldsc.error.log
#$ -o {this_dir}/qsub/ldsc.output.log

yap qsub --command_file_path {this_dir}/qsub/ldsc_commands.txt --working_dir {this_dir}/qsub --project_name ldsc --total_cpu 20 --total_mem 5000 --qsub_global_parms "-pe smp=20;-l h_vmem=5G"
"""
with open(f'qsub/ldsc_qsub.sh', 'w') as f:
    f.write(qsub_script)


161 bed files


### Notes
- Submit job using -pe smp 20 -l h_vmem 5G
- Each job is CPU bound, took several mins to compute a chrom and ~1h in total for a bed file

## Step 2 multiple regression

In [29]:
summary_stats_dir = '/gale/netapp/home/hanliu/ref/LDSC/baseline/model_all_snp/sumstats/'
w_ld_chr_prefix = '/gale/netapp/home/hanliu/ref/LDSC/baseline/model_all_snp/weights_1000G_EUR_Phase3_no_hla/weights.'
baseline_prefix = '/gale/netapp/home/hanliu/ref/LDSC/baseline/baseline_roadmap/baseline_roadmap.'
freq_prefix = '/gale/netapp/home/hanliu/ref/LDSC/baseline/1000G_Phase3_frq/1000G.EUR.QC.'


In [30]:
sum_stats_list = list(pathlib.Path(summary_stats_dir).glob('*gz'))
len(sum_stats_list)

42

In [35]:
annotation_dir = pathlib.Path('Annotation/').absolute()
cluster_dirs = [f'{i}/{i.name.split(".")[0]}.' for i in annotation_dir.glob('*') if i.is_dir()]
cluster_str = ','.join(cluster_dirs)
len(cluster_dirs)

161

In [52]:
model_dir = pathlib.Path('Model').absolute()
model_dir.mkdir(exist_ok=True)

cmds = []
appread = set()
for sum_stats_path in sum_stats_list:
    sum_stats_name = sum_stats_path.name.split('.')[0]
    if sum_stats_name in appread:
        raise ValueError('Summary stats have duplicated name')
    else:
        appread.add(sum_stats_name)
    output_prefix = model_dir / sum_stats_name
    
    cmd = f'/gale/netapp/home/hanliu/anaconda3/envs/ldsc/bin/python2.7 {ldsc_dir}/ldsc.py '\
          f'--h2 {sum_stats_path} --w-ld-chr {w_ld_chr_prefix} '\
          f'--ref-ld-chr {cluster_str},{baseline_prefix} '\
          f'--frqfile-chr {freq_prefix} --overlap-annot '\
          f'--out {output_prefix} --print-coefficients'
    cmds.append(cmd)
    
with open(f'qsub/model_commands.txt', 'w') as f:
    f.write('\n'.join(cmds))

this_dir = pathlib.Path().absolute()
qsub_script = f"""
#!/bin/bash
#$ -N master
#$ -V
#$ -pe smp 1
#$ -l h_vmem=5G
#$ -l h_rt=99:99:99
#$ -l s_rt=99:99:99
#$ -wd {this_dir}
#$ -e {this_dir}/qsub/model.error.log
#$ -o {this_dir}/qsub/model.output.log

yap qsub --command_file_path {this_dir}/qsub/model_commands.txt --working_dir {this_dir}/qsub --project_name model --total_cpu 100 --total_mem 5000 --qsub_global_parms "-pe smp=20;-l h_vmem=5G"
"""
with open(f'qsub/model_qsub.sh', 'w') as f:
    f.write(qsub_script)