# MVP Partitioned Heritability Estimate
Author Jesse Marks

Olivia Corradin wanted us to use LDSC to compute the partitioned heritability estimate for the MVP opioid use disorder GWAS results.

# Olivias Code

In [None]:
## activate ldsc environment 
## Files listed as ldsc/ directory are from the ldsc wiki/tutorial 

$name=analysis_identifier

for j in {1..22}
do
python make_annot.py --bed-file "$peaks" --bimfile ldsc/1000G_EUR_Phase3_plink/1000G.EUR.QC."$j".bim --annot-file "$name"_"$peaks"."$j".annot.gz
python ldsc.py --l2 --bfile ldsc/1000G_EUR_Phase3_plink/1000G.EUR.QC."$j" --ld-wind-cm 1 --annot "$name"_"$peaks"."$j".annot.gz --thin-annot --out "$name"_"$peaks"."$j" --print-snps ldsc/1000G_EUR_Phase3_baseline/print_snps.txt  
done

mkdir LDSC_"$name"
mv "$name"* LDSC_"$name"/

for d in disease_sumstats
do
python ldsc.py --h2 "$d".sumstats --w-ld-chr ldsc/weights_hm3_no_hla/weights. --ref-ld-chr LDSC_"$name"/"$name"_"$peak".,/lab/corradin_data/ldsc/1000G_EUR_Phase3_baseline/baseline. --overlap-annot --out OUT_"$name"_"$d"_"$peak" --print-coefficients --frqfile-chr /lab/corradin_data/ldsc/1000G_Phase3_frq/1000G.EUR.QC.
done

# Our Code

## Data Wrangle

In [None]:
cd /shared/rti-heroin/ldsc_genetic_correlation/data/mvp/0001/

aws s3 sync s3://rti-common/dbGaP/phs001672_mvp/RootStudyConsentSet_phs001672.MVP.v4.p1.c1.HMB-MDS/AnalysisFiles/ .
for file in *tar.gz; do
    tar -xvzf $file
done

cp /shared/rti-heroin/ldsc_genetic_correlation/data/mvp/0001/release/Submissions/sub20200313/OUD_EA_MVP1_MVP2_Mar12.txt.gz .
cp /shared/rti-heroin/ldsc_genetic_correlation/data/mvp/0001/release/Submissions/sub20200313/OUD_EA_MVP1_MVP2_YP_SAGE_Mar12.txt.gz .

rm -rf release/ *tar.gz

zcat OUD_EA_MVP1_MVP2_YP_SAGE_Mar12.txt.gz  | head
#SNP_ID  Chromosome      Position        Allele1 Allele2 EA      EAF     Effective_N     Original_N      Zscore  PValue  Direction
#rs3115860       1       753405  A       C       A       0.8592  30228.7 78187   -0.941  0.3466  --???+
#rs3131970       1       753425  T       C       T       0.1412  29680   77702   0.862   0.3888  ++????
#rs3115858       1       755890  A       T       A       0.1382  30974.6 80209   0.821   0.4118  ++???+
#rs3131962       1       756604  A       G       A       0.1386  30969.5 80219   0.808   0.4189  ++???+
#rs4951929       1       757734  T       C       T       0.8617  30977.7 80201   -0.818  0.4135  --???-

zcat OUD_EA_MVP1_MVP2_Mar12.txt.gz  | head
#SNP_ID  Chromosome      Position        Allele1 Allele2 EA      EAF     SampleSize      N_case  N_control       Effect  SE      PValue  Direction
#rs3115860       1       753405  A       C       A       0.8592  77622   8310    69312   -0.0239 0.0242  0.3241  --
#rs3131970       1       753425  T       C       T       0.1412  77702   8314    69388   0.021   0.0242  0.3862  ++
#rs3115858       1       755890  A       T       A       0.1382  79649   8519    71130   0.0199  0.0241  0.4083  ++
#rs3131962       1       756604  A       G       A       0.1384  79668   8520    71148   0.0192  0.0241  0.4257  ++
#rs4951929       1       757734  T       C       T       0.8617  79634   8518    71116   -0.0199 0.0241  0.409   --

gunzip *z

# format file1
file=OUD_EA_MVP1_MVP2_YP_SAGE_Mar12
docker run -it -v "$PWD:/data/" \
               -v "/shared/jmarks/bin/python_convert/:/data/python_convert/" \
    continuumio/anaconda3 python  /data/python_convert/sumstats.py csv \
        --sumstats /data/$file.txt \
        --out /data/$file.csv \
        --force \
        --snp SNP_ID \
        --a1 Allele1 \
        --a2 Allele2 \
        --z Zscore \
        --n Original_N \
        --auto \
        --head 5 

# format file2
file=OUD_EA_MVP1_MVP2_Mar12
docker run -it -v "$PWD:/data/" \
               -v "/shared/jmarks/bin/python_convert/:/data/python_convert/" \
    continuumio/anaconda3 python  /data/python_convert/sumstats.py csv \
        --sumstats /data/$file.txt \
        --out /data/$file.csv \
        --force \
        --snp SNP_ID \
        --a1 Allele1 \
        --a2 Allele2 \
        --beta Effect \
        --n SampleSize \
        --auto \
        --head 5 

# SNP CHR BP A1 A2 BETA  PVAL  N 
#awk '{print $1, $2, $3, $5,$6,$8,$4,$7}' OFS="\t" OUD_EA_MVP1_MVP2_YP_SAGE_Mar12.csv \
#    > mvp_oud_eur_p1p2_yp_sage_ldsc_ready.txt
#awk '{print $1, $2, $3, $5,$6,$10,$4,$7}' OFS="\t" OUD_EA_MVP1_MVP2_Mar12.csv \
#    > mvp_oud_eur_p1p2_ldsc_ready.txt
gzip *ldsc_ready*

## upload to S3
#aws s3 cp mvp_oud_eur_p1p2_ldsc_ready.txt.gz s3://rti-shared/ldsc/data/mvp_oud/
#aws s3 cp mvp_oud_eur_p1p2_yp_sage_ldsc_ready.txt.gz s3://rti-shared/ldsc/data/mvp_oud/

# munge sumstats
wget https://data.broadinstitute.org/alkesgroup/LDSCORE/w_hm3.snplist.bz2
bunzip2 w_hm3.snplist.bz2
    
/opt/ldsc/munge_sumstats.py \
    --sumstats /data/mvp/OUD_EA_MVP1_MVP2_Mar12.csv \
    --signed-sumstats BETA,0 \
    --N-col N \
    --out /data/mvp/mvp_oud_eur_p1p2 \
    --merge-alleles /data/w_hm3.snplist

/opt/ldsc/munge_sumstats.py \
    --sumstats /data/mvp/OUD_EA_MVP1_MVP2_YP_SAGE_Mar12.csv \
    --signed-sumstats Z,0 \
    --N-col N \
    --out /data/mvp/mvp_oud_eur_p1p2_yp_sage \
    --merge-alleles /data/w_hm3.snplist

In [None]:
# upload peaks files
cd /home/jesse/Projects/heroin/ldsc/mvp # local
scp -i ~/.ssh/gwas_rsa * ec2-user@34.195.174.206:/shared/rti-heroin/ldsc_genetic_correlation/data/mvp/0001/peaks/
#GVEL_ALL_FDR05
#LVEL_ALL_FDR05
#linear_regression_peaks

In [None]:
cd /shared/rti-heroin/ldsc_genetic_correlation/data/mvp/0001/ldsc/

# download ldsc files
wget https://storage.googleapis.com/broad-alkesgroup-public/LDSCORE/1000G_Phase3_baseline_ldscores.tgz
wget https://storage.googleapis.com/broad-alkesgroup-public/LDSCORE/1000G_Phase3_plinkfiles.tgz
wget https://storage.googleapis.com/broad-alkesgroup-public/LDSCORE/1000G_Phase3_frq.tgz
wget https://storage.googleapis.com/broad-alkesgroup-public/LDSCORE/weights_hm3_no_hla.tgz

# decompress
for file in *tgz; do
    tar xvzf $file
    rm $file 
done

## LDSC
1. Create annotation file based off of a BED formatted file (file containing chr, chr-start, chr-end)
2. Compute the annotation-specific (partitioned) LD scores.
3. Compute the partitioned heritability estimate. 

MVP_p1p2_YP_SAGE: N=82,707 (10,544 cases and 72,163 controls) <br>
MVP_p1p2: 79,729 (8,529 cases and 71,200 controls)

In [None]:
# interactive docker container for LDSC
docker run -it -v "$PWD:/data/" \
  rticode/ldsc:7618f4943d8f31a37cbd207f867ba5742d03373f bash 
    
# loop through both MVP metas
for name in {mvp_p1p2_yp_sage,mvp_p1p2}; do

    # store processing files for each meta in separate dir
    mkdir /data/LDSC_${name}/
    
    # use sumstats file that corresponds to the meta name for the h2 estimate
    if [[ $name == "mvp_p1p2_yp_sage" ]]; then
        stats=mvp_oud_eur_p1p2_yp_sage.sumstats.gz
    else
        stats=mvp_oud_eur_p1p2.sumstats.gz
    fi
    
    # loop through each BED file
    for peaks in {GVEL_ALL_FDR05,LVEL_ALL_FDR05,linear_regression_peaks}; do
        # loop through each chromosome
        for j in {1..22}; do
        
            # create annotation files
            python /opt/ldsc/make_annot.py \
                --bed-file "/data/peaks/$peaks" \
                --bimfile "/data/ldsc/1000G_EUR_Phase3_plink/1000G.EUR.QC.$j.bim" \
                --annot-file "/data/LDSC_${name}/${name}_${peaks}.$j.annot.gz"

            # compute LD scores
            python /opt/ldsc/ldsc.py \
                --l2 \
                --bfile "/data/ldsc/1000G_EUR_Phase3_plink/1000G.EUR.QC.$j" \
                --ld-wind-cm 1 \
                --annot "/data/LDSC_${name}/${name}_${peaks}.$j.annot.gz" \
                --thin-annot \
                --out "/data/LDSC_${name}/${name}_${peaks}.$j" \
                --print-snps "/data/ldsc/1000G_EUR_Phase3_baseline/print_snps.txt"
        done # end chr loop
        
    # computed partitioned heritability estimate
    python /opt/ldsc/ldsc.py \
        --h2 "/data/mvp/$stats" \
        --w-ld-chr "/data/ldsc/weights_hm3_no_hla/weights." \
        --ref-ld-chr "/data/LDSC_${name}/${name}_${peaks}.,/data/ldsc/1000G_EUR_Phase3_baseline/baseline." \
        --overlap-annot \
        --out "/data/results/OUT_${name}_${peaks}" \
        --print-coefficients \
        --frqfile-chr "/data/ldsc/1000G_Phase3_frq/1000G.EUR.QC."
    
    done # end BED file loop
done # end meta file loop



### 002
New Request by Olivia to run this code on one more list of peaks.

In [None]:
cd /shared/rti-heroin/ldsc_genetic_correlation/data/mvp/0002/ # EC2

# interactive docker container for LDSC
docker run -it -v "$PWD:/data/" \
  rticode/ldsc:7618f4943d8f31a37cbd207f867ba5742d03373f bash 
    
# loop through both MVP metas
for name in {mvp_p1p2_yp_sage,mvp_p1p2}; do

    # store processing files for each meta in separate dir
    mkdir /data/LDSC_${name}/
    
    # use sumstats file that corresponds to the meta name for the h2 estimate
    if [[ $name == "mvp_p1p2_yp_sage" ]]; then
        stats=mvp_oud_eur_p1p2_yp_sage.sumstats.gz
    else
        stats=mvp_oud_eur_p1p2.sumstats.gz
    fi
    
    # loop through each BED file (peaks)
    for peaks in {GVEL_ALL_FDR05,LVEL_ALL_FDR05,linear_regression_peaks,FDR05_LVEL_targeting_shared_genes_gr1}; do
        # loop through each chromosome
        for j in {1..22}; do
        
            # create annotation files
            python /opt/ldsc/make_annot.py \
                --bed-file "/data/peaks/$peaks" \
                --bimfile "/data/ldsc/1000G_EUR_Phase3_plink/1000G.EUR.QC.$j.bim" \
                --annot-file "/data/LDSC_${name}/${name}_${peaks}.$j.annot.gz"

            # compute LD scores
            python /opt/ldsc/ldsc.py \
                --l2 \
                --bfile "/data/ldsc/1000G_EUR_Phase3_plink/1000G.EUR.QC.$j" \
                --ld-wind-cm 1 \
                --annot "/data/LDSC_${name}/${name}_${peaks}.$j.annot.gz" \
                --thin-annot \
                --out "/data/LDSC_${name}/${name}_${peaks}.$j" \
                --print-snps "/data/ldsc/1000G_EUR_Phase3_baseline/print_snps.txt"
        done # end chr loop
        
    # computed partitioned heritability estimate
    python /opt/ldsc/ldsc.py \
        --h2 "/data/mvp/$stats" \
        --w-ld-chr "/data/ldsc/weights_hm3_no_hla/weights." \
        --ref-ld-chr "/data/LDSC_${name}/${name}_${peaks}.,/data/ldsc/1000G_EUR_Phase3_baseline/baseline." \
        --overlap-annot \
        --out "/data/results/OUT_${name}_${peaks}" \
        --print-coefficients \
        --frqfile-chr "/data/ldsc/1000G_Phase3_frq/1000G.EUR.QC."
    
    done # end BED file loop
done # end meta file loop


# Sandbox

In [None]:
peaks=GVEL_ALL_FDR05.bed
j=22


python /opt/ldsc/make_annot.py \
    --bed-file "/data/peaks/$peaks" \
    --bimfile "/data/ldsc/1000G_EUR_Phase3_plink/1000G.EUR.QC.$j.bim" \
    --annot-file "/data/t0/${name}_${peaks}.$j.annot.gz"

    
python /opt/ldsc/ldsc.py \
    --l2 \
    --bfile "/data/ldsc/1000G_EUR_Phase3_plink/1000G.EUR.QC.$j" \
    --ld-wind-cm 1 \
    --annot "/data/t0/${name}_${peaks}.$j.annot.gz" \
    --thin-annot \
    --out "/data/t1/${name}_${peaks}.$j" \
    --print-snps "/data/ldsc/1000G_EUR_Phase3_baseline/print_snps.txt"

In [None]:
"/data/LDSC_${name}/${name}_${peaks}.,/data/ldsc/1000G_EUR_Phase3_baseline/baseline."

docker run -it -v "$PWD:/data/" \
  rticode/ldsc:7618f4943d8f31a37cbd207f867ba5742d03373f bash 

GVEL_ALL_FDR05,LVEL_ALL_FDR05,linear_regression_peaks
name=mvp_p1p1
stats=mvp_oud_eur_p1p2.sumstats.gz
#ref_string=/data/ldsc/1000G_EUR_Phase3_baseline/baseline.,/data/LDSC_mvp_p1p1/mvp_p1p1_GVEL_ALL_FDR05.,/data/LDSC_mvp_p1p1/mvp_p1p1_LVEL_ALL_FDR05.
ref_string=/data/ldsc/1000G_EUR_Phase3_baseline/baseline.,/data/t1/mvp_p1p1_GVEL_ALL_FDR05.

# computed partitioned heritability estimate
python /opt/ldsc/ldsc.py \
    --h2 "/data/mvp/$stats" \
    --w-ld-chr "/data/ldsc/weights_hm3_no_hla/weights." \
    --ref-ld-chr $ref_string \
    --overlap-annot \
    --out "/data/results/OUT_${name}_${peaks}" \
    --print-coefficients \
    --frqfile-chr "/data/ldsc/1000G_Phase3_frq/1000G.EUR.QC."
