In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import tqdm
import os
import sys

In [None]:
from datetime import datetime
import os
import pandas as pd
import numpy as np

USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}


JOB_NAME='get_untrimmed_frq_files'
%env JOB_NAME={JOB_NAME}

%env DOCKER_PREFIX=XXXX # this is censored because pulling docker images from google container registry charges the owner of the image

In [None]:
%%writefile ~/aou_dsub.bash

function aou_dsub () {

  # Get a shorter username to leave more characters for the job name.
  local DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

  # For AoU RWB projects network name is "network".
  local AOU_NETWORK=network
  local AOU_SUBNETWORK=subnetwork

  dsub \
      --provider google-cls-v2 \
      --user-project "${GOOGLE_PROJECT}"\
      --project "${GOOGLE_PROJECT}"\
      --boot-disk-size 40 \
      --network "${AOU_NETWORK}" \
      --subnetwork "${AOU_SUBNETWORK}" \
      --service-account "$(gcloud config get-value account)" \
      --user "${DSUB_USER_NAME}" \
      --regions us-central1 \
      --logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/$(date +'%Y%m%d/%H%M%S')/{job-id}-{task-id}-{task-attempt}.log" \
      "$@"
}

# Make variant keep files

In [None]:
for chromosome in range(1,23):
    vids = pd.read_parquet(
        f'/home/jupyter/data/vat/vat_chr{chromosome}.parquet', 
    ).iloc[
        :,0
    ].str.replace(
        '-', ':'
    ).drop_duplicates()
    
    vids.to_csv(
        f'/home/jupyter/data/snp_extract_files/afr.and.eur.missingness.05.afr.or.eur.maf.001.{chromosome}.txt', 
        index = False,
        header = False
    )

In [None]:
!gsutil rsync /home/jupyter/data/snp_extract_files/ ${WORKSPACE_BUCKET}/data/snp_extract_files/

# Make plinks

In [None]:
%%bash
gsutil cp /home/jupyter/data/sample_keep_files/*.keep ${WORKSPACE_BUCKET}/data/sample_keep_files/


In [None]:
chroms = np.arange(1,23)
CDR_STORAGE_PATH = os.environ['CDR_STORAGE_PATH']
WORKSPACE_BUCKET = os.environ['WORKSPACE_BUCKET']
task_df = pd.DataFrame({
    '--env CHROM' : chroms,
    '--input SNP_KEEP_FILE' : [f'{WORKSPACE_BUCKET}/data/snp_extract_files/afr.and.eur.missingness.05.afr.or.eur.maf.001.{chrom}.txt'  for chrom in chroms],
    '--input PLINK_FILES' : [f'gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/acaf_threshold/plink_bed/chr{chrom}.*' for chrom in chroms],
})
task_df.to_csv('make_plink_task_df.tsv', sep = '\t', index = False)

In [None]:
%%writefile make_plinks.sh

set -o errexit
set -o xtrace

PLINK_DIR="$(dirname "${PLINK_FILES}")"

awk -F'\t' \
    'BEGIN{OFS="\t"} {$1 = substr($1,4); $2 = substr($2, 4); print}' \
    ${PLINK_DIR}/chr${CHROM}.bim > \
    tmp.bim
mv tmp.bim ${PLINK_DIR}/chr${CHROM}.bim

plink \
    --keep-allele-order \
    --keep ${SAMPLE_KEEP_DIR}/afr70346.keep \
    --bfile ${PLINK_DIR}/chr${CHROM} \
    --extract ${SNP_KEEP_FILE} \
    --make-bed \
    --memory 8000 \
    --out chr${CHROM}_afr70346

gsutil -m cp chr${CHROM}_afr* ${WORKSPACE_BUCKET}/data/plink/
rm chr${CHROM}_afr*

plink \
    --keep-allele-order \
    --keep ${SAMPLE_KEEP_DIR}/eur70346.keep \
    --bfile ${PLINK_DIR}/chr${CHROM} \
    --extract ${SNP_KEEP_FILE} \
    --make-bed \
    --memory 8000 \
    --out chr${CHROM}_eur70346

gsutil -m cp chr${CHROM}_eur* ${WORKSPACE_BUCKET}/data/plink/
rm chr${CHROM}_eur*

In [None]:
%%bash --out JOB_NAME
source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.
                
aou_dsub \
  --machine-type n1-highmem-2 \
  --image '${DOCKER_PREFIX}:1.5' \
  --env WORKSPACE_BUCKET ${WORKSPACE_BUCKET} \
  --disk-size 2000 \
  --tasks make_plink_task_df.tsv 22 \
  --input-recursive SAMPLE_KEEP_DIR=${WORKSPACE_BUCKET}/data/sample_keep_files \
  --script make_plinks.sh

# Compute MAFs

In [None]:
%%writefile calc_afs.sh

set -o errexit
set -o xtrace

PLINK_DIR="$(dirname "${PLINK_FILES}")"

for POP in afr70346 eur70346
do
    plink \
        --keep-allele-order \
        --keep ${SAMPLE_KEEP_DIR}/${POP}.keep \
        --bfile ${PLINK_DIR}/chr${CHROM}_${POP} \
        --memory 10000 \
        --freq \
        --out ${OUTPUT_PATH}/chr${CHROM}_${POP}
done

In [None]:
chroms = np.arange(1,23)
CDR_STORAGE_PATH = os.environ['CDR_STORAGE_PATH']
WORKSPACE_BUCKET = os.environ['WORKSPACE_BUCKET']
task_df = pd.DataFrame({
    '--env CHROM' : chroms,
    '--input PLINK_FILES' : [f'{WORKSPACE_BUCKET}/data/plink/chr{chrom}_*' for chrom in chroms],
})
task_df.to_csv('calc_af_task_df.tsv', sep = '\t', index = False)

In [None]:
%%bash --out JOB_NAME
source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.
                
aou_dsub \
  --machine-type n1-highmem-2 \
  --image 'us.gcr.io/broad-dsp-gcr-public/terra-jupyter-aou:2.0.11' \
  --tasks calc_af_task_df.tsv 22 \
  --disk-size 600 \
  --input-recursive SAMPLE_KEEP_DIR=${WORKSPACE_BUCKET}/data/sample_keep_files \
  --output-recursive OUTPUT_PATH=${WORKSPACE_BUCKET}/data/maf/ \
  --script "calc_afs.sh"

In [None]:
!gsutil -m rsync ${WORKSPACE_BUCKET}/data/maf/ /home/jupyter/data/maf/

# Make a merged MAF df

In [None]:
maf_df_list = []
for chr in tqdm.tqdm(range(1,23)):
    maf_chr_list = []
    for pop in ['afr70346', 'eur70346']:
        maf = pd.read_csv(
            f'/home/jupyter/data/maf/chr{chr}_{pop}.frq',
            sep = '\\s+',
        ).rename(
            columns = {'MAF' : f'MAF_{pop}', 'NCHROBS' : f'NCHROBS_{pop[:3]}'}
        )
        maf_chr_list.append(maf)
    maf_df_list.append(maf_chr_list[0].merge(maf_chr_list[1]))
maf_df = pd.concat(maf_df_list)
    
maf_df = maf_df.assign(
    MAF_afr70346 = lambda df: np.minimum(df.MAF_afr70346, 1 - df.MAF_afr70346),
    MAF_eur70346 = lambda df: np.minimum(df.MAF_eur70346, 1 - df.MAF_eur70346),
).reset_index(
)
maf_df.to_parquet(
    '/home/jupyter/data/maf/full_maf_merged.parquet'
)