In [1]:
import pandas as pd
import numpy as np
import os
import plotly.express as px
import tqdm
from statsmodels.stats.proportion import proportion_confint
import statsmodels.api as sm
from scipy.optimize import minimize_scalar

# get unadmixed af

In [None]:
WORKSPACE_BUCKET = os.getenv("WORKSPACE_BUCKET")
MULTISUSIE_BUCKET=#censored
%env DOCKER_PREFIX=XXXX # this is censored because pulling docker images from google container registry charges the owner of the image

In [None]:
%%writefile afr_unadmixed_af.sh

set -o errexit
set -o xtrace

source /opt/miniconda/etc/profile.d/conda.sh
conda init bash
conda activate /root/miniforge3/envs/admix-kit

BFILE_PREFIX="$(dirname "${BFILE_PREFIX}")"
BFILE_PREFIX=${BFILE_PREFIX}/chr${CHROM}_afr70346

python ${AF_SCRIPT} \
    --pvar_path ${PVAR_PATH} \
    --psam_path ${PSAM_PATH} \
    --sample_keep_path ${SAMPLE_KEEP_PATH} \
    --lanc_path ${LANC_PATH} \
    --bfile_prefix ${BFILE_PREFIX} \
    --chromosome ${CHROM} \
    --out_dir ${OUT_DIR}

In [None]:
task_df = pd.DataFrame({'chrom' : range(1,23)})

task_df['--input AF_SCRIPT'] = MULTISUSIE_BUCKET + '/scripts/one_time_scripts/get_unadmixed_allele_frequencies.py'
task_df['--input BFILE_PREFIX'] = [WORKSPACE_BUCKET + f'/data/plink/chr{c}_afr70346.*' for c in task_df.chrom]
task_df['--input PVAR_PATH'] = [MULTISUSIE_BUCKET + f'/data/lai/chr{c}.pvar' for c in task_df.chrom]
task_df['--input PSAM_PATH'] = [MULTISUSIE_BUCKET + f'/data/lai/chr{c}.psam' for c in task_df.chrom]
task_df['--input SAMPLE_KEEP_PATH'] = [WORKSPACE_BUCKET + f'/data/sample_keep_files/afr70346.keep' for c in task_df.chrom]
task_df['--input LANC_PATH'] = [MULTISUSIE_BUCKET + f'/data/lai/chr{c}.lanc' for c in task_df.chrom]

task_df['--env CHROM'] = task_df.chrom

task_df['--output-recursive OUT_DIR'] = WORKSPACE_BUCKET + '/data/maf'

task_df.to_csv('afr_unadmixed_af_tasks.tsv', sep = '\t', index = False)

In [None]:
%%bash
source ~/aou_dsub.bash 

aou_dsub \
  --name afr_unadmixed_af \
  --tasks afr_unadmixed_af_tasks.tsv 1-22 \
  --image '${DOCKER_PREFIX}:1.5' \
  --machine-type n2-highmem-4 \
  --script "afr_unadmixed_af.sh"

In [None]:
%%bash
source ~/aou_dsub.bash 

aou_dsub \
  --name afr_unadmixed_af \
  --tasks afr_unadmixed_af_tasks.tsv 1-13 \
  --image '${DOCKER_PREFIX}:1.5' \
  --machine-type n2-highmem-8 \
  --script "afr_unadmixed_af.sh"

In [None]:
%%bash
source ~/aou_dsub.bash 

aou_dsub \
  --name afr_unadmixed_af \
  --tasks afr_unadmixed_af_tasks.tsv 1-2 \
  --image '${DOCKER_PREFIX}:1.5' \
  --machine-type n2-highmem-16 \
  --script "afr_unadmixed_af.sh"

In [None]:
!gsutil -m cp ${WORKSPACE_BUCKET}/data/maf/chr*_afr_unadmixed.frq /home/jupyter/data/maf/

In [None]:
unadmixed_maf = pd.concat(
    [pd.read_csv(f'/home/jupyter/data/maf/chr{c}_afr_unadmixed.frq', sep = '\t') for c in range(1,23)]
).reset_index(
)

In [None]:
maf = pd.read_parquet(
    '/home/jupyter/data/maf/full_maf_merged.parquet'
).assign(
    MAF_afr_unadmixed = np.minimum(unadmixed_maf.MAF, 1 - unadmixed_maf.MAF)
)
assert(all(maf.SNP == unadmixed_maf.SNP))

In [None]:
maf.to_parquet('/home/jupyter/data/maf/full_maf_merged.parquet')