In [None]:
import os
import numpy as np
import pandas as pd
import pandas_profiling
import plotnine
from plotnine import *  # Provides a ggplot-like interface to matplotlib.
from IPython.display import display

## Plot setup.
theme_set(theme_bw(base_size = 11)) # Default theme for plots.

def get_boxplot_fun_data(df):
  """Returns a data frame with a y position and a label, for use annotating ggplot boxplots.

  Args:
    d: A data frame.
  Returns:
    A data frame with column y as max and column label as length.
  """
  d = {'y': max(df), 'label': f'N = {len(df)}'}
  return(pd.DataFrame(data=d, index=[0]))

# NOTE: if you get any errors from this cell, restart your kernel and run it again.


In [None]:
bucket = os.getenv("WORKSPACE_BUCKET")
bucket

In [None]:
#generate population-specific genotypes by chromosme
%%writefile ./generate_plink_pops.sh

set -o errexit
set -o nounset

PLINK_BFILE="${PLINK_FILES%.*}"
echo ${PLINK_BFILE}

unzip "${PLINK2_SOFT}"

pops="AFR"

for pop in ${pops};do

./plink2 --bfile "${PLINK_BFILE}" \
--keep "${PCA_DIR}/aou_v7_${pop}.ids" \
--make-bed \
--out "${OUT_DIR}"/aou_v7_${pop}_chr"${CHROM}"

done


In [None]:
%%writefile ./Regenie_GWAS_quant.sh

set -o pipefail 
set -o errexit

PLINK_BFILE="${bed_file%.*}"
sample_dir=$(dirname "${sample_files}")
snps_dir=$(dirname "${step1_snplists}")

pops="EUR AMR AFR"

for pop in ${pops};do

step1_snplist=${snps_dir}/aou_v7_${pop}_maf001_qcd_rsids.snplist
regenie \
    --step 1 \
    --bed "${PLINK_BFILE}" \
    --keep "${sample_dir}/aou_v7_training_${pop}.ids" \
    --phenoFile "${pheno_file}" \
    --phenoColList "${phen_col}" \
    --covarFile "${cov_file}" \
    --catCovarList sex \
    --extract "${step1_snplist}" \
    --covarColList age,age2,age2_sex,age_sex,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10\
    --bsize 1000 \
    --verbose \
    --out "${prefix}"_step1_chr"${chrom}"_"${pop}"

#regenie pt 2
regenie \
    --step 2 \
    --bed "${PLINK_BFILE}" \
    --keep  "${sample_dir}/aou_v7_training_${pop}.ids" \
    --phenoFile "${pheno_file}" \
    --phenoColList "${phen_col}" \
    --covarFile "${cov_file}" \
    --catCovarList sex \
    --covarColList age,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10\
    --pred "${prefix}"_step1_chr"${chrom}"_"${pop}"_pred.list \
    --bsize 400 \
    --verbose  --minMAC 20 \
    --gz \
    --out "${prefix}"_step2_chr"${chrom}"_"${pop}"
done

export regenie_results="*.regenie.gz"
echo "regenie_results: ${regenie_results}"
mv ${regenie_results} ${OUTPUT_PATH}

In [None]:
%%writefile ./Regenie_GWAS_binary.sh

set -o pipefail 
set -o errexit

PLINK_BFILE="${bed_file%.*}"
echo ${PLINK_BFILE}
sample_dir=$(dirname "${sample_files}")
snps_dir=$(dirname "${step1_snplists}")

pops="EUR AMR AFR"

for pop in ${pops};do

step1_snplist=${snps_dir}/aou_v7_${pop}_maf001_qcd_rsids.snplist
regenie \
    --step 1 \
    --bed "${PLINK_BFILE}" \
    --keep "${sample_dir}/aou_v7_training_${pop}.ids" \
    --phenoFile "${pheno_file}" \
    --phenoColList "${phen_col}" \
    --covarFile "${cov_file}" \
    --catCovarList sex \
    --covarColList age,age2,age2_sex,age_sex,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10\
    --bsize 1000 \
    --extract "${step1_snplist}" \
    --verbose \
    --bt \
    --out "${prefix}"_step1_chr"${chrom}"_"${pop}"

#regenie pt 2
regenie \
    --step 2 \
    --bed "${PLINK_BFILE}" \
    --keep  "${sample_dir}/aou_v7_training_${pop}.ids" \
    --phenoFile "${pheno_file}" \
    --phenoColList "${phen_col}" \
    --covarFile "${cov_file}" \
    --catCovarList sex \
    --covarColList age,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10\
    --pred "${prefix}"_step1_chr"${chrom}"_"${pop}"_pred.list \
    --bsize 400 \
    --verbose \
    --minMAC 20 \
    --firth --approx \
    --bt --pThresh 0.05 \
    --af-cc \
    --gz \
    --out "${prefix}"_step2_chr"${chrom}"_"${pop}"
done

export regenie_results="*.regenie.gz"
echo "regenie_results: ${regenie_results}"
mv ${regenie_results} ${OUTPUT_PATH}