In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import tqdm
import os
import sysf

# Parse VAT using dsub

## First pass

In [None]:
from datetime import datetime
import os
import pandas as pd
import numpy as np

USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}


JOB_NAME='get_vat'
%env JOB_NAME={JOB_NAME}

%env DOCKER_PREFIX=XXXX # this is censored because pulling docker images from google container registry charges the owner of the image


In [None]:
%%writefile ~/aou_dsub.bash

function aou_dsub () {

  # Get a shorter username to leave more characters for the job name.
  local DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

  # For AoU RWB projects network name is "network".
  local AOU_NETWORK=network
  local AOU_SUBNETWORK=subnetwork

  dsub \
      --provider google-cls-v2 \
      --user-project "${GOOGLE_PROJECT}"\
      --project "${GOOGLE_PROJECT}"\
      --boot-disk-size 40 \
      --network "${AOU_NETWORK}" \
      --subnetwork "${AOU_SUBNETWORK}" \
      --service-account "$(gcloud config get-value account)" \
      --user "${DSUB_USER_NAME}" \
      --regions us-central1 \
      --logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/$(date +'%Y%m%d/%H%M%S')/{job-id}-{task-id}-{task-attempt}.log" \
      "$@"
}

In [None]:
%%writefile parse_vat.sh

set -o errexit
set -o xtrace

gsutil -u ${GOOGLE_PROJECT} -m cp gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux/vat/vat_complete.bgz.tsv.gz vat_complete.bgz.tsv.gz

python -c "import pandas as pd; import numpy as np; import tqdm;
worker_index=${WORKER_INDEX}
WORKSPACE_BUCKET=${WORKSPACE_BUCKET}
header = pd.read_csv('vat_complete.bgz.tsv.gz', nrows = 0, sep = '\t', dtype = str).columns.tolist();
transcript_specific_columns = [
    'transcript',
    'gene_symbol',
    'transcript_source',
    'aa_change',
    'consequence',
    'dna_change_in_transcript',
    'exon_number',
    'intron_number',
    'gene_id',
    'is_canonical_transcript',
    
];
relevant_columns = ['vid',
 'transcript',
 'contig',
 'position',
 'ref_allele',
 'alt_allele',
 'gvs_afr_ac',
 'gvs_afr_an',
 'gvs_afr_af',
 'gvs_eur_ac',
 'gvs_eur_an',
 'gvs_eur_af',
 'gene_symbol',
 'transcript_source',
 'aa_change',
 'consequence',
 'dna_change_in_transcript',
 'variant_type',
 'exon_number',
 'intron_number',
 'genomic_location',
 'dbsnp_rsid',
 'gene_id',
 'gene_omim_id',
 'is_canonical_transcript',
 'revel',
 'splice_ai_acceptor_gain_score',
 'splice_ai_acceptor_gain_distance',
 'splice_ai_acceptor_loss_score',
 'splice_ai_acceptor_loss_distance',
 'splice_ai_donor_gain_score',
 'splice_ai_donor_gain_distance',
 'splice_ai_donor_loss_score',
 'splice_ai_donor_loss_distance',
 'omim_phenotypes_id',
 'omim_phenotypes_name',
 'clinvar_classification',
 'clinvar_last_updated',
 'clinvar_phenotype'];
relevant_column_indices = np.flatnonzero([c in relevant_columns for c in header]).tolist()
non_specific_columns = [c for c in relevant_columns if c not in transcript_specific_columns]

def compress_df_chunk(df, extra_columns = None):
    non_specific_columns_other_than_vid = [c for c in non_specific_columns if c != 'vid'] + extra_colummns
    non_specific_component = df.iloc[0,][non_specific_columns_other_than_vid]
    transcript_specific_component = df[transcript_specific_columns].astype(str).agg(','.join)
    return pd.concat([non_specific_component, transcript_specific_component])

def gsutil_cp(src, dst, clobber = True, quiet = False):
    import subprocess

    gsutil_options = ['-m']
    if quiet:
        gsutil_options += ['-q']

    cp_options = []
    if not clobber:
        cp_options += ['-n']

    subprocess.run(
        ['gsutil'] + gsutil_options + ['cp'] + cp_options + [src, dst],
        check = True,
        universal_newlines = True
    )

dtypes = {c : str for c in relevant_columns}
dtypes['gvs_afr_ac'] = int
dtypes['gvs_afr_an'] = int
dtypes['gvs_afr_af'] = float 
dtypes['gvs_eur_ac'] = int
dtypes['gvs_eur_an'] = int
dtypes['gvs_eur_af'] = float
vat_iterator = pd.read_csv(
    'vat_complete.bgz.tsv.gz', 
    chunksize = int(1e6), 
    sep = '\t', 
    header = 0,
    names = relevant_columns,
    dtype = dtypes,
    usecols = relevant_column_indices,
    nrows = int(1e9),
    skiprows = int(1e9 * worker_index)   
)
n_afr = 79826
n_eur = 223350

allele_number_threshold_afr = n_afr * 2 * .95
allele_number_threshold_eur = n_eur * 2 * .95

old_chromosomes = None
relevant_chromosomes = [f'chr{i}' for i in range(1,23)]
for chunk in tqdm.tqdm(vat_iterator):
    if worker_index == 4:
        chunk = chunk.query('contig in @relevant_chromosomes')
    chunk_filtered = chunk.assign(
        gvs_afr_an = lambda df: df.gvs_afr_an.astype(int),
        gvs_eur_an = lambda df: df.gvs_eur_an.astype(int),
        gvs_afr_af = lambda df: df.gvs_afr_af.astype(float),
        gvs_eur_af = lambda df: df.gvs_eur_af.astype(float),
    ).query(
        '(0.001 <= gvs_afr_af <= .999) | (0.001 <= gvs_eur_af <= .999)'
    ).query(
        'gvs_afr_an >= @allele_number_threshold_afr'
    ).query(
        'gvs_eur_an >= @allele_number_threshold_eur'
    )
    chunk_filtered_collapsed = chunk_filtered.groupby(
        ['vid'], sort = False
    ).apply(
        compress_df_chunk
    ).reset_index(
    ).assign(
        gvs_eur_missingness = lambda df: df.gvs_eur_an / (n_eur * 2),
        gvs_afr_missingness = lambda df: df.gvs_afr_an / (n_afr * 2),
        CHR = lambda df: df.contig.str.slice(3)
    )
    new_chromosomes = chunk_filtered_collapsed.CHR.drop_duplicates().tolist()
    if old_chromosomes is None:
        old_chromosomes = new_chromosomes
    for chromosome in new_chromosomes:
        chunk_filtered_collapsed.query(
            'CHR == @chromosome'
        ).to_csv(
            f'vat_chr{chromosome}_worker_{worker_index}.tsv',
            mode = 'a',
            header = False,
            index = False,
            sep = '\t'
        )
    for chromosome in old_chromosomes:
        if chromosome not in new_chromosomes:
            gsutil_cp(f'vat_chr{chromosome}_worker_{worker_index}.tsv', f'{WORKSPACE_BUCKET}/data/vat/vat_chr{chromosome}_worker_{worker_index}.tsv')
    old_chromosomes = new_chromosomes"

gsutil -m cp vat_chr* ${WORKSPACE_BUCKET}/data/vat/

In [None]:
task_df = pd.DataFrame({
    '--env WORKER_INDEX' : [i for i in range(5)]
})
task_df.to_csv('index_df.tsv', sep = '\t', index = False)

In [None]:
%%bash --out JOB_NAME
source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.
                
aou_dsub \
  --machine-type n1-highmem-2 \
  --task index_df.tsv 2-5 \
  --env WORKSPACE_BUCKET ${WORKSPACE_BUCKET} \
  --image "${DOCKER_PREFIX}:1.4" \
  --disk-size 1000 \
  --script parse_vat.sh

## finish broken chromosome 22 separately

In [None]:
%%writefile parse_vat_22.sh

set -o xtrace

gsutil -u ${GOOGLE_PROJECT} -m cp gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux/vat/vat_complete.bgz.tsv.gz vat_complete.bgz.tsv.gz

python -c "import pandas as pd; import numpy as np; import tqdm;
worker_index=6
WORKSPACE_BUCKET=${WORKSPACE_BUCKET}
header = pd.read_csv('vat_complete.bgz.tsv.gz', nrows = 0, sep = '\t', dtype = str).columns.tolist();
transcript_specific_columns = [
    'transcript',
    'gene_symbol',
    'transcript_source',
    'aa_change',
    'consequence',
    'dna_change_in_transcript',
    'exon_number',
    'intron_number',
    'gene_id',
    'is_canonical_transcript',
    
];
relevant_columns = ['vid',
 'transcript',
 'contig',
 'position',
 'ref_allele',
 'alt_allele',
 'gvs_afr_ac',
 'gvs_afr_an',
 'gvs_afr_af',
 'gvs_eur_ac',
 'gvs_eur_an',
 'gvs_eur_af',
 'gene_symbol',
 'transcript_source',
 'aa_change',
 'consequence',
 'dna_change_in_transcript',
 'variant_type',
 'exon_number',
 'intron_number',
 'genomic_location',
 'dbsnp_rsid',
 'gene_id',
 'gene_omim_id',
 'is_canonical_transcript',
 'revel',
 'splice_ai_acceptor_gain_score',
 'splice_ai_acceptor_gain_distance',
 'splice_ai_acceptor_loss_score',
 'splice_ai_acceptor_loss_distance',
 'splice_ai_donor_gain_score',
 'splice_ai_donor_gain_distance',
 'splice_ai_donor_loss_score',
 'splice_ai_donor_loss_distance',
 'omim_phenotypes_id',
 'omim_phenotypes_name',
 'clinvar_classification',
 'clinvar_last_updated',
 'clinvar_phenotype'];
relevant_column_indices = np.flatnonzero([c in relevant_columns for c in header]).tolist()
non_specific_columns = [c for c in relevant_columns if c not in transcript_specific_columns]

def compress_df_chunk(df, extra_columns = None):
    if extra_columns is not None:
        non_specific_columns_other_than_vid = [c for c in non_specific_columns if c != 'vid'] + extra_columns
    else:
        non_specific_columns_other_than_vid = [c for c in non_specific_columns if c != 'vid']
    non_specific_component = df.iloc[0,][non_specific_columns_other_than_vid]
    transcript_specific_component = df[transcript_specific_columns].astype(str).agg(','.join)
    return pd.concat([non_specific_component, transcript_specific_component])

def gsutil_cp(src, dst, clobber = True, quiet = False):
    import subprocess
    
    gsutil_options = ['-m']
    if quiet:
        gsutil_options += ['-q']
    
    cp_options = []
    if not clobber:
        cp_options += ['-n']
    
    subprocess.run(
        ['gsutil'] + gsutil_options + ['cp'] + cp_options + [src, dst],
        check = True,
        universal_newlines = True
    )

dtypes = {c : str for c in relevant_columns}
dtypes['gvs_afr_ac'] = int
dtypes['gvs_afr_an'] = int
dtypes['gvs_afr_af'] = float 
dtypes['gvs_eur_ac'] = int
dtypes['gvs_eur_an'] = int
dtypes['gvs_eur_af'] = float
vat_iterator = pd.read_csv(
    'vat_complete.bgz.tsv.gz', 
    chunksize = int(1e6), 
    sep = '\t', 
    header = 0,
    names = relevant_columns,
    dtype = dtypes,
    usecols = relevant_column_indices,
    nrows = int(2e8),
    skiprows = int(4.69e9)   
)
n_afr = 79826
n_eur = 223350

allele_number_threshold_afr = n_afr * 2 * .95
allele_number_threshold_eur = n_eur * 2 * .95

old_chromosomes = None
relevant_chromosomes = [f'chr{i}' for i in range(1,23)]
late_chromosomes = ['chr21', 'chr22']
latest_chromosome = 'chr22'
for chunk in tqdm.tqdm(vat_iterator):
    chunk = chunk.query('contig in @late_chromosomes')
    if chunk.shape[0] == 0:
        break
    chunk = chunk.query('contig == @latest_chromosome')
    if chunk.shape[0] == 0:
        continue
    chunk_filtered = chunk.assign(
        gvs_afr_an = lambda df: df.gvs_afr_an.astype(int),
        gvs_eur_an = lambda df: df.gvs_eur_an.astype(int),
        gvs_afr_af = lambda df: df.gvs_afr_af.astype(float),
        gvs_eur_af = lambda df: df.gvs_eur_af.astype(float),
    ).query(
        '(0.001 <= gvs_afr_af <= .999) | (0.001 <= gvs_eur_af <= .999)'
    ).query(
        'gvs_afr_an >= @allele_number_threshold_afr'
    ).query(
        'gvs_eur_an >= @allele_number_threshold_eur'
    )
    chunk_filtered_collapsed = chunk_filtered.groupby(
        ['vid'], sort = False
    ).apply(
        compress_df_chunk
    ).reset_index(
    ).assign(
        CHR = lambda df: df.contig.str.slice(3)
    )
    new_chromosomes = chunk_filtered_collapsed.CHR.drop_duplicates().tolist()
    if old_chromosomes is None:
        old_chromosomes = new_chromosomes
    for chromosome in new_chromosomes:
        chunk_filtered_collapsed.query(
            'CHR == @chromosome'
        ).to_csv(
            f'vat_chr{chromosome}_worker_{worker_index}.tsv',
            mode = 'a',
            header = False,
            index = False,
            sep = '\t'
        )
    for chromosome in old_chromosomes:
        if chromosome not in new_chromosomes:
            gsutil_cp(f'vat_chr{chromosome}_worker_{worker_index}.tsv', f'{WORKSPACE_BUCKET}/data/vat/vat_chr{chromosome}_worker_{worker_index}.tsv')
    old_chromosomes = new_chromosomes"

gsutil -m cp vat_chr* ${WORKSPACE_BUCKET}/data/vat/

In [None]:
%%bash --out JOB_NAME
source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.
                
aou_dsub \
  --machine-type n1-highmem-2 \
  --image ${DOCKER_IMAGE} \
  --env WORKSPACE_BUCKET ${WORKSPACE_BUCKET} \
  --disk-size 1000 \
  --script parse_vat_22.sh

# Process VATs

In [None]:
!gsutil rsync ${WORKSPACE_BUCKET}/data/vat /home/jupyter/data/vat/

In [None]:
!head /home/jupyter/data/vat/vat_chr22_worker_6.tsv

In [None]:
header = pd.read_csv('/home/jupyter/data/vat/vat_complete.bgz.tsv.gz', nrows = 0, sep = '\t', dtype = str).columns.tolist();
transcript_specific_columns = [
    'transcript',
    'gene_symbol',
    'transcript_source',
    'aa_change',
    'consequence',
    'dna_change_in_transcript',
    'exon_number',
    'intron_number',
    'gene_id',
    'is_canonical_transcript',
    
];
relevant_columns = ['vid',
 'transcript',
 'contig',
 'position',
 'ref_allele',
 'alt_allele',
 'gvs_afr_ac',
 'gvs_afr_an',
 'gvs_afr_af',
 'gvs_eur_ac',
 'gvs_eur_an',
 'gvs_eur_af',
 'gene_symbol',
 'transcript_source',
 'aa_change',
 'consequence',
 'dna_change_in_transcript',
 'variant_type',
 'exon_number',
 'intron_number',
 'genomic_location',
 'dbsnp_rsid',
 'gene_id',
 'gene_omim_id',
 'is_canonical_transcript',
 'revel',
 'splice_ai_acceptor_gain_score',
 'splice_ai_acceptor_gain_distance',
 'splice_ai_acceptor_loss_score',
 'splice_ai_acceptor_loss_distance',
 'splice_ai_donor_gain_score',
 'splice_ai_donor_gain_distance',
 'splice_ai_donor_loss_score',
 'splice_ai_donor_loss_distance',
 'omim_phenotypes_id',
 'omim_phenotypes_name',
 'clinvar_classification',
 'clinvar_last_updated',
 'clinvar_phenotype'];
relevant_column_indices = np.flatnonzero([c in relevant_columns for c in header]).tolist()
non_specific_columns = [c for c in relevant_columns if c not in transcript_specific_columns]
non_specific_columns_other_than_vid = [c for c in non_specific_columns if c != 'vid']
processed_header = ['vid'] + \
    non_specific_columns_other_than_vid + \
    transcript_specific_columns + \
    ['gvs_eur_missingness', 'gvs_afr_missingness', 'CHR']

In [None]:
def compress_df_chunk(df, extra_columns = None):
    non_specific_columns_other_than_vid = [c for c in non_specific_columns if c != 'vid'] + extra_columns
    non_specific_component = df.iloc[0,][non_specific_columns_other_than_vid]
    transcript_specific_component = df[transcript_specific_columns].astype(str).agg(','.join)
    return pd.concat([non_specific_component, transcript_specific_component])

vat_dir = '/home/jupyter/data/vat/'
unprocessed_vat_shards = [f for f in os.listdir(vat_dir) if '_worker_' in f]

for chrom in tqdm.tqdm(list(range(1,22))):
    
    unprocessed_vat_shards_chrom = [f for f in unprocessed_vat_shards if f'chr{chrom}_' in f]
    vat_shard_df_list = []
    for f in unprocessed_vat_shards_chrom:
        df_shard = pd.read_csv(
            vat_dir + f,
            sep = '\t', 
            names = processed_header
        )
        df_shard.omim_phenotypes_id = df_shard.omim_phenotypes_id.astype(str)
        df_shard.is_canonical_transcript = df_shard.is_canonical_transcript.astype(str)
        vat_shard_df_list.append(df_shard)
    vat_chrom = pd.concat(vat_shard_df_list)

    duplicate_vids = vat_chrom[vat_chrom.vid.duplicated()].vid.tolist()
    if len(duplicate_vids) > 0:
        vat_chrom_duplicates = vat_chrom[[i in duplicate_vids for i in vat_chrom.vid]]
        vat_chrom_duplicates_merged = vat_chrom_duplicates.groupby(
            ['vid'], sort = False
        ).apply(
            lambda df: compress_df_chunk(df, extra_columns = ['CHR', 'gvs_eur_missingness', 'gvs_afr_missingness']),
        ).reset_index(
        )
        vat_chrom_deduped = pd.concat([
            vat_chrom[[i not in duplicate_vids for i in vat_chrom.vid]], 
            vat_chrom_duplicates_merged
        ]).sort_values(
            ['position', 'ref_allele', 'alt_allele']
        )
    else:
        vat_chrom_deduped = vat_chrom
    
    vat_chrom_deduped.to_parquet(
        vat_dir + f'vat_chr{chrom}.parquet'
    )

In [None]:
header = pd.read_csv('/home/jupyter/data/vat/vat_complete.bgz.tsv.gz', nrows = 0, sep = '\t', dtype = str).columns.tolist();
transcript_specific_columns = [
    'transcript',
    'gene_symbol',
    'transcript_source',
    'aa_change',
    'consequence',
    'dna_change_in_transcript',
    'exon_number',
    'intron_number',
    'gene_id',
    'is_canonical_transcript',
    
];
relevant_columns = ['vid',
 'transcript',
 'contig',
 'position',
 'ref_allele',
 'alt_allele',
 'gvs_afr_ac',
 'gvs_afr_an',
 'gvs_afr_af',
 'gvs_eur_ac',
 'gvs_eur_an',
 'gvs_eur_af',
 'gene_symbol',
 'transcript_source',
 'aa_change',
 'consequence',
 'dna_change_in_transcript',
 'variant_type',
 'exon_number',
 'intron_number',
 'genomic_location',
 'dbsnp_rsid',
 'gene_id',
 'gene_omim_id',
 'is_canonical_transcript',
 'revel',
 'splice_ai_acceptor_gain_score',
 'splice_ai_acceptor_gain_distance',
 'splice_ai_acceptor_loss_score',
 'splice_ai_acceptor_loss_distance',
 'splice_ai_donor_gain_score',
 'splice_ai_donor_gain_distance',
 'splice_ai_donor_loss_score',
 'splice_ai_donor_loss_distance',
 'omim_phenotypes_id',
 'omim_phenotypes_name',
 'clinvar_classification',
 'clinvar_last_updated',
 'clinvar_phenotype'];
relevant_column_indices = np.flatnonzero([c in relevant_columns for c in header]).tolist()
non_specific_columns = [c for c in relevant_columns if c not in transcript_specific_columns]
non_specific_columns_other_than_vid = [c for c in non_specific_columns if c != 'vid']
processed_header = ['vid'] + \
    non_specific_columns_other_than_vid + \
    transcript_specific_columns + \
    ['CHR']

In [None]:
def compress_df_chunk(df, extra_columns = None):
    non_specific_columns_other_than_vid = [c for c in non_specific_columns if c != 'vid'] + extra_columns
    non_specific_component = df.iloc[0,][non_specific_columns_other_than_vid]
    transcript_specific_component = df[transcript_specific_columns].astype(str).agg(','.join)
    return pd.concat([non_specific_component, transcript_specific_component])

vat_dir = '/home/jupyter/data/vat/'
unprocessed_vat_shards = [f for f in os.listdir(vat_dir) if '_worker_' in f]

for chrom in tqdm.tqdm(list(range(22,23))):
    
    unprocessed_vat_shards_chrom = [f for f in unprocessed_vat_shards if f'chr{chrom}_' in f]
    vat_shard_df_list = []
    for f in unprocessed_vat_shards_chrom:
        df_shard = pd.read_csv(
            vat_dir + f,
            sep = '\t', 
            names = processed_header
        )
        df_shard.omim_phenotypes_id = df_shard.omim_phenotypes_id.astype(str)
        df_shard.is_canonical_transcript = df_shard.is_canonical_transcript.astype(str)
        vat_shard_df_list.append(df_shard)
    vat_chrom = pd.concat(vat_shard_df_list)

    duplicate_vids = vat_chrom[vat_chrom.vid.duplicated()].vid.tolist()
    if len(duplicate_vids) > 0:
        vat_chrom_duplicates = vat_chrom[[i in duplicate_vids for i in vat_chrom.vid]]
        vat_chrom_duplicates_merged = vat_chrom_duplicates.groupby(
            ['vid'], sort = False
        ).apply(
            lambda df: compress_df_chunk(df, extra_columns = ['CHR']),
        ).reset_index(
        )
        vat_chrom_deduped = pd.concat([
            vat_chrom[[i not in duplicate_vids for i in vat_chrom.vid]], 
            vat_chrom_duplicates_merged
        ]).sort_values(
            ['position', 'ref_allele', 'alt_allele']
        )
    else:
        vat_chrom_deduped = vat_chrom
    
    vat_chrom_deduped.to_parquet(
        vat_dir + f'vat_chr{chrom}.parquet'
    )

# Make non-synonymous annotation

In [None]:
synonymous_classes = ['synonymous_variant', 'start_retained_variant', 'stop_retained_variant']
nonsynonymous_classes = ['frameshift_variant', 'inframe_deletion', 'inframe_insertion', 'missense_variant', 
                         'start_lost', 'stop_gained', 'stop_lost']
for chr in tqdm.tqdm(range(1,23)):
    vat_chr = pd.read_parquet(f'/home/jupyter/data/vat/vat_chr{chr}.parquet')
    vat_chr_with_consequence = vat_chr.dropna(subset = ['consequence'])
    vat_chr_nonsynonymous = vat_chr_with_consequence[reduce(lambda a, b: a | b,  [vat_chr_with_consequence.consequence.str.contains(c) for c in nonsynonymous_classes])]
    vat_chr_synonymous = vat_chr_with_consequence[reduce(lambda a, b: a | b,  [vat_chr_with_consequence.consequence.str.contains(c) for c in synonymous_classes])]
    nonsynonymous_vids = set(vat_chr_nonsynonymous.vid.str.replace('-', ':').tolist())
    synonymous_vids = set(vat_chr_synonymous.vid.str.replace('-', ':').tolist())
    synonymous_excluding_nonsynonymous_vids = synonymous_vids.difference(nonsynonymous_vids)
    bim = pd.read_csv(
        f'/home/jupyter/data/plink/chr{chr}_afr70346.bim',
        sep = '\t',
        names = ['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2']
    ).assign(
        synonymous = lambda df: np.array([v in synonymous_excluding_nonsynonymous_vids for v in df.SNP]).astype(int),
        non_synonymous = lambda df: np.array([v in nonsynonymous_vids for v in df.SNP]).astype(int),
    )
    bim.to_csv(
        f'/home/jupyter/data/annotations/synonymous_and_nonsynonymous/chr{chr}_afr70346.annot.gz',
        index = False,
        sep = '\t'
    )