In [1]:
# Set the process name to be human readable in htop
import setproctitle
setproctitle.setproctitle("05_Fetch_Population_Variants")

import requests
import pandas as pd
pd.options.display.max_columns = 999

import numpy as np
import helper as my

from tqdm import tqdm, tqdm_notebook
from tqdm._tqdm_notebook import tqdm_notebook

tqdm.pandas(tqdm_notebook)
tqdm_notebook.pandas()


%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns

from mjm_tools import zip_res_range, unzip_res_range

from collections import defaultdict

# Paramaters for file locations

In [2]:
# Base directory for whole project
base_dir = "/home/sdw95/3D_SARS2"

# Query GnomAD for Population Variants

In [58]:
# Borrowed From https://gist.github.com/ressy/6fd7f6ee6401ac8e703dc2709399869e
# See also Docs at (https://gnomad.broadinstitute.org/api)
def fetch_gnomAD(jsondata, url="https://gnomad.broadinstitute.org/api"):
    # The server gives a generic error message if the content type isn't
    # explicitly set
    headers = {"Content-Type": "application/json"}
    response = requests.post(url, json=jsondata, headers=headers)
    json = response.json()
    if "errors" in json:
        raise Exception(str(json["errors"]))
    return json
# FUNCTION END

# Borrowed From https://gist.github.com/ressy/6fd7f6ee6401ac8e703dc2709399869e
# See also Docs at (https://gnomad.broadinstitute.org/api)
def queryGnomadByGene(gene_name, dataset="gnomad_r2_1", fields=["gene_id", "gene_symbol", "chrom", "pos", "ref", "alt", "consequence", "rsid", "variantId", "hgvsp"]):
    # Note that this is GraphQL, not JSON.
    fmt_graphql = """
    {
        gene(gene_symbol: "%s", reference_genome:GRCh38) {
          variants(dataset: %s) {
          %s
        }
      }
    }
    """
    # This part will be JSON encoded, but with the GraphQL part left as a
    # glob of text.
    req_variantlist = {
        "query": fmt_graphql % (gene_name, dataset, "\n".join(fields)),
        "variables": {}
        }
    response = fetch_gnomAD(req_variantlist)
    return pd.DataFrame(response["data"]["gene"]["variants"])[fields]
# FUNCTION END

In [147]:
queryGnomadByGene("PRIM2")

Unnamed: 0,gene_id,gene_symbol,chrom,pos,ref,alt,consequence,rsid,variantId,hgvsp
0,ENSG00000146143,PRIM2,6,57183182,T,C,intron_variant,rs1045869235,6-57183182-T-C,
1,ENSG00000146143,PRIM2,6,57183182,T,TC,intron_variant,rs1045869235,6-57183182-T-TC,
2,ENSG00000146143,PRIM2,6,57183182,TC,T,intron_variant,rs1045869235,6-57183182-TC-T,
3,ENSG00000146143,PRIM2,6,57183190,C,A,intron_variant,rs990388135,6-57183190-C-A,
4,ENSG00000146143,PRIM2,6,57183195,T,C,intron_variant,rs763067344,6-57183195-T-C,
5,ENSG00000146143,PRIM2,6,57183201,A,G,intron_variant,rs1250384194,6-57183201-A-G,
6,ENSG00000146143,PRIM2,6,57183204,A,G,intron_variant,rs766506262,6-57183204-A-G,
7,ENSG00000146143,PRIM2,6,57183205,T,C,intron_variant,rs1364833733,6-57183205-T-C,
8,ENSG00000146143,PRIM2,6,57183210,C,T,intron_variant,rs369094479,6-57183210-C-T,
9,ENSG00000146143,PRIM2,6,57183211,G,A,intron_variant,rs749727827,6-57183211-G-A,


In [52]:
# Read in all Human Proteins Involved in Interactome
all_proteins = pd.read_csv("{0}/Data/Proteins.txt".format(base_dir), sep="\t")
human_genes = all_proteins[all_proteins["Is_Viral"] == False]["Gene_Name"].to_list()

In [69]:
# Fetch All Variants Listed in gnomAD (filter to only missense variants)
# NOTE: There is a query rate limit exceeded error that gets thrown if you submit
#       too many requests at once. I've jerry rigged this to just keep trying
#       until it works. But the first time I ran it, this worked fine on one attempt.
#
# NOTE: AATF and CISD3 never finish with the current setup (they throw a separate error, no data
#       in gnomad_r2_1?). These weren't a problem when I ran using GRCh37 as the reference genome
#       instead of GRCh38. Comparing the two outputs, the chromosomal positions don't seem to actually
#       change depending on the reference genome selected so I'm not sure what this means. When
#       this code was run originally, the GnomAD API didn't require a reference_genome selection.

import time
#all_variants = dict()

attempt_i = 1
attempts_per_gene = 10
while(True):
    cur_keys = len(all_variants)
    if(cur_keys == len(human_genes + ["ACE2", "TMPRSS2"])):
        break
    print "Starting Attempt", attempt_i, "with", cur_keys, "Genes Parsed"
    
    # Iterate over all genes
    for g in tqdm_notebook(human_genes + ["ACE2", "TMPRSS2"]):
        if(g in all_variants):
            continue
        
        # Try to fetch variants on each gene n times
        tries = 0
        while(True):
            try:
                tries += 1
                pop_variants = queryGnomadByGene(g)
                break
            except Exception:
                time.sleep(0.1)
                if(tries > attempts_per_gene):
                    break
                pass
        if(tries > attempts_per_gene):
            print "Skipped", g
            continue
        if(not g == pop_variants["gene_symbol"].unique()[0]):
            print "MISMATCH: ", g, "-->", pop_variants["gene_symbol"].unique()[0]
            pop_variants["gene_symbol"] = g
        all_variants[g] = pop_variants[pop_variants["consequence"] == "missense_variant"]
    
    print "Finished Attempt", attempt_i, "with", len(all_variants), "Genes Parsed (", (len(all_variants) - cur_keys), " New)"
    print
    attempt_i += 1
    time.sleep(60)
#all_variants = pd.concat(all_variants)

Starting Attempt 1 with 332 Genes Parsed
Starting Attempt 1 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 1 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 1 with 332 Genes Parsed ( 0  New)

Starting Attempt 2 with 332 Genes Parsed
Starting Attempt 2 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 2 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 2 with 332 Genes Parsed ( 0  New)

Starting Attempt 3 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 3 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 3 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 3 with 332 Genes Parsed ( 0  New)

Starting Attempt 4 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 4 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 4 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 4 with 332 Genes Parsed ( 0  New)

Starting Attempt 5 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 5 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 5 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 5 with 332 Genes Parsed ( 0  New)

Starting Attempt 6 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 6 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 6 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 6 with 332 Genes Parsed ( 0  New)

Starting Attempt 7 with 332 Genes Parsed
Starting Attempt 7 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 7 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 7 with 332 Genes Parsed ( 0  New)

Starting Attempt 8 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 8 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 8 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 8 with 332 Genes Parsed ( 0  New)

Starting Attempt 9 with 332 Genes Parsed
Starting Attempt 9 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 9 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 9 with 332 Genes Parsed ( 0  New)

Starting Attempt 10 with 332 Genes Parsed
Starting Attempt 10 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 10 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 10 with 332 Genes Parsed ( 0  New)

Starting Attempt 11 with 332 Genes Parsed
Starting Attempt 11 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 11 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 11 with 332 Genes Parsed ( 0  New)

Starting Attempt 12 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 12 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 12 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 12 with 332 Genes Parsed ( 0  New)

Starting Attempt 13 with 332 Genes Parsed
Starting Attempt 13 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 13 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 13 with 332 Genes Parsed ( 0  New)

Starting Attempt 14 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 14 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 14 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 14 with 332 Genes Parsed ( 0  New)

Starting Attempt 15 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 15 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 15 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 15 with 332 Genes Parsed ( 0  New)

Starting Attempt 16 with 332 Genes Parsed
Starting Attempt 16 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 16 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 16 with 332 Genes Parsed ( 0  New)

Starting Attempt 17 with 332 Genes Parsed
Starting Attempt 17 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 17 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 17 with 332 Genes Parsed ( 0  New)

Starting Attempt 18 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 18 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 18 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 18 with 332 Genes Parsed ( 0  New)

Starting Attempt 19 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 19 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 19 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 19 with 332 Genes Parsed ( 0  New)

Starting Attempt 20 with 332 Genes Parsed
Starting Attempt 20 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 20 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 20 with 332 Genes Parsed ( 0  New)

Starting Attempt 21 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 21 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 21 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 21 with 332 Genes Parsed ( 0  New)

Starting Attempt 22 with 332 Genes Parsed
Starting Attempt 22 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 22 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 22 with 332 Genes Parsed ( 0  New)

Starting Attempt 23 with 332 Genes Parsed
Starting Attempt 23 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 23 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 23 with 332 Genes Parsed ( 0  New)

Starting Attempt 24 with 332 Genes Parsed
Starting Attempt 24 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 24 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 24 with 332 Genes Parsed ( 0  New)

Starting Attempt 25 with 332 Genes Parsed
Starting Attempt 25 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 25 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 25 with 332 Genes Parsed ( 0  New)

Starting Attempt 26 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 26 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 26 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 26 with 332 Genes Parsed ( 0  New)

Starting Attempt 27 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 27 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 27 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 27 with 332 Genes Parsed ( 0  New)

Starting Attempt 28 with 332 Genes Parsed
Starting Attempt 28 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 28 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 28 with 332 Genes Parsed ( 0  New)

Starting Attempt 29 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 29 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 29 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 29 with 332 Genes Parsed ( 0  New)

Starting Attempt 30 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 30 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 30 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 30 with 332 Genes Parsed ( 0  New)

Starting Attempt 31 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 31 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 31 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 31 with 332 Genes Parsed ( 0  New)

Starting Attempt 32 with 332 Genes Parsed
Starting Attempt 32 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 32 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 32 with 332 Genes Parsed ( 0  New)

Starting Attempt 33 with 332 Genes Parsed
Starting Attempt 33 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 33 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 33 with 332 Genes Parsed ( 0  New)

Starting Attempt 34 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 34 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 34 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 34 with 332 Genes Parsed ( 0  New)

Starting Attempt 35 with 332 Genes Parsed
Starting Attempt 35 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 35 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 35 with 332 Genes Parsed ( 0  New)

Starting Attempt 36 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 36 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 36 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 36 with 332 Genes Parsed ( 0  New)

Starting Attempt 37 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 37 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 37 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 37 with 332 Genes Parsed ( 0  New)

Starting Attempt 38 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 38 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 38 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 38 with 332 Genes Parsed ( 0  New)

Starting Attempt 39 with 332 Genes Parsed
Starting Attempt 39 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 39 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 39 with 332 Genes Parsed ( 0  New)

Starting Attempt 40 with 332 Genes Parsed
Starting Attempt 40 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 40 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 40 with 332 Genes Parsed ( 0  New)

Starting Attempt 41 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 41 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 41 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 41 with 332 Genes Parsed ( 0  New)

Starting Attempt 42 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 42 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 42 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 42 with 332 Genes Parsed ( 0  New)

Starting Attempt 43 with 332 Genes Parsed
Starting Attempt 43 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 43 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 43 with 332 Genes Parsed ( 0  New)

Starting Attempt 44 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 44 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 44 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 44 with 332 Genes Parsed ( 0  New)

Starting Attempt 45 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 45 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 45 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 45 with 332 Genes Parsed ( 0  New)

Starting Attempt 46 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 46 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 46 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 46 with 332 Genes Parsed ( 0  New)

Starting Attempt 47 with 332 Genes Parsed
Starting Attempt 47 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 47 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 47 with 332 Genes Parsed ( 0  New)

Starting Attempt 48 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 48 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 48 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 48 with 332 Genes Parsed ( 0  New)

Starting Attempt 49 with 332 Genes Parsed
Starting Attempt 49 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 49 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 49 with 332 Genes Parsed ( 0  New)

Starting Attempt 50 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 50 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 50 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 50 with 332 Genes Parsed ( 0  New)

Starting Attempt 51 with 332 Genes Parsed
Starting Attempt 51 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 51 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 51 with 332 Genes Parsed ( 0  New)

Starting Attempt 52 with 332 Genes Parsed
Starting Attempt 52 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 52 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 52 with 332 Genes Parsed ( 0  New)

Starting Attempt 53 with 332 Genes Parsed
Starting Attempt 53 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 53 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 53 with 332 Genes Parsed ( 0  New)

Starting Attempt 54 with 332 Genes Parsed
Starting Attempt 54 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 54 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 54 with 332 Genes Parsed ( 0  New)

Starting Attempt 55 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 55 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 55 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 55 with 332 Genes Parsed ( 0  New)

Starting Attempt 56 with 332 Genes Parsed
Starting Attempt 56 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 56 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 56 with 332 Genes Parsed ( 0  New)

Starting Attempt 57 with 332 Genes Parsed
Starting Attempt 57 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 57 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 57 with 332 Genes Parsed ( 0  New)

Starting Attempt 58 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Starting Attempt 58 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 58 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 58 with 332 Genes Parsed ( 0  New)

Starting Attempt 59 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Starting Attempt 59 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped AATF
Skipped CISD3

Finished Attempt 59 with 332 Genes Parsed ( 0  New)

Skipped CISD3

Finished Attempt 59 with 332 Genes Parsed ( 0  New)

Starting Attempt 61 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 61 with 332 Genes Parsed ( 0  New)

Starting Attempt 62 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 62 with 332 Genes Parsed ( 0  New)

Starting Attempt 63 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 63 with 332 Genes Parsed ( 0  New)

Starting Attempt 64 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 64 with 332 Genes Parsed ( 0  New)

Starting Attempt 65 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 65 with 332 Genes Parsed ( 0  New)

Starting Attempt 66 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 66 with 332 Genes Parsed ( 0  New)

Starting Attempt 67 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 67 with 332 Genes Parsed ( 0  New)

Starting Attempt 68 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 68 with 332 Genes Parsed ( 0  New)

Starting Attempt 69 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 69 with 332 Genes Parsed ( 0  New)

Starting Attempt 70 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 70 with 332 Genes Parsed ( 0  New)

Starting Attempt 71 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 71 with 332 Genes Parsed ( 0  New)

Starting Attempt 72 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 72 with 332 Genes Parsed ( 0  New)

Starting Attempt 73 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 73 with 332 Genes Parsed ( 0  New)

Starting Attempt 74 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 74 with 332 Genes Parsed ( 0  New)

Starting Attempt 75 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 75 with 332 Genes Parsed ( 0  New)

Starting Attempt 76 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 76 with 332 Genes Parsed ( 0  New)

Starting Attempt 77 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 77 with 332 Genes Parsed ( 0  New)

Starting Attempt 78 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 78 with 332 Genes Parsed ( 0  New)

Starting Attempt 79 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 79 with 332 Genes Parsed ( 0  New)

Starting Attempt 80 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 80 with 332 Genes Parsed ( 0  New)

Starting Attempt 81 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 81 with 332 Genes Parsed ( 0  New)

Starting Attempt 82 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 82 with 332 Genes Parsed ( 0  New)

Starting Attempt 83 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 83 with 332 Genes Parsed ( 0  New)

Starting Attempt 84 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 84 with 332 Genes Parsed ( 0  New)

Starting Attempt 85 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 85 with 332 Genes Parsed ( 0  New)

Starting Attempt 86 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 86 with 332 Genes Parsed ( 0  New)

Starting Attempt 87 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 87 with 332 Genes Parsed ( 0  New)

Starting Attempt 88 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 88 with 332 Genes Parsed ( 0  New)

Starting Attempt 89 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 89 with 332 Genes Parsed ( 0  New)

Starting Attempt 90 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 90 with 332 Genes Parsed ( 0  New)

Starting Attempt 91 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 91 with 332 Genes Parsed ( 0  New)

Starting Attempt 92 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 92 with 332 Genes Parsed ( 0  New)

Starting Attempt 93 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 93 with 332 Genes Parsed ( 0  New)

Starting Attempt 94 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 94 with 332 Genes Parsed ( 0  New)

Starting Attempt 95 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 95 with 332 Genes Parsed ( 0  New)

Starting Attempt 96 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 96 with 332 Genes Parsed ( 0  New)

Starting Attempt 97 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 97 with 332 Genes Parsed ( 0  New)

Starting Attempt 98 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 98 with 332 Genes Parsed ( 0  New)

Starting Attempt 99 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 99 with 332 Genes Parsed ( 0  New)

Starting Attempt 100 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 100 with 332 Genes Parsed ( 0  New)

Starting Attempt 101 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 101 with 332 Genes Parsed ( 0  New)

Starting Attempt 102 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 102 with 332 Genes Parsed ( 0  New)

Starting Attempt 103 with 332 Genes Parsed


HBox(children=(IntProgress(value=0, max=334), HTML(value=u'')))

Skipped AATF
Skipped CISD3

Finished Attempt 103 with 332 Genes Parsed ( 0  New)



KeyboardInterrupt: 

In [73]:
# Combine Variants for each gene into one DF
all_variants = pd.concat(all_variants.values())

In [75]:
# Sort and remove duplicates (no longer necessary?)
# (for some reason POL1A had duplicate entries?)
# (Could not reproduce, but GnomAD API syntax has also changed since first run)
print len(all_variants[all_variants.duplicated(["chrom", "pos", "ref", "alt"], keep=False)])

all_variants["chrom"] = all_variants["chrom"].map(lambda x: int(x) if not x == "X" else x)
print len(all_variants)
all_variants = all_variants.drop_duplicates().sort_values(["chrom", "pos", "ref", "alt"])
print len(all_variants)

0
133126
133126


# Map Variants to UniProt Position using VEP

In [76]:
# NOTE: This step can most easilly be done by manually querying VEP

In [77]:
# Write Input File for VEP
out = open("{0}/VEP_Input.txt".format(base_dir), "w+")
out.write("\n".join(all_variants[["chrom", "pos", "ref", "alt"]].apply(lambda x: "{0} {1} . {2} {3}".format(*x), axis=1)))
out.close()

In [117]:
# NOTE: Either I'm crazy or there's currently something up with VEP
#       When submitting to the grch37 version the SWISSPROT column
#       in the output only shows gene names (e.g. PARP_HUMAN) so the
#       text download cannnot be used to map UniProt IDs (this is part
#       of the sanity check I use to make sure the mapping is verifiable)
#
#       Instead of just submitting to the grch37 like I did originally.
#
#       (https://grch37.ensembl.org/Homo_sapiens/Tools/VEP)
#
#       I instead now need to re-map the GnomAD vairants (only provided in
#       grch37 coordinates) to grch38 using the ensembl assembly converter
#
#       (https://grch37.ensembl.org/Homo_sapiens/Tools/AssemblyConverter?db=core)
#
#       Then submit the mapped input to the regular grch38 VEP
#
#       (https://useast.ensembl.org/Homo_sapiens/Tools/VEP?)
#
#       This then becomes a pain because merging the VEP output (GRCh38) with
#       the original GnomAD (GRCh37) input is not straightforward. The Ensembl
#       assembly converrter drops some variants during the conversion without
#       indicating which ones are dropped so a 1-1 mapping based on order
#       is not possible.
#       
#       Rather than updating and re-runing this script I'm just defaulting to the
#       output from the previously working version of this code / external resources.

# Submit input file here...
#
# https://grch37.ensembl.org/Homo_sapiens/Tools/VEP
#
# Additional Configurations
#
# - Identifiers
#   + Gene Symbol
#   + Transcript version
#   + UniProt (not selected by default)
#
# - Variants and Frequency Data
#   + Find co-located known variants (Yes)
#   + Frequency Data for co-located variants
#     - 1000 Genomes global MAF
#     - gnomAD exomes (not selected by default)
#   + PubMedIDs for citations (Yes)
#
# - Additional Annotations
#   + Default options
#
# - Predictions
#   + Default options (SIFT + PolyPhen prediction and score)
#
# Filtering Options
#   + Defualt
#
# Advanced Options
#   + Default

In [119]:
# Read in VEP Mapped Variants
# Download output from VEP job
# NOTE: From results papge apply filter "Uploaded variant is missense_variant"
#       to limit to only relevant entries
#
# Save in root project directory and rename to match filename below...
vep_mapped = pd.read_csv("{0}/Human_Population_Variants_VEP_Mapped.txt".format(base_dir), sep="\t")

In [123]:
# Join on Key
vep_mapped["Key"] = vep_mapped[["Location", "Allele"]].apply(lambda x: x[0].split("-")[0] + x[1], axis=1)
all_variants["Key"] = all_variants[["chrom", "pos", "alt"]].apply(lambda x: str(x[0]) + ":" + str(x[1]) + x[2], axis=1)

# Make sure the same variants are included in both
# NOTE: This should not match anymore because of GRCh37 to GRCh38 mapping step
s1 = set(all_variants["Key"])
s2 = set(vep_mapped["Key"])
print s1 == s2

# Merge
merged = all_variants.join(vep_mapped.set_index("Key"), how="inner", on="Key")

False


In [125]:
# Get expected UniProt from original input / compare against VEP Mapping
proteins = pd.read_csv("../Data/Proteins.txt", sep="\t")
gene2uniprot = proteins.set_index("Gene_Name")["ID"].to_dict()
gene2uniprot["TMPRSS2"] = "O15393"
gene2uniprot["ACE2"] = "Q9BYF1"
merged["Expected_UniProt"] = merged["gene_symbol"].map(lambda x: gene2uniprot[x])

# Drop any cases with wrong UniProt mapping (there are only 2 here so its not a big deal)
print len(merged)
print len(merged[merged["SWISSPROT"] == merged["Expected_UniProt"]])
merged = merged[merged["SWISSPROT"] == merged["Expected_UniProt"]]

451337
282749


In [126]:
# Select / Rename Columns
merged["AA_Ref"] = merged["Amino_acids"].map(lambda x: x.split("/")[0])
merged["AA_Alt"] = merged["Amino_acids"].map(lambda x: x.split("/")[1])
merged["SIFT_Category"] = merged["SIFT"].map(lambda x: x.split("(")[0] if not x == "-" else np.nan)
merged["SIFT_Score"] = merged["SIFT"].map(lambda x: float(x.split("(")[1].strip(")")) if not x == "-" else np.nan)
merged["PolyPhen_Category"] = merged["PolyPhen"].map(lambda x: x.split("(")[0] if not x == "-" else np.nan)
merged["PolyPhen_Score"] = merged["PolyPhen"].map(lambda x: float(x.split("(")[1].strip(")")) if not x == "-" else np.nan)
merged = merged[["gene_symbol", "gene_id", "SWISSPROT", "chrom", "pos", "ref", "alt", "consequence", "rsid", "IMPACT", "Protein_position", "AA_Ref", "AA_Alt", "SIFT_Category", "SIFT_Score", "PolyPhen_Category", "PolyPhen_Score", "gnomAD_AF", "CLIN_SIG", "SOMATIC", "PHENO"]]

In [127]:
# Rename Columns
merged.columns = ["Gene_Symbol", "Gene_ID", "UniProt", "Chrom", "Pos", "Ref", "Alt", "Consequence", "rsID", "Imact", "AA_Pos", "AA_Ref", "AA_Alt", "SIFT_Category", "SIFT_Score", "PolyPhen_Category", "PolyPhen_Score", "gnomAD_AF", "Clinical_Significance", "Somatic", "Pheno"]

In [128]:
# Update gnomAD_AF to remove "-" blanks
merged["gnomAD_AF"] = merged["gnomAD_AF"].map(lambda x: float(x) if not x == "-" else np.nan)

In [130]:
# Make sure all mapped uniprot positions references match
protein_summary = pd.read_csv("{0}/Data/Proteins.txt".format(base_dir), sep="\t")
uni2seq = protein_summary.set_index("ID")["Sequence"].to_dict()
uni2seq["O15393"] = my.get_Fasta("O15393")
uni2seq["Q9BYF1"] = my.get_Fasta("Q9BYF1")
merged["Accurate_Pos"] = merged[["UniProt", "AA_Pos", "AA_Ref"]].apply(lambda x: uni2seq[x[0]][x[1] - 1] == x[2] if x[1] <= len(uni2seq[x[0]]) else False, axis=1)

tmp = merged.sort_values(["Chrom", "Pos", "Accurate_Pos"], ascending=[True, True, False]).drop_duplicates(["Chrom", "Pos", "Ref", "Alt"])

In [131]:
tmp["Accurate_Pos"].mean()

0.95588362526643644

In [132]:
len(tmp)

127141

In [133]:
len(all_variants)

133126

In [134]:
# Only retain variants where the reference matches at the Uniprot position
tmp = tmp[tmp["Accurate_Pos"]]
tmp = tmp.drop("Accurate_Pos", axis=1)

In [135]:
# Save final list of population variants
tmp.to_csv("{0}/Data/Pop_Vars.txt".format(base_dir), sep="\t", index=None)