In [75]:
import logging
import multiprocessing as mp
import os
from collections import OrderedDict

from MODApy.cfg import cfg

import cyvcf2

import matplotlib
import matplotlib.pyplot as plt

import matplotlib_venn as venn

import numpy as np

import pandas as pd
from MODApy.vcfmgr import ParsedVCF
matplotlib.use("agg")
logger = logging.getLogger(__name__)

In [64]:
def parse_vcf_file(vcf):
    """
    Parse a VCF file and return a dictionary of variant information.

    Parameters
    ----------
    vcf : str
        The path to the VCF file to be parsed.

    Returns
    -------
    variants_dict : OrderedDict
        An ordered dictionary of variants with the keys in the format
        CHROM+POS+REF+ALT.
        The values of the dictionary are dictionaries containing information
        about each variant,
        including its ID, QUAL, FILTER, and any additional INFO fields present
        in the VCF file.
    name : str
        The name of the first sample in the VCF file, or the name of the
        VCF file if no samples are present.
    pVCF : cyvcf2.Reader
        A cyvcf2.Reader object representing the VCF file.

    Raises
    ------
    IOError
        If the input VCF file cannot be found or opened.
    """
    logger.info("Parsing VCF File. %s" % vcf)
    pVCF = cyvcf2.Reader(vcf)
    try:
        name = pVCF.samples[0]
    except Exception:
        name = vcf.split("/")[-1]
    variants_dict = OrderedDict()
    for variant in pVCF:
        variants_dict[
            variant.CHROM
            + "+"
            + str(variant.POS)
            + "+"
            + variant.REF
            + "+"
            + ",".join(variant.ALT)
        ] = {
            "ID": variant.ID,
            "QUAL": variant.QUAL,
            "FILTER": variant.FILTER,
        }
        variants_dict[
            variant.CHROM
            + "+"
            + str(variant.POS)
            + "+"
            + variant.REF
            + "+"
            + ",".join(variant.ALT)
        ].update({k: v for (k, v) in variant.INFO})
    return variants_dict, name, pVCF

In [65]:
def create_dataframe(variants_dict):
    """
    Create a pandas DataFrame from the variants dictionary.

    Parameters
    ----------
    variants_dict : dict
        A dictionary of variants data.

    Returns
    -------
    pandas.DataFrame
        A DataFrame containing variants data.

    """
    df1 = pd.DataFrame.from_dict(variants_dict, orient="index")
    del variants_dict
    df1.index = df1.index.str.split("+", expand=True)
    df1.index.names = ["CHROM", "POS", "REF", "ALT"]
    df1.reset_index(inplace=True)
    return df1

In [66]:
def split_alternate_alleles(df):
    """
    Splits rows with multiple alternate alleles into separate rows.

    Parameters
    ----------
    df : pandas.DataFrame
        The input DataFrame with the genotype data to be processed.

    Returns
    -------
    pandas.DataFrame
        A DataFrame with the same columns as `df`, where rows with multiple
        alternate alleles have been split into separate rows.
    """
    splitdf = df.loc[df["ALT"].str.contains(",")].copy()
    if len(splitdf) > 0:
        ALT = (
            splitdf["ALT"]
            .astype(str)
            .str.split(",", n=1, expand=True)
            .stack()
            .rename("ALT")
        )
        ALT.index = ALT.index.droplevel(-1)
        ALT = ALT.to_frame()
        splitdf = splitdf.join(ALT, lsuffix="_x", rsuffix="_y")
        del ALT
        splitdf["ALT"] = splitdf["ALT_y"].combine_first(splitdf["ALT_x"])
        splitdf.drop(columns=["ALT_y", "ALT_x"], inplace=True)
        splitdf.reset_index(inplace=True)
        splitdf.drop(columns="index", inplace=True)
    odd = splitdf.iloc[::2].copy()
    even = splitdf.iloc[1::2].copy()
    splitlist = [
        "ID",
        "AC",
        "AF",
        "SAMPLES_AF",
        "MLEAC",
        "MLEAF",
        "VARTYPE",
        "dbSNPBuildID",
    ]
    splitlist = [x for x in splitlist if x in df.columns]
    splitlist += [x for x in df.columns if x.startswith(("1000", "CLINVAR"))]
    for col in splitlist:
        odd[col] = odd[col].astype(str).str.split(",", n=1).str[0]
        even[col] = even[col].apply(
            lambda x: x
            if len(str(x).split(",")) <= 1
            else str(x).split(",", maxsplit=1)[1]
        )
    splitdf = (
        pd.concat([odd, even])
        .sort_index()
        .replace(to_replace=[r"\(", r"\)"], value="", regex=True)
    )
    del odd, even
    splitdf = splitdf[["CHROM", "POS", "REF", "ALT"] + splitlist]
    df = df.merge(splitdf, on=["CHROM", "POS", "REF"], how="left")
    splitlist.append("ALT")
    xlist = [x + "_x" for x in splitlist]
    ylist = [y + "_y" for y in splitlist]
    del splitdf
    for col in splitlist:
        df[col] = df[col + "_y"].combine_first(df[col + "_x"])
    del splitlist
    df.drop(columns=xlist + ylist, inplace=True)
    del xlist, ylist
    df["POS"] = df["POS"].astype(int)
    return df

In [67]:
def handle_annotations(df, pVCF):
    """
    Parses the 'ANN' column in a pandas DataFrame and extracts functional
    annotations as separate columns.

    Parameters:
    -----------
    df : pandas.DataFrame
        Input DataFrame with 'ANN' column containing functional annotations
        separated by commas.

    Returns:
    --------
    pandas.DataFrame
        A new DataFrame with functional annotations as separate columns,
        joined with the original DataFrame.
    """
    if "ANN" in df.columns:
        anndf = df["ANN"]
        annhead = pVCF.get_header_type("ANN")["Description"].strip(
            '"Functional annotations: \'"'
        )
        annheaderlist = [x.strip() for x in annhead.split("|")]
        anndf = anndf.str.split(",", expand=True).stack()
        anndf = anndf.str.split("|", expand=True)
        anndf.columns = annheaderlist
        df.drop(columns="ANN", inplace=True)
        anndf.index = anndf.index.droplevel(1)
        df = df.join(anndf, how="inner")
        del anndf
        del annhead
        del annheaderlist
    return df

In [68]:
def prioritize_variants(df, IMPACT_SEVERITY=None):
    """
    Sort variants in a pandas DataFrame according to their severity.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame containing the variants to be sorted.
    IMPACT_SEVERITY : dict, optional
        A dictionary that maps each variant type to its severity score.
        The keys of the dictionary should be the names of the variant types
        (e.g., 'missense_variant'), and the values should be integers
        representing the severity score. If not provided, the default values for
        severity scores will be used.

    Returns
    -------
    pandas.DataFrame
        A new DataFrame with the same columns as the input DataFrame, but with
        the variants sorted by their severity.

    Notes
    -----
    This function assumes that the input DataFrame has columns named 'CHROM',
    'POS', 'REF', 'ALT', 'Annotation', and 'HGVS.c'. The 'Annotation' column
    should contain the variant types
    (e.g., 'missense_variant&splice_region_variant'), and the 'HGVS.c' column
    should contain the HGVS coding sequence notation for each variant
    (e.g., 'NM_001005353.2:c.43A>G'). Variants with a null HGVS notation will be
    sorted to the end.
    """
    if IMPACT_SEVERITY is None:
        IMPACT_SEVERITY = {
            "exon_loss_variant": 1,
            "frameshift_variant": 2,
            "stop_gained": 3,
            "stop_lost": 4,
            "start_lost": 5,
            "splice_acceptor_variant": 6,
            "splice_donor_variant": 7,
            "disruptive_inframe_deletion": 8,
            "inframe_insertion": 9,
            "disruptive_inframe_insertion": 10,
            "inframe_deletion": 11,
            "missense_variant": 12,
            "splice_region_variant": 13,
            "stop_retained_variant": 14,
            "initiator_codon_variant": 15,
            "synonymous_variant": 16,
            "start_retained": 17,
            "coding_sequence_variant": 18,
            "5_prime_UTR_variant": 19,
            "3_prime_UTR_variant": 20,
            "5_prime_UTR_premature_start_codon_gain_variant": 21,
            "intron_variant": 22,
            "non_coding_exon_variant": 23,
            "upstream_gene_variant": 24,
            "downstream_gene_variant": 25,
            "TF_binding_site_variant": 26,
            "regulatory_region_variant": 27,
            "intergenic_region": 28,
            "transcript": 29,
        }
    if 'Annotation' in df.columns:
        df["sorter"] = (
            df["Annotation"].str.split("&").str[0].replace(IMPACT_SEVERITY)
        )
        df.loc[df["HGVS.c"].str.contains("null"), "HGVS.c"] = None
        df["sorter2"] = [x[0] == x[1] for x in zip(df["ALT"], df["Allele"])]
        df = df.sort_values(
            by=["CHROM", "POS", "sorter2", "sorter"],
            ascending=[True, True, False, True],
        ).drop_duplicates(["CHROM", "POS", "REF", "ALT"])
        df.drop(columns=["sorter", "sorter2"], inplace=True)
    return df

In [69]:
def format_ann_columns(df, pVCF):
    """
    Formats the columns of a pandas DataFrame containing variant annotation
    data.

    Parameters
    ----------
    df : ParsedVCF (pandas.DataFrame extension)
        A DataFrame containing variant annotation data.

    Returns
    -------
    ParsedVCF (pandas.DataFrame extension)
        The input DataFrame with formatted columns.

    Notes
    -----
    The function applies the following transformations to the input DataFrame:

    - All column names are converted to uppercase.
    - If the DataFrame contains a column named 'HGVS.P', a new column named
    'AMINOCHANGE'
    is added, which contains the result of calling the `aminoChange` function
    on the 'HGVS.P'
    column.
    - If the DataFrame contains a column named 'HOM', its values are converted
    from boolean to
    categorical ('HOM' and 'HET'). The 'HET' column is dropped if present. The
    column name is changed to 'ZIGOSITY'.
    - If the DataFrame contains a column named 'ESP6500_MAF', new columns named
    'ESP6500_MAF_EA', 'ESP6500_MAF_AA', and 'ESP6500_MAF_ALL' are added,
    containing the values of the corresponding fields in the 'ESP6500_MAF'
    column. The values are converted from strings to floats and divided by 100.
    The 'ESP6500_MAF' column is dropped.
    - If the DataFrame contains a column named 'ESP6500_PH', new columns named
    'POLYPHEN_PRED' and 'POLYPHEN_SCORE' are added, containing the values of the
    corresponding fields in the 'ESP6500_PH' column. The 'POLYPHEN_PRED' values
    are cleaned up by removing trailing dots and commas. The 'POLYPHEN_SCORE'
    values are split on commas and the first element is kept. The 'ESP6500_PH'
    column is dropped.
    - The columns named 'ANNOTATION', 'ANNOTATION_IMPACT', and 'ID' are renamed
    to 'EFFECT', 'IMPACT', and 'RSID', respectively.
    - Columns with numeric data (according to the VCF header) are converted to
    floats or integers, as appropriate. The columns named 'ESP6500_MAF_EA',
    'ESP6500_MAF_AA', and 'ESP6500_MAF_ALL' are also converted to floats.
    - The DataFrame is rounded to 6 decimal places.
    - If the DataFrame contains a column named 'CLINVAR_CLNSIG', its values are
    replaced with their corresponding meanings according to the
    `clinvartranslation` dictionary.

    """
    df.columns = df.columns.str.upper()
    if "HGVS.P" in df.columns:
        df["AMINOCHANGE"] = df["HGVS.P"].apply(aminoChange)
    if "HOM" in df.columns:
        df["HOM"] = df["HOM"].replace({True: "HOM", np.nan: "HET", None: "HET"})
        df.drop(columns="HET", inplace=True, errors="ignore")
        df.rename(columns={"HOM": "ZIGOSITY"}, inplace=True)
    if "ESP6500_MAF" in df.columns:
        df[["ESP6500_MAF_EA", "ESP6500_MAF_AA", "ESP6500_MAF_ALL"]] = df[
            "ESP6500_MAF"
        ].str.split(",", expand=True)
        df["ESP6500_MAF_EA"] = df["ESP6500_MAF_EA"].apply(divide, args=(100,))
        df["ESP6500_MAF_AA"] = df["ESP6500_MAF_AA"].apply(divide, args=(100,))
        df["ESP6500_MAF_ALL"] = df["ESP6500_MAF_ALL"].apply(divide, args=(100,))
        df.drop(columns=["ESP6500_MAF"], inplace=True)
    if "ESP6500_PH" in df.columns:
        df[["POLYPHEN_PRED", "POLYPHEN_SCORE"]] = df["ESP6500_PH"].str.split(
            ":", n=1, expand=True
        )
        df["POLYPHEN_PRED"] = df["POLYPHEN_PRED"].str.strip(".").str.strip(".,")
        df["POLYPHEN_SCORE"] = df["POLYPHEN_SCORE"].str.split(",").str[0]
        df.drop(columns=["ESP6500_PH"], inplace=True)
    df.rename(
        columns={
            "ANNOTATION": "EFFECT",
            "ANNOTATION_IMPACT": "IMPACT",
            "ID": "RSID",
        },
        inplace=True,
        errors="ignore",
    )
    numcols = list()
    for x in pVCF.header_iter():
        if x.type == "INFO":
            if x["Type"] in ["Float", "Integer"]:
                numcols.append(x["ID"])
    numcols += ["ESP6500_MAF_EA", "ESP6500_MAF_AA", "ESP6500_MAF_ALL"]
    numcols = list(
        set([x.upper() for x in numcols for y in df.columns if x.upper() == y])
    )
    df[numcols] = df[numcols].apply(pd.to_numeric, errors="coerce", axis=1)
    df = df.round(6)

    if "CLINVAR_CLNSIG" in df.columns:
        clinvartranslation = {
            "255": "other",
            "0": "Uncertain significance",
            "1": "not provided",
            "2": "Benign",
            "3": "Likely Benign",
            "4": "Likely pathogenic",
            "5": "Pathogenic",
            "6": "drug response",
            "7": "histocompatibility",
        }
        for k, v in clinvartranslation.items():
            df["CLINVAR_CLNSIG"] = df["CLINVAR_CLNSIG"].str.replace(k, v)
    return df

In [70]:
def clean_df(df):
    """
    Replace missing and empty values in the DataFrame with the '.' character.
    Convert the 'POS' column to integer type.

    Parameters
    ----------
    df : pandas.DataFrame
        Input DataFrame to be cleaned.

    Returns
    -------
    pandas.DataFrame
        Cleaned DataFrame with replaced missing and empty values and 'POS'
        column as integer.

    """
    df.replace(["nan", "", np.nan], ".", inplace=True)
    df.replace(to_replace=[None], value=".", inplace=True, regex=True)
    df = df.astype("str")
    df["POS"] = df["POS"].astype(int)
    return df

In [71]:
def aminoChange(value: str):
    """
    Given a string `value`, extract the amino acid change from the
    `HGVS.P` format.

    Parameters
    ----------
        value (str): A string representing the `HGVS.P` format.

    Returns
    -------
        str: A string representing the amino acid change or
        "." if not applicable.
    """
    try:
        value = value.replace("p.", "")
        if value[:3] != value[-3:]:
            return "CHANGE"
        else:
            return "."
    except Exception:
        return "."

def divide(x, y):
    """
    Method to divide x on y, needed for dividing freqs.
    Parameters
    ----------
    x
        The dividend
    y
        The divisor
    Returns result or x.
    """
    try:
        return float(x) / y
    except Exception:
        return x

In [76]:
def main_parse_vcf(vcf):
    variants_dict, name, pVCF = parse_vcf_file(vcf)
    df1 = create_dataframe(variants_dict)
    del variants_dict
    df1 = split_alternate_alleles(df1)
    df1 = handle_annotations(df1, pVCF)
    #df1 = prioritize_variants(df1)
    df1 = format_ann_columns(df1, pVCF)
    df1 = clean_df(df1)
    df1 = df1.pipe(ParsedVCF)
    df1.name = name
    return df1

In [77]:
vcf_file_path = "../../data/Patients/BM23-48263_MODApy/BM23-48263_MODApy.final.vcf"

In [78]:
df1 = main_parse_vcf(vcf_file_path)

In [86]:
df1

Unnamed: 0,CHROM,POS,REF,QUAL,FILTER,AN,BASEQRANKSUM,CLIPPINGRANKSUM,DP,FS,MQ,MQRANKSUM,QD,READPOSRANKSUM,SOR,SET,SNP,ZIGOSITY,GNOMAD_AB_HIST_ALL,GNOMAD_AB_HIST_ALT,GNOMAD_AB_MEDIAN,GNOMAD_AC,GNOMAD_AC_AFR,GNOMAD_AC_AMR,GNOMAD_AC_ASJ,GNOMAD_AC_EAS,GNOMAD_AC_FIN,GNOMAD_AC_FEMALE,GNOMAD_AC_MALE,GNOMAD_AC_NFE,GNOMAD_AC_OTH,GNOMAD_AC_POPMAX,GNOMAD_AC_SAS,GNOMAD_AC_RAW,GNOMAD_AF,GNOMAD_AF_AFR,GNOMAD_AF_AMR,GNOMAD_AF_ASJ,GNOMAD_AF_EAS,GNOMAD_AF_FIN,GNOMAD_AF_FEMALE,GNOMAD_AF_MALE,GNOMAD_AF_NFE,GNOMAD_AF_OTH,GNOMAD_AF_POPMAX,GNOMAD_AF_SAS,GNOMAD_AF_RAW,GNOMAD_AN,GNOMAD_AN_AFR,GNOMAD_AN_AMR,GNOMAD_AN_ASJ,GNOMAD_AN_EAS,GNOMAD_AN_FIN,GNOMAD_AN_FEMALE,GNOMAD_AN_MALE,GNOMAD_AN_NFE,GNOMAD_AN_OTH,GNOMAD_AN_POPMAX,GNOMAD_AN_SAS,GNOMAD_AN_RAW,GNOMAD_AS_FILTERSTATUS,GNOMAD_AS_RF,GNOMAD_BASEQRANKSUM,GNOMAD_CSQ,GNOMAD_CLIPPINGRANKSUM,GNOMAD_DB,GNOMAD_DP,GNOMAD_DP_HIST_ALL,GNOMAD_DP_HIST_ALT,GNOMAD_DP_MEDIAN,GNOMAD_DREF_MEDIAN,GNOMAD_FS,GNOMAD_GC,GNOMAD_GC_AFR,GNOMAD_GC_AMR,GNOMAD_GC_ASJ,GNOMAD_GC_EAS,GNOMAD_GC_FIN,GNOMAD_GC_FEMALE,GNOMAD_GC_MALE,GNOMAD_GC_NFE,GNOMAD_GC_OTH,GNOMAD_GC_SAS,GNOMAD_GC_RAW,GNOMAD_GQ_HIST_ALL,GNOMAD_GQ_HIST_ALT,GNOMAD_GQ_MEDIAN,GNOMAD_HOM,GNOMAD_HOM_AFR,GNOMAD_HOM_AMR,GNOMAD_HOM_ASJ,GNOMAD_HOM_EAS,GNOMAD_HOM_FIN,GNOMAD_HOM_FEMALE,GNOMAD_HOM_MALE,GNOMAD_HOM_NFE,GNOMAD_HOM_OTH,GNOMAD_HOM_SAS,GNOMAD_HOM_RAW,GNOMAD_INBREEDINGCOEFF,GNOMAD_MQ,GNOMAD_MQRANKSUM,GNOMAD_POPMAX,GNOMAD_QD,GNOMAD_READPOSRANKSUM,GNOMAD_SOR,GNOMAD_VQSLOD,GNOMAD_VQSR_CULPRIT,DBNSFP_GERP++_RS,DBNSFP_GERP++_NR,DBNSFP_1000GP1_AMR_AF,DBNSFP_1000GP1_AF,DBNSFP_UNIPROT_ACC,DBNSFP_LRT_PRED,DBNSFP_PHASTCONS100WAY_VERTEBRATE,DBNSFP_POLYPHEN2_HDIV_PRED,DBNSFP_1000GP1_ASN_AF,DBNSFP_1000GP1_AFR_AF,DBNSFP_MUTATIONTASTER_PRED,DBNSFP_1000GP1_EUR_AF,DBNSFP_INTERPRO_DOMAIN,DBNSFP_ESP6500_AA_AF,DBNSFP_POLYPHEN2_HVAR_PRED,DBNSFP_SIFT_PRED,DBNSFP_ESP6500_EA_AF,GNOMAD_STAR_AC,GNOMAD_STAR_AC_RAW,GNOMAD_STAR_HOM,GNOMAD_VQSR_NEGATIVE_TRAIN_SITE,GNOMAD_VQSR_POSITIVE_TRAIN_SITE,INS,DEL,GNOMAD_AS_RF_POSITIVE_TRAIN,GNOMAD_AS_RF_NEGATIVE_TRAIN,LOF,NMD,GNOMAD_AC_AFR_FEMALE,GNOMAD_AC_AFR_MALE,GNOMAD_AC_AMR_FEMALE,GNOMAD_AC_AMR_MALE,GNOMAD_AC_ASJ_FEMALE,GNOMAD_AC_ASJ_MALE,GNOMAD_AC_EAS_FEMALE,GNOMAD_AC_EAS_MALE,GNOMAD_AC_FIN_FEMALE,GNOMAD_AC_FIN_MALE,GNOMAD_AC_NFE_FEMALE,GNOMAD_AC_NFE_MALE,GNOMAD_AC_OTH_FEMALE,GNOMAD_AC_OTH_MALE,GNOMAD_AC_SAS_FEMALE,GNOMAD_AC_SAS_MALE,GNOMAD_AF_AFR_FEMALE,GNOMAD_AF_AFR_MALE,GNOMAD_AF_AMR_FEMALE,GNOMAD_AF_AMR_MALE,GNOMAD_AF_ASJ_FEMALE,GNOMAD_AF_ASJ_MALE,GNOMAD_AF_EAS_FEMALE,GNOMAD_AF_EAS_MALE,GNOMAD_AF_FIN_FEMALE,GNOMAD_AF_FIN_MALE,GNOMAD_AF_NFE_FEMALE,GNOMAD_AF_NFE_MALE,GNOMAD_AF_OTH_FEMALE,GNOMAD_AF_OTH_MALE,GNOMAD_AF_SAS_FEMALE,GNOMAD_AF_SAS_MALE,GNOMAD_AN_AFR_FEMALE,GNOMAD_AN_AFR_MALE,GNOMAD_AN_AMR_FEMALE,GNOMAD_AN_AMR_MALE,GNOMAD_AN_ASJ_FEMALE,GNOMAD_AN_ASJ_MALE,GNOMAD_AN_EAS_FEMALE,GNOMAD_AN_EAS_MALE,GNOMAD_AN_FIN_FEMALE,GNOMAD_AN_FIN_MALE,GNOMAD_AN_NFE_FEMALE,GNOMAD_AN_NFE_MALE,GNOMAD_AN_OTH_FEMALE,GNOMAD_AN_OTH_MALE,GNOMAD_AN_SAS_FEMALE,GNOMAD_AN_SAS_MALE,GNOMAD_GC_AFR_FEMALE,GNOMAD_GC_AFR_MALE,GNOMAD_GC_AMR_FEMALE,GNOMAD_GC_AMR_MALE,GNOMAD_GC_ASJ_FEMALE,GNOMAD_GC_ASJ_MALE,GNOMAD_GC_EAS_FEMALE,GNOMAD_GC_EAS_MALE,GNOMAD_GC_FIN_FEMALE,GNOMAD_GC_FIN_MALE,GNOMAD_GC_NFE_FEMALE,GNOMAD_GC_NFE_MALE,GNOMAD_GC_OTH_FEMALE,GNOMAD_GC_OTH_MALE,GNOMAD_GC_SAS_FEMALE,GNOMAD_GC_SAS_MALE,GNOMAD_HEMI,GNOMAD_HEMI_AFR,GNOMAD_HEMI_AMR,GNOMAD_HEMI_ASJ,GNOMAD_HEMI_EAS,GNOMAD_HEMI_FIN,GNOMAD_HEMI_NFE,GNOMAD_HEMI_OTH,GNOMAD_HEMI_SAS,GNOMAD_HEMI_RAW,GNOMAD_STAR_HEMI,RSID,AC,AF,MLEAC,MLEAF,VARTYPE,1000GP3_AA,1000GP3_AC,1000GP3_AF,1000GP3_AFR_AF,1000GP3_AMR_AF,1000GP3_AN,1000GP3_DP,1000GP3_EAS_AF,1000GP3_EUR_AF,1000GP3_NS,1000GP3_SAS_AF,1000GP3_VT,1000GP3_MULTI_ALLELIC,1000GP3_EX_TARGET,CLINVAR_ALLELEID,CLINVAR_CLNDISDB,CLINVAR_CLNDN,CLINVAR_CLNHGVS,CLINVAR_CLNREVSTAT,CLINVAR_CLNSIG,CLINVAR_CLNVC,CLINVAR_CLNVCSO,CLINVAR_GENEINFO,CLINVAR_MC,CLINVAR_ORIGIN,CLINVAR_RS,CLINVAR_AF_ESP,CLINVAR_AF_TGP,CLINVAR_AF_EXAC,CLINVAR_CLNVI,CLINVAR_CLNSIGCONF,CLINVAR_CLNDISDBINCL,CLINVAR_CLNDNINCL,CLINVAR_CLNSIGINCL,1000GP3_OLD_VARIANT,ALT,ALLELE,EFFECT,IMPACT,GENE_NAME,GENE_ID,FEATURE_TYPE,FEATURE_ID,TRANSCRIPT_BIOTYPE,RANK,HGVS.C,HGVS.P,CDNA.POS / CDNA.LENGTH,CDS.POS / CDS.LENGTH,AA.POS / AA.LENGTH,DISTANCE,ERRORS / WARNINGS / INFO,AMINOCHANGE
0,chr1,14653,C,48.77,.,2.0,0.217,-0.65,15.0,0.0,40.77,1.083,3.25,0.65,0.784,variant,True,HET,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,1.0,0.5,1.0,0.5,SNP,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,T,T,downstream_gene_variant,MODIFIER,DDX11L1,DDX11L1,transcript,NR_046018.2,pseudogene,.,n.*244C>T,.,.,.,.,244,.,.
0,chr1,14653,C,48.77,.,2.0,0.217,-0.65,15.0,0.0,40.77,1.083,3.25,0.65,0.784,variant,True,HET,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,1.0,0.5,1.0,0.5,SNP,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,T,T,downstream_gene_variant,MODIFIER,MIR6859-1,MIR6859-1,transcript,NR_106918.1,pseudogene,.,n.*2716G>A,.,.,.,.,2716,.,.
0,chr1,14653,C,48.77,.,2.0,0.217,-0.65,15.0,0.0,40.77,1.083,3.25,0.65,0.784,variant,True,HET,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,1.0,0.5,1.0,0.5,SNP,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,T,T,downstream_gene_variant,MODIFIER,MIR6859-2,MIR6859-2,transcript,NR_107062.1,pseudogene,.,n.*2716G>A,.,.,.,.,2716,.,.
0,chr1,14653,C,48.77,.,2.0,0.217,-0.65,15.0,0.0,40.77,1.083,3.25,0.65,0.784,variant,True,HET,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,1.0,0.5,1.0,0.5,SNP,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,T,T,downstream_gene_variant,MODIFIER,MIR6859-3,MIR6859-3,transcript,NR_107063.1,pseudogene,.,n.*2716G>A,.,.,.,.,2716,.,.
0,chr1,14653,C,48.77,.,2.0,0.217,-0.65,15.0,0.0,40.77,1.083,3.25,0.65,0.784,variant,True,HET,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,1.0,0.5,1.0,0.5,SNP,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,T,T,downstream_gene_variant,MODIFIER,MIR6859-4,MIR6859-4,transcript,NR_128720.1,pseudogene,.,n.*2716G>A,.,.,.,.,2716,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104321,chrY,21154603,A,199.800003,.,2.0,.,.,8.0,0.0,60.0,.,24.98,.,1.609,variant,True,HOM,.,.,0.453488,6.0,0.0,0.0,0.0,0.0,0.0,.,.,6.0,0.0,6.0,0.0,587.0,0.007481,0.0,0.0,0.0,0.0,0.0,.,.,0.012097,0.0,0.012097,0.0,0.037379,802.0,78.0,61.0,33.0,29.0,42.0,.,.,496.0,10.0,496.0,53.0,15704.0,PASS,0.698082,1.23,C|intron_variant&non_coding_transcript_variant...,-0.008,true,6679245.0,2119|382|308|616|1382|2263|2863|2476|1660|866|...,563|15|2|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0,38.0,0.0,36.73,.,.,.,.,.,.,.,.,.,.,.,.,13536|921|222|187|149|102|89|78|65|59|56|40|61...,15|525|26|11|3|2|0|0|1|0|0|0|0|1|0|0|1|0|0|2,99.0,.,.,.,.,.,.,.,.,.,.,.,.,-0.0113,51.220001,-2.961,NFE,17.190001,0.988,1.03,-54.16,FS,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,true,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,rs79788321,2.0,1.0,2.0,1.0,SNP,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,C,C,intron_variant,MODIFIER,TTTY14,TTTY14,transcript,NR_125735.1,pseudogene,1/2,n.503+84328T>G,.,.,.,.,.,.,.
104321,chrY,21154603,A,199.800003,.,2.0,.,.,8.0,0.0,60.0,.,24.98,.,1.609,variant,True,HOM,.,.,0.453488,6.0,0.0,0.0,0.0,0.0,0.0,.,.,6.0,0.0,6.0,0.0,587.0,0.007481,0.0,0.0,0.0,0.0,0.0,.,.,0.012097,0.0,0.012097,0.0,0.037379,802.0,78.0,61.0,33.0,29.0,42.0,.,.,496.0,10.0,496.0,53.0,15704.0,PASS,0.698082,1.23,C|intron_variant&non_coding_transcript_variant...,-0.008,true,6679245.0,2119|382|308|616|1382|2263|2863|2476|1660|866|...,563|15|2|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0,38.0,0.0,36.73,.,.,.,.,.,.,.,.,.,.,.,.,13536|921|222|187|149|102|89|78|65|59|56|40|61...,15|525|26|11|3|2|0|0|1|0|0|0|0|1|0|0|1|0|0|2,99.0,.,.,.,.,.,.,.,.,.,.,.,.,-0.0113,51.220001,-2.961,NFE,17.190001,0.988,1.03,-54.16,FS,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,true,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,rs79788321,2.0,1.0,2.0,1.0,SNP,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,C,C,intron_variant,MODIFIER,TTTY14,TTTY14,transcript,NR_125737.1,pseudogene,2/2,n.139-59876T>G,.,.,.,.,.,.,.
104321,chrY,21154603,A,199.800003,.,2.0,.,.,8.0,0.0,60.0,.,24.98,.,1.609,variant,True,HOM,.,.,0.453488,6.0,0.0,0.0,0.0,0.0,0.0,.,.,6.0,0.0,6.0,0.0,587.0,0.007481,0.0,0.0,0.0,0.0,0.0,.,.,0.012097,0.0,0.012097,0.0,0.037379,802.0,78.0,61.0,33.0,29.0,42.0,.,.,496.0,10.0,496.0,53.0,15704.0,PASS,0.698082,1.23,C|intron_variant&non_coding_transcript_variant...,-0.008,true,6679245.0,2119|382|308|616|1382|2263|2863|2476|1660|866|...,563|15|2|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0,38.0,0.0,36.73,.,.,.,.,.,.,.,.,.,.,.,.,13536|921|222|187|149|102|89|78|65|59|56|40|61...,15|525|26|11|3|2|0|0|1|0|0|0|0|1|0|0|1|0|0|2,99.0,.,.,.,.,.,.,.,.,.,.,.,.,-0.0113,51.220001,-2.961,NFE,17.190001,0.988,1.03,-54.16,FS,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,true,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,rs79788321,2.0,1.0,2.0,1.0,SNP,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,C,C,intron_variant,MODIFIER,TTTY14,TTTY14,transcript,NR_001543.3,pseudogene,1/1,n.504-59876T>G,.,.,.,.,.,.,.
104321,chrY,21154603,A,199.800003,.,2.0,.,.,8.0,0.0,60.0,.,24.98,.,1.609,variant,True,HOM,.,.,0.453488,6.0,0.0,0.0,0.0,0.0,0.0,.,.,6.0,0.0,6.0,0.0,587.0,0.007481,0.0,0.0,0.0,0.0,0.0,.,.,0.012097,0.0,0.012097,0.0,0.037379,802.0,78.0,61.0,33.0,29.0,42.0,.,.,496.0,10.0,496.0,53.0,15704.0,PASS,0.698082,1.23,C|intron_variant&non_coding_transcript_variant...,-0.008,true,6679245.0,2119|382|308|616|1382|2263|2863|2476|1660|866|...,563|15|2|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0,38.0,0.0,36.73,.,.,.,.,.,.,.,.,.,.,.,.,13536|921|222|187|149|102|89|78|65|59|56|40|61...,15|525|26|11|3|2|0|0|1|0|0|0|0|1|0|0|1|0|0|2,99.0,.,.,.,.,.,.,.,.,.,.,.,.,-0.0113,51.220001,-2.961,NFE,17.190001,0.988,1.03,-54.16,FS,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,true,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,rs79788321,2.0,1.0,2.0,1.0,SNP,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,C,C,intron_variant,MODIFIER,CD24,CD24,transcript,NR_117090.1,pseudogene,1/1,n.100-76T>G,.,.,.,.,.,.,.


In [80]:
df2.shape

(317617, 270)

In [11]:
variants_dict, name, pVCF = parse_vcf_file(vcf_file_path)


In [12]:
df1 = create_dataframe(variants_dict)


In [13]:
df1.shape

(104079, 255)

In [31]:
pd.set_option('display.max_columns',500)

In [37]:
df1.loc[df1['POS']=='69511','GnomAD_CSQ'].values

array(['G|missense_variant|MODERATE|OR4F5|ENSG00000186092|Transcript|ENST00000335137|protein_coding|1/1||ENST00000335137.3:c.421A>G|ENSP00000334393.3:p.Thr141Ala|421|421|141|T/A|Aca/Gca|rs2691305&rs75062661&COSM4144171|1||1||SNV||HGNC|14825|YES|||CCDS30547.1|ENSP00000334393|Q8NH21||UPI0000041BC1||tolerated(0.82)|benign(0.003)|Transmembrane_helices:TMhelix&Superfamily_domains:SSF81321&Pfam_domain:PF13853&Gene3D:1.20.1070.10&hmmpanther:PTHR26451&hmmpanther:PTHR26451:SF72&PROSITE_profiles:PS50262|||G:0.9832&G:0.9832||||||G:0.5441&G:0.5441|G:0.8874&G:0.8874|G:0.9507&G:0.9507|G:0.894&G:0.894|G:0.5942&G:0.5942|G:0.9394&G:0.9394|G:0.9994&G:0.9994|G:0.9907&G:0.9907|G:0.9716&G:0.9716|G:0.9597&G:0.9597||0&0&1|0&0&1|||||||||,T|missense_variant|MODERATE|OR4F5|ENSG00000186092|Transcript|ENST00000335137|protein_coding|1/1||ENST00000335137.3:c.421A>T|ENSP00000334393.3:p.Thr141Ser|421|421|141|T/S|Aca/Tca|rs2691305&rs75062661&COSM4144171|2||1||SNV||HGNC|14825|YES|||CCDS30547.1|ENSP00000334393|Q8NH21||U

In [33]:
df1.head()

Unnamed: 0,CHROM,POS,REF,ALT,ID,QUAL,FILTER,AC,AF,AN,BaseQRankSum,ClippingRankSum,DP,FS,MLEAC,MLEAF,MQ,MQRankSum,QD,ReadPosRankSum,SOR,set,SNP,HET,VARTYPE,ANN,HOM,GnomAD_AB_HIST_ALL,GnomAD_AB_HIST_ALT,GnomAD_AB_MEDIAN,GnomAD_AC,GnomAD_AC_AFR,GnomAD_AC_AMR,GnomAD_AC_ASJ,GnomAD_AC_EAS,GnomAD_AC_FIN,GnomAD_AC_Female,GnomAD_AC_Male,GnomAD_AC_NFE,GnomAD_AC_OTH,GnomAD_AC_POPMAX,GnomAD_AC_SAS,GnomAD_AC_raw,GnomAD_AF,GnomAD_AF_AFR,GnomAD_AF_AMR,GnomAD_AF_ASJ,GnomAD_AF_EAS,GnomAD_AF_FIN,GnomAD_AF_Female,GnomAD_AF_Male,GnomAD_AF_NFE,GnomAD_AF_OTH,GnomAD_AF_POPMAX,GnomAD_AF_SAS,GnomAD_AF_raw,GnomAD_AN,GnomAD_AN_AFR,GnomAD_AN_AMR,GnomAD_AN_ASJ,GnomAD_AN_EAS,GnomAD_AN_FIN,GnomAD_AN_Female,GnomAD_AN_Male,GnomAD_AN_NFE,GnomAD_AN_OTH,GnomAD_AN_POPMAX,GnomAD_AN_SAS,GnomAD_AN_raw,GnomAD_AS_FilterStatus,GnomAD_AS_RF,GnomAD_BaseQRankSum,GnomAD_CSQ,GnomAD_ClippingRankSum,GnomAD_DB,GnomAD_DP,GnomAD_DP_HIST_ALL,GnomAD_DP_HIST_ALT,GnomAD_DP_MEDIAN,GnomAD_DREF_MEDIAN,GnomAD_FS,GnomAD_GC,GnomAD_GC_AFR,GnomAD_GC_AMR,GnomAD_GC_ASJ,GnomAD_GC_EAS,GnomAD_GC_FIN,GnomAD_GC_Female,GnomAD_GC_Male,GnomAD_GC_NFE,GnomAD_GC_OTH,GnomAD_GC_SAS,GnomAD_GC_raw,GnomAD_GQ_HIST_ALL,GnomAD_GQ_HIST_ALT,GnomAD_GQ_MEDIAN,GnomAD_Hom,GnomAD_Hom_AFR,GnomAD_Hom_AMR,GnomAD_Hom_ASJ,GnomAD_Hom_EAS,GnomAD_Hom_FIN,GnomAD_Hom_Female,GnomAD_Hom_Male,GnomAD_Hom_NFE,GnomAD_Hom_OTH,GnomAD_Hom_SAS,GnomAD_Hom_raw,GnomAD_InbreedingCoeff,GnomAD_MQ,GnomAD_MQRankSum,GnomAD_POPMAX,GnomAD_QD,GnomAD_ReadPosRankSum,GnomAD_SOR,GnomAD_VQSLOD,GnomAD_VQSR_culprit,dbNSFP_GERP++_RS,dbNSFP_GERP++_NR,dbNSFP_1000Gp1_AMR_AF,dbNSFP_1000Gp1_AF,dbNSFP_Uniprot_acc,dbNSFP_LRT_pred,dbNSFP_phastCons100way_vertebrate,dbNSFP_Polyphen2_HDIV_pred,dbNSFP_1000Gp1_ASN_AF,dbNSFP_1000Gp1_AFR_AF,dbNSFP_MutationTaster_pred,dbNSFP_1000Gp1_EUR_AF,dbNSFP_Interpro_domain,dbNSFP_ESP6500_AA_AF,dbNSFP_Polyphen2_HVAR_pred,dbNSFP_SIFT_pred,dbNSFP_ESP6500_EA_AF,1000Gp3_AA,1000Gp3_AC,1000Gp3_AF,1000Gp3_AFR_AF,1000Gp3_AMR_AF,1000Gp3_AN,1000Gp3_DP,1000Gp3_EAS_AF,1000Gp3_EUR_AF,1000Gp3_NS,1000Gp3_SAS_AF,1000Gp3_VT,GnomAD_STAR_AC,GnomAD_STAR_AC_raw,GnomAD_STAR_Hom,GnomAD_VQSR_NEGATIVE_TRAIN_SITE,GnomAD_VQSR_POSITIVE_TRAIN_SITE,INS,1000Gp3_MULTI_ALLELIC,1000Gp3_EX_TARGET,DEL,GnomAD_AS_RF_POSITIVE_TRAIN,CLINVAR_ALLELEID,CLINVAR_CLNDISDB,CLINVAR_CLNDN,CLINVAR_CLNHGVS,CLINVAR_CLNREVSTAT,CLINVAR_CLNSIG,CLINVAR_CLNVC,CLINVAR_CLNVCSO,CLINVAR_GENEINFO,CLINVAR_MC,CLINVAR_ORIGIN,CLINVAR_RS,CLINVAR_AF_ESP,CLINVAR_AF_TGP,CLINVAR_AF_EXAC,GnomAD_AS_RF_NEGATIVE_TRAIN,CLINVAR_CLNVI,LOF,CLINVAR_CLNSIGCONF,NMD,CLINVAR_CLNDISDBINCL,CLINVAR_CLNDNINCL,CLINVAR_CLNSIGINCL,GnomAD_AC_AFR_Female,GnomAD_AC_AFR_Male,GnomAD_AC_AMR_Female,GnomAD_AC_AMR_Male,GnomAD_AC_ASJ_Female,GnomAD_AC_ASJ_Male,GnomAD_AC_EAS_Female,GnomAD_AC_EAS_Male,GnomAD_AC_FIN_Female,GnomAD_AC_FIN_Male,GnomAD_AC_NFE_Female,GnomAD_AC_NFE_Male,GnomAD_AC_OTH_Female,GnomAD_AC_OTH_Male,GnomAD_AC_SAS_Female,GnomAD_AC_SAS_Male,GnomAD_AF_AFR_Female,GnomAD_AF_AFR_Male,GnomAD_AF_AMR_Female,GnomAD_AF_AMR_Male,GnomAD_AF_ASJ_Female,GnomAD_AF_ASJ_Male,GnomAD_AF_EAS_Female,GnomAD_AF_EAS_Male,GnomAD_AF_FIN_Female,GnomAD_AF_FIN_Male,GnomAD_AF_NFE_Female,GnomAD_AF_NFE_Male,GnomAD_AF_OTH_Female,GnomAD_AF_OTH_Male,GnomAD_AF_SAS_Female,GnomAD_AF_SAS_Male,GnomAD_AN_AFR_Female,GnomAD_AN_AFR_Male,GnomAD_AN_AMR_Female,GnomAD_AN_AMR_Male,GnomAD_AN_ASJ_Female,GnomAD_AN_ASJ_Male,GnomAD_AN_EAS_Female,GnomAD_AN_EAS_Male,GnomAD_AN_FIN_Female,GnomAD_AN_FIN_Male,GnomAD_AN_NFE_Female,GnomAD_AN_NFE_Male,GnomAD_AN_OTH_Female,GnomAD_AN_OTH_Male,GnomAD_AN_SAS_Female,GnomAD_AN_SAS_Male,GnomAD_GC_AFR_Female,GnomAD_GC_AFR_Male,GnomAD_GC_AMR_Female,GnomAD_GC_AMR_Male,GnomAD_GC_ASJ_Female,GnomAD_GC_ASJ_Male,GnomAD_GC_EAS_Female,GnomAD_GC_EAS_Male,GnomAD_GC_FIN_Female,GnomAD_GC_FIN_Male,GnomAD_GC_NFE_Female,GnomAD_GC_NFE_Male,GnomAD_GC_OTH_Female,GnomAD_GC_OTH_Male,GnomAD_GC_SAS_Female,GnomAD_GC_SAS_Male,GnomAD_Hemi,GnomAD_Hemi_AFR,GnomAD_Hemi_AMR,GnomAD_Hemi_ASJ,GnomAD_Hemi_EAS,GnomAD_Hemi_FIN,GnomAD_Hemi_NFE,GnomAD_Hemi_OTH,GnomAD_Hemi_SAS,GnomAD_Hemi_raw,GnomAD_STAR_Hemi,1000Gp3_OLD_VARIANT
0,chr1,14653,C,T,,48.77,,1,0.5,2,0.217,-0.65,15,0.0,1,0.5,40.77,1.083,3.25,0.65,0.784,variant,True,True,SNP,T|downstream_gene_variant|MODIFIER|DDX11L1|DDX...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,chr1,14671,G,C,,202.770004,,1,0.5,2,-1.555,0.4,18,0.0,1,0.5,41.419998,-1.555,11.26,0.133,1.022,variant,True,True,SNP,C|downstream_gene_variant|MODIFIER|DDX11L1|DDX...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,chr1,69511,A,G,rs2691305,3168.77002,,2,1.0,2,,,107,0.0,2,1.0,44.189999,,29.610001,,0.961,variant,True,,SNP,G|missense_variant|MODERATE|OR4F5|OR4F5|transc...,True,0|0|97|381|730|328|351|261|262|431|248|36|165|...,0|0|97|381|730|328|352|260|263|430|247|36|165|...,0.375,147701.0,6502.0,18452.0,4790.0,16689.0,11021.0,65689.0,82012.0,64577.0,3093.0,16689.0,22577.0,156927.0,0.950616,0.607436,0.95084,0.97795,0.999461,0.991454,0.940914,0.958532,0.97278,0.949939,0.999461,0.985379,0.875211,155374.0,10704.0,19406.0,4898.0,16698.0,11116.0,69814.0,85560.0,66384.0,3256.0,16698.0,22912.0,179302.0,PASS,0.786236,0.667,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,-0.048,True,13873540.0,10415|901|424|815|1996|3382|4906|5867|4523|423...,3902|743|393|808|1992|3376|4427|4396|4184|4172...,65.0,0.0,2.42,"(2440, 2793, 72454, 0, 0, 0)","(1401, 1400, 2551, 0, 0, 0)","(280, 394, 9029, 0, 0, 0)","(37, 34, 2378, 0, 0, 0)","(0, 9, 8340, 0, 0, 0)","(31, 33, 5494, 0, 0, 0)","(1314, 1497, 32096, 0, 0, 0)","(1126, 1296, 40358, 0, 0, 0)","(564, 679, 31949, 0, 0, 0)","(49, 65, 1514, 0, 0, 0)","(78, 179, 11199, 0, 0, 0)","(9156, 4058, 76434, 1, 1, 1)",6159|3645|686|555|279|160|210|312|253|389|494|...,599|2618|627|511|249|150|203|305|249|385|492|4...,99.0,72454.0,2551.0,9029.0,2378.0,8340.0,5494.0,32096.0,40358.0,31949.0,1514.0,11199.0,76434.0,0.6295,31.530001,-0.544,EAS,26.25,-0.944,0.802,-0.903,MQ,1.15,2.31,0.649171,0.652015,Q8NH21,N,0.0,B,0.874126,0.329268,P,0.695251,"GPCR,_rhodopsin-like_superfamily_(1),",0.544101,B,".,T",0.887429,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,chr1,752894,T,C,rs3131971,663.77002,basic_snp_filter,2,1.0,2,,,18,0.0,2,1.0,39.950001,,28.790001,,5.892,FilteredInAll,True,,SNP,C|non_coding_transcript_exon_variant|MODIFIER|...,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,.|||,3772.0,0.753195,0.5129,0.8127,5008.0,17740.0,0.8839,0.8419,2504.0,0.8098,SNP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,chr1,762273,G,A,rs3115849,3228.77002,,2,1.0,2,,,103,0.0,2,1.0,45.25,,31.35,,0.973,variant,True,,SNP,A|upstream_gene_variant|MODIFIER|LINC01128|LIN...,True,0|6|150|527|2128|4130|4098|3189|3049|1737|986|...,0|6|150|527|2128|4134|4095|3188|3050|1737|985|...,0.342037,116307.0,2570.0,18700.0,6767.0,9065.0,12016.0,52668.0,63639.0,45820.0,3073.0,9065.0,18296.0,118345.0,0.818303,0.413715,0.795406,0.844187,0.896992,0.802994,0.8114,0.824105,0.860599,0.830541,0.896992,0.817443,0.800862,142132.0,6212.0,23510.0,8016.0,10106.0,14964.0,64910.0,77222.0,53242.0,3700.0,10106.0,22382.0,147772.0,PASS,0.799154,1.03,A|upstream_gene_variant|MODIFIER|LINC01128|ENS...,-0.068,True,7942198.0,1971|149|27|97|351|961|2147|4484|5187|5515|589...,642|75|27|97|350|959|1901|3226|4535|5292|5805|...,66.0,0.0,9.018,"(2785, 20255, 48026)","(953, 1736, 417)","(480, 3850, 7425)","(83, 1083, 2842)","(56, 929, 4068)","(284, 2380, 4818)","(1427, 9388, 21640)","(1358, 10867, 26386)","(481, 6460, 19680)","(51, 525, 1274)","(397, 3292, 7502)","(4200, 21027, 48659)",1092|798|85|69|25|14|30|29|22|56|50|52|103|160...,15|530|53|40|16|14|29|27|22|56|50|52|100|160|1...,99.0,48026.0,417.0,7425.0,2842.0,4068.0,4818.0,21640.0,26386.0,19680.0,1274.0,7502.0,48659.0,0.0858,45.389999,-3.682,EAS,22.559999,0.495,1.369,-0.0105,MQRankSum,,,,,,,,,,,,,,,,,,.|||,3665.0,0.731829,0.3903,0.8112,5008.0,14934.0,0.8879,0.8688,2504.0,0.8354,SNP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [15]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104079 entries, 0 to 104078
Columns: 255 entries, CHROM to 1000Gp3_OLD_VARIANT
dtypes: float64(98), int64(2), object(155)
memory usage: 202.5+ MB


In [16]:
df1.describe()

Unnamed: 0,QUAL,AN,BaseQRankSum,ClippingRankSum,DP,FS,MQ,MQRankSum,QD,ReadPosRankSum,...,GnomAD_Hemi_AFR,GnomAD_Hemi_AMR,GnomAD_Hemi_ASJ,GnomAD_Hemi_EAS,GnomAD_Hemi_FIN,GnomAD_Hemi_NFE,GnomAD_Hemi_OTH,GnomAD_Hemi_SAS,GnomAD_Hemi_raw,GnomAD_STAR_Hemi
count,104079.0,104079.0,64688.0,64688.0,104079.0,104079.0,104079.0,64688.0,104079.0,64646.0,...,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,47.0
mean,983.239527,2.0,-0.166657,-0.010453,52.588543,1.456095,59.37145,-0.219687,19.120819,0.331839,...,1149.344346,2624.482846,963.250318,1592.702668,2189.2554,11478.207116,536.855146,4218.631512,29185.322745,107.638298
std,1194.298003,0.0,1.62204,0.907133,56.734379,4.445449,3.362744,1.457357,9.665246,1.128349,...,943.267214,2105.492666,770.736875,1440.129245,1782.100221,9358.938613,411.988722,3446.121345,21098.053287,606.42948
min,30.709999,2.0,-9.104,-3.908,1.0,0.0,23.0,-25.612,0.04,-13.357,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,267.769989,2.0,-1.269,-0.648,18.0,0.0,60.0,-0.75,11.32,-0.392,...,307.5,750.5,265.0,168.5,629.0,3373.5,183.5,1231.0,10712.0,0.0
50%,615.72998,2.0,-0.203,0.0,37.0,0.0,60.0,-0.054,15.84,0.366,...,959.0,2248.0,850.0,1343.0,1882.0,9642.0,476.0,3678.0,27230.0,1.0
75%,1254.77002,2.0,0.91,0.631,69.0,1.775,60.0,0.598,29.639999,1.065,...,1888.0,4208.5,1527.0,2701.5,3496.5,18245.5,828.5,6587.5,43899.0,2.5
max,67382.773438,2.0,10.042,3.739,2469.0,268.968994,70.0,10.067,42.950001,8.02,...,3058.0,7003.0,2555.0,4378.0,5854.0,31526.0,1428.0,11637.0,67455.0,4152.0


In [14]:
df1 = handle_annotations(df1, pVCF)


In [15]:
df1.shape

(316174, 270)

In [40]:
df1.head()

Unnamed: 0,CHROM,POS,REF,ALT,ID,QUAL,FILTER,AC,AF,AN,BaseQRankSum,ClippingRankSum,DP,FS,MLEAC,MLEAF,MQ,MQRankSum,QD,ReadPosRankSum,SOR,set,SNP,HET,VARTYPE,HOM,GnomAD_AB_HIST_ALL,GnomAD_AB_HIST_ALT,GnomAD_AB_MEDIAN,GnomAD_AC,GnomAD_AC_AFR,GnomAD_AC_AMR,GnomAD_AC_ASJ,GnomAD_AC_EAS,GnomAD_AC_FIN,GnomAD_AC_Female,GnomAD_AC_Male,GnomAD_AC_NFE,GnomAD_AC_OTH,GnomAD_AC_POPMAX,GnomAD_AC_SAS,GnomAD_AC_raw,GnomAD_AF,GnomAD_AF_AFR,GnomAD_AF_AMR,GnomAD_AF_ASJ,GnomAD_AF_EAS,GnomAD_AF_FIN,GnomAD_AF_Female,GnomAD_AF_Male,GnomAD_AF_NFE,GnomAD_AF_OTH,GnomAD_AF_POPMAX,GnomAD_AF_SAS,GnomAD_AF_raw,GnomAD_AN,GnomAD_AN_AFR,GnomAD_AN_AMR,GnomAD_AN_ASJ,GnomAD_AN_EAS,GnomAD_AN_FIN,GnomAD_AN_Female,GnomAD_AN_Male,GnomAD_AN_NFE,GnomAD_AN_OTH,GnomAD_AN_POPMAX,GnomAD_AN_SAS,GnomAD_AN_raw,GnomAD_AS_FilterStatus,GnomAD_AS_RF,GnomAD_BaseQRankSum,GnomAD_CSQ,GnomAD_ClippingRankSum,GnomAD_DB,GnomAD_DP,GnomAD_DP_HIST_ALL,GnomAD_DP_HIST_ALT,GnomAD_DP_MEDIAN,GnomAD_DREF_MEDIAN,GnomAD_FS,GnomAD_GC,GnomAD_GC_AFR,GnomAD_GC_AMR,GnomAD_GC_ASJ,GnomAD_GC_EAS,GnomAD_GC_FIN,GnomAD_GC_Female,GnomAD_GC_Male,GnomAD_GC_NFE,GnomAD_GC_OTH,GnomAD_GC_SAS,GnomAD_GC_raw,GnomAD_GQ_HIST_ALL,GnomAD_GQ_HIST_ALT,GnomAD_GQ_MEDIAN,GnomAD_Hom,GnomAD_Hom_AFR,GnomAD_Hom_AMR,GnomAD_Hom_ASJ,GnomAD_Hom_EAS,GnomAD_Hom_FIN,GnomAD_Hom_Female,GnomAD_Hom_Male,GnomAD_Hom_NFE,GnomAD_Hom_OTH,GnomAD_Hom_SAS,GnomAD_Hom_raw,GnomAD_InbreedingCoeff,GnomAD_MQ,GnomAD_MQRankSum,GnomAD_POPMAX,GnomAD_QD,GnomAD_ReadPosRankSum,GnomAD_SOR,GnomAD_VQSLOD,GnomAD_VQSR_culprit,dbNSFP_GERP++_RS,dbNSFP_GERP++_NR,dbNSFP_1000Gp1_AMR_AF,dbNSFP_1000Gp1_AF,dbNSFP_Uniprot_acc,dbNSFP_LRT_pred,dbNSFP_phastCons100way_vertebrate,dbNSFP_Polyphen2_HDIV_pred,dbNSFP_1000Gp1_ASN_AF,dbNSFP_1000Gp1_AFR_AF,dbNSFP_MutationTaster_pred,dbNSFP_1000Gp1_EUR_AF,dbNSFP_Interpro_domain,dbNSFP_ESP6500_AA_AF,dbNSFP_Polyphen2_HVAR_pred,dbNSFP_SIFT_pred,dbNSFP_ESP6500_EA_AF,1000Gp3_AA,1000Gp3_AC,1000Gp3_AF,1000Gp3_AFR_AF,1000Gp3_AMR_AF,1000Gp3_AN,1000Gp3_DP,1000Gp3_EAS_AF,1000Gp3_EUR_AF,1000Gp3_NS,1000Gp3_SAS_AF,1000Gp3_VT,GnomAD_STAR_AC,GnomAD_STAR_AC_raw,GnomAD_STAR_Hom,GnomAD_VQSR_NEGATIVE_TRAIN_SITE,GnomAD_VQSR_POSITIVE_TRAIN_SITE,INS,1000Gp3_MULTI_ALLELIC,1000Gp3_EX_TARGET,DEL,GnomAD_AS_RF_POSITIVE_TRAIN,CLINVAR_ALLELEID,CLINVAR_CLNDISDB,CLINVAR_CLNDN,CLINVAR_CLNHGVS,CLINVAR_CLNREVSTAT,CLINVAR_CLNSIG,CLINVAR_CLNVC,CLINVAR_CLNVCSO,CLINVAR_GENEINFO,CLINVAR_MC,CLINVAR_ORIGIN,CLINVAR_RS,CLINVAR_AF_ESP,CLINVAR_AF_TGP,CLINVAR_AF_EXAC,GnomAD_AS_RF_NEGATIVE_TRAIN,CLINVAR_CLNVI,LOF,CLINVAR_CLNSIGCONF,NMD,CLINVAR_CLNDISDBINCL,CLINVAR_CLNDNINCL,CLINVAR_CLNSIGINCL,GnomAD_AC_AFR_Female,GnomAD_AC_AFR_Male,GnomAD_AC_AMR_Female,GnomAD_AC_AMR_Male,GnomAD_AC_ASJ_Female,GnomAD_AC_ASJ_Male,GnomAD_AC_EAS_Female,GnomAD_AC_EAS_Male,GnomAD_AC_FIN_Female,GnomAD_AC_FIN_Male,GnomAD_AC_NFE_Female,GnomAD_AC_NFE_Male,GnomAD_AC_OTH_Female,GnomAD_AC_OTH_Male,GnomAD_AC_SAS_Female,GnomAD_AC_SAS_Male,GnomAD_AF_AFR_Female,GnomAD_AF_AFR_Male,GnomAD_AF_AMR_Female,GnomAD_AF_AMR_Male,GnomAD_AF_ASJ_Female,GnomAD_AF_ASJ_Male,GnomAD_AF_EAS_Female,GnomAD_AF_EAS_Male,GnomAD_AF_FIN_Female,GnomAD_AF_FIN_Male,GnomAD_AF_NFE_Female,GnomAD_AF_NFE_Male,GnomAD_AF_OTH_Female,GnomAD_AF_OTH_Male,GnomAD_AF_SAS_Female,GnomAD_AF_SAS_Male,GnomAD_AN_AFR_Female,GnomAD_AN_AFR_Male,GnomAD_AN_AMR_Female,GnomAD_AN_AMR_Male,GnomAD_AN_ASJ_Female,GnomAD_AN_ASJ_Male,GnomAD_AN_EAS_Female,GnomAD_AN_EAS_Male,GnomAD_AN_FIN_Female,GnomAD_AN_FIN_Male,GnomAD_AN_NFE_Female,GnomAD_AN_NFE_Male,GnomAD_AN_OTH_Female,GnomAD_AN_OTH_Male,GnomAD_AN_SAS_Female,GnomAD_AN_SAS_Male,GnomAD_GC_AFR_Female,GnomAD_GC_AFR_Male,GnomAD_GC_AMR_Female,GnomAD_GC_AMR_Male,GnomAD_GC_ASJ_Female,GnomAD_GC_ASJ_Male,GnomAD_GC_EAS_Female,GnomAD_GC_EAS_Male,GnomAD_GC_FIN_Female,GnomAD_GC_FIN_Male,GnomAD_GC_NFE_Female,GnomAD_GC_NFE_Male,GnomAD_GC_OTH_Female,GnomAD_GC_OTH_Male,GnomAD_GC_SAS_Female,GnomAD_GC_SAS_Male,GnomAD_Hemi,GnomAD_Hemi_AFR,GnomAD_Hemi_AMR,GnomAD_Hemi_ASJ,GnomAD_Hemi_EAS,GnomAD_Hemi_FIN,GnomAD_Hemi_NFE,GnomAD_Hemi_OTH,GnomAD_Hemi_SAS,GnomAD_Hemi_raw,GnomAD_STAR_Hemi,1000Gp3_OLD_VARIANT,Allele,Annotation,Annotation_Impact,Gene_Name,Gene_ID,Feature_Type,Feature_ID,Transcript_BioType,Rank,HGVS.c,HGVS.p,cDNA.pos / cDNA.length,CDS.pos / CDS.length,AA.pos / AA.length,Distance,ERRORS / WARNINGS / INFO
0,chr1,14653,C,T,,48.77,,1,0.5,2,0.217,-0.65,15,0.0,1,0.5,40.77,1.083,3.25,0.65,0.784,variant,True,True,SNP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,T,downstream_gene_variant,MODIFIER,DDX11L1,DDX11L1,transcript,NR_046018.2,pseudogene,,n.*244C>T,,,,,244,
0,chr1,14653,C,T,,48.77,,1,0.5,2,0.217,-0.65,15,0.0,1,0.5,40.77,1.083,3.25,0.65,0.784,variant,True,True,SNP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,T,downstream_gene_variant,MODIFIER,MIR6859-1,MIR6859-1,transcript,NR_106918.1,pseudogene,,n.*2716G>A,,,,,2716,
0,chr1,14653,C,T,,48.77,,1,0.5,2,0.217,-0.65,15,0.0,1,0.5,40.77,1.083,3.25,0.65,0.784,variant,True,True,SNP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,T,downstream_gene_variant,MODIFIER,MIR6859-2,MIR6859-2,transcript,NR_107062.1,pseudogene,,n.*2716G>A,,,,,2716,
0,chr1,14653,C,T,,48.77,,1,0.5,2,0.217,-0.65,15,0.0,1,0.5,40.77,1.083,3.25,0.65,0.784,variant,True,True,SNP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,T,downstream_gene_variant,MODIFIER,MIR6859-3,MIR6859-3,transcript,NR_107063.1,pseudogene,,n.*2716G>A,,,,,2716,
0,chr1,14653,C,T,,48.77,,1,0.5,2,0.217,-0.65,15,0.0,1,0.5,40.77,1.083,3.25,0.65,0.784,variant,True,True,SNP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,T,downstream_gene_variant,MODIFIER,MIR6859-4,MIR6859-4,transcript,NR_128720.1,pseudogene,,n.*2716G>A,,,,,2716,


In [19]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 316174 entries, 0 to 104078
dtypes: float64(98), int64(2), object(170)
memory usage: 653.7+ MB


In [20]:
df1.describe()

Unnamed: 0,QUAL,AN,BaseQRankSum,ClippingRankSum,DP,FS,MQ,MQRankSum,QD,ReadPosRankSum,...,GnomAD_Hemi_AFR,GnomAD_Hemi_AMR,GnomAD_Hemi_ASJ,GnomAD_Hemi_EAS,GnomAD_Hemi_FIN,GnomAD_Hemi_NFE,GnomAD_Hemi_OTH,GnomAD_Hemi_SAS,GnomAD_Hemi_raw,GnomAD_STAR_Hemi
count,316174.0,316174.0,194432.0,194432.0,316174.0,316174.0,316174.0,194432.0,316174.0,194335.0,...,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,111.0
mean,969.780853,2.0,-0.186557,-0.006459,51.004798,1.452862,59.475862,-0.162875,19.274428,0.337247,...,1183.165005,2726.718948,1005.299184,1666.643699,2293.401632,11843.474615,558.951496,4432.846328,30660.398005,199.108108
std,1155.575181,0.0,1.598716,0.907496,53.164443,5.201441,3.19315,1.332556,9.599713,1.111631,...,964.765073,2156.235401,782.109568,1490.270929,1796.418039,9456.304185,418.339174,3483.463587,21344.193233,863.711582
min,30.709999,2.0,-9.104,-3.908,1.0,0.0,23.0,-25.612,0.04,-13.357,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,265.769989,2.0,-1.279,-0.653,17.0,0.0,60.0,-0.729,11.45,-0.381,...,347.0,747.75,305.0,194.0,782.25,3557.0,195.0,1340.25,12162.0,0.0
50%,611.77002,2.0,-0.217,0.0,37.0,0.0,60.0,-0.04,15.99,0.37,...,1002.0,2317.0,877.5,1379.0,2065.5,9964.5,513.0,3903.0,29266.5,1.0
75%,1249.77002,2.0,0.876,0.637,68.0,1.697,60.0,0.604,29.690001,1.059,...,1926.0,4480.75,1583.5,2934.0,3692.0,18184.0,856.0,7229.0,46659.0,3.0
max,67382.773438,2.0,10.042,3.739,2469.0,268.968994,70.0,10.067,42.950001,8.02,...,3058.0,7003.0,2555.0,4378.0,5854.0,31526.0,1428.0,11637.0,67455.0,4152.0


In [41]:
df2 = prioritize_variants(df1)


In [42]:
df2.shape

(104079, 270)

In [43]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 104079 entries, 831 to 104078
dtypes: float64(98), int64(2), object(170)
memory usage: 215.2+ MB


In [44]:
df2.describe()

Unnamed: 0,QUAL,AN,BaseQRankSum,ClippingRankSum,DP,FS,MQ,MQRankSum,QD,ReadPosRankSum,SOR,GnomAD_AN,GnomAD_AN_AFR,GnomAD_AN_AMR,GnomAD_AN_ASJ,GnomAD_AN_EAS,GnomAD_AN_FIN,GnomAD_AN_Female,GnomAD_AN_Male,GnomAD_AN_NFE,GnomAD_AN_OTH,GnomAD_AN_SAS,GnomAD_AN_raw,GnomAD_BaseQRankSum,GnomAD_ClippingRankSum,GnomAD_DP,GnomAD_FS,GnomAD_InbreedingCoeff,GnomAD_MQ,GnomAD_MQRankSum,GnomAD_QD,GnomAD_ReadPosRankSum,GnomAD_SOR,GnomAD_VQSLOD,GnomAD_STAR_AC,GnomAD_STAR_AC_raw,GnomAD_STAR_Hom,CLINVAR_ALLELEID,CLINVAR_AF_ESP,CLINVAR_AF_TGP,CLINVAR_AF_EXAC,GnomAD_AC_AFR_Female,GnomAD_AC_AFR_Male,GnomAD_AC_AMR_Female,GnomAD_AC_AMR_Male,GnomAD_AC_ASJ_Female,GnomAD_AC_ASJ_Male,GnomAD_AC_EAS_Female,GnomAD_AC_EAS_Male,GnomAD_AC_FIN_Female,GnomAD_AC_FIN_Male,GnomAD_AC_NFE_Female,GnomAD_AC_NFE_Male,GnomAD_AC_OTH_Female,GnomAD_AC_OTH_Male,GnomAD_AC_SAS_Female,GnomAD_AC_SAS_Male,GnomAD_AF_AFR_Female,GnomAD_AF_AFR_Male,GnomAD_AF_AMR_Female,GnomAD_AF_AMR_Male,GnomAD_AF_ASJ_Female,GnomAD_AF_ASJ_Male,GnomAD_AF_EAS_Female,GnomAD_AF_EAS_Male,GnomAD_AF_FIN_Female,GnomAD_AF_FIN_Male,GnomAD_AF_NFE_Female,GnomAD_AF_NFE_Male,GnomAD_AF_OTH_Female,GnomAD_AF_OTH_Male,GnomAD_AF_SAS_Female,GnomAD_AF_SAS_Male,GnomAD_AN_AFR_Female,GnomAD_AN_AFR_Male,GnomAD_AN_AMR_Female,GnomAD_AN_AMR_Male,GnomAD_AN_ASJ_Female,GnomAD_AN_ASJ_Male,GnomAD_AN_EAS_Female,GnomAD_AN_EAS_Male,GnomAD_AN_FIN_Female,GnomAD_AN_FIN_Male,GnomAD_AN_NFE_Female,GnomAD_AN_NFE_Male,GnomAD_AN_OTH_Female,GnomAD_AN_OTH_Male,GnomAD_AN_SAS_Female,GnomAD_AN_SAS_Male,GnomAD_Hemi,GnomAD_Hemi_AFR,GnomAD_Hemi_AMR,GnomAD_Hemi_ASJ,GnomAD_Hemi_EAS,GnomAD_Hemi_FIN,GnomAD_Hemi_NFE,GnomAD_Hemi_OTH,GnomAD_Hemi_SAS,GnomAD_Hemi_raw,GnomAD_STAR_Hemi
count,104079.0,104079.0,64688.0,64688.0,104079.0,104079.0,104079.0,64688.0,104079.0,64646.0,104079.0,50924.0,50924.0,50924.0,50924.0,50924.0,50924.0,50919.0,50919.0,50924.0,50924.0,50924.0,50924.0,50895.0,50895.0,50924.0,50924.0,50923.0,50924.0,50895.0,50924.0,50893.0,50924.0,49440.0,3415.0,3415.0,3415.0,6796.0,2575.0,2950.0,2603.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,782.0,777.0,781.0,777.0,781.0,778.0,781.0,774.0,780.0,776.0,784.0,783.0,782.0,778.0,780.0,779.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,47.0
mean,983.239527,2.0,-0.166657,-0.010453,52.588543,1.456095,59.37145,-0.219687,19.120819,0.331839,1.256548,205494.312682,12607.291434,28681.895256,8529.761174,14625.616173,18280.96644,93348.762859,112165.046702,91806.803334,4670.827017,26291.151854,227970.315195,0.171914,0.019173,10771920.0,1.711368,0.042386,57.835582,-0.136736,20.683597,0.309115,0.882855,2.593893,659.923865,1057.205857,160.368082,244446.298852,0.244694,0.248371,0.251737,3844.317662,1149.344346,8372.974587,2624.482846,2045.52986,963.250318,3435.385006,1592.702668,4475.959339,2189.2554,20229.311309,11478.207116,1125.584498,536.855146,3161.561626,4218.631512,0.500582,0.495969,0.492252,0.487294,0.493592,0.487934,0.469079,0.465964,0.4935,0.488717,0.493704,0.48941,0.492999,0.487842,0.480249,0.477549,7807.479034,2352.613723,17083.951715,5399.66709,4178.7446,1987.664549,7398.805591,3460.467598,9128.88183,4511.482846,41401.341804,23719.935197,2300.688691,1107.002541,6635.92122,8931.7446,24752.729352,1149.344346,2624.482846,963.250318,1592.702668,2189.2554,11478.207116,536.855146,4218.631512,29185.322745,107.638298
std,1194.298003,0.0,1.62204,0.907133,56.734379,4.445449,3.362744,1.457357,9.665246,1.128349,0.977261,57387.741614,3977.493632,7243.383247,2080.066923,3987.617988,5965.53336,25705.417476,32158.025129,28156.998594,1172.141346,6607.899892,40542.82359,1.660448,0.078953,8328420.0,8.670718,0.165498,5.35862,1.096958,7.317674,0.367313,0.974417,126.686528,5111.670975,6834.481105,1889.167363,120835.982104,0.146903,0.148701,0.149655,2957.782419,943.267214,6245.266991,2105.492666,1479.281547,770.736875,2963.551454,1440.129245,3395.425239,1782.100221,15176.675996,9358.938613,799.652139,411.988722,2379.324568,3446.121345,0.338504,0.343781,0.339657,0.3434,0.331313,0.336637,0.375376,0.380589,0.334606,0.339804,0.333262,0.339178,0.325728,0.330813,0.340429,0.342973,2333.09049,838.917786,4234.543891,1819.099777,1009.789644,697.604902,1931.181931,1122.668337,2679.16024,1649.532563,11797.344204,8768.813221,554.236892,361.877104,1537.338012,3078.012906,19380.888455,943.267214,2105.492666,770.736875,1440.129245,1782.100221,9358.938613,411.988722,3446.121345,21098.053287,606.42948
min,30.709999,2.0,-9.104,-3.908,1.0,0.0,23.0,-25.612,0.04,-13.357,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,-8.037,-2.315,137.0,0.0,-0.9999,20.0,-13.04,0.06,-7.298,0.001,-27740.0,0.0,1.0,0.0,15136.0,0.0001,0.0002,1e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,267.769989,2.0,-1.269,-0.648,18.0,0.0,60.0,-0.75,11.32,-0.392,0.649,174277.5,10927.0,24188.0,8039.5,12620.0,16293.75,80411.0,93514.0,78036.5,3984.0,22792.0,241742.0,-1.117,-0.004,5887525.0,0.0,0.0205,59.130001,0.075,15.86,0.231,0.691,3.49,1.0,1.0,0.0,137838.25,0.11435,0.1206,0.12175,1219.5,307.5,2617.0,750.5,723.0,265.0,480.0,168.5,1508.5,629.0,7429.5,3373.5,431.0,183.5,1019.0,1231.0,0.193234,0.179006,0.177218,0.167873,0.200676,0.180649,0.080591,0.06998,0.202314,0.186775,0.196898,0.186779,0.206162,0.196105,0.166678,0.15604,7644.0,1791.0,15679.0,4583.0,3911.0,1716.5,7321.0,2711.0,9048.0,3744.5,39798.0,17504.5,2146.0,925.5,6200.0,7607.5,8295.5,307.5,750.5,265.0,168.5,629.0,3373.5,183.5,1231.0,10712.0,0.0
50%,615.72998,2.0,-0.203,0.0,37.0,0.0,60.0,-0.054,15.84,0.366,0.941,238237.0,14952.0,32810.0,9594.0,16970.0,21686.0,107814.0,130808.0,108152.0,5322.0,29998.0,246196.0,0.312,0.0,9056779.0,0.0,0.0408,59.439999,0.122,19.91,0.329,0.734,5.35,1.0,3.0,0.0,253185.0,0.2474,0.2462,0.26062,3357.0,959.0,7825.0,2248.0,1901.0,850.0,3014.0,1343.0,4083.0,1882.0,18205.0,9642.0,1062.0,476.0,2885.0,3678.0,0.495966,0.484461,0.485408,0.47637,0.48171,0.473452,0.434605,0.426606,0.484842,0.469333,0.479282,0.481,0.486394,0.470696,0.473313,0.464608,9100.0,2782.0,19376.0,6106.0,4676.0,2265.0,8436.0,3991.0,10484.0,5247.0,47720.0,28024.0,2588.0,1256.0,7428.0,9960.0,21641.0,959.0,2248.0,850.0,1343.0,1882.0,9642.0,476.0,3678.0,27230.0,1.0
75%,1254.77002,2.0,0.91,0.631,69.0,1.775,60.0,0.598,29.639999,1.065,1.609,245476.0,15276.0,33530.0,9830.0,17228.0,22252.0,110990.0,134506.0,111266.0,5468.0,30758.0,246266.0,1.47,0.011,13594480.0,0.733,0.076,59.619999,0.177,25.870001,0.44,0.796,6.18,6.0,20.0,0.0,320605.0,0.3753,0.3778,0.382945,6293.0,1888.0,13445.0,4208.5,3134.5,1527.0,5904.0,2701.5,7141.5,3496.5,31420.5,18245.5,1725.5,828.5,4983.0,6587.5,0.822967,0.824459,0.796321,0.797678,0.770648,0.7704,0.856312,0.874948,0.776299,0.794086,0.788565,0.789178,0.757992,0.771157,0.797579,0.799851,9180.0,3018.0,19558.0,6895.0,4734.0,2524.5,8488.0,4335.0,10584.0,5763.5,48509.0,30833.5,2624.0,1402.0,7506.0,11518.0,38035.5,1888.0,4208.5,1527.0,2701.5,3496.5,18245.5,828.5,6587.5,43899.0,2.5
max,67382.773438,2.0,10.042,3.739,2469.0,268.968994,70.0,10.067,42.950001,8.02,8.155,246272.0,15304.0,33582.0,9850.0,17248.0,22300.0,111362.0,134910.0,111720.0,5486.0,30782.0,246272.0,10.91,2.2,229055900.0,252.589005,0.8654,65.720001,7.39,37.759998,6.54,18.700001,9.0,123465.0,173408.0,59756.0,589474.0,0.4996,0.4998,0.49989,9188.0,3058.0,19576.0,7003.0,4740.0,2555.0,8492.0,4378.0,10592.0,5854.0,48636.0,31526.0,2630.0,1428.0,7508.0,11637.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,9188.0,3058.0,19576.0,7003.0,4740.0,2555.0,8492.0,4378.0,10592.0,5854.0,48636.0,31529.0,2630.0,1428.0,7508.0,11637.0,67435.0,3058.0,7003.0,2555.0,4378.0,5854.0,31526.0,1428.0,11637.0,67455.0,4152.0


In [33]:
df1 = format_ann_columns(df1, pVCF)

In [46]:
df2 = format_ann_columns(df2, pVCF)

In [47]:
print(df1.shape)
print(df2.shape)

(316174, 272)
(104079, 270)


In [48]:
print(df1.info())
print(df2.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 316174 entries, 0 to 104078
Columns: 272 entries, CHROM to AMINOCHANGE
dtypes: bool(1), float64(198), object(73)
memory usage: 656.4+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 104079 entries, 831 to 104078
Columns: 270 entries, CHROM to AMINOCHANGE
dtypes: float64(198), object(72)
memory usage: 215.2+ MB
None


In [49]:
df1.describe()

Unnamed: 0,QUAL,AC,AF,AN,BASEQRANKSUM,CLIPPINGRANKSUM,DP,FS,MLEAC,MLEAF,MQ,MQRANKSUM,QD,READPOSRANKSUM,SOR,GNOMAD_AB_MEDIAN,GNOMAD_AC,GNOMAD_AC_AFR,GNOMAD_AC_AMR,GNOMAD_AC_ASJ,GNOMAD_AC_EAS,GNOMAD_AC_FIN,GNOMAD_AC_FEMALE,GNOMAD_AC_MALE,GNOMAD_AC_NFE,GNOMAD_AC_OTH,GNOMAD_AC_POPMAX,GNOMAD_AC_SAS,GNOMAD_AC_RAW,GNOMAD_AF,GNOMAD_AF_AFR,GNOMAD_AF_AMR,GNOMAD_AF_ASJ,GNOMAD_AF_EAS,GNOMAD_AF_FIN,GNOMAD_AF_FEMALE,GNOMAD_AF_MALE,GNOMAD_AF_NFE,GNOMAD_AF_OTH,GNOMAD_AF_POPMAX,GNOMAD_AF_SAS,GNOMAD_AF_RAW,GNOMAD_AN,GNOMAD_AN_AFR,GNOMAD_AN_AMR,GNOMAD_AN_ASJ,GNOMAD_AN_EAS,GNOMAD_AN_FIN,GNOMAD_AN_FEMALE,GNOMAD_AN_MALE,GNOMAD_AN_NFE,GNOMAD_AN_OTH,GNOMAD_AN_POPMAX,GNOMAD_AN_SAS,GNOMAD_AN_RAW,GNOMAD_AS_RF,GNOMAD_BASEQRANKSUM,GNOMAD_CLIPPINGRANKSUM,GNOMAD_DP,GNOMAD_DP_MEDIAN,GNOMAD_DREF_MEDIAN,GNOMAD_FS,GNOMAD_GC,GNOMAD_GC_AFR,GNOMAD_GC_AMR,GNOMAD_GC_ASJ,GNOMAD_GC_EAS,GNOMAD_GC_FIN,GNOMAD_GC_FEMALE,GNOMAD_GC_MALE,GNOMAD_GC_NFE,GNOMAD_GC_OTH,GNOMAD_GC_SAS,GNOMAD_GC_RAW,GNOMAD_GQ_MEDIAN,GNOMAD_HOM,GNOMAD_HOM_AFR,GNOMAD_HOM_AMR,GNOMAD_HOM_ASJ,GNOMAD_HOM_EAS,GNOMAD_HOM_FIN,GNOMAD_HOM_FEMALE,GNOMAD_HOM_MALE,GNOMAD_HOM_NFE,GNOMAD_HOM_OTH,GNOMAD_HOM_SAS,GNOMAD_HOM_RAW,GNOMAD_INBREEDINGCOEFF,GNOMAD_MQ,GNOMAD_MQRANKSUM,GNOMAD_QD,GNOMAD_READPOSRANKSUM,GNOMAD_SOR,GNOMAD_VQSLOD,DBNSFP_GERP++_RS,DBNSFP_GERP++_NR,DBNSFP_1000GP1_AMR_AF,DBNSFP_1000GP1_AF,DBNSFP_PHASTCONS100WAY_VERTEBRATE,DBNSFP_1000GP1_ASN_AF,DBNSFP_1000GP1_AFR_AF,DBNSFP_1000GP1_EUR_AF,DBNSFP_ESP6500_AA_AF,DBNSFP_ESP6500_EA_AF,1000GP3_AC,1000GP3_AF,1000GP3_AFR_AF,1000GP3_AMR_AF,1000GP3_AN,1000GP3_DP,1000GP3_EAS_AF,1000GP3_EUR_AF,1000GP3_NS,1000GP3_SAS_AF,GNOMAD_STAR_AC,GNOMAD_STAR_AC_RAW,GNOMAD_STAR_HOM,GNOMAD_AS_RF_POSITIVE_TRAIN,CLINVAR_ALLELEID,CLINVAR_AF_ESP,CLINVAR_AF_TGP,CLINVAR_AF_EXAC,GNOMAD_AS_RF_NEGATIVE_TRAIN,GNOMAD_AC_AFR_FEMALE,GNOMAD_AC_AFR_MALE,GNOMAD_AC_AMR_FEMALE,GNOMAD_AC_AMR_MALE,GNOMAD_AC_ASJ_FEMALE,GNOMAD_AC_ASJ_MALE,GNOMAD_AC_EAS_FEMALE,GNOMAD_AC_EAS_MALE,GNOMAD_AC_FIN_FEMALE,GNOMAD_AC_FIN_MALE,GNOMAD_AC_NFE_FEMALE,GNOMAD_AC_NFE_MALE,GNOMAD_AC_OTH_FEMALE,GNOMAD_AC_OTH_MALE,GNOMAD_AC_SAS_FEMALE,GNOMAD_AC_SAS_MALE,GNOMAD_AF_AFR_FEMALE,GNOMAD_AF_AFR_MALE,GNOMAD_AF_AMR_FEMALE,GNOMAD_AF_AMR_MALE,GNOMAD_AF_ASJ_FEMALE,GNOMAD_AF_ASJ_MALE,GNOMAD_AF_EAS_FEMALE,GNOMAD_AF_EAS_MALE,GNOMAD_AF_FIN_FEMALE,GNOMAD_AF_FIN_MALE,GNOMAD_AF_NFE_FEMALE,GNOMAD_AF_NFE_MALE,GNOMAD_AF_OTH_FEMALE,GNOMAD_AF_OTH_MALE,GNOMAD_AF_SAS_FEMALE,GNOMAD_AF_SAS_MALE,GNOMAD_AN_AFR_FEMALE,GNOMAD_AN_AFR_MALE,GNOMAD_AN_AMR_FEMALE,GNOMAD_AN_AMR_MALE,GNOMAD_AN_ASJ_FEMALE,GNOMAD_AN_ASJ_MALE,GNOMAD_AN_EAS_FEMALE,GNOMAD_AN_EAS_MALE,GNOMAD_AN_FIN_FEMALE,GNOMAD_AN_FIN_MALE,GNOMAD_AN_NFE_FEMALE,GNOMAD_AN_NFE_MALE,GNOMAD_AN_OTH_FEMALE,GNOMAD_AN_OTH_MALE,GNOMAD_AN_SAS_FEMALE,GNOMAD_AN_SAS_MALE,GNOMAD_GC_AFR_FEMALE,GNOMAD_GC_AFR_MALE,GNOMAD_GC_AMR_FEMALE,GNOMAD_GC_AMR_MALE,GNOMAD_GC_ASJ_FEMALE,GNOMAD_GC_ASJ_MALE,GNOMAD_GC_EAS_FEMALE,GNOMAD_GC_EAS_MALE,GNOMAD_GC_FIN_FEMALE,GNOMAD_GC_FIN_MALE,GNOMAD_GC_NFE_FEMALE,GNOMAD_GC_NFE_MALE,GNOMAD_GC_OTH_FEMALE,GNOMAD_GC_OTH_MALE,GNOMAD_GC_SAS_FEMALE,GNOMAD_GC_SAS_MALE,GNOMAD_HEMI,GNOMAD_HEMI_AFR,GNOMAD_HEMI_AMR,GNOMAD_HEMI_ASJ,GNOMAD_HEMI_EAS,GNOMAD_HEMI_FIN,GNOMAD_HEMI_NFE,GNOMAD_HEMI_OTH,GNOMAD_HEMI_SAS,GNOMAD_HEMI_RAW,GNOMAD_STAR_HEMI
count,316174.0,314731.0,314731.0,316174.0,194432.0,194432.0,316174.0,316174.0,314731.0,314731.0,316174.0,194432.0,316174.0,194335.0,316174.0,152828.0,152831.0,152831.0,152831.0,152831.0,152831.0,152831.0,152771.0,152771.0,152831.0,152831.0,152492.0,152831.0,152831.0,152792.0,152386.0,152266.0,152273.0,152102.0,152310.0,152732.0,152719.0,152792.0,152673.0,152492.0,152310.0,152831.0,152891.0,152891.0,152891.0,152891.0,152891.0,152891.0,152831.0,152831.0,152891.0,152891.0,152492.0,152891.0,152891.0,152827.0,152839.0,152839.0,152891.0,152831.0,152831.0,152891.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,152831.0,152771.0,152771.0,152771.0,152771.0,152771.0,152771.0,150565.0,150565.0,152771.0,152771.0,152771.0,152771.0,152890.0,152891.0,152839.0,152891.0,152835.0,152891.0,149341.0,36058.0,36058.0,32676.0,32676.0,36103.0,32676.0,32676.0,32676.0,27555.0,27555.0,284792.0,284792.0,284792.0,284792.0,285304.0,285304.0,284792.0,284792.0,285304.0,284792.0,10193.0,10193.0,10193.0,73743.0,22311.0,8710.0,10169.0,8881.0,882.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2196.0,2184.0,2193.0,2184.0,2194.0,2186.0,2193.0,2178.0,2191.0,2182.0,2200.0,2198.0,2196.0,2187.0,2191.0,2189.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,111.0
mean,969.780854,1.391566,0.695783,2.0,-0.186557,-0.006459,51.004798,1.452862,1.388573,0.694287,59.475862,-0.162875,19.274428,0.337247,1.261055,0.472694,97504.719906,5604.794152,13716.316245,4047.537685,6914.107498,8748.582153,44264.079374,53278.932638,43819.556824,2217.937009,11999.196423,12435.888341,106245.951378,0.467572,0.438319,0.471358,0.468418,0.465997,0.471178,0.467364,0.467872,0.470111,0.468346,0.59318,0.466035,0.463147,207625.432066,12745.363893,28941.557489,8597.691349,14767.397917,18478.347077,94288.641663,113415.576814,92839.6562,4713.087311,22270.711027,26542.33083,229079.396008,0.866143,0.164956,0.01922,10699800.0,56.259306,3.8e-05,1.516782,,,,,,,,,,,,,93.846713,32804.51731,1855.777857,4654.840133,1354.290651,2450.072514,2940.258989,14921.744741,18101.487995,14635.955469,737.656656,4175.665041,35834.756701,0.047078,58.014633,-0.070118,20.837847,0.316503,0.854569,3.034701,0.528424,4.306483,0.481208,0.475192,0.377532,0.478918,0.44804,0.483622,0.441883,0.471579,2393.425402,0.479809,0.455387,0.490504,4989.304016,17428.468875,0.485673,0.491903,2503.995528,0.486748,548.755224,887.103306,116.94506,1.004041,235411.185021,0.236574,0.244541,0.245344,1.336735,3984.133273,1183.165005,8687.468268,2726.718948,2142.614234,1005.299184,3600.451043,1666.643699,4727.80689,2293.401632,21102.618314,11843.474615,1176.358114,558.951496,3329.499547,4432.846328,0.517367,0.512473,0.507546,0.504652,0.513543,0.508721,0.488345,0.485937,0.516504,0.512526,0.51402,0.510575,0.512776,0.509653,0.502529,0.500749,7790.029918,2324.481868,17178.604714,5389.470535,4215.152312,1996.718948,7397.178604,3430.579782,9192.233908,4499.134633,41503.618314,23508.154125,2311.364461,1103.486854,6665.81505,8919.999547,,,,,,,,,,,,,,,,,25710.500907,1183.165005,2726.718948,1005.299184,1666.643699,2293.401632,11843.474615,558.951496,4432.846328,30660.398005,199.108108
std,1155.575181,0.488101,0.244051,0.0,1.598716,0.907496,53.164443,5.201441,0.487427,0.243713,3.19315,1.332556,9.599713,1.111631,0.98594,0.085182,71221.427079,4609.264025,10053.443361,2948.483037,5572.307934,6730.757116,32204.299995,39159.997862,33145.012558,1593.346171,13427.076539,9105.579067,72741.063118,0.305128,0.318823,0.315009,0.313322,0.341798,0.314748,0.304419,0.305805,0.312264,0.306737,0.306854,0.312159,0.303518,57098.338374,3941.532722,7214.029893,2075.398435,3963.533382,5901.690881,25530.053365,31924.058609,27944.005181,1169.614509,22775.345369,6583.266397,39982.548551,0.207496,1.645246,0.081745,7898318.0,41.979815,0.001945,8.238684,,,,,,,,,,,,,16.385287,35429.109292,2202.823691,5040.098345,1485.166929,2783.497331,3334.417415,16021.271595,19565.651391,16489.366371,799.680096,4567.810348,37063.595457,0.153509,5.11916,0.958244,7.103807,0.346561,0.867841,103.504318,3.93673,1.417221,0.307735,0.30408,0.455443,0.341202,0.331168,0.313204,0.325968,0.316869,1480.385979,0.296462,0.32472,0.304195,150.786052,3923.10727,0.338324,0.307727,2.388893,0.310367,4042.073392,5663.244119,1396.561735,0.086268,122710.568115,0.144926,0.148452,0.150148,0.737353,3000.803208,964.765073,6328.247573,2156.235401,1491.728391,782.109568,3034.985697,1490.270929,3422.795222,1796.418039,15318.600301,9456.304185,807.315481,418.339174,2388.263842,3483.463587,0.339478,0.344692,0.342085,0.345762,0.336576,0.341848,0.379689,0.384856,0.335844,0.341339,0.336192,0.342101,0.328354,0.333513,0.339621,0.34193,2296.492023,846.561932,4007.573948,1801.395146,927.898472,676.860338,1886.948452,1129.375372,2527.124765,1609.433265,11348.368251,8747.642365,523.297832,357.264992,1449.460216,3037.295237,,,,,,,,,,,,,,,,,19669.817242,964.765073,2156.235401,782.109568,1490.270929,1796.418039,9456.304185,418.339174,3483.463587,21344.193233,863.711582
min,30.709999,1.0,0.5,2.0,-9.104,-3.908,1.0,0.0,1.0,0.5,23.0,-25.612,0.04,-13.357,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9e-06,0.0,4e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6.0,0.006566,-8.037,-2.315,137.0,1.0,0.0,0.0,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.9999,20.0,-13.04,0.06,-7.298,0.001,-27740.0,-12.2,0.0465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1228.0,-128.0,0.0,0.0,1228.0,0.0,0.0,1.0,0.0,1.0,15136.0,0.0001,0.0002,1e-05,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,265.769989,1.0,0.5,2.0,-1.279,-0.653,17.0,0.0,1.0,0.5,60.0,-0.729,11.45,-0.381,0.648,0.464286,37140.0,1522.0,4845.0,1481.0,1770.0,2882.0,16919.0,20132.0,15513.0,858.0,4677.0,4585.0,45455.0,0.210039,0.146657,0.198854,0.198173,0.150048,0.200562,0.212142,0.210323,0.202758,0.210355,0.351672,0.198466,0.207285,180818.0,11635.5,25128.0,8081.0,13140.0,16568.0,84602.0,99173.0,81910.0,4110.0,10108.0,23094.0,243386.0,0.904969,-1.1165,-0.004,5988016.0,29.0,0.0,0.0,,,,,,,,,,,,,99.0,3926.0,92.0,410.0,119.0,91.0,237.0,1799.0,2138.0,1296.0,80.0,402.0,5089.0,0.021,59.169998,0.081,15.9,0.237,0.691,3.6,-1.57,3.55,0.218232,0.217491,0.0,0.166084,0.138211,0.21504,0.139582,0.19314,1144.0,0.229832,0.1558,0.232,5008.0,15818.0,0.1796,0.2306,2504.0,0.2209,1.0,1.0,0.0,1.0,135292.0,0.10785,0.116,0.10841,1.0,1356.25,347.0,2649.0,747.75,863.0,305.0,581.0,194.0,1786.0,782.25,8473.0,3557.0,494.5,195.0,1248.25,1340.25,0.212544,0.195473,0.187443,0.18593,0.215569,0.191799,0.087677,0.074286,0.226393,0.209924,0.221327,0.215283,0.226092,0.209524,0.175511,0.17688,7406.5,1698.5,15704.0,4536.0,3959.0,1667.0,7206.0,2671.0,8841.0,3638.0,39102.0,16242.0,2134.0,911.5,6220.0,7410.0,,,,,,,,,,,,,,,,,8634.0,347.0,747.75,305.0,194.0,782.25,3557.0,195.0,1340.25,12162.0,0.0
50%,611.77002,1.0,0.5,2.0,-0.217,0.0,37.0,0.0,1.0,0.5,60.0,-0.04,15.99,0.37,0.941,0.484848,86981.0,4516.0,12558.0,3680.0,6111.0,7568.0,39521.0,47306.0,38054.0,2011.0,9602.0,11282.0,97042.0,0.446334,0.405037,0.452597,0.443975,0.442963,0.446813,0.446718,0.445705,0.44387,0.446863,0.62178,0.447553,0.433979,239968.0,15024.0,33012.0,9650.0,17034.0,21800.0,108504.0,131748.0,108778.0,5358.0,16436.0,30236.0,246210.0,0.953468,0.311,0.0,9151413.0,47.0,0.0,0.0,,,,,,,,,,,,,99.0,18946.0,823.0,2703.0,747.0,1242.0,1531.0,8666.0,10436.0,7892.0,423.0,2374.0,21878.0,0.0413,59.439999,0.127,19.91,0.333,0.733,5.37,1.27,4.73,0.455801,0.45467,0.015,0.468531,0.410569,0.459103,0.402917,0.437653,2287.0,0.458267,0.4266,0.4726,5008.0,17970.0,0.4633,0.4682,2504.0,0.4663,1.0,3.0,0.0,1.0,252189.0,0.24345,0.244,0.25491,1.0,3493.5,1002.0,8110.5,2317.0,2105.0,877.5,3223.0,1379.0,4588.0,2065.5,18953.0,9964.5,1138.0,513.0,3198.0,3903.0,0.507102,0.50674,0.51882,0.513175,0.486842,0.476451,0.469638,0.454795,0.499717,0.493986,0.488727,0.488811,0.515625,0.504261,0.495868,0.501463,9100.0,2754.0,19385.0,6085.0,4684.0,2253.5,8438.0,3963.0,10494.0,5242.0,47802.0,27619.0,2590.0,1252.0,7427.0,9885.0,,,,,,,,,,,,,,,,,22507.0,1002.0,2317.0,877.5,1379.0,2065.5,9964.5,513.0,3903.0,29266.5,1.0
75%,1249.77002,2.0,1.0,2.0,0.876,0.637,68.0,1.697,2.0,1.0,60.0,0.604,29.690001,1.059,1.609,0.5,149411.0,9172.0,21365.0,6327.0,11199.0,13886.0,67702.0,81527.5,68394.5,3403.0,15049.0,19235.0,160715.0,0.712649,0.718223,0.73345,0.724162,0.773079,0.736675,0.711376,0.714236,0.726121,0.71477,0.872485,0.718927,0.70677,245578.0,15280.0,33540.0,9834.0,17232.0,22258.0,111034.0,134560.0,111322.0,5470.0,22290.0,30762.0,246268.0,0.954732,1.46,0.012,13562260.0,73.0,0.0,0.711,,,,,,,,,,,,,99.0,52044.0,3059.0,7699.0,2215.0,4175.0,4788.5,23762.0,28799.0,23421.0,1192.0,6744.0,57363.0,0.0763,59.610001,0.181,25.91,0.443,0.793,6.19,3.59,5.4,0.740331,0.731227,0.992,0.788462,0.75,0.748021,0.737242,0.741561,3609.0,0.724641,0.7451,0.7478,5008.0,19669.0,0.7937,0.7485,2504.0,0.7444,6.0,17.0,0.0,1.0,315284.0,0.355475,0.3754,0.37654,1.0,6546.25,1926.0,13638.0,4480.75,3346.0,1583.5,6594.0,2934.0,7516.0,3692.0,33238.0,18184.0,1788.5,856.0,5168.0,7229.0,0.852525,0.851986,0.816157,0.829796,0.83481,0.835835,0.905065,0.905071,0.843145,0.845297,0.850771,0.862588,0.837422,0.842342,0.834089,0.826278,9178.0,3013.0,19560.0,6895.75,4734.0,2530.0,8488.0,4340.0,10586.0,5782.0,48504.0,30924.75,2624.0,1408.0,7504.0,11535.0,,,,,,,,,,,,,,,,,38163.25,1926.0,4480.75,1583.5,2934.0,3692.0,18184.0,856.0,7229.0,46659.0,3.0
max,67382.773438,2.0,1.0,2.0,10.042,3.739,2469.0,268.968994,2.0,1.0,70.0,10.067,42.950001,8.02,8.155,1.0,246266.0,15304.0,33582.0,9850.0,17248.0,22300.0,111361.0,134906.0,111720.0,5486.0,111564.0,30782.0,246272.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,246272.0,15304.0,33582.0,9850.0,17248.0,22300.0,111362.0,134910.0,111720.0,5486.0,111720.0,30782.0,246272.0,0.954732,10.91,2.2,229055900.0,1424.0,0.5,252.589005,,,,,,,,,,,,,99.0,123133.0,7652.0,16791.0,4925.0,8624.0,11150.0,55680.0,67453.0,55860.0,2743.0,15391.0,123136.0,0.8654,65.720001,7.39,37.759998,6.54,18.700001,9.0,6.17,6.17,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5008.0,1.0,1.0,1.0,5008.0,141654.0,1.0,1.0,2504.0,1.0,123465.0,173408.0,59756.0,5.0,589474.0,0.4996,0.4998,0.49989,5.0,9188.0,3058.0,19576.0,7003.0,4740.0,2555.0,8492.0,4378.0,10592.0,5854.0,48636.0,31526.0,2630.0,1428.0,7508.0,11637.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,9188.0,3058.0,19576.0,7003.0,4740.0,2555.0,8492.0,4378.0,10592.0,5854.0,48636.0,31529.0,2630.0,1428.0,7508.0,11637.0,,,,,,,,,,,,,,,,,67435.0,3058.0,7003.0,2555.0,4378.0,5854.0,31526.0,1428.0,11637.0,67455.0,4152.0


In [50]:
df2.describe()

Unnamed: 0,QUAL,AC,AF,AN,BASEQRANKSUM,CLIPPINGRANKSUM,DP,FS,MLEAC,MLEAF,MQ,MQRANKSUM,QD,READPOSRANKSUM,SOR,GNOMAD_AB_MEDIAN,GNOMAD_AC,GNOMAD_AC_AFR,GNOMAD_AC_AMR,GNOMAD_AC_ASJ,GNOMAD_AC_EAS,GNOMAD_AC_FIN,GNOMAD_AC_FEMALE,GNOMAD_AC_MALE,GNOMAD_AC_NFE,GNOMAD_AC_OTH,GNOMAD_AC_POPMAX,GNOMAD_AC_SAS,GNOMAD_AC_RAW,GNOMAD_AF,GNOMAD_AF_AFR,GNOMAD_AF_AMR,GNOMAD_AF_ASJ,GNOMAD_AF_EAS,GNOMAD_AF_FIN,GNOMAD_AF_FEMALE,GNOMAD_AF_MALE,GNOMAD_AF_NFE,GNOMAD_AF_OTH,GNOMAD_AF_POPMAX,GNOMAD_AF_SAS,GNOMAD_AF_RAW,GNOMAD_AN,GNOMAD_AN_AFR,GNOMAD_AN_AMR,GNOMAD_AN_ASJ,GNOMAD_AN_EAS,GNOMAD_AN_FIN,GNOMAD_AN_FEMALE,GNOMAD_AN_MALE,GNOMAD_AN_NFE,GNOMAD_AN_OTH,GNOMAD_AN_POPMAX,GNOMAD_AN_SAS,GNOMAD_AN_RAW,GNOMAD_AS_RF,GNOMAD_BASEQRANKSUM,GNOMAD_CLIPPINGRANKSUM,GNOMAD_DP,GNOMAD_DP_MEDIAN,GNOMAD_DREF_MEDIAN,GNOMAD_FS,GNOMAD_GC,GNOMAD_GC_AFR,GNOMAD_GC_AMR,GNOMAD_GC_ASJ,GNOMAD_GC_EAS,GNOMAD_GC_FIN,GNOMAD_GC_FEMALE,GNOMAD_GC_MALE,GNOMAD_GC_NFE,GNOMAD_GC_OTH,GNOMAD_GC_SAS,GNOMAD_GC_RAW,GNOMAD_GQ_MEDIAN,GNOMAD_HOM,GNOMAD_HOM_AFR,GNOMAD_HOM_AMR,GNOMAD_HOM_ASJ,GNOMAD_HOM_EAS,GNOMAD_HOM_FIN,GNOMAD_HOM_FEMALE,GNOMAD_HOM_MALE,GNOMAD_HOM_NFE,GNOMAD_HOM_OTH,GNOMAD_HOM_SAS,GNOMAD_HOM_RAW,GNOMAD_INBREEDINGCOEFF,GNOMAD_MQ,GNOMAD_MQRANKSUM,GNOMAD_QD,GNOMAD_READPOSRANKSUM,GNOMAD_SOR,GNOMAD_VQSLOD,DBNSFP_GERP++_RS,DBNSFP_GERP++_NR,DBNSFP_1000GP1_AMR_AF,DBNSFP_1000GP1_AF,DBNSFP_PHASTCONS100WAY_VERTEBRATE,DBNSFP_1000GP1_ASN_AF,DBNSFP_1000GP1_AFR_AF,DBNSFP_1000GP1_EUR_AF,DBNSFP_ESP6500_AA_AF,DBNSFP_ESP6500_EA_AF,1000GP3_AC,1000GP3_AF,1000GP3_AFR_AF,1000GP3_AMR_AF,1000GP3_AN,1000GP3_DP,1000GP3_EAS_AF,1000GP3_EUR_AF,1000GP3_NS,1000GP3_SAS_AF,GNOMAD_STAR_AC,GNOMAD_STAR_AC_RAW,GNOMAD_STAR_HOM,GNOMAD_AS_RF_POSITIVE_TRAIN,CLINVAR_ALLELEID,CLINVAR_AF_ESP,CLINVAR_AF_TGP,CLINVAR_AF_EXAC,GNOMAD_AS_RF_NEGATIVE_TRAIN,GNOMAD_AC_AFR_FEMALE,GNOMAD_AC_AFR_MALE,GNOMAD_AC_AMR_FEMALE,GNOMAD_AC_AMR_MALE,GNOMAD_AC_ASJ_FEMALE,GNOMAD_AC_ASJ_MALE,GNOMAD_AC_EAS_FEMALE,GNOMAD_AC_EAS_MALE,GNOMAD_AC_FIN_FEMALE,GNOMAD_AC_FIN_MALE,GNOMAD_AC_NFE_FEMALE,GNOMAD_AC_NFE_MALE,GNOMAD_AC_OTH_FEMALE,GNOMAD_AC_OTH_MALE,GNOMAD_AC_SAS_FEMALE,GNOMAD_AC_SAS_MALE,GNOMAD_AF_AFR_FEMALE,GNOMAD_AF_AFR_MALE,GNOMAD_AF_AMR_FEMALE,GNOMAD_AF_AMR_MALE,GNOMAD_AF_ASJ_FEMALE,GNOMAD_AF_ASJ_MALE,GNOMAD_AF_EAS_FEMALE,GNOMAD_AF_EAS_MALE,GNOMAD_AF_FIN_FEMALE,GNOMAD_AF_FIN_MALE,GNOMAD_AF_NFE_FEMALE,GNOMAD_AF_NFE_MALE,GNOMAD_AF_OTH_FEMALE,GNOMAD_AF_OTH_MALE,GNOMAD_AF_SAS_FEMALE,GNOMAD_AF_SAS_MALE,GNOMAD_AN_AFR_FEMALE,GNOMAD_AN_AFR_MALE,GNOMAD_AN_AMR_FEMALE,GNOMAD_AN_AMR_MALE,GNOMAD_AN_ASJ_FEMALE,GNOMAD_AN_ASJ_MALE,GNOMAD_AN_EAS_FEMALE,GNOMAD_AN_EAS_MALE,GNOMAD_AN_FIN_FEMALE,GNOMAD_AN_FIN_MALE,GNOMAD_AN_NFE_FEMALE,GNOMAD_AN_NFE_MALE,GNOMAD_AN_OTH_FEMALE,GNOMAD_AN_OTH_MALE,GNOMAD_AN_SAS_FEMALE,GNOMAD_AN_SAS_MALE,GNOMAD_GC_AFR_FEMALE,GNOMAD_GC_AFR_MALE,GNOMAD_GC_AMR_FEMALE,GNOMAD_GC_AMR_MALE,GNOMAD_GC_ASJ_FEMALE,GNOMAD_GC_ASJ_MALE,GNOMAD_GC_EAS_FEMALE,GNOMAD_GC_EAS_MALE,GNOMAD_GC_FIN_FEMALE,GNOMAD_GC_FIN_MALE,GNOMAD_GC_NFE_FEMALE,GNOMAD_GC_NFE_MALE,GNOMAD_GC_OTH_FEMALE,GNOMAD_GC_OTH_MALE,GNOMAD_GC_SAS_FEMALE,GNOMAD_GC_SAS_MALE,GNOMAD_HEMI,GNOMAD_HEMI_AFR,GNOMAD_HEMI_AMR,GNOMAD_HEMI_ASJ,GNOMAD_HEMI_EAS,GNOMAD_HEMI_FIN,GNOMAD_HEMI_NFE,GNOMAD_HEMI_OTH,GNOMAD_HEMI_SAS,GNOMAD_HEMI_RAW,GNOMAD_STAR_HEMI
count,104079.0,103836.0,103836.0,104079.0,64688.0,64688.0,104079.0,104079.0,103836.0,103836.0,104079.0,64688.0,104079.0,64646.0,104079.0,50911.0,50914.0,50914.0,50914.0,50914.0,50914.0,50914.0,50909.0,50909.0,50914.0,50914.0,50757.0,50914.0,50914.0,50891.0,50746.0,50719.0,50729.0,50663.0,50741.0,50886.0,50879.0,50891.0,50844.0,50757.0,50726.0,50914.0,50924.0,50924.0,50924.0,50924.0,50924.0,50924.0,50919.0,50919.0,50924.0,50924.0,50757.0,50924.0,50924.0,50910.0,50895.0,50895.0,50924.0,50914.0,50914.0,50924.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50914.0,50909.0,50909.0,50909.0,50909.0,50909.0,50909.0,50122.0,50122.0,50909.0,50909.0,50909.0,50909.0,50923.0,50924.0,50895.0,50924.0,50893.0,50924.0,49440.0,12376.0,12376.0,11111.0,11111.0,12416.0,11111.0,11111.0,11111.0,9351.0,9351.0,92808.0,92808.0,92808.0,92808.0,92896.0,92896.0,92808.0,92808.0,92896.0,92808.0,3415.0,3415.0,3415.0,24074.0,6796.0,2575.0,2950.0,2603.0,348.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,782.0,777.0,781.0,777.0,781.0,778.0,781.0,774.0,780.0,776.0,784.0,783.0,782.0,778.0,780.0,779.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,787.0,47.0
mean,983.239528,1.386022,0.693011,2.0,-0.166657,-0.010453,52.588543,1.456095,1.383104,0.691552,59.37145,-0.219687,19.120819,0.331839,1.256548,0.468645,96603.807558,5564.31726,13560.549711,4024.761952,6843.535275,8680.763817,43851.285824,52762.009036,43386.880956,2199.371096,11964.962764,12343.627489,105735.924127,0.467073,0.438821,0.469512,0.468342,0.46486,0.470672,0.466722,0.467316,0.469596,0.467724,0.591428,0.466109,0.463282,205494.312682,12607.291434,28681.895256,8529.761174,14625.616173,18280.96644,93348.762859,112165.046702,91806.803334,4670.827017,22032.731072,26291.151854,227970.315195,0.853284,0.171914,0.019173,10771920.0,56.708037,5.8e-05,1.711368,,,,,,,,,,,,,93.714381,32401.474887,1837.459526,4593.501758,1341.389558,2419.200554,2911.281463,14754.070149,17893.43436,14435.93292,728.990807,4133.718301,35466.483863,0.042386,57.835582,-0.136736,20.683597,0.309115,0.882855,2.593893,0.540345,4.26775,0.481864,0.475767,0.362704,0.478354,0.449764,0.483739,0.443555,0.472184,2405.170427,0.482196,0.457872,0.492225,4988.142859,17444.665551,0.488315,0.494157,2503.986264,0.489348,659.923865,1057.205857,160.368082,1.003738,244446.298852,0.244694,0.248371,0.251737,1.241379,3844.317662,1149.344346,8372.974587,2624.482846,2045.52986,963.250318,3435.385006,1592.702668,4475.959339,2189.2554,20229.311309,11478.207116,1125.584498,536.855146,3161.561626,4218.631512,0.500582,0.495969,0.492252,0.487294,0.493592,0.487934,0.469079,0.465964,0.4935,0.488717,0.493704,0.48941,0.492999,0.487842,0.480249,0.477549,7807.479034,2352.613723,17083.951715,5399.66709,4178.7446,1987.664549,7398.805591,3460.467598,9128.88183,4511.482846,41401.341804,23719.935197,2300.688691,1107.002541,6635.92122,8931.7446,,,,,,,,,,,,,,,,,24752.729352,1149.344346,2624.482846,963.250318,1592.702668,2189.2554,11478.207116,536.855146,4218.631512,29185.322745,107.638298
std,1194.298003,0.486838,0.243419,0.0,1.62204,0.907133,56.734379,4.445449,0.486146,0.243073,3.362744,1.457357,9.665246,1.128349,0.977261,0.092561,70898.319113,4589.833279,10030.03964,2930.542842,5549.082049,6712.470507,32070.318255,38980.075427,32957.139847,1585.175919,13577.093523,9061.328036,72370.370786,0.304898,0.318167,0.315942,0.312182,0.341746,0.314099,0.304322,0.305545,0.311491,0.306382,0.307424,0.31184,0.302545,57387.741614,3977.493632,7243.383247,2080.066923,3987.617988,5965.53336,25705.417476,32158.025129,28156.998594,1172.141346,22629.952645,6607.899892,40542.82359,0.226111,1.660448,0.078953,8328420.0,44.591242,0.00312,8.670718,,,,,,,,,,,,,16.48832,35180.346702,2191.852079,5019.238546,1472.863094,2770.410935,3315.703056,15917.838808,19426.423863,16346.8997,793.411643,4537.200707,36882.86514,0.165498,5.35862,1.096958,7.317674,0.367313,0.974417,126.686528,3.867522,1.446625,0.309011,0.304565,0.450804,0.343225,0.332101,0.312389,0.326887,0.31668,1473.750592,0.295053,0.323592,0.303115,155.542434,3904.329717,0.337258,0.305885,4.18651,0.308937,5111.670975,6834.481105,1889.167363,0.080928,120835.982104,0.146903,0.148701,0.149655,0.541436,2957.782419,943.267214,6245.266991,2105.492666,1479.281547,770.736875,2963.551454,1440.129245,3395.425239,1782.100221,15176.675996,9358.938613,799.652139,411.988722,2379.324568,3446.121345,0.338504,0.343781,0.339657,0.3434,0.331313,0.336637,0.375376,0.380589,0.334606,0.339804,0.333262,0.339178,0.325728,0.330813,0.340429,0.342973,2333.09049,838.917786,4234.543891,1819.099777,1009.789644,697.604902,1931.181931,1122.668337,2679.16024,1649.532563,11797.344204,8768.813221,554.236892,361.877104,1537.338012,3078.012906,,,,,,,,,,,,,,,,,19380.888455,943.267214,2105.492666,770.736875,1440.129245,1782.100221,9358.938613,411.988722,3446.121345,21098.053287,606.42948
min,30.709999,1.0,0.5,2.0,-9.104,-3.908,1.0,0.0,1.0,0.5,23.0,-25.612,0.04,-13.357,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9e-06,0.0,4e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6.0,0.006566,-8.037,-2.315,137.0,1.0,0.0,0.0,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.9999,20.0,-13.04,0.06,-7.298,0.001,-27740.0,-12.2,0.0465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1228.0,-128.0,0.0,0.0,1228.0,0.0,0.0,1.0,0.0,1.0,15136.0,0.0001,0.0002,1e-05,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,267.769989,1.0,0.5,2.0,-1.269,-0.648,18.0,0.0,1.0,0.5,60.0,-0.75,11.32,-0.392,0.649,0.462687,36270.25,1503.25,4672.25,1470.0,1751.0,2794.0,16551.0,19634.0,15200.25,845.0,4530.0,4534.0,44740.0,0.207547,0.147453,0.191913,0.198098,0.148401,0.200198,0.208552,0.207025,0.201872,0.20682,0.347838,0.197394,0.205869,174277.5,10927.0,24188.0,8039.5,12620.0,16293.75,80411.0,93514.0,78036.5,3984.0,9916.0,22792.0,241742.0,0.901628,-1.117,-0.004,5887525.0,29.0,0.0,0.0,,,,,,,,,,,,,99.0,3522.0,84.0,357.0,113.0,82.0,217.0,1628.0,1936.25,1208.0,73.0,376.0,4549.0,0.0205,59.130001,0.075,15.86,0.231,0.691,3.49,-1.51,3.48,0.212707,0.212454,0.0,0.162587,0.138211,0.212401,0.138981,0.191754,1157.0,0.232228,0.1596,0.2334,5008.0,15831.0,0.1825,0.2336,2504.0,0.2249,1.0,1.0,0.0,1.0,137838.25,0.11435,0.1206,0.12175,1.0,1219.5,307.5,2617.0,750.5,723.0,265.0,480.0,168.5,1508.5,629.0,7429.5,3373.5,431.0,183.5,1019.0,1231.0,0.193234,0.179006,0.177218,0.167873,0.200676,0.180649,0.080591,0.06998,0.202314,0.186775,0.196898,0.186779,0.206162,0.196105,0.166677,0.15604,7644.0,1791.0,15679.0,4583.0,3911.0,1716.5,7321.0,2711.0,9048.0,3744.5,39798.0,17504.5,2146.0,925.5,6200.0,7607.5,,,,,,,,,,,,,,,,,8295.5,307.5,750.5,265.0,168.5,629.0,3373.5,183.5,1231.0,10712.0,0.0
50%,615.72998,1.0,0.5,2.0,-0.203,0.0,37.0,0.0,1.0,0.5,60.0,-0.054,15.84,0.366,0.941,0.484536,86298.5,4504.0,12315.0,3680.0,5993.0,7528.0,39266.0,46893.0,37761.0,2002.0,9448.0,11201.0,97323.5,0.449626,0.407731,0.453158,0.448898,0.441732,0.452569,0.449601,0.449605,0.449434,0.449636,0.619944,0.450306,0.438621,238237.0,14952.0,32810.0,9594.0,16970.0,21686.0,107814.0,130808.0,108152.0,5322.0,16076.0,29998.0,246196.0,0.953306,0.312,0.0,9056779.0,47.0,0.0,0.0,,,,,,,,,,,,,99.0,18867.0,818.0,2613.0,747.0,1188.0,1529.0,8660.5,10398.0,7840.0,423.0,2337.0,21898.0,0.0408,59.439999,0.122,19.91,0.329,0.734,5.35,1.22,4.7,0.458564,0.45467,0.011,0.458042,0.410569,0.461741,0.399455,0.443945,2309.0,0.463259,0.4289,0.4755,5008.0,17980.0,0.4692,0.4751,2504.0,0.4724,1.0,3.0,0.0,1.0,253185.0,0.2474,0.2462,0.26062,1.0,3357.0,959.0,7825.0,2248.0,1901.0,850.0,3014.0,1343.0,4083.0,1882.0,18205.0,9642.0,1062.0,476.0,2885.0,3678.0,0.495966,0.484461,0.485408,0.47637,0.48171,0.473452,0.434605,0.426606,0.484843,0.469333,0.479282,0.481,0.486394,0.470696,0.473313,0.464608,9100.0,2782.0,19376.0,6106.0,4676.0,2265.0,8436.0,3991.0,10484.0,5247.0,47720.0,28024.0,2588.0,1256.0,7428.0,9960.0,,,,,,,,,,,,,,,,,21641.0,959.0,2248.0,850.0,1343.0,1882.0,9642.0,476.0,3678.0,27230.0,1.0
75%,1254.77002,2.0,1.0,2.0,0.91,0.631,69.0,1.775,2.0,1.0,60.0,0.598,29.639999,1.065,1.609,0.5,147101.0,9097.0,21200.0,6284.0,11098.0,13743.0,66767.0,80274.0,67243.5,3375.0,14962.0,19127.75,159093.75,0.710626,0.717392,0.733642,0.720554,0.771679,0.729707,0.709699,0.711186,0.72196,0.711917,0.871384,0.716987,0.703414,245476.0,15276.0,33530.0,9830.0,17228.0,22252.0,110990.0,134506.0,111266.0,5468.0,22286.0,30758.0,246266.0,0.954732,1.47,0.011,13594480.0,73.0,0.0,0.733,,,,,,,,,,,,,99.0,51423.0,3027.0,7610.0,2192.0,4143.0,4691.0,23484.75,28456.75,22828.0,1173.0,6697.0,56555.0,0.076,59.619999,0.177,25.870001,0.44,0.796,6.18,3.56,5.38,0.743094,0.734432,0.986,0.797203,0.754065,0.745383,0.741059,0.741561,3610.0,0.724641,0.7474,0.7464,5008.0,19697.0,0.7946,0.7475,2504.0,0.7454,6.0,20.0,0.0,1.0,320605.0,0.3753,0.3778,0.382945,1.0,6293.0,1888.0,13445.0,4208.5,3134.5,1527.0,5904.0,2701.5,7141.5,3496.5,31420.5,18245.5,1725.5,828.5,4983.0,6587.5,0.822966,0.824459,0.796321,0.797678,0.770648,0.7704,0.856312,0.874948,0.776299,0.794086,0.788566,0.789179,0.757992,0.771157,0.797579,0.799851,9180.0,3018.0,19558.0,6895.0,4734.0,2524.5,8488.0,4335.0,10584.0,5763.5,48509.0,30833.5,2624.0,1402.0,7506.0,11518.0,,,,,,,,,,,,,,,,,38035.5,1888.0,4208.5,1527.0,2701.5,3496.5,18245.5,828.5,6587.5,43899.0,2.5
max,67382.773438,2.0,1.0,2.0,10.042,3.739,2469.0,268.968994,2.0,1.0,70.0,10.067,42.950001,8.02,8.155,1.0,246266.0,15304.0,33582.0,9850.0,17248.0,22300.0,111361.0,134906.0,111720.0,5486.0,111564.0,30782.0,246272.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,246272.0,15304.0,33582.0,9850.0,17248.0,22300.0,111362.0,134910.0,111720.0,5486.0,111720.0,30782.0,246272.0,0.954732,10.91,2.2,229055900.0,1424.0,0.5,252.589005,,,,,,,,,,,,,99.0,123133.0,7652.0,16791.0,4925.0,8624.0,11150.0,55680.0,67453.0,55860.0,2743.0,15391.0,123136.0,0.8654,65.720001,7.39,37.759998,6.54,18.700001,9.0,6.17,6.17,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5008.0,1.0,1.0,1.0,5008.0,141654.0,1.0,1.0,2504.0,1.0,123465.0,173408.0,59756.0,5.0,589474.0,0.4996,0.4998,0.49989,5.0,9188.0,3058.0,19576.0,7003.0,4740.0,2555.0,8492.0,4378.0,10592.0,5854.0,48636.0,31526.0,2630.0,1428.0,7508.0,11637.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,9188.0,3058.0,19576.0,7003.0,4740.0,2555.0,8492.0,4378.0,10592.0,5854.0,48636.0,31529.0,2630.0,1428.0,7508.0,11637.0,,,,,,,,,,,,,,,,,67435.0,3058.0,7003.0,2555.0,4378.0,5854.0,31526.0,1428.0,11637.0,67455.0,4152.0


In [34]:
df1 = clean_df(df1)

In [35]:
df1.shape

(316174, 270)

In [36]:
df2.shape

(317617, 270)

In [31]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 104079 entries, 831 to 104078
Columns: 270 entries, CHROM to AMINOCHANGE
dtypes: int64(1), object(269)
memory usage: 215.2+ MB


In [32]:
df1.describe()

Unnamed: 0,POS
count,104079.0
mean,74529230.0
std,58573000.0
min,6115.0
25%,27172950.0
50%,58395500.0
75%,113078200.0
max,249210700.0


In [33]:
df1.columns

Index(['CHROM', 'POS', 'REF', 'ALT', 'RSID', 'QUAL', 'FILTER', 'AC', 'AF',
       'AN',
       ...
       'TRANSCRIPT_BIOTYPE', 'RANK', 'HGVS.C', 'HGVS.P',
       'CDNA.POS / CDNA.LENGTH', 'CDS.POS / CDS.LENGTH', 'AA.POS / AA.LENGTH',
      dtype='object', length=270)

In [34]:
df1.groupby(['CHROM', 'POS', 'REF', 'ALT']).size()

CHROM  POS       REF  ALT
chr1   14653     C    T      1
       14671     G    C      1
       69511     A    G      1
       752894    T    C      1
       762273    G    A      1
                            ..
chrY   21154323  G    A      1
       21154426  G    A      1
       21154466  T    A      1
       21154569  A    G      1
       21154603  A    C      1
Length: 104079, dtype: int64

In [35]:
df1.to_parquet('./test_parquet_prioritized.parquet')

# Reading parquet

In [1]:
!ls

Parquet-test.ipynb               [1m[36mtest3[m[m
[1m[36mtest[m[m                             test_parquet_all_ann.parquet
[1m[36mtest2[m[m                            test_parquet_prioritized.parquet


In [1]:
import pandas as pd

In [2]:
df = pd.read_parquet('./test_parquet_all_ann.parquet')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 316174 entries, 0 to 104078
Columns: 270 entries, CHROM to AMINOCHANGE
dtypes: int64(1), object(269)
memory usage: 653.7+ MB


In [11]:
df.head()

Unnamed: 0,CHROM,POS,REF,ALT,RSID,QUAL,FILTER,AC,AF,AN,...,TRANSCRIPT_BIOTYPE,RANK,HGVS.C,HGVS.P,CDNA.POS / CDNA.LENGTH,CDS.POS / CDS.LENGTH,AA.POS / AA.LENGTH,DISTANCE,ERRORS / WARNINGS / INFO,AMINOCHANGE
0,chr1,14653,C,T,,48.77,,1.0,0.5,2.0,...,pseudogene,,n.*244C>T,,,,,244,,
0,chr1,14653,C,T,,48.77,,1.0,0.5,2.0,...,pseudogene,,n.*2716G>A,,,,,2716,,
0,chr1,14653,C,T,,48.77,,1.0,0.5,2.0,...,pseudogene,,n.*2716G>A,,,,,2716,,
0,chr1,14653,C,T,,48.77,,1.0,0.5,2.0,...,pseudogene,,n.*2716G>A,,,,,2716,,
0,chr1,14653,C,T,,48.77,,1.0,0.5,2.0,...,pseudogene,,n.*2716G>A,,,,,2716,,


In [14]:
df.describe()

Unnamed: 0,POS,GNOMAD_GC,GNOMAD_GC_AFR,GNOMAD_GC_AMR,GNOMAD_GC_ASJ,GNOMAD_GC_EAS,GNOMAD_GC_FIN,GNOMAD_GC_FEMALE,GNOMAD_GC_MALE,GNOMAD_GC_NFE,...,GNOMAD_GC_EAS_FEMALE,GNOMAD_GC_EAS_MALE,GNOMAD_GC_FIN_FEMALE,GNOMAD_GC_FIN_MALE,GNOMAD_GC_NFE_FEMALE,GNOMAD_GC_NFE_MALE,GNOMAD_GC_OTH_FEMALE,GNOMAD_GC_OTH_MALE,GNOMAD_GC_SAS_FEMALE,GNOMAD_GC_SAS_MALE
count,316174.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,73189600.0,,,,,,,,,,...,,,,,,,,,,
std,58301220.0,,,,,,,,,,...,,,,,,,,,,
min,6115.0,,,,,,,,,,...,,,,,,,,,,
25%,26802980.0,,,,,,,,,,...,,,,,,,,,,
50%,57176500.0,,,,,,,,,,...,,,,,,,,,,
75%,111629000.0,,,,,,,,,,...,,,,,,,,,,
max,249210700.0,,,,,,,,,,...,,,,,,,,,,


In [15]:
df.size

85366980

In [16]:
df.shape

(316174, 270)

In [31]:
(df.groupby(['CHROM','POS','REF','ALT']).size()==1).sum()

35966

In [32]:
(df.groupby(['CHROM','POS','REF','ALT']).size()>1).sum()

68113

In [34]:
df.groupby(['CHROM','POS','REF','ALT']).size().count()

104079

In [42]:
pd.set_option('display.max_columns',500)

In [57]:
test = df.loc[df['POS']==14671].copy()


In [60]:
test

Unnamed: 0,CHROM,POS,REF,ALT,RSID,QUAL,FILTER,AC,AF,AN,BASEQRANKSUM,CLIPPINGRANKSUM,DP,FS,MLEAC,MLEAF,MQ,MQRANKSUM,QD,READPOSRANKSUM,SOR,SET,SNP,VARTYPE,ZIGOSITY,GNOMAD_AB_HIST_ALL,GNOMAD_AB_HIST_ALT,GNOMAD_AB_MEDIAN,GNOMAD_AC,GNOMAD_AC_AFR,GNOMAD_AC_AMR,GNOMAD_AC_ASJ,GNOMAD_AC_EAS,GNOMAD_AC_FIN,GNOMAD_AC_FEMALE,GNOMAD_AC_MALE,GNOMAD_AC_NFE,GNOMAD_AC_OTH,GNOMAD_AC_POPMAX,GNOMAD_AC_SAS,GNOMAD_AC_RAW,GNOMAD_AF,GNOMAD_AF_AFR,GNOMAD_AF_AMR,GNOMAD_AF_ASJ,GNOMAD_AF_EAS,GNOMAD_AF_FIN,GNOMAD_AF_FEMALE,GNOMAD_AF_MALE,GNOMAD_AF_NFE,GNOMAD_AF_OTH,GNOMAD_AF_POPMAX,GNOMAD_AF_SAS,GNOMAD_AF_RAW,GNOMAD_AN,GNOMAD_AN_AFR,GNOMAD_AN_AMR,GNOMAD_AN_ASJ,GNOMAD_AN_EAS,GNOMAD_AN_FIN,GNOMAD_AN_FEMALE,GNOMAD_AN_MALE,GNOMAD_AN_NFE,GNOMAD_AN_OTH,GNOMAD_AN_POPMAX,GNOMAD_AN_SAS,GNOMAD_AN_RAW,GNOMAD_AS_FILTERSTATUS,GNOMAD_AS_RF,GNOMAD_BASEQRANKSUM,GNOMAD_CSQ,GNOMAD_CLIPPINGRANKSUM,GNOMAD_DB,GNOMAD_DP,GNOMAD_DP_HIST_ALL,GNOMAD_DP_HIST_ALT,GNOMAD_DP_MEDIAN,GNOMAD_DREF_MEDIAN,GNOMAD_FS,GNOMAD_GC,GNOMAD_GC_AFR,GNOMAD_GC_AMR,GNOMAD_GC_ASJ,GNOMAD_GC_EAS,GNOMAD_GC_FIN,GNOMAD_GC_FEMALE,GNOMAD_GC_MALE,GNOMAD_GC_NFE,GNOMAD_GC_OTH,GNOMAD_GC_SAS,GNOMAD_GC_RAW,GNOMAD_GQ_HIST_ALL,GNOMAD_GQ_HIST_ALT,GNOMAD_GQ_MEDIAN,GNOMAD_HOM,GNOMAD_HOM_AFR,GNOMAD_HOM_AMR,GNOMAD_HOM_ASJ,GNOMAD_HOM_EAS,GNOMAD_HOM_FIN,GNOMAD_HOM_FEMALE,GNOMAD_HOM_MALE,GNOMAD_HOM_NFE,GNOMAD_HOM_OTH,GNOMAD_HOM_SAS,GNOMAD_HOM_RAW,GNOMAD_INBREEDINGCOEFF,GNOMAD_MQ,GNOMAD_MQRANKSUM,GNOMAD_POPMAX,GNOMAD_QD,GNOMAD_READPOSRANKSUM,GNOMAD_SOR,GNOMAD_VQSLOD,GNOMAD_VQSR_CULPRIT,DBNSFP_GERP++_RS,DBNSFP_GERP++_NR,DBNSFP_1000GP1_AMR_AF,DBNSFP_1000GP1_AF,DBNSFP_UNIPROT_ACC,DBNSFP_LRT_PRED,DBNSFP_PHASTCONS100WAY_VERTEBRATE,DBNSFP_POLYPHEN2_HDIV_PRED,DBNSFP_1000GP1_ASN_AF,DBNSFP_1000GP1_AFR_AF,DBNSFP_MUTATIONTASTER_PRED,DBNSFP_1000GP1_EUR_AF,DBNSFP_INTERPRO_DOMAIN,DBNSFP_ESP6500_AA_AF,DBNSFP_POLYPHEN2_HVAR_PRED,DBNSFP_SIFT_PRED,DBNSFP_ESP6500_EA_AF,1000GP3_AA,1000GP3_AC,1000GP3_AF,1000GP3_AFR_AF,1000GP3_AMR_AF,1000GP3_AN,1000GP3_DP,1000GP3_EAS_AF,1000GP3_EUR_AF,1000GP3_NS,1000GP3_SAS_AF,1000GP3_VT,GNOMAD_STAR_AC,GNOMAD_STAR_AC_RAW,GNOMAD_STAR_HOM,GNOMAD_VQSR_NEGATIVE_TRAIN_SITE,GNOMAD_VQSR_POSITIVE_TRAIN_SITE,INS,1000GP3_MULTI_ALLELIC,1000GP3_EX_TARGET,DEL,GNOMAD_AS_RF_POSITIVE_TRAIN,CLINVAR_ALLELEID,CLINVAR_CLNDISDB,CLINVAR_CLNDN,CLINVAR_CLNHGVS,CLINVAR_CLNREVSTAT,CLINVAR_CLNSIG,CLINVAR_CLNVC,CLINVAR_CLNVCSO,CLINVAR_GENEINFO,CLINVAR_MC,CLINVAR_ORIGIN,CLINVAR_RS,CLINVAR_AF_ESP,CLINVAR_AF_TGP,CLINVAR_AF_EXAC,GNOMAD_AS_RF_NEGATIVE_TRAIN,CLINVAR_CLNVI,LOF,CLINVAR_CLNSIGCONF,NMD,CLINVAR_CLNDISDBINCL,CLINVAR_CLNDNINCL,CLINVAR_CLNSIGINCL,GNOMAD_AC_AFR_FEMALE,GNOMAD_AC_AFR_MALE,GNOMAD_AC_AMR_FEMALE,GNOMAD_AC_AMR_MALE,GNOMAD_AC_ASJ_FEMALE,GNOMAD_AC_ASJ_MALE,GNOMAD_AC_EAS_FEMALE,GNOMAD_AC_EAS_MALE,GNOMAD_AC_FIN_FEMALE,GNOMAD_AC_FIN_MALE,GNOMAD_AC_NFE_FEMALE,GNOMAD_AC_NFE_MALE,GNOMAD_AC_OTH_FEMALE,GNOMAD_AC_OTH_MALE,GNOMAD_AC_SAS_FEMALE,GNOMAD_AC_SAS_MALE,GNOMAD_AF_AFR_FEMALE,GNOMAD_AF_AFR_MALE,GNOMAD_AF_AMR_FEMALE,GNOMAD_AF_AMR_MALE,GNOMAD_AF_ASJ_FEMALE,GNOMAD_AF_ASJ_MALE,GNOMAD_AF_EAS_FEMALE,GNOMAD_AF_EAS_MALE,GNOMAD_AF_FIN_FEMALE,GNOMAD_AF_FIN_MALE,GNOMAD_AF_NFE_FEMALE,GNOMAD_AF_NFE_MALE,GNOMAD_AF_OTH_FEMALE,GNOMAD_AF_OTH_MALE,GNOMAD_AF_SAS_FEMALE,GNOMAD_AF_SAS_MALE,GNOMAD_AN_AFR_FEMALE,GNOMAD_AN_AFR_MALE,GNOMAD_AN_AMR_FEMALE,GNOMAD_AN_AMR_MALE,GNOMAD_AN_ASJ_FEMALE,GNOMAD_AN_ASJ_MALE,GNOMAD_AN_EAS_FEMALE,GNOMAD_AN_EAS_MALE,GNOMAD_AN_FIN_FEMALE,GNOMAD_AN_FIN_MALE,GNOMAD_AN_NFE_FEMALE,GNOMAD_AN_NFE_MALE,GNOMAD_AN_OTH_FEMALE,GNOMAD_AN_OTH_MALE,GNOMAD_AN_SAS_FEMALE,GNOMAD_AN_SAS_MALE,GNOMAD_GC_AFR_FEMALE,GNOMAD_GC_AFR_MALE,GNOMAD_GC_AMR_FEMALE,GNOMAD_GC_AMR_MALE,GNOMAD_GC_ASJ_FEMALE,GNOMAD_GC_ASJ_MALE,GNOMAD_GC_EAS_FEMALE,GNOMAD_GC_EAS_MALE,GNOMAD_GC_FIN_FEMALE,GNOMAD_GC_FIN_MALE,GNOMAD_GC_NFE_FEMALE,GNOMAD_GC_NFE_MALE,GNOMAD_GC_OTH_FEMALE,GNOMAD_GC_OTH_MALE,GNOMAD_GC_SAS_FEMALE,GNOMAD_GC_SAS_MALE,GNOMAD_HEMI,GNOMAD_HEMI_AFR,GNOMAD_HEMI_AMR,GNOMAD_HEMI_ASJ,GNOMAD_HEMI_EAS,GNOMAD_HEMI_FIN,GNOMAD_HEMI_NFE,GNOMAD_HEMI_OTH,GNOMAD_HEMI_SAS,GNOMAD_HEMI_RAW,GNOMAD_STAR_HEMI,1000GP3_OLD_VARIANT,ALLELE,EFFECT,IMPACT,GENE_NAME,GENE_ID,FEATURE_TYPE,FEATURE_ID,TRANSCRIPT_BIOTYPE,RANK,HGVS.C,HGVS.P,CDNA.POS / CDNA.LENGTH,CDS.POS / CDS.LENGTH,AA.POS / AA.LENGTH,DISTANCE,ERRORS / WARNINGS / INFO,AMINOCHANGE
1,chr1,14671,G,C,,202.770004,,1.0,0.5,2.0,-1.555,0.4,18.0,0.0,1.0,0.5,41.419998,-1.555,11.26,0.133,1.022,variant,True,SNP,HET,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,downstream_gene_variant,MODIFIER,DDX11L1,DDX11L1,transcript,NR_046018.2,pseudogene,,n.*262G>C,,,,,262.0,,
1,chr1,14671,G,C,,202.770004,,1.0,0.5,2.0,-1.555,0.4,18.0,0.0,1.0,0.5,41.419998,-1.555,11.26,0.133,1.022,variant,True,SNP,HET,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,downstream_gene_variant,MODIFIER,MIR6859-1,MIR6859-1,transcript,NR_106918.1,pseudogene,,n.*2698C>G,,,,,2698.0,,
1,chr1,14671,G,C,,202.770004,,1.0,0.5,2.0,-1.555,0.4,18.0,0.0,1.0,0.5,41.419998,-1.555,11.26,0.133,1.022,variant,True,SNP,HET,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,downstream_gene_variant,MODIFIER,MIR6859-2,MIR6859-2,transcript,NR_107062.1,pseudogene,,n.*2698C>G,,,,,2698.0,,
1,chr1,14671,G,C,,202.770004,,1.0,0.5,2.0,-1.555,0.4,18.0,0.0,1.0,0.5,41.419998,-1.555,11.26,0.133,1.022,variant,True,SNP,HET,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,downstream_gene_variant,MODIFIER,MIR6859-3,MIR6859-3,transcript,NR_107063.1,pseudogene,,n.*2698C>G,,,,,2698.0,,
1,chr1,14671,G,C,,202.770004,,1.0,0.5,2.0,-1.555,0.4,18.0,0.0,1.0,0.5,41.419998,-1.555,11.26,0.133,1.022,variant,True,SNP,HET,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,downstream_gene_variant,MODIFIER,MIR6859-4,MIR6859-4,transcript,NR_128720.1,pseudogene,,n.*2698C>G,,,,,2698.0,,
1,chr1,14671,G,C,,202.770004,,1.0,0.5,2.0,-1.555,0.4,18.0,0.0,1.0,0.5,41.419998,-1.555,11.26,0.133,1.022,variant,True,SNP,HET,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,non_coding_transcript_exon_variant,MODIFIER,WASH7P,WASH7P,transcript,NR_024540.1,pseudogene,11/11,n.1460C>G,,,,,,,


In [67]:

IMPACT_SEVERITY = {
    "exon_loss_variant": 1,
    "frameshift_variant": 2,
    "stop_gained": 3,
    "stop_lost": 4,
    "start_lost": 5,
    "splice_acceptor_variant": 6,
    "splice_donor_variant": 7,
    "disruptive_inframe_deletion": 8,
    "inframe_insertion": 9,
    "disruptive_inframe_insertion": 10,
    "inframe_deletion": 11,
    "missense_variant": 12,
    "splice_region_variant": 13,
    "stop_retained_variant": 14,
    "initiator_codon_variant": 15,
    "synonymous_variant": 16,
    "start_retained": 17,
    "coding_sequence_variant": 18,
    "5_prime_UTR_variant": 19,
    "3_prime_UTR_variant": 20,
    "5_prime_UTR_premature_start_codon_gain_variant": 21,
    "intron_variant": 22,
    "non_coding_exon_variant": 23,
    "upstream_gene_variant": 24,
    "downstream_gene_variant": 25,
    "TF_binding_site_variant": 26,
    "regulatory_region_variant": 27,
    "intergenic_region": 28,
    "transcript": 29,
}
if 'EFFECT' in test.columns:
    test["sorter"] = (
        test["EFFECT"].str.split("&").str[0].replace(IMPACT_SEVERITY)
    )
    test.loc[test["HGVS.C"].str.contains("null"), "HGVS.C"] = None
    test["sorter2"] = [x[0] == x[1] for x in zip(test["ALT"], test["ALLELE"])]
    test = test.sort_values(
        by=["CHROM", "POS", "sorter2", "sorter"],
        ascending=[True, True, False, True],
    ).drop_duplicates(["CHROM", "POS", "REF", "ALT"])
    test.drop(columns=["sorter", "sorter2"], inplace=True)

In [68]:
test

Unnamed: 0,CHROM,POS,REF,ALT,RSID,QUAL,FILTER,AC,AF,AN,BASEQRANKSUM,CLIPPINGRANKSUM,DP,FS,MLEAC,MLEAF,MQ,MQRANKSUM,QD,READPOSRANKSUM,SOR,SET,SNP,VARTYPE,ZIGOSITY,GNOMAD_AB_HIST_ALL,GNOMAD_AB_HIST_ALT,GNOMAD_AB_MEDIAN,GNOMAD_AC,GNOMAD_AC_AFR,GNOMAD_AC_AMR,GNOMAD_AC_ASJ,GNOMAD_AC_EAS,GNOMAD_AC_FIN,GNOMAD_AC_FEMALE,GNOMAD_AC_MALE,GNOMAD_AC_NFE,GNOMAD_AC_OTH,GNOMAD_AC_POPMAX,GNOMAD_AC_SAS,GNOMAD_AC_RAW,GNOMAD_AF,GNOMAD_AF_AFR,GNOMAD_AF_AMR,GNOMAD_AF_ASJ,GNOMAD_AF_EAS,GNOMAD_AF_FIN,GNOMAD_AF_FEMALE,GNOMAD_AF_MALE,GNOMAD_AF_NFE,GNOMAD_AF_OTH,GNOMAD_AF_POPMAX,GNOMAD_AF_SAS,GNOMAD_AF_RAW,GNOMAD_AN,GNOMAD_AN_AFR,GNOMAD_AN_AMR,GNOMAD_AN_ASJ,GNOMAD_AN_EAS,GNOMAD_AN_FIN,GNOMAD_AN_FEMALE,GNOMAD_AN_MALE,GNOMAD_AN_NFE,GNOMAD_AN_OTH,GNOMAD_AN_POPMAX,GNOMAD_AN_SAS,GNOMAD_AN_RAW,GNOMAD_AS_FILTERSTATUS,GNOMAD_AS_RF,GNOMAD_BASEQRANKSUM,GNOMAD_CSQ,GNOMAD_CLIPPINGRANKSUM,GNOMAD_DB,GNOMAD_DP,GNOMAD_DP_HIST_ALL,GNOMAD_DP_HIST_ALT,GNOMAD_DP_MEDIAN,GNOMAD_DREF_MEDIAN,GNOMAD_FS,GNOMAD_GC,GNOMAD_GC_AFR,GNOMAD_GC_AMR,GNOMAD_GC_ASJ,GNOMAD_GC_EAS,GNOMAD_GC_FIN,GNOMAD_GC_FEMALE,GNOMAD_GC_MALE,GNOMAD_GC_NFE,GNOMAD_GC_OTH,GNOMAD_GC_SAS,GNOMAD_GC_RAW,GNOMAD_GQ_HIST_ALL,GNOMAD_GQ_HIST_ALT,GNOMAD_GQ_MEDIAN,GNOMAD_HOM,GNOMAD_HOM_AFR,GNOMAD_HOM_AMR,GNOMAD_HOM_ASJ,GNOMAD_HOM_EAS,GNOMAD_HOM_FIN,GNOMAD_HOM_FEMALE,GNOMAD_HOM_MALE,GNOMAD_HOM_NFE,GNOMAD_HOM_OTH,GNOMAD_HOM_SAS,GNOMAD_HOM_RAW,GNOMAD_INBREEDINGCOEFF,GNOMAD_MQ,GNOMAD_MQRANKSUM,GNOMAD_POPMAX,GNOMAD_QD,GNOMAD_READPOSRANKSUM,GNOMAD_SOR,GNOMAD_VQSLOD,GNOMAD_VQSR_CULPRIT,DBNSFP_GERP++_RS,DBNSFP_GERP++_NR,DBNSFP_1000GP1_AMR_AF,DBNSFP_1000GP1_AF,DBNSFP_UNIPROT_ACC,DBNSFP_LRT_PRED,DBNSFP_PHASTCONS100WAY_VERTEBRATE,DBNSFP_POLYPHEN2_HDIV_PRED,DBNSFP_1000GP1_ASN_AF,DBNSFP_1000GP1_AFR_AF,DBNSFP_MUTATIONTASTER_PRED,DBNSFP_1000GP1_EUR_AF,DBNSFP_INTERPRO_DOMAIN,DBNSFP_ESP6500_AA_AF,DBNSFP_POLYPHEN2_HVAR_PRED,DBNSFP_SIFT_PRED,DBNSFP_ESP6500_EA_AF,1000GP3_AA,1000GP3_AC,1000GP3_AF,1000GP3_AFR_AF,1000GP3_AMR_AF,1000GP3_AN,1000GP3_DP,1000GP3_EAS_AF,1000GP3_EUR_AF,1000GP3_NS,1000GP3_SAS_AF,1000GP3_VT,GNOMAD_STAR_AC,GNOMAD_STAR_AC_RAW,GNOMAD_STAR_HOM,GNOMAD_VQSR_NEGATIVE_TRAIN_SITE,GNOMAD_VQSR_POSITIVE_TRAIN_SITE,INS,1000GP3_MULTI_ALLELIC,1000GP3_EX_TARGET,DEL,GNOMAD_AS_RF_POSITIVE_TRAIN,CLINVAR_ALLELEID,CLINVAR_CLNDISDB,CLINVAR_CLNDN,CLINVAR_CLNHGVS,CLINVAR_CLNREVSTAT,CLINVAR_CLNSIG,CLINVAR_CLNVC,CLINVAR_CLNVCSO,CLINVAR_GENEINFO,CLINVAR_MC,CLINVAR_ORIGIN,CLINVAR_RS,CLINVAR_AF_ESP,CLINVAR_AF_TGP,CLINVAR_AF_EXAC,GNOMAD_AS_RF_NEGATIVE_TRAIN,CLINVAR_CLNVI,LOF,CLINVAR_CLNSIGCONF,NMD,CLINVAR_CLNDISDBINCL,CLINVAR_CLNDNINCL,CLINVAR_CLNSIGINCL,GNOMAD_AC_AFR_FEMALE,GNOMAD_AC_AFR_MALE,GNOMAD_AC_AMR_FEMALE,GNOMAD_AC_AMR_MALE,GNOMAD_AC_ASJ_FEMALE,GNOMAD_AC_ASJ_MALE,GNOMAD_AC_EAS_FEMALE,GNOMAD_AC_EAS_MALE,GNOMAD_AC_FIN_FEMALE,GNOMAD_AC_FIN_MALE,GNOMAD_AC_NFE_FEMALE,GNOMAD_AC_NFE_MALE,GNOMAD_AC_OTH_FEMALE,GNOMAD_AC_OTH_MALE,GNOMAD_AC_SAS_FEMALE,GNOMAD_AC_SAS_MALE,GNOMAD_AF_AFR_FEMALE,GNOMAD_AF_AFR_MALE,GNOMAD_AF_AMR_FEMALE,GNOMAD_AF_AMR_MALE,GNOMAD_AF_ASJ_FEMALE,GNOMAD_AF_ASJ_MALE,GNOMAD_AF_EAS_FEMALE,GNOMAD_AF_EAS_MALE,GNOMAD_AF_FIN_FEMALE,GNOMAD_AF_FIN_MALE,GNOMAD_AF_NFE_FEMALE,GNOMAD_AF_NFE_MALE,GNOMAD_AF_OTH_FEMALE,GNOMAD_AF_OTH_MALE,GNOMAD_AF_SAS_FEMALE,GNOMAD_AF_SAS_MALE,GNOMAD_AN_AFR_FEMALE,GNOMAD_AN_AFR_MALE,GNOMAD_AN_AMR_FEMALE,GNOMAD_AN_AMR_MALE,GNOMAD_AN_ASJ_FEMALE,GNOMAD_AN_ASJ_MALE,GNOMAD_AN_EAS_FEMALE,GNOMAD_AN_EAS_MALE,GNOMAD_AN_FIN_FEMALE,GNOMAD_AN_FIN_MALE,GNOMAD_AN_NFE_FEMALE,GNOMAD_AN_NFE_MALE,GNOMAD_AN_OTH_FEMALE,GNOMAD_AN_OTH_MALE,GNOMAD_AN_SAS_FEMALE,GNOMAD_AN_SAS_MALE,GNOMAD_GC_AFR_FEMALE,GNOMAD_GC_AFR_MALE,GNOMAD_GC_AMR_FEMALE,GNOMAD_GC_AMR_MALE,GNOMAD_GC_ASJ_FEMALE,GNOMAD_GC_ASJ_MALE,GNOMAD_GC_EAS_FEMALE,GNOMAD_GC_EAS_MALE,GNOMAD_GC_FIN_FEMALE,GNOMAD_GC_FIN_MALE,GNOMAD_GC_NFE_FEMALE,GNOMAD_GC_NFE_MALE,GNOMAD_GC_OTH_FEMALE,GNOMAD_GC_OTH_MALE,GNOMAD_GC_SAS_FEMALE,GNOMAD_GC_SAS_MALE,GNOMAD_HEMI,GNOMAD_HEMI_AFR,GNOMAD_HEMI_AMR,GNOMAD_HEMI_ASJ,GNOMAD_HEMI_EAS,GNOMAD_HEMI_FIN,GNOMAD_HEMI_NFE,GNOMAD_HEMI_OTH,GNOMAD_HEMI_SAS,GNOMAD_HEMI_RAW,GNOMAD_STAR_HEMI,1000GP3_OLD_VARIANT,ALLELE,EFFECT,IMPACT,GENE_NAME,GENE_ID,FEATURE_TYPE,FEATURE_ID,TRANSCRIPT_BIOTYPE,RANK,HGVS.C,HGVS.P,CDNA.POS / CDNA.LENGTH,CDS.POS / CDS.LENGTH,AA.POS / AA.LENGTH,DISTANCE,ERRORS / WARNINGS / INFO,AMINOCHANGE
1,chr1,14671,G,C,,202.770004,,1.0,0.5,2.0,-1.555,0.4,18.0,0.0,1.0,0.5,41.419998,-1.555,11.26,0.133,1.022,variant,True,SNP,HET,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,downstream_gene_variant,MODIFIER,DDX11L1,DDX11L1,transcript,NR_046018.2,pseudogene,,n.*262G>C,,,,,262,,


In [69]:
df.loc[df['POS']==14671]

Unnamed: 0,CHROM,POS,REF,ALT,RSID,QUAL,FILTER,AC,AF,AN,BASEQRANKSUM,CLIPPINGRANKSUM,DP,FS,MLEAC,MLEAF,MQ,MQRANKSUM,QD,READPOSRANKSUM,SOR,SET,SNP,VARTYPE,ZIGOSITY,GNOMAD_AB_HIST_ALL,GNOMAD_AB_HIST_ALT,GNOMAD_AB_MEDIAN,GNOMAD_AC,GNOMAD_AC_AFR,GNOMAD_AC_AMR,GNOMAD_AC_ASJ,GNOMAD_AC_EAS,GNOMAD_AC_FIN,GNOMAD_AC_FEMALE,GNOMAD_AC_MALE,GNOMAD_AC_NFE,GNOMAD_AC_OTH,GNOMAD_AC_POPMAX,GNOMAD_AC_SAS,GNOMAD_AC_RAW,GNOMAD_AF,GNOMAD_AF_AFR,GNOMAD_AF_AMR,GNOMAD_AF_ASJ,GNOMAD_AF_EAS,GNOMAD_AF_FIN,GNOMAD_AF_FEMALE,GNOMAD_AF_MALE,GNOMAD_AF_NFE,GNOMAD_AF_OTH,GNOMAD_AF_POPMAX,GNOMAD_AF_SAS,GNOMAD_AF_RAW,GNOMAD_AN,GNOMAD_AN_AFR,GNOMAD_AN_AMR,GNOMAD_AN_ASJ,GNOMAD_AN_EAS,GNOMAD_AN_FIN,GNOMAD_AN_FEMALE,GNOMAD_AN_MALE,GNOMAD_AN_NFE,GNOMAD_AN_OTH,GNOMAD_AN_POPMAX,GNOMAD_AN_SAS,GNOMAD_AN_RAW,GNOMAD_AS_FILTERSTATUS,GNOMAD_AS_RF,GNOMAD_BASEQRANKSUM,GNOMAD_CSQ,GNOMAD_CLIPPINGRANKSUM,GNOMAD_DB,GNOMAD_DP,GNOMAD_DP_HIST_ALL,GNOMAD_DP_HIST_ALT,GNOMAD_DP_MEDIAN,GNOMAD_DREF_MEDIAN,GNOMAD_FS,GNOMAD_GC,GNOMAD_GC_AFR,GNOMAD_GC_AMR,GNOMAD_GC_ASJ,GNOMAD_GC_EAS,GNOMAD_GC_FIN,GNOMAD_GC_FEMALE,GNOMAD_GC_MALE,GNOMAD_GC_NFE,GNOMAD_GC_OTH,GNOMAD_GC_SAS,GNOMAD_GC_RAW,GNOMAD_GQ_HIST_ALL,GNOMAD_GQ_HIST_ALT,GNOMAD_GQ_MEDIAN,GNOMAD_HOM,GNOMAD_HOM_AFR,GNOMAD_HOM_AMR,GNOMAD_HOM_ASJ,GNOMAD_HOM_EAS,GNOMAD_HOM_FIN,GNOMAD_HOM_FEMALE,GNOMAD_HOM_MALE,GNOMAD_HOM_NFE,GNOMAD_HOM_OTH,GNOMAD_HOM_SAS,GNOMAD_HOM_RAW,GNOMAD_INBREEDINGCOEFF,GNOMAD_MQ,GNOMAD_MQRANKSUM,GNOMAD_POPMAX,GNOMAD_QD,GNOMAD_READPOSRANKSUM,GNOMAD_SOR,GNOMAD_VQSLOD,GNOMAD_VQSR_CULPRIT,DBNSFP_GERP++_RS,DBNSFP_GERP++_NR,DBNSFP_1000GP1_AMR_AF,DBNSFP_1000GP1_AF,DBNSFP_UNIPROT_ACC,DBNSFP_LRT_PRED,DBNSFP_PHASTCONS100WAY_VERTEBRATE,DBNSFP_POLYPHEN2_HDIV_PRED,DBNSFP_1000GP1_ASN_AF,DBNSFP_1000GP1_AFR_AF,DBNSFP_MUTATIONTASTER_PRED,DBNSFP_1000GP1_EUR_AF,DBNSFP_INTERPRO_DOMAIN,DBNSFP_ESP6500_AA_AF,DBNSFP_POLYPHEN2_HVAR_PRED,DBNSFP_SIFT_PRED,DBNSFP_ESP6500_EA_AF,1000GP3_AA,1000GP3_AC,1000GP3_AF,1000GP3_AFR_AF,1000GP3_AMR_AF,1000GP3_AN,1000GP3_DP,1000GP3_EAS_AF,1000GP3_EUR_AF,1000GP3_NS,1000GP3_SAS_AF,1000GP3_VT,GNOMAD_STAR_AC,GNOMAD_STAR_AC_RAW,GNOMAD_STAR_HOM,GNOMAD_VQSR_NEGATIVE_TRAIN_SITE,GNOMAD_VQSR_POSITIVE_TRAIN_SITE,INS,1000GP3_MULTI_ALLELIC,1000GP3_EX_TARGET,DEL,GNOMAD_AS_RF_POSITIVE_TRAIN,CLINVAR_ALLELEID,CLINVAR_CLNDISDB,CLINVAR_CLNDN,CLINVAR_CLNHGVS,CLINVAR_CLNREVSTAT,CLINVAR_CLNSIG,CLINVAR_CLNVC,CLINVAR_CLNVCSO,CLINVAR_GENEINFO,CLINVAR_MC,CLINVAR_ORIGIN,CLINVAR_RS,CLINVAR_AF_ESP,CLINVAR_AF_TGP,CLINVAR_AF_EXAC,GNOMAD_AS_RF_NEGATIVE_TRAIN,CLINVAR_CLNVI,LOF,CLINVAR_CLNSIGCONF,NMD,CLINVAR_CLNDISDBINCL,CLINVAR_CLNDNINCL,CLINVAR_CLNSIGINCL,GNOMAD_AC_AFR_FEMALE,GNOMAD_AC_AFR_MALE,GNOMAD_AC_AMR_FEMALE,GNOMAD_AC_AMR_MALE,GNOMAD_AC_ASJ_FEMALE,GNOMAD_AC_ASJ_MALE,GNOMAD_AC_EAS_FEMALE,GNOMAD_AC_EAS_MALE,GNOMAD_AC_FIN_FEMALE,GNOMAD_AC_FIN_MALE,GNOMAD_AC_NFE_FEMALE,GNOMAD_AC_NFE_MALE,GNOMAD_AC_OTH_FEMALE,GNOMAD_AC_OTH_MALE,GNOMAD_AC_SAS_FEMALE,GNOMAD_AC_SAS_MALE,GNOMAD_AF_AFR_FEMALE,GNOMAD_AF_AFR_MALE,GNOMAD_AF_AMR_FEMALE,GNOMAD_AF_AMR_MALE,GNOMAD_AF_ASJ_FEMALE,GNOMAD_AF_ASJ_MALE,GNOMAD_AF_EAS_FEMALE,GNOMAD_AF_EAS_MALE,GNOMAD_AF_FIN_FEMALE,GNOMAD_AF_FIN_MALE,GNOMAD_AF_NFE_FEMALE,GNOMAD_AF_NFE_MALE,GNOMAD_AF_OTH_FEMALE,GNOMAD_AF_OTH_MALE,GNOMAD_AF_SAS_FEMALE,GNOMAD_AF_SAS_MALE,GNOMAD_AN_AFR_FEMALE,GNOMAD_AN_AFR_MALE,GNOMAD_AN_AMR_FEMALE,GNOMAD_AN_AMR_MALE,GNOMAD_AN_ASJ_FEMALE,GNOMAD_AN_ASJ_MALE,GNOMAD_AN_EAS_FEMALE,GNOMAD_AN_EAS_MALE,GNOMAD_AN_FIN_FEMALE,GNOMAD_AN_FIN_MALE,GNOMAD_AN_NFE_FEMALE,GNOMAD_AN_NFE_MALE,GNOMAD_AN_OTH_FEMALE,GNOMAD_AN_OTH_MALE,GNOMAD_AN_SAS_FEMALE,GNOMAD_AN_SAS_MALE,GNOMAD_GC_AFR_FEMALE,GNOMAD_GC_AFR_MALE,GNOMAD_GC_AMR_FEMALE,GNOMAD_GC_AMR_MALE,GNOMAD_GC_ASJ_FEMALE,GNOMAD_GC_ASJ_MALE,GNOMAD_GC_EAS_FEMALE,GNOMAD_GC_EAS_MALE,GNOMAD_GC_FIN_FEMALE,GNOMAD_GC_FIN_MALE,GNOMAD_GC_NFE_FEMALE,GNOMAD_GC_NFE_MALE,GNOMAD_GC_OTH_FEMALE,GNOMAD_GC_OTH_MALE,GNOMAD_GC_SAS_FEMALE,GNOMAD_GC_SAS_MALE,GNOMAD_HEMI,GNOMAD_HEMI_AFR,GNOMAD_HEMI_AMR,GNOMAD_HEMI_ASJ,GNOMAD_HEMI_EAS,GNOMAD_HEMI_FIN,GNOMAD_HEMI_NFE,GNOMAD_HEMI_OTH,GNOMAD_HEMI_SAS,GNOMAD_HEMI_RAW,GNOMAD_STAR_HEMI,1000GP3_OLD_VARIANT,ALLELE,EFFECT,IMPACT,GENE_NAME,GENE_ID,FEATURE_TYPE,FEATURE_ID,TRANSCRIPT_BIOTYPE,RANK,HGVS.C,HGVS.P,CDNA.POS / CDNA.LENGTH,CDS.POS / CDS.LENGTH,AA.POS / AA.LENGTH,DISTANCE,ERRORS / WARNINGS / INFO,AMINOCHANGE
1,chr1,14671,G,C,,202.770004,,1.0,0.5,2.0,-1.555,0.4,18.0,0.0,1.0,0.5,41.419998,-1.555,11.26,0.133,1.022,variant,True,SNP,HET,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,downstream_gene_variant,MODIFIER,DDX11L1,DDX11L1,transcript,NR_046018.2,pseudogene,,n.*262G>C,,,,,262.0,,
1,chr1,14671,G,C,,202.770004,,1.0,0.5,2.0,-1.555,0.4,18.0,0.0,1.0,0.5,41.419998,-1.555,11.26,0.133,1.022,variant,True,SNP,HET,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,downstream_gene_variant,MODIFIER,MIR6859-1,MIR6859-1,transcript,NR_106918.1,pseudogene,,n.*2698C>G,,,,,2698.0,,
1,chr1,14671,G,C,,202.770004,,1.0,0.5,2.0,-1.555,0.4,18.0,0.0,1.0,0.5,41.419998,-1.555,11.26,0.133,1.022,variant,True,SNP,HET,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,downstream_gene_variant,MODIFIER,MIR6859-2,MIR6859-2,transcript,NR_107062.1,pseudogene,,n.*2698C>G,,,,,2698.0,,
1,chr1,14671,G,C,,202.770004,,1.0,0.5,2.0,-1.555,0.4,18.0,0.0,1.0,0.5,41.419998,-1.555,11.26,0.133,1.022,variant,True,SNP,HET,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,downstream_gene_variant,MODIFIER,MIR6859-3,MIR6859-3,transcript,NR_107063.1,pseudogene,,n.*2698C>G,,,,,2698.0,,
1,chr1,14671,G,C,,202.770004,,1.0,0.5,2.0,-1.555,0.4,18.0,0.0,1.0,0.5,41.419998,-1.555,11.26,0.133,1.022,variant,True,SNP,HET,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,downstream_gene_variant,MODIFIER,MIR6859-4,MIR6859-4,transcript,NR_128720.1,pseudogene,,n.*2698C>G,,,,,2698.0,,
1,chr1,14671,G,C,,202.770004,,1.0,0.5,2.0,-1.555,0.4,18.0,0.0,1.0,0.5,41.419998,-1.555,11.26,0.133,1.022,variant,True,SNP,HET,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,non_coding_transcript_exon_variant,MODIFIER,WASH7P,WASH7P,transcript,NR_024540.1,pseudogene,11/11,n.1460C>G,,,,,,,


In [73]:
df.query("""CHROM == 'chr1' & POS == 14671""")

Unnamed: 0,CHROM,POS,REF,ALT,RSID,QUAL,FILTER,AC,AF,AN,BASEQRANKSUM,CLIPPINGRANKSUM,DP,FS,MLEAC,MLEAF,MQ,MQRANKSUM,QD,READPOSRANKSUM,SOR,SET,SNP,VARTYPE,ZIGOSITY,GNOMAD_AB_HIST_ALL,GNOMAD_AB_HIST_ALT,GNOMAD_AB_MEDIAN,GNOMAD_AC,GNOMAD_AC_AFR,GNOMAD_AC_AMR,GNOMAD_AC_ASJ,GNOMAD_AC_EAS,GNOMAD_AC_FIN,GNOMAD_AC_FEMALE,GNOMAD_AC_MALE,GNOMAD_AC_NFE,GNOMAD_AC_OTH,GNOMAD_AC_POPMAX,GNOMAD_AC_SAS,GNOMAD_AC_RAW,GNOMAD_AF,GNOMAD_AF_AFR,GNOMAD_AF_AMR,GNOMAD_AF_ASJ,GNOMAD_AF_EAS,GNOMAD_AF_FIN,GNOMAD_AF_FEMALE,GNOMAD_AF_MALE,GNOMAD_AF_NFE,GNOMAD_AF_OTH,GNOMAD_AF_POPMAX,GNOMAD_AF_SAS,GNOMAD_AF_RAW,GNOMAD_AN,GNOMAD_AN_AFR,GNOMAD_AN_AMR,GNOMAD_AN_ASJ,GNOMAD_AN_EAS,GNOMAD_AN_FIN,GNOMAD_AN_FEMALE,GNOMAD_AN_MALE,GNOMAD_AN_NFE,GNOMAD_AN_OTH,GNOMAD_AN_POPMAX,GNOMAD_AN_SAS,GNOMAD_AN_RAW,GNOMAD_AS_FILTERSTATUS,GNOMAD_AS_RF,GNOMAD_BASEQRANKSUM,GNOMAD_CSQ,GNOMAD_CLIPPINGRANKSUM,GNOMAD_DB,GNOMAD_DP,GNOMAD_DP_HIST_ALL,GNOMAD_DP_HIST_ALT,GNOMAD_DP_MEDIAN,GNOMAD_DREF_MEDIAN,GNOMAD_FS,GNOMAD_GC,GNOMAD_GC_AFR,GNOMAD_GC_AMR,GNOMAD_GC_ASJ,GNOMAD_GC_EAS,GNOMAD_GC_FIN,GNOMAD_GC_FEMALE,GNOMAD_GC_MALE,GNOMAD_GC_NFE,GNOMAD_GC_OTH,GNOMAD_GC_SAS,GNOMAD_GC_RAW,GNOMAD_GQ_HIST_ALL,GNOMAD_GQ_HIST_ALT,GNOMAD_GQ_MEDIAN,GNOMAD_HOM,GNOMAD_HOM_AFR,GNOMAD_HOM_AMR,GNOMAD_HOM_ASJ,GNOMAD_HOM_EAS,GNOMAD_HOM_FIN,GNOMAD_HOM_FEMALE,GNOMAD_HOM_MALE,GNOMAD_HOM_NFE,GNOMAD_HOM_OTH,GNOMAD_HOM_SAS,GNOMAD_HOM_RAW,GNOMAD_INBREEDINGCOEFF,GNOMAD_MQ,GNOMAD_MQRANKSUM,GNOMAD_POPMAX,GNOMAD_QD,GNOMAD_READPOSRANKSUM,GNOMAD_SOR,GNOMAD_VQSLOD,GNOMAD_VQSR_CULPRIT,DBNSFP_GERP++_RS,DBNSFP_GERP++_NR,DBNSFP_1000GP1_AMR_AF,DBNSFP_1000GP1_AF,DBNSFP_UNIPROT_ACC,DBNSFP_LRT_PRED,DBNSFP_PHASTCONS100WAY_VERTEBRATE,DBNSFP_POLYPHEN2_HDIV_PRED,DBNSFP_1000GP1_ASN_AF,DBNSFP_1000GP1_AFR_AF,DBNSFP_MUTATIONTASTER_PRED,DBNSFP_1000GP1_EUR_AF,DBNSFP_INTERPRO_DOMAIN,DBNSFP_ESP6500_AA_AF,DBNSFP_POLYPHEN2_HVAR_PRED,DBNSFP_SIFT_PRED,DBNSFP_ESP6500_EA_AF,1000GP3_AA,1000GP3_AC,1000GP3_AF,1000GP3_AFR_AF,1000GP3_AMR_AF,1000GP3_AN,1000GP3_DP,1000GP3_EAS_AF,1000GP3_EUR_AF,1000GP3_NS,1000GP3_SAS_AF,1000GP3_VT,GNOMAD_STAR_AC,GNOMAD_STAR_AC_RAW,GNOMAD_STAR_HOM,GNOMAD_VQSR_NEGATIVE_TRAIN_SITE,GNOMAD_VQSR_POSITIVE_TRAIN_SITE,INS,1000GP3_MULTI_ALLELIC,1000GP3_EX_TARGET,DEL,GNOMAD_AS_RF_POSITIVE_TRAIN,CLINVAR_ALLELEID,CLINVAR_CLNDISDB,CLINVAR_CLNDN,CLINVAR_CLNHGVS,CLINVAR_CLNREVSTAT,CLINVAR_CLNSIG,CLINVAR_CLNVC,CLINVAR_CLNVCSO,CLINVAR_GENEINFO,CLINVAR_MC,CLINVAR_ORIGIN,CLINVAR_RS,CLINVAR_AF_ESP,CLINVAR_AF_TGP,CLINVAR_AF_EXAC,GNOMAD_AS_RF_NEGATIVE_TRAIN,CLINVAR_CLNVI,LOF,CLINVAR_CLNSIGCONF,NMD,CLINVAR_CLNDISDBINCL,CLINVAR_CLNDNINCL,CLINVAR_CLNSIGINCL,GNOMAD_AC_AFR_FEMALE,GNOMAD_AC_AFR_MALE,GNOMAD_AC_AMR_FEMALE,GNOMAD_AC_AMR_MALE,GNOMAD_AC_ASJ_FEMALE,GNOMAD_AC_ASJ_MALE,GNOMAD_AC_EAS_FEMALE,GNOMAD_AC_EAS_MALE,GNOMAD_AC_FIN_FEMALE,GNOMAD_AC_FIN_MALE,GNOMAD_AC_NFE_FEMALE,GNOMAD_AC_NFE_MALE,GNOMAD_AC_OTH_FEMALE,GNOMAD_AC_OTH_MALE,GNOMAD_AC_SAS_FEMALE,GNOMAD_AC_SAS_MALE,GNOMAD_AF_AFR_FEMALE,GNOMAD_AF_AFR_MALE,GNOMAD_AF_AMR_FEMALE,GNOMAD_AF_AMR_MALE,GNOMAD_AF_ASJ_FEMALE,GNOMAD_AF_ASJ_MALE,GNOMAD_AF_EAS_FEMALE,GNOMAD_AF_EAS_MALE,GNOMAD_AF_FIN_FEMALE,GNOMAD_AF_FIN_MALE,GNOMAD_AF_NFE_FEMALE,GNOMAD_AF_NFE_MALE,GNOMAD_AF_OTH_FEMALE,GNOMAD_AF_OTH_MALE,GNOMAD_AF_SAS_FEMALE,GNOMAD_AF_SAS_MALE,GNOMAD_AN_AFR_FEMALE,GNOMAD_AN_AFR_MALE,GNOMAD_AN_AMR_FEMALE,GNOMAD_AN_AMR_MALE,GNOMAD_AN_ASJ_FEMALE,GNOMAD_AN_ASJ_MALE,GNOMAD_AN_EAS_FEMALE,GNOMAD_AN_EAS_MALE,GNOMAD_AN_FIN_FEMALE,GNOMAD_AN_FIN_MALE,GNOMAD_AN_NFE_FEMALE,GNOMAD_AN_NFE_MALE,GNOMAD_AN_OTH_FEMALE,GNOMAD_AN_OTH_MALE,GNOMAD_AN_SAS_FEMALE,GNOMAD_AN_SAS_MALE,GNOMAD_GC_AFR_FEMALE,GNOMAD_GC_AFR_MALE,GNOMAD_GC_AMR_FEMALE,GNOMAD_GC_AMR_MALE,GNOMAD_GC_ASJ_FEMALE,GNOMAD_GC_ASJ_MALE,GNOMAD_GC_EAS_FEMALE,GNOMAD_GC_EAS_MALE,GNOMAD_GC_FIN_FEMALE,GNOMAD_GC_FIN_MALE,GNOMAD_GC_NFE_FEMALE,GNOMAD_GC_NFE_MALE,GNOMAD_GC_OTH_FEMALE,GNOMAD_GC_OTH_MALE,GNOMAD_GC_SAS_FEMALE,GNOMAD_GC_SAS_MALE,GNOMAD_HEMI,GNOMAD_HEMI_AFR,GNOMAD_HEMI_AMR,GNOMAD_HEMI_ASJ,GNOMAD_HEMI_EAS,GNOMAD_HEMI_FIN,GNOMAD_HEMI_NFE,GNOMAD_HEMI_OTH,GNOMAD_HEMI_SAS,GNOMAD_HEMI_RAW,GNOMAD_STAR_HEMI,1000GP3_OLD_VARIANT,ALLELE,EFFECT,IMPACT,GENE_NAME,GENE_ID,FEATURE_TYPE,FEATURE_ID,TRANSCRIPT_BIOTYPE,RANK,HGVS.C,HGVS.P,CDNA.POS / CDNA.LENGTH,CDS.POS / CDS.LENGTH,AA.POS / AA.LENGTH,DISTANCE,ERRORS / WARNINGS / INFO,AMINOCHANGE
1,chr1,14671,G,C,,202.770004,,1.0,0.5,2.0,-1.555,0.4,18.0,0.0,1.0,0.5,41.419998,-1.555,11.26,0.133,1.022,variant,True,SNP,HET,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,downstream_gene_variant,MODIFIER,DDX11L1,DDX11L1,transcript,NR_046018.2,pseudogene,,n.*262G>C,,,,,262.0,,
1,chr1,14671,G,C,,202.770004,,1.0,0.5,2.0,-1.555,0.4,18.0,0.0,1.0,0.5,41.419998,-1.555,11.26,0.133,1.022,variant,True,SNP,HET,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,downstream_gene_variant,MODIFIER,MIR6859-1,MIR6859-1,transcript,NR_106918.1,pseudogene,,n.*2698C>G,,,,,2698.0,,
1,chr1,14671,G,C,,202.770004,,1.0,0.5,2.0,-1.555,0.4,18.0,0.0,1.0,0.5,41.419998,-1.555,11.26,0.133,1.022,variant,True,SNP,HET,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,downstream_gene_variant,MODIFIER,MIR6859-2,MIR6859-2,transcript,NR_107062.1,pseudogene,,n.*2698C>G,,,,,2698.0,,
1,chr1,14671,G,C,,202.770004,,1.0,0.5,2.0,-1.555,0.4,18.0,0.0,1.0,0.5,41.419998,-1.555,11.26,0.133,1.022,variant,True,SNP,HET,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,downstream_gene_variant,MODIFIER,MIR6859-3,MIR6859-3,transcript,NR_107063.1,pseudogene,,n.*2698C>G,,,,,2698.0,,
1,chr1,14671,G,C,,202.770004,,1.0,0.5,2.0,-1.555,0.4,18.0,0.0,1.0,0.5,41.419998,-1.555,11.26,0.133,1.022,variant,True,SNP,HET,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,downstream_gene_variant,MODIFIER,MIR6859-4,MIR6859-4,transcript,NR_128720.1,pseudogene,,n.*2698C>G,,,,,2698.0,,
1,chr1,14671,G,C,,202.770004,,1.0,0.5,2.0,-1.555,0.4,18.0,0.0,1.0,0.5,41.419998,-1.555,11.26,0.133,1.022,variant,True,SNP,HET,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,non_coding_transcript_exon_variant,MODIFIER,WASH7P,WASH7P,transcript,NR_024540.1,pseudogene,11/11,n.1460C>G,,,,,,,


In [18]:
!pip install duckdb

Looking in indexes: https://juan.vazquez%40glovoapp.com:****@artifactory.glovoint.com/artifactory/api/pypi/glovo-pypi/simple
Collecting duckdb
  Using cached https://artifactory.glovoint.com/artifactory/api/pypi/glovo-pypi/packages/packages/9e/45/61c1089f797aa7b3168d1c2b55573a5880b34a438b245da73dfd92057457/duckdb-0.8.0-cp311-cp311-macosx_11_0_arm64.whl (12.6 MB)
Installing collected packages: duckdb
Successfully installed duckdb-0.8.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [19]:
import duckdb

In [2]:
import pandas as pd

In [3]:
df = pd.read_parquet('./test_parquet_all_ann.parquet')

In [33]:
test_parquet = duckdb.read_parquet("./test_parquet_all_ann.parquet")

In [43]:
duckdb.query('''SELECT CHROM,POS,REF,ALT,COUNT(1) FROM test_parquet
             GROUP BY
             "CHROM","POS","REF","ALT"
             ''')

┌─────────┬───────────┬─────────┬─────────┬──────────┐
│  CHROM  │    POS    │   REF   │   ALT   │ count(1) │
│ varchar │   int64   │ varchar │ varchar │  int64   │
├─────────┼───────────┼─────────┼─────────┼──────────┤
│ chr2    │ 232196472 │ T       │ A       │        3 │
│ chr2    │ 233391463 │ A       │ G       │        5 │
│ chr2    │ 233391965 │ A       │ G       │        5 │
│ chr2    │ 233392020 │ A       │ AAAAG   │        5 │
│ chr2    │ 233502984 │ A       │ G       │        3 │
│ chr2    │ 234066969 │ C       │ G       │        2 │
│ chr2    │ 234598991 │ T       │ G       │        6 │
│ chr2    │ 234652347 │ C       │ G       │       10 │
│ chr2    │ 234702905 │ C       │ T       │        1 │
│ chr2    │ 234708428 │ G       │ A       │        1 │
│  ·      │      ·    │ ·       │ ·       │        · │
│  ·      │      ·    │ ·       │ ·       │        · │
│  ·      │      ·    │ ·       │ ·       │        · │
│ chr8    │   2088868 │ T       │ G       │        1 │
│ chr7    

In [7]:
df.shape

(316174, 270)

# Comparing against parquet process in code

In [18]:
df2 = pd.read_parquet('../../data/Patients/BM23-48263_MODApy/BM23-48263_MODApy.final.parquet')

In [19]:
df2.shape

(317617, 270)

In [20]:
df1.shape

(316174, 270)

In [42]:
df2.columns == df1.columns

array([ True,  True,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [38]:
df1.columns

Index(['CHROM', 'POS', 'REF', 'ALT', 'RSID', 'QUAL', 'FILTER', 'AC', 'AF',
       'AN',
       ...
       'TRANSCRIPT_BIOTYPE', 'RANK', 'HGVS.C', 'HGVS.P',
       'CDNA.POS / CDNA.LENGTH', 'CDS.POS / CDS.LENGTH', 'AA.POS / AA.LENGTH',
      dtype='object', length=270)

In [39]:
df2.columns

Index(['CHROM', 'POS', 'REF', 'QUAL', 'FILTER', 'AN', 'BASEQRANKSUM',
       'CLIPPINGRANKSUM', 'DP', 'FS',
       ...
       'TRANSCRIPT_BIOTYPE', 'RANK', 'HGVS.C', 'HGVS.P',
       'CDNA.POS / CDNA.LENGTH', 'CDS.POS / CDS.LENGTH', 'AA.POS / AA.LENGTH',
      dtype='object', length=270)

In [40]:
[x for x in df1.columns if x.upper() not in df2.columns]

[]

In [30]:
df1.columns = [x.upper() for x in df1.columns]

In [41]:
[x for x in df2.columns if x.upper() not in df1.columns]

[]

In [87]:
fulldf = df1.merge(df2, how='outer', indicator=True)

In [88]:
fulldf._merge.value_counts()

both          317617
left_only          0
right_only         0
dtype: int64

In [54]:
fulldf.loc[fulldf._merge != 'both']

Unnamed: 0,CHROM,POS,REF,QUAL,FILTER,AN,BASEQRANKSUM,CLIPPINGRANKSUM,DP,FS,...,RANK,HGVS.C,HGVS.P,CDNA.POS / CDNA.LENGTH,CDS.POS / CDS.LENGTH,AA.POS / AA.LENGTH,DISTANCE,ERRORS / WARNINGS / INFO,AMINOCHANGE,_merge
3382,chr1,9792105,G,542.72998,.,2.0,.,.,26.0,0.0,...,.,c.*5001_*5002insA,.,.,.,.,2934,.,.,left_only
3383,chr1,9792105,G,542.72998,.,2.0,.,.,26.0,0.0,...,.,c.*5001_*5002insAA,.,.,.,.,2934,.,.,left_only
3384,chr1,9792105,G,542.72998,.,2.0,.,.,26.0,0.0,...,16/18,c.2428-152dupT,.,.,.,.,.,.,.,left_only
3385,chr1,9792105,G,542.72998,.,2.0,.,.,26.0,0.0,...,16/18,c.2428-153_2428-152dupTT,.,.,.,.,.,.,.,left_only
3386,chr1,9792105,G,542.72998,.,2.0,.,.,26.0,0.0,...,15/17,c.2371-152dupT,.,.,.,.,.,.,.,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319055,chrX,118985711,CTT,325.730011,.,2.0,.,.,18.0,0.0,...,2/9,c.263+17delA,.,.,.,.,.,.,.,right_only
319056,chrX,118985711,CTT,325.730011,.,2.0,.,.,18.0,0.0,...,2/10,c.263+17_263+18delAA,.,.,.,.,.,.,.,right_only
319057,chrX,118985711,CTT,325.730011,.,2.0,.,.,18.0,0.0,...,2/10,c.263+17delA,.,.,.,.,.,.,.,right_only
319058,chrX,118985711,CTT,325.730011,.,2.0,.,.,18.0,0.0,...,2/9,c.263+17_263+18delAA,.,.,.,.,.,.,.,right_only


In [61]:
pd.set_option('display.max_columns',500)

In [62]:
fulldf.loc[fulldf['POS']==118985711]

Unnamed: 0,CHROM,POS,REF,QUAL,FILTER,AN,BASEQRANKSUM,CLIPPINGRANKSUM,DP,FS,MQ,MQRANKSUM,QD,READPOSRANKSUM,SOR,SET,SNP,ZIGOSITY,GNOMAD_AB_HIST_ALL,GNOMAD_AB_HIST_ALT,GNOMAD_AB_MEDIAN,GNOMAD_AC,GNOMAD_AC_AFR,GNOMAD_AC_AMR,GNOMAD_AC_ASJ,GNOMAD_AC_EAS,GNOMAD_AC_FIN,GNOMAD_AC_FEMALE,GNOMAD_AC_MALE,GNOMAD_AC_NFE,GNOMAD_AC_OTH,GNOMAD_AC_POPMAX,GNOMAD_AC_SAS,GNOMAD_AC_RAW,GNOMAD_AF,GNOMAD_AF_AFR,GNOMAD_AF_AMR,GNOMAD_AF_ASJ,GNOMAD_AF_EAS,GNOMAD_AF_FIN,GNOMAD_AF_FEMALE,GNOMAD_AF_MALE,GNOMAD_AF_NFE,GNOMAD_AF_OTH,GNOMAD_AF_POPMAX,GNOMAD_AF_SAS,GNOMAD_AF_RAW,GNOMAD_AN,GNOMAD_AN_AFR,GNOMAD_AN_AMR,GNOMAD_AN_ASJ,GNOMAD_AN_EAS,GNOMAD_AN_FIN,GNOMAD_AN_FEMALE,GNOMAD_AN_MALE,GNOMAD_AN_NFE,GNOMAD_AN_OTH,GNOMAD_AN_POPMAX,GNOMAD_AN_SAS,GNOMAD_AN_RAW,GNOMAD_AS_FILTERSTATUS,GNOMAD_AS_RF,GNOMAD_BASEQRANKSUM,GNOMAD_CSQ,GNOMAD_CLIPPINGRANKSUM,GNOMAD_DB,GNOMAD_DP,GNOMAD_DP_HIST_ALL,GNOMAD_DP_HIST_ALT,GNOMAD_DP_MEDIAN,GNOMAD_DREF_MEDIAN,GNOMAD_FS,GNOMAD_GC,GNOMAD_GC_AFR,GNOMAD_GC_AMR,GNOMAD_GC_ASJ,GNOMAD_GC_EAS,GNOMAD_GC_FIN,GNOMAD_GC_FEMALE,GNOMAD_GC_MALE,GNOMAD_GC_NFE,GNOMAD_GC_OTH,GNOMAD_GC_SAS,GNOMAD_GC_RAW,GNOMAD_GQ_HIST_ALL,GNOMAD_GQ_HIST_ALT,GNOMAD_GQ_MEDIAN,GNOMAD_HOM,GNOMAD_HOM_AFR,GNOMAD_HOM_AMR,GNOMAD_HOM_ASJ,GNOMAD_HOM_EAS,GNOMAD_HOM_FIN,GNOMAD_HOM_FEMALE,GNOMAD_HOM_MALE,GNOMAD_HOM_NFE,GNOMAD_HOM_OTH,GNOMAD_HOM_SAS,GNOMAD_HOM_RAW,GNOMAD_INBREEDINGCOEFF,GNOMAD_MQ,GNOMAD_MQRANKSUM,GNOMAD_POPMAX,GNOMAD_QD,GNOMAD_READPOSRANKSUM,GNOMAD_SOR,GNOMAD_VQSLOD,GNOMAD_VQSR_CULPRIT,DBNSFP_GERP++_RS,DBNSFP_GERP++_NR,DBNSFP_1000GP1_AMR_AF,DBNSFP_1000GP1_AF,DBNSFP_UNIPROT_ACC,DBNSFP_LRT_PRED,DBNSFP_PHASTCONS100WAY_VERTEBRATE,DBNSFP_POLYPHEN2_HDIV_PRED,DBNSFP_1000GP1_ASN_AF,DBNSFP_1000GP1_AFR_AF,DBNSFP_MUTATIONTASTER_PRED,DBNSFP_1000GP1_EUR_AF,DBNSFP_INTERPRO_DOMAIN,DBNSFP_ESP6500_AA_AF,DBNSFP_POLYPHEN2_HVAR_PRED,DBNSFP_SIFT_PRED,DBNSFP_ESP6500_EA_AF,GNOMAD_STAR_AC,GNOMAD_STAR_AC_RAW,GNOMAD_STAR_HOM,GNOMAD_VQSR_NEGATIVE_TRAIN_SITE,GNOMAD_VQSR_POSITIVE_TRAIN_SITE,INS,DEL,GNOMAD_AS_RF_POSITIVE_TRAIN,GNOMAD_AS_RF_NEGATIVE_TRAIN,LOF,NMD,GNOMAD_AC_AFR_FEMALE,GNOMAD_AC_AFR_MALE,GNOMAD_AC_AMR_FEMALE,GNOMAD_AC_AMR_MALE,GNOMAD_AC_ASJ_FEMALE,GNOMAD_AC_ASJ_MALE,GNOMAD_AC_EAS_FEMALE,GNOMAD_AC_EAS_MALE,GNOMAD_AC_FIN_FEMALE,GNOMAD_AC_FIN_MALE,GNOMAD_AC_NFE_FEMALE,GNOMAD_AC_NFE_MALE,GNOMAD_AC_OTH_FEMALE,GNOMAD_AC_OTH_MALE,GNOMAD_AC_SAS_FEMALE,GNOMAD_AC_SAS_MALE,GNOMAD_AF_AFR_FEMALE,GNOMAD_AF_AFR_MALE,GNOMAD_AF_AMR_FEMALE,GNOMAD_AF_AMR_MALE,GNOMAD_AF_ASJ_FEMALE,GNOMAD_AF_ASJ_MALE,GNOMAD_AF_EAS_FEMALE,GNOMAD_AF_EAS_MALE,GNOMAD_AF_FIN_FEMALE,GNOMAD_AF_FIN_MALE,GNOMAD_AF_NFE_FEMALE,GNOMAD_AF_NFE_MALE,GNOMAD_AF_OTH_FEMALE,GNOMAD_AF_OTH_MALE,GNOMAD_AF_SAS_FEMALE,GNOMAD_AF_SAS_MALE,GNOMAD_AN_AFR_FEMALE,GNOMAD_AN_AFR_MALE,GNOMAD_AN_AMR_FEMALE,GNOMAD_AN_AMR_MALE,GNOMAD_AN_ASJ_FEMALE,GNOMAD_AN_ASJ_MALE,GNOMAD_AN_EAS_FEMALE,GNOMAD_AN_EAS_MALE,GNOMAD_AN_FIN_FEMALE,GNOMAD_AN_FIN_MALE,GNOMAD_AN_NFE_FEMALE,GNOMAD_AN_NFE_MALE,GNOMAD_AN_OTH_FEMALE,GNOMAD_AN_OTH_MALE,GNOMAD_AN_SAS_FEMALE,GNOMAD_AN_SAS_MALE,GNOMAD_GC_AFR_FEMALE,GNOMAD_GC_AFR_MALE,GNOMAD_GC_AMR_FEMALE,GNOMAD_GC_AMR_MALE,GNOMAD_GC_ASJ_FEMALE,GNOMAD_GC_ASJ_MALE,GNOMAD_GC_EAS_FEMALE,GNOMAD_GC_EAS_MALE,GNOMAD_GC_FIN_FEMALE,GNOMAD_GC_FIN_MALE,GNOMAD_GC_NFE_FEMALE,GNOMAD_GC_NFE_MALE,GNOMAD_GC_OTH_FEMALE,GNOMAD_GC_OTH_MALE,GNOMAD_GC_SAS_FEMALE,GNOMAD_GC_SAS_MALE,GNOMAD_HEMI,GNOMAD_HEMI_AFR,GNOMAD_HEMI_AMR,GNOMAD_HEMI_ASJ,GNOMAD_HEMI_EAS,GNOMAD_HEMI_FIN,GNOMAD_HEMI_NFE,GNOMAD_HEMI_OTH,GNOMAD_HEMI_SAS,GNOMAD_HEMI_RAW,GNOMAD_STAR_HEMI,RSID,AC,AF,MLEAC,MLEAF,VARTYPE,1000GP3_AA,1000GP3_AC,1000GP3_AF,1000GP3_AFR_AF,1000GP3_AMR_AF,1000GP3_AN,1000GP3_DP,1000GP3_EAS_AF,1000GP3_EUR_AF,1000GP3_NS,1000GP3_SAS_AF,1000GP3_VT,1000GP3_MULTI_ALLELIC,1000GP3_EX_TARGET,CLINVAR_ALLELEID,CLINVAR_CLNDISDB,CLINVAR_CLNDN,CLINVAR_CLNHGVS,CLINVAR_CLNREVSTAT,CLINVAR_CLNSIG,CLINVAR_CLNVC,CLINVAR_CLNVCSO,CLINVAR_GENEINFO,CLINVAR_MC,CLINVAR_ORIGIN,CLINVAR_RS,CLINVAR_AF_ESP,CLINVAR_AF_TGP,CLINVAR_AF_EXAC,CLINVAR_CLNVI,CLINVAR_CLNSIGCONF,CLINVAR_CLNDISDBINCL,CLINVAR_CLNDNINCL,CLINVAR_CLNSIGINCL,1000GP3_OLD_VARIANT,ALT,ALLELE,EFFECT,IMPACT,GENE_NAME,GENE_ID,FEATURE_TYPE,FEATURE_ID,TRANSCRIPT_BIOTYPE,RANK,HGVS.C,HGVS.P,CDNA.POS / CDNA.LENGTH,CDS.POS / CDS.LENGTH,AA.POS / AA.LENGTH,DISTANCE,ERRORS / WARNINGS / INFO,AMINOCHANGE,_merge
314569,chrX,118985711,CTT,325.730011,.,2.0,.,.,18.0,0.0,60.0,.,18.1,.,0.693,variant2,.,HET,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,True,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,"DEL,DEL",.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,"C,CT",C,intron_variant,MODIFIER,UPF3B,UPF3B,transcript,NM_080632.2,protein_coding,2/10,c.263+17_263+18delAA,.,.,.,.,.,.,.,left_only
314570,chrX,118985711,CTT,325.730011,.,2.0,.,.,18.0,0.0,60.0,.,18.1,.,0.693,variant2,.,HET,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,True,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,"DEL,DEL",.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,"C,CT",CT,intron_variant,MODIFIER,UPF3B,UPF3B,transcript,NM_080632.2,protein_coding,2/10,c.263+17delA,.,.,.,.,.,.,.,left_only
314571,chrX,118985711,CTT,325.730011,.,2.0,.,.,18.0,0.0,60.0,.,18.1,.,0.693,variant2,.,HET,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,True,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,"DEL,DEL",.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,"C,CT",C,intron_variant,MODIFIER,UPF3B,UPF3B,transcript,NM_023010.3,protein_coding,2/9,c.263+17_263+18delAA,.,.,.,.,.,.,.,left_only
314572,chrX,118985711,CTT,325.730011,.,2.0,.,.,18.0,0.0,60.0,.,18.1,.,0.693,variant2,.,HET,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,True,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,"DEL,DEL",.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,"C,CT",CT,intron_variant,MODIFIER,UPF3B,UPF3B,transcript,NM_023010.3,protein_coding,2/9,c.263+17delA,.,.,.,.,.,.,.,left_only
319052,chrX,118985711,CTT,325.730011,.,2.0,.,.,18.0,0.0,60.0,.,18.1,.,0.693,variant2,.,HET,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,True,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,,1.0,0.5,1.0,0.5,DEL,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,C,C,intron_variant,MODIFIER,UPF3B,UPF3B,transcript,NM_080632.2,protein_coding,2/10,c.263+17_263+18delAA,.,.,.,.,.,.,.,right_only
319053,chrX,118985711,CTT,325.730011,.,2.0,.,.,18.0,0.0,60.0,.,18.1,.,0.693,variant2,.,HET,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,True,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,,1.0,0.5,1.0,0.5,DEL,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,C,CT,intron_variant,MODIFIER,UPF3B,UPF3B,transcript,NM_080632.2,protein_coding,2/10,c.263+17delA,.,.,.,.,.,.,.,right_only
319054,chrX,118985711,CTT,325.730011,.,2.0,.,.,18.0,0.0,60.0,.,18.1,.,0.693,variant2,.,HET,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,True,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,,1.0,0.5,1.0,0.5,DEL,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,C,C,intron_variant,MODIFIER,UPF3B,UPF3B,transcript,NM_023010.3,protein_coding,2/9,c.263+17_263+18delAA,.,.,.,.,.,.,.,right_only
319055,chrX,118985711,CTT,325.730011,.,2.0,.,.,18.0,0.0,60.0,.,18.1,.,0.693,variant2,.,HET,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,True,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,,1.0,0.5,1.0,0.5,DEL,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,C,CT,intron_variant,MODIFIER,UPF3B,UPF3B,transcript,NM_023010.3,protein_coding,2/9,c.263+17delA,.,.,.,.,.,.,.,right_only
319056,chrX,118985711,CTT,325.730011,.,2.0,.,.,18.0,0.0,60.0,.,18.1,.,0.693,variant2,.,HET,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,True,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,1.0,0.5,1.0,0.5,DEL,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,CT,C,intron_variant,MODIFIER,UPF3B,UPF3B,transcript,NM_080632.2,protein_coding,2/10,c.263+17_263+18delAA,.,.,.,.,.,.,.,right_only
319057,chrX,118985711,CTT,325.730011,.,2.0,.,.,18.0,0.0,60.0,.,18.1,.,0.693,variant2,.,HET,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,True,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,1.0,0.5,1.0,0.5,DEL,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,CT,CT,intron_variant,MODIFIER,UPF3B,UPF3B,transcript,NM_080632.2,protein_coding,2/10,c.263+17delA,.,.,.,.,.,.,.,right_only
