# Python Variant Pipeline

Use this python script to connect to impala, find variants in subjects and gene regions of interest, annotate them with the following list of annotations, and save results to a file or impala table: 

- Kaviar allele frequency
- ClinVar significance rating and disorder description
- dbSNP rsID
- DANN pathogenicity rating
- Ensembl gene name and gene id
- Predicted coding consequences

## Setup your parameters

Each of the following parameters should be entered in python list format unless otherwise specified. Or as 'all' to skip that particular filter. 

In [170]:
# list of genes to search in
gene_list = ['CYP%']

# subject id's to include
subject_list = ['102-00511%']

# members to include (NB, M, F) or 'all'
member_list = ['NB', 'F']

# genotype to include ('hom_rec', 'hom_alt', 'het') or 'all'
geno_list = ['hom_rec']

# max kaviar allele frequency to return, formatted as string, or 'all'
max_kav_freq = '.03'

# return only variants markes as non-conflicted significant in clinvar
# (siginicance rating of 4 or 5, but never 2 or 3)
# enter as 'yes' or 'no'
clin_patho = 'yes'

# minimum dann score to return
min_dann = 'all'

# return only variants with 'HIGH' coding consequences
# enter 'yes' or 'no'
coding_impact = 'yes'

# enter variant type as string 'illumina' or 'comgen'
variant_type = 'illumina'

## Query impala

### Connect to impala

In [2]:
import ibis
import os

# connect to impala with ibis
hdfs_port = os.environ.get('glados20', 50070)
hdfs = ibis.hdfs_connect(host='glados20', port=hdfs_port, user='selasady')
con = ibis.impala.connect(host='glados19', port=21050, timeout=120)

# enable interactive mode
ibis.options.interactive = True

### Create functions to parse user arguments

In [173]:
def label_member(tbl_name, trio_arg):
    """
    function to create a sql statement from user trio argument for which
    trio members to include in analysis
    :param tbl_name: alias of tablename used in query as string, ex. 'bv'
    :param trio_arg: list of trio members to include, ex. 'M','F','NB'
    :return: member_arg
    """
    member_list = []
    for member in trio_arg:
        if member == 'NB':
            member_list.append("{}.sample_id LIKE '%03'".format(tbl_name))
        if member == 'M':
            member_list.append("{}.sample_id LIKE '%01'".format(tbl_name))
        if member == 'F':
            member_list.append("{}.sample_id LIKE '%02'".format(tbl_name))
        if member_list == 'all':
            member_list =''
    if len(member_list) > 0:
        member_arg = 'AND (' + ' OR '.join(member_list) + ')'
    # otherwise statment is empty
    else:
        member_arg = ''
    return member_arg

def parse_arg_list(tbl_name, col_name, arg_list):
    """
    format user arg lists 
    :param tbl_name: alias of tablename used in query as string, ex. 'vars'
    :param arg_list: user arg list to parse
    :return: query_arg assign to a variable to create sql statements
    """
    wildcard_arg=[]
    reg_arg = []
    args = []
    if arg_list != 'all':
        for arg in arg_list:
            if '%' in arg:
                wildcard_arg.append(arg)
            else: 
                reg_arg.append(arg)
    else:
        reg_arg = ''
        wildcard_arg = ''
    if len(wildcard_arg) > 1:
        for arg in wildcard_arg:
            args.append("OR {}.{} LIKE ('{}')".format(tbl_name, col_name, arg))
    elif len(wildcard_arg) == 1:
        for arg in wildcard_arg:
            args.append("OR {}.{} LIKE ('{}')".format(tbl_name, col_name, arg))
    elif len(reg_arg) == 1:
        args.append("OR {}.{} = '{}'".format(tbl_name, col_name, "','".join(reg_arg)))
    elif len(reg_arg) > 1:
        args.append("OR {}.{} IN ('{}')".format(tbl_name, col_name, "','".join(reg_arg)))
    query_statement = ' '.join(args).replace('OR', 'AND', 1)
    return query_statement

def parse_clinvar(clin_arg, table):
    if clin_arg.lower() == 'yes':
        clin_statement = "AND ({}.clin_sig NOT REGEXP '3|2[^5]|2$' AND {}.clin_sig REGEXP '4|[^25]5|^5')".format(table, table)
    elif clin_arg.lower() == 'no':
        clin_statement = ''
    else:
        print "Enter a 'yes' or 'no' value for clin_patho."
    return clin_statement

def parse_coding(coding_arg, table):
    if coding_arg.lower() == 'yes':
        coding_statement = "AND {}.impact = 'HIGH'".format(table)
    elif coding_arg.lower() == 'no':
        coding_statement = ''
    else:
        print "Enter a 'yes' or 'no' value for coding_impact."
    return coding_statement

def parse_singles(user_arg, table, column, val_type):
    if user_arg == 'all':
        single_statement = ''
    else:
        if val_type == 'int':
            single_statement = "AND {}.{} = {}".format(table, column, int(user_arg))
        elif val_type == 'float':
            single_statement = "AND {}.{} = {}".format(table, column, float(user_arg))
        elif val_type == 'string':
            single_statement = "AND {}.{} = '{}'".format(table, column, str(user_arg))
    return single_statement

def parse_genotype(gt_arg, tbl_name, platform):
    gt_list = []
    # if gt_arg = 'all' then leave blank
    if gt_arg == 'all':
        gt_statement = ''
    else:
        # if there is more than on gt arg
        if len(gt_arg) > 1:
            # add a statement to gt_list
            gt_list.append("AND {}.gt IN ('{}')".format(tbl_name, "','".join(map(str, gt_arg))))
        # if there is only one arg
        elif len(gt_arg) == 1:
            # add a gt '=' statement to the list
            gt_list.append("AND {}.gt = '{}'".format(tbl_name, ','.join(map(str, gt_arg))))
        return gt_list

### Parse User Args

In [172]:
members = label_member('vars', member_list)
subjects = parse_arg_list('var', 'sample_id', subject_list )                                                
genes = parse_arg_list('var', 'gene_name', gene_list )
geno = parse_genotype(geno_list, 'var', 'illumina')
clin_statement = parse_clinvar(clin_patho, 'vars')
kav = parse_singles(max_kav_freq, 'vars', 'kav_freq', 'float')
dann = parse_singles(min_dann, 'vars', 'dann_score', 'float')
coding = parse_coding(coding_impact, 'vars')

["AND var.gt = 'hom_rec'"]


### View results

In [141]:
print 'Member statement: = ' + members
print 'Subject statement: = ' + subjects
print 'Gene list statement: = ' + genes
print 'Genotype statement: = ' + geno
print 'ClinVar statement: = ' + clin_statement
print 'Kaviar frequency statement: = ' + kav
print 'DANN score statement: = ' + dann
print 'Coding consequences statement: = ' + coding

Member statement: = AND (vars.sample_id LIKE '%03' OR vars.sample_id LIKE '%02')
Subject statement: = AND var.sample_id LIKE ('102-00511%')
Gene list statement: = AND var.gene_name LIKE ('CYP%')


TypeError: cannot concatenate 'str' and 'NoneType' objects

In [None]:
WITH vars AS 
    (
    SELECT * 
    FROM p7_platform.wgs_illumina_variant 
        WHERE (subject_id = '102-00511-03' AND subject_id LIKE '%03')
        AND gt = '0/1'
     ),
    gv as (
        SELECT * 
        FROM p7_ref_grch37.global_variants
        WHERE kav_freq <= .03 
        AND (clin_sig NOT REGEXP '3|2[^5]|2$' AND clin_sig REGEXP '4|[^25]5|^5')
        AND dann_score >= 0.96),
    cd as (
        SELECT * 
        FROM users_selasady.coding_consequences
        WHERE impact = 'HIGH')
     
SELECT vars.*, gv.kav_freq, gv.clin_sig, gv.clin_dbn, gv.rs_id,
    gv.dann_score, gv.ens_gene, gv.ens_geneid, cd.effect, cd.impact,
    cd.feature, cd.feature_id, cd.biotype, cd.rank, cd.hgvs_c, cd.hgvs_p
    FROM vars, gv, cd
    WHERE vars.chrom = gv.chrom
    AND cd.chrom = gv.chrom
    AND vars.pos = gv.pos
    AND cd.pos = gv.pos
    AND vars.ref = gv.ref
    AND cd.ref = gv.ref
    AND vars.alt = gv.alt
    AND cd.alt = gv.alt
    AND gv.ens_gene LIKE 'CYP%' 