# Python Variant Pipeline

Use this python script to connect to impala, find variants in subjects and gene regions of interest, annotate them with the following list of annotations, and save results to a file or impala table: 

- Kaviar allele frequency
- ClinVar significance rating and disorder description
- dbSNP rsID
- DANN pathogenicity rating
- Ensembl gene name and gene id
- Predicted coding consequences

## Setup your parameters

Each of the following parameters should be entered in python list format unless otherwise specified. Or as 'all' to skip that particular filter. 

In [81]:
# list of genes to search in
gene_list = ['CYP%']

# subject id's to include
subject_list = 'all'

# members to include (NB, M, F)
member_list = ['NB']

# genotype to include ('hom_rec', 'hom_alt', 'het')
geno_list = ['hom_rec']

# max kaviar allele frequency to return as a string
max_kav_freq = '.03'

# return only variants markes as non-conflicted significant in clinvar
# (siginicance rating of 4 or 5, but never 2 or 3)
# enter as 'yes' or 'no'
clin_patho = 'yes'

# minimum dann score to return
min_dann = 'all'

# return only variants with 'HIGH' coding consequences
# enter 'yes' or 'no'
coding_impact = 'no'

# enter variant type as string 'illumina' or 'comgen'
variant_type = 'illumina'

## Query impala

### Connect to impala

In [None]:
import ibis
import os

# connect to impala with ibis
hdfs_port = os.environ.get('glados20', 50070)
hdfs = ibis.hdfs_connect(host='glados20', port=hdfs_port, user='selasady')
con = ibis.impala.connect(host='glados19', port=21050, timeout=120)

# enable interactive mode
ibis.options.interactive = True

### Parse User Arguments

In [79]:
def label_member(tbl_name, trio_arg):
    """
    function to create a sql statement from user trio argument for which
    trio members to include in analysis
    :param tbl_name: alias of tablename used in query as string, ex. 'bv'
    :param trio_arg: list of trio members to include, ex. 'M','F','NB'
    :return: member_arg
    """
    member_list = []
    for member in trio_arg:
        if member == 'NB':
            member_list.append("{}.sample_id LIKE '%03'".format(tbl_name))
        if member == 'M':
            member_list.append("{}.sample_id LIKE '%01'".format(tbl_name))
        if member == 'F':
            member_list.append("{}.sample_id LIKE '%02'".format(tbl_name))
        if member_list == 'all':
            member_list =''
    if len(member_list) > 0:
        member_arg = 'AND (' + ' OR '.join(member_list) + ')'
    # otherwise statment is empty
    else:
        member_arg = ''
    return member_arg

def parse_arg_list(tbl_name, col_name, arg_list):
    """
    format user arg lists 
    :param tbl_name: alias of tablename used in query as string, ex. 'vars'
    :param arg_list: user arg list to parse
    :return: query_arg assign to a variable to create sql statements
    """
    wildcard_arg=[]
    reg_arg = []
    args = []
    if arg_list != 'all':
        for arg in arg_list:
            if '%' in arg:
                wildcard_arg.append(arg)
            else: 
                reg_arg.append(arg)
    else:
        reg_arg = ''
        wildcard_arg = ''
    if len(wildcard_arg) > 1:
        for arg in wildcard_arg:
            args.append("OR {}.{} LIKE ('{}')".format(tbl_name, col_name, arg))
    if len(reg_arg) == 1:
        args.append("OR {}.{} = '{}'".format(tbl_name, col_name, "','".join(reg_arg)))
    if len(reg_arg) > 1:
        args.append("OR {}.{} IN ('{}')".format(tbl_name, col_name, "','".join(reg_arg)))
    query_statement = ' '.join(args).replace('OR', 'AND', 1)
    return query_statement

def parse_clinvar(clin_arg, table):
    if clin_arg.lower == 'yes':
        clin_statement = "WHERE ({}.clin_sig NOT REGEXP '3|2[^5]|2$' AND {}.clin_sig REGEXP '4|[^25]5|^5')".format(table, table)
    else:
        clin_statement = ''
    return clin_statement

In [82]:
parse_clinvar(clin_patho, 'vars')

''

In [68]:
members = label_member('vars', member_list)
subjects = parse_arg_list('var', 'sample_id', subject_list )                                                
genes = parse_arg_list('var', 'gene_name', gene_list )
geno = parse_arg_list('var', 'genotype', geno_list)

In [69]:
print geno

AND var.genotype = 'hom_rec'
