# Python Variant Pipeline

Use this python script to connect to impala, find variants in subjects and gene regions of interest, annotate them with the following list of annotations, and save results to a file or impala table: 

- Kaviar allele frequency
- ClinVar significance rating and disorder description
- dbSNP rsID
- DANN pathogenicity rating
- Ensembl gene name and gene id
- Predicted coding consequence

## Setup your parameters

In [103]:
# list of genes to search in
gene_list = ['RAB31','CYP%']

# subject id's to include
#subject_list = 'all'
subject_list = ['55', '60']
# members to include (NB, M, F)
member_list = ['NB']

# max kaviar allele frequency to return as a string
max_kav_freq = '.03'

# return only variants markes as non-conflicted significant in clinvar
# (siginicance rating of 4 or 5, but never 2 or 3)
# enter as 'yes' or 'no'
clin_patho = 'no'

# minimum dann score to return
min_dann = 'all'

# return only variants with 'HIGH' coding consequences
# enter 'yes' or 'no'
coding_impact = 'no'

# enter variant type as string 'illumina' or 'comgen'
variant_type = 'illumina'

## Query Impala

### Connect to impala

Update these arguments to match with your server and user name.

In [2]:
import ibis
import os

# connect to impala with ibis
hdfs_port = os.environ.get('glados20', 50070)
hdfs = ibis.hdfs_connect(host='glados20', port=hdfs_port, user='selasady')
con = ibis.impala.connect(host='glados19', port=21050, timeout=120)

# enable interactive mode
ibis.options.interactive = True

### Format user arguments

In [147]:
def label_member(tbl_name, trio_arg):
    """
    function to create a sql statement from user trio argument for which
    trio members to include in analysis
    :param tbl_name: alias of tablename used in query as string, ex. 'bv'
    :param trio_arg: list of trio members to include, ex. 'M','F','NB'
    :return: member_arg
    """
    member_list = []
    for member in trio_arg:
        if member == 'NB':
            member_list.append("{}.sample_id LIKE '%03'".format(tbl_name))
        if member == 'M':
            member_list.append("{}.sample_id LIKE '%01'".format(tbl_name))
        if member == 'F':
            member_list.append("{}.sample_id LIKE '%02'".format(tbl_name))
        if member_list == 'all':
            member_list =''
    if len(member_list) > 0:
        member_arg = 'AND (' + ' OR '.join(member_list) + ')'
    # otherwise statment is empty
    else:
        member_arg = ''
    return member_arg

members = label_member('vars', member_list)

def parse_arg_list(tbl_name, col_name, arg_list):
    """
    format user arg lists 
    :param tbl_name: alias of tablename used in query as string, ex. 'vars'
    :param arg_list: user arg list to parse
    :return: query_arg assign to a variable to create sql statements
    """
    wildcard_arg=[]
    reg_arg = []
    if arg_list != 'all':
        for arg in arg_list:
            if '%' in arg:
                wildcard_arg.append(arg)
            else: 
                reg_arg.append(arg)
                #reg_arg.append("AND {}.{} IN ('{}')".format(tbl_name, col_name, arg))
    else:
        reg_arg = ''
        wildcard_arg = ''
    return reg_arg, wildcard_arg
                                                  
test1,test2 = parse_arg_list('var', 'sample_id', subject_list )                                                
genes = parse_arg_list('var', 'gene_name', gene_list ) 


# def join_args(arg_list):
#     """
#     joins together sql statements, adding 'WITH' to
#     first argument, and AND to each following arg
#     :param arg_list: list of sql statements to join
#     :return: subject_statement
#     """
#     if len(arg_list) > 0:
#         subject_statement = ' '.join(arg_list)
#     # otherwise return empty string
#     else:
#         subject_statement = ''
#     return subject_statement

In [148]:
print members
print test1,test2
print genes

AND (vars.sample_id LIKE '%03')
['55', '60'] []
(['RAB31'], ['CYP%'])


In [None]:
if variant_type = 'illumina'