# Python Variant Pipeline

Use this python script to connect to impala, find variants in subjects and gene regions of interest, annotate them with the following list of annotations, and save results to a file or impala table: 

- Kaviar allele frequency
- ClinVar significance rating and disorder description
- dbSNP rsID
- DANN pathogenicity rating
- Ensembl gene name and gene id
- Predicted coding consequences

## Setup your parameters

Each of the following parameters should be entered in python list format unless otherwise specified. Or enter 'all' to skip that particular filter. 

In [361]:
# list of genes to search in
gene_list = ['CYP%']

# subject id's to include
subject_list = ['102-00511-02', '5']
#subject_list = 'all'

# members to include (NB, M, F) or 'all'
member_list = ['NB', 'M']
#member_list = 'all'

# genotype to include ('hom_rec', 'hom_alt', 'het') or 'all'
geno_list = ['0/1']
#geno_list = 'all'

# max kaviar allele frequency to return, formatted as string, or 'all'
max_kav_freq = '.03'

# return only variants markes as non-conflicted significant in clinvar
# (siginicance rating of 4 or 5, but never 2 or 3)
# enter as 'yes' or 'no'
clin_patho = 'yes'

# minimum dann score to return
min_dann = 'all'

# return only variants with 'HIGH' coding consequences
# enter 'yes' or 'no'
coding_impact = 'no'

# enter output method for results as 'tsv' or 'table'
output_type = 'tsv'

## Parse User Arguments

### Create functions to parse user arguments

Run this cell, but don't need to make any changes to it. 

In [362]:
def label_member(trio_arg):
    """
    function to create a sql statement from user trio argument for which
    trio members to include in analysis
    :param tbl_name: alias of tablename used in query as string, ex. 'bv'
    :param trio_arg: list of trio members to include, ex. 'M','F','NB'
    :return: member_arg
    """
    member_list = []
    for member in trio_arg:
        if member == 'NB':
            member_list.append("subject_id LIKE '%03'")
        if member == 'M':
            member_list.append("subject_id LIKE '%01'")
        if member == 'F':
            member_list.append("subject_id LIKE '%02'")
        if member_list == 'all':
            member_list =''
    if len(member_list) > 0:
        member_arg = 'AND (' + ' OR '.join(member_list) + ')'
    # otherwise statement is empty
    else:
        member_arg = ''
    return member_arg

def parse_arg_list(arg_list, col_name):
    """
    format user arg lists 
    :param tbl_name: alias of tablename used in query as string, ex. 'vars'
    :param arg_list: user arg list to parse
    :return: query_arg assign to a variable to create sql statements
    """
    wildcard_arg=[]
    reg_arg = []
    args = []
    # if the user does not equal all
    if arg_list != 'all':
        # for each argument in the list
        for arg in arg_list:
            # if the argument contains a wildcard
            if '%' in arg:
                # append the argument to the wildcard list
                wildcard_arg.append(arg)
            # if no wildcard, append to regular arg list
            else: 
                reg_arg.append(arg)
    # if the user arg = 'all' create empty lists
    else:
        reg_arg = ''
        wildcard_arg = ''
    # create query statement from wildcard list
    for arg in wildcard_arg:
        # add a query statment to args for each arg
        args.append("OR {} LIKE ('{}')".format(col_name, arg))
    for arg in reg_arg:
        # add a query statment to args for each arg
        args.append("OR {} = '{}'".format(col_name, arg))
    # final query statement joins all arguments in args
    query_statement = ' '.join(args).replace('OR', 'AND (', 1)  
    return query_statement 
    
def parse_clinvar(clin_arg):
    if clin_arg.lower() == 'yes':
        clin_statement = "AND (clin_sig NOT REGEXP '3|2[^5]|2$' AND clin_sig REGEXP '4|[^25]5|^5')"
    elif clin_arg.lower() == 'no':
        clin_statement = ''
    else:
        print "Enter a 'yes' or 'no' value for clin_patho."
    return clin_statement

def parse_coding(coding_arg):
    if coding_arg.lower() == 'yes':
        coding_statement = "WHERE impact = 'HIGH'"
    elif coding_arg.lower() == 'no':
        coding_statement = ''
    else:
        print "Enter a 'yes' or 'no' value for coding_impact."
    return coding_statement

def parse_singles(user_arg, column, val_type, op_type):
    if user_arg == 'all':
        single_statement = ''
    else:
        if val_type == 'int':
            single_statement = "AND {} {} {}".format(column, op_type, int(user_arg))
        elif val_type == 'float':
            single_statement = "AND {} {} {}".format(column, op_type, float(user_arg))
        elif val_type == 'string':
            single_statement = "AND {} {} '{}'".format(column, op_type, str(user_arg))
        else:
            print "Please check that the data type of your {} argument is either int, float or string.".format(user_arg)
    return single_statement

def parse_genotype(gt_arg):
    gt_list = []
    # if gt_arg = 'all' then leave blank
    if gt_arg == 'all':
        gt_statement = ''
    else:
        # if there is more than on gt arg
        if len(gt_arg) > 1:
            # add a statement to gt_list
            gt_list.append("AND gt IN ('{}')".format("','".join(map(str, gt_arg))))
        # if there is only one arg
        elif len(gt_arg) == 1:
            # add a gt '=' statement to the list
            gt_list.append("AND gt = '{}'".format(','.join(map(str, gt_arg))))
    geno_statement = ' '.join(gt_list)
    return geno_statement

In [363]:
members = label_member(member_list)
subjects = parse_arg_list(subject_list, 'subject_id')                                                
genes = parse_arg_list(gene_list, 'ens_gene')
geno = parse_genotype(geno_list)
clin_statement = parse_clinvar(clin_patho)
kav = parse_singles(max_kav_freq, 'kav_freq', 'float', '<=')
dann = parse_singles(min_dann, 'dann_score', 'float', '>=')
coding = parse_coding(coding_impact)

### Create Merged Query Statements

In [365]:
var_statement = (members + ' ' + subjects + ' ' + geno).replace('AND', 'WHERE', 1) 
if len(var_statement) > 0:
    var_statement = var_statement + ')'

print 'members = ' + members
print 'subjects = ' + subjects 
print 'geno = ' + geno

# # combine global variants query
# gv_combo = [kav, clin_statement, dann]
# gv_statement = ' '.join(gv_combo).replace('AND', 'WHERE (', 1) + ')'

print var_statement

members = AND (subject_id LIKE '%03' OR subject_id LIKE '%01')
subjects = AND ( subject_id = '102-00511-02' OR subject_id = '5'
geno = AND gt = '0/1'
WHERE (subject_id LIKE '%03' OR subject_id LIKE '%01') AND ( subject_id = '102-00511-02' OR subject_id = '5' AND gt = '0/1')


### Write Query

In [181]:
query = '''
    WITH vars AS 
    (
    SELECT * 
    FROM p7_platform.wgs_illumina_variant 
        {}
        {}
     ),
    gv as (
        SELECT * 
        FROM p7_ref_grch37.global_variants
        {}
        {}),
    cd as (
        SELECT * 
        FROM users_selasady.coding_consequences
        {})
     
    SELECT vars.*, gv.kav_freq, gv.clin_sig, gv.clin_dbn, gv.rs_id,
         gv.dann_score, gv.ens_gene, gv.ens_geneid, cd.effect, cd.impact,
         cd.feature, cd.feature_id, cd.biotype, cd.rank, cd.hgvs_c, cd.hgvs_p
         FROM vars, gv, cd
         WHERE vars.chrom = gv.chrom
         AND cd.chrom = gv.chrom
         AND vars.pos = gv.pos
         AND cd.pos = gv.pos
         AND vars.ref = gv.ref
         AND cd.ref = gv.ref
         AND vars.alt = gv.alt
         AND cd.alt = gv.alt
    '''.format(subject_statement, geno, gv_statement, genes, coding)

print query


    WITH vars AS 
    (
    SELECT * 
    FROM p7_platform.wgs_illumina_variant 
         ))
        AND gt = '0/1'
     ),
    gv as (
        SELECT * 
        FROM p7_ref_grch37.global_variants
        WHERE ( kav_freq <= 0.03 AND (clin_sig NOT REGEXP '3|2[^5]|2$' AND clin_sig REGEXP '4|[^25]5|^5') )
        AND ( ens_gene LIKE ('CYP%'))),
    cd as (
        SELECT * 
        FROM users_selasady.coding_consequences
        )
     
    SELECT vars.*, gv.kav_freq, gv.clin_sig, gv.clin_dbn, gv.rs_id,
         gv.dann_score, gv.ens_gene, gv.ens_geneid, cd.effect, cd.impact,
         cd.feature, cd.feature_id, cd.biotype, cd.rank, cd.hgvs_c, cd.hgvs_p
         FROM vars, gv, cd
         WHERE vars.chrom = gv.chrom
         AND cd.chrom = gv.chrom
         AND vars.pos = gv.pos
         AND cd.pos = gv.pos
         AND vars.ref = gv.ref
         AND cd.ref = gv.ref
         AND vars.alt = gv.alt
         AND cd.alt = gv.alt
    


##  Query Impala

### Connect to impala

In [156]:
# import needed modules
from impala.util import as_pandas
import pandas as pd
from impala.dbapi import connect

# disable extraneous pandas warning
pd.options.mode.chained_assignment = None

### Run Query

In [179]:
def run_query(query_name, out_db, out_name):
    """
    opens odbc connection to impala, drops table if exists, runs query
     saves results as table, closes connection
    :param query_name: name of query statement to run
    :param out_db: name of output database
    :param out_name: name of output table name
    :return: table of results saved on impala in specified output dir
    """
    # create connection object
    conn=connect(host='glados19', port=21050, timeout=120)
    # drop table if it exists
    cur = conn.cursor()
    print 'Removing table if it already exists...'
    cur.execute('DROP TABLE IF EXISTS {}.{}'.format(out_db, out_name))
    # run query
    print 'Running the following query on impala: \n' + query_name
    cur.execute(query_name)
    results_df = as_pandas(cur)
    if len(results_df) > 0:
        print 'Query finished. Closing connection.'
        return results_df
    else:
        print 'No results found.'
    cur.close()
    conn.close()

In [180]:
# run query
results_df = run_query(query, 'users_selasady', 'filtered_variants')

Removing table if it already exists...
Running the following query on impala: 

    WITH vars AS 
    (
    SELECT * 
    FROM p7_platform.wgs_illumina_variant 
         ))
        AND gt = '0/1'
     ),
    gv as (
        SELECT * 
        FROM p7_ref_grch37.global_variants
        WHERE ( kav_freq <= 0.03 AND (clin_sig NOT REGEXP '3|2[^5]|2$' AND clin_sig REGEXP '4|[^25]5|^5') )
        AND ( ens_gene LIKE ('CYP%'))),
    cd as (
        SELECT * 
        FROM users_selasady.coding_consequences
        )
     
    SELECT vars.*, gv.kav_freq, gv.clin_sig, gv.clin_dbn, gv.rs_id,
         gv.dann_score, gv.ens_gene, gv.ens_geneid, cd.effect, cd.impact,
         cd.feature, cd.feature_id, cd.biotype, cd.rank, cd.hgvs_c, cd.hgvs_p
         FROM vars, gv, cd
         WHERE vars.chrom = gv.chrom
         AND cd.chrom = gv.chrom
         AND vars.pos = gv.pos
         AND cd.pos = gv.pos
         AND vars.ref = gv.ref
         AND cd.ref = gv.ref
         AND vars.alt = gv.alt
         AND 

HiveServer2Error: AnalysisException: Syntax error in line 6:
         ))
          ^
Encountered: )
Expected: INSERT, SELECT, VALUES, COMMA

CAUSED BY: Exception: Syntax error

In [165]:
print results_df.type()

AttributeError: 'NoneType' object has no attribute 'type'