# Querying population frequency using excel sheet as the input
> The excel sheet should have column name of :
>1. "Coordinate" for nucleotide position
>2. "Chr" for Chromosomal number

In [1]:
import os, sqlite3, xlrd, time, datetime
import pandas as pd
from utils_query import *
import subprocess

## 1. Creating sql table for the input

In [3]:
xls_file = pd.ExcelFile('query.xlsx')
table = xls_file.parse('All Samples')
chrom = [x for x in table['Chr'].unique()[:-1]]
conn = sqlite3.connect('dbase_Sqlite')
cleardb('dbase_Sqlite', 'dataFrame')
table.to_sql('dataFrame', conn)

  chunksize=chunksize, dtype=dtype)


## 2. Querying from Chinese, Malay and Indian populations

In [4]:
startTime = time.time()
query = """
        SELECT DISTINCT
            dataFrame.*, Indian_ALLELE_FREQ_1, Indian_ALLELE_FREQ_2, Indian.ID
        FROM
            dataFrame LEFT OUTER JOIN 
            Indian ON dataFrame.Coordinate=Indian.POS 
            AND dataFrame.Chr=Indian.CHROM
        """
df_I = pd.read_sql_query(query , conn)

query = """
        SELECT DISTINCT
            dataFrame.*, Malay_ALLELE_FREQ_1, Malay_ALLELE_FREQ_2, Malay.ID
        FROM
            dataFrame LEFT OUTER JOIN
            Malay ON dataFrame.Coordinate=Malay.POS
            AND dataFrame.Chr=Malay.CHROM
        """
df_M = pd.read_sql_query(query , conn)

query = """
        SELECT DISTINCT
            dataFrame.*, Chinese_ALLELE_FREQ_1, Chinese_ALLELE_FREQ_2, Chinese_ALLELE_FREQ_3, Chinese_ALLELE_FREQ_4, Chinese.ID
        FROM
            dataFrame LEFT OUTER JOIN
            Chinese ON dataFrame.Coordinate=Chinese.POS
            AND dataFrame.Chr=Chinese.CHROM
        """

df_C = pd.read_sql_query(query , conn)
        
timeTaken = time.time()-startTime 
print(datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S'), 
      '- Data Retrieval for all races completed: Took {} seconds to complete.'.format(timeTaken))

2017/09/18 15:54:03 - Data Retrieval for all races completed: Took 205.01780104637146 seconds to complete.


## 3. Merging the output tables into one table

In [5]:
df = df_I.merge(df_C[['index','Chinese_ALLELE_FREQ_1','Chinese_ALLELE_FREQ_2','Chinese_ALLELE_FREQ_3', 'Chinese_ALLELE_FREQ_4']], 
                on=['index'], how='outer').merge(df_M[['index','Malay_ALLELE_FREQ_1', 'Malay_ALLELE_FREQ_2']], on=['index'], how='outer')
df[:9]

Unnamed: 0,index,Sample,Gene,Variant,Chr,Coordinate,Variant Length,Type,Genotype,Exonic,...,HGNC,Indian_ALLELE_FREQ_1,Indian_ALLELE_FREQ_2,ID,Chinese_ALLELE_FREQ_1,Chinese_ALLELE_FREQ_2,Chinese_ALLELE_FREQ_3,Chinese_ALLELE_FREQ_4,Malay_ALLELE_FREQ_1,Malay_ALLELE_FREQ_2
0,0,T1-MS,,A>A/G,1,14804874,1,snv,het,no,...,,,,,,,,,,
1,1,T1-MS,SDHB,G>T/T,1,17380497,1,snv,hom,yes,...,SDHB,G:0.0277778,T:0.972222,rs2746462,G:0,T:1,,,G:0,T:1
2,2,T1-MS,MUTYH,C>C/G,1,45797505,1,snv,het,yes,...,"MUTYH, HPDL",C:0.819444,G:0.180556,rs3219489,C:0.66129,G:0.33871,,,C:0.583333,G:0.416667
3,3,T1-MS,MUTYH,C>C/G,1,45797505,1,snv,het,yes,...,"MUTYH, HPDL",C:0.819444,G:0.180556,rs3219489,C:0.66129,G:0.33871,,,C:0.583333,G:0.416667
4,4,T1-MS,,C>C/T,1,150860471,1,snv,het,no,...,,,,,,,,,,
5,5,T1-MS,NR5A2,A>A/G,1,200007432,1,snv,het,no,...,NR5A2,A:0.541667,G:0.458333,rs3790844,A:0.397849,G:0.602151,,,A:0.473958,G:0.526042
6,6,T1-MS,,A>A/G,1,222164948,1,snv,het,no,...,,,,,,,,,,
7,7,T1-MS,,G>G/T,2,15782471,1,snv,het,no,...,,,,,,,,,,
8,8,T1-MS,ALK,G>G/C,2,29416366,1,snv,het,yes,...,ALK,G:0.430556,C:0.569444,rs1881421,G:0.258065,C:0.741935,,,G:0.317708,C:0.682292


In [6]:
df_working = df[:]

In [7]:
for ethnic in ['Indian', 'Malay', 'Chinese']:
    df_Null_temp = df_working[df_working[ethnic + "_ALLELE_FREQ_1"].isnull()]

    for chrom in range(1,23):
        df_Null_temp_working = df_Null_temp[(df_Null_temp.Chr==str(chrom))]
        with open('query.bed','w') as bedFile:
            for i in df_Null_temp_working['Coordinate']:
                bedFile.write('chr' + str(chrom) + '\t' + str(i-1) + '\t' + str(i) + '\n')
        with open('output.fa', 'w') as outfile:
            proc = subprocess.Popen(['seqtk', 'subseq', '/Volumes/Samsung_T3/SgPopulationGenetics/GRCh37/chr' + str(chrom) + '.fa', 'query.bed'], 
                                     stdin=subprocess.PIPE, stdout=outfile)
            out, err = proc.communicate()
        
        p = open('output.fa', "r")
        for i in df_Null_temp_working.index: 
            p.readline()
            df_working.loc[(df_working.index==i),(ethnic + '_ALLELE_FREQ_1')] = str.capitalize(p.readline().replace('\n', ':1'))
            

In [11]:
df_working[:9]

Unnamed: 0,index,Sample,Gene,Variant,Chr,Coordinate,Variant Length,Type,Genotype,Exonic,...,HGNC,Indian_ALLELE_FREQ_1,Indian_ALLELE_FREQ_2,ID,Chinese_ALLELE_FREQ_1,Chinese_ALLELE_FREQ_2,Chinese_ALLELE_FREQ_3,Chinese_ALLELE_FREQ_4,Malay_ALLELE_FREQ_1,Malay_ALLELE_FREQ_2
0,0,T1-MS,,A>A/G,1,14804874,1,snv,het,no,...,,A:1,,,A:1,,,,A:1,
1,1,T1-MS,SDHB,G>T/T,1,17380497,1,snv,hom,yes,...,SDHB,G:0.0277778,T:0.972222,rs2746462,G:0,T:1,,,G:0,T:1
2,2,T1-MS,MUTYH,C>C/G,1,45797505,1,snv,het,yes,...,"MUTYH, HPDL",C:0.819444,G:0.180556,rs3219489,C:0.66129,G:0.33871,,,C:0.583333,G:0.416667
3,3,T1-MS,MUTYH,C>C/G,1,45797505,1,snv,het,yes,...,"MUTYH, HPDL",C:0.819444,G:0.180556,rs3219489,C:0.66129,G:0.33871,,,C:0.583333,G:0.416667
4,4,T1-MS,,C>C/T,1,150860471,1,snv,het,no,...,,C:1,,,C:1,,,,C:1,
5,5,T1-MS,NR5A2,A>A/G,1,200007432,1,snv,het,no,...,NR5A2,A:0.541667,G:0.458333,rs3790844,A:0.397849,G:0.602151,,,A:0.473958,G:0.526042
6,6,T1-MS,,A>A/G,1,222164948,1,snv,het,no,...,,A:1,,,A:1,,,,A:1,
7,7,T1-MS,,G>G/T,2,15782471,1,snv,het,no,...,,G:1,,,G:1,,,,G:1,
8,8,T1-MS,ALK,G>G/C,2,29416366,1,snv,het,yes,...,ALK,G:0.430556,C:0.569444,rs1881421,G:0.258065,C:0.741935,,,G:0.317708,C:0.682292


## 4. Saving the output tables

In [12]:
df_working.to_excel("output.xls")

## 5. Query individual snps

In [6]:
query = """
        SELECT DISTINCT
            Malay.*
        FROM
            Malay
        WHERE POS in ('97816327', '14804874', '176637576', '150860471', '47601106')
        """

conn = sqlite3.connect('dbase_Sqlite')

df_testing = pd.read_sql_query(query , conn)
df_testing

Unnamed: 0,CHROM,POS,N_ALLELES,N_CHR,Malay_ALLELE_FREQ_1,Malay_ALLELE_FREQ_2,ID
0,2,47601106,2,192,T:0.192708,C:0.807292,rs1126497
1,2,97816327,2,174,T:0.965517,C:0.0344828,.
2,2,97816327,2,174,T:0.965517,C:0.0344828,rs6465657
3,2,97816327,2,174,T:0.965517,C:0.0344828,rs77759206
4,5,176637576,2,192,T:0.536458,C:0.463542,rs28932178
5,7,97816327,2,192,C:0.739583,T:0.260417,.
6,7,97816327,2,192,C:0.739583,T:0.260417,rs6465657
7,7,97816327,2,192,C:0.739583,T:0.260417,rs77759206
