# Querying population frequency using excel sheet as the input
> The excel sheet should have column name of :
>1. "Coordinate" for nucleotide position
>2. "Chr" for Chromosomal number

In [1]:
import os, sqlite3, xlrd, time, datetime
import pandas as pd
from utils_query import *
import subprocess

## 1. Creating sql table for the input

In [2]:
xls_file = pd.ExcelFile('query.xlsx')
table = xls_file.parse('All Samples')
chrom = [x for x in table['Chr'].unique()[:-1]]
conn = sqlite3.connect('dbase_Sqlite')
cleardb('dbase_Sqlite', 'dataFrame')
table.to_sql('dataFrame', conn)

  chunksize=chunksize, dtype=dtype)


## 2. Querying from Chinese, Malay and Indian populations

In [3]:
startTime = time.time()
query = """
        SELECT DISTINCT
            dataFrame.*, Indian_ALLELE_FREQ_1, Indian_ALLELE_FREQ_2, Indian.ID
        FROM
            dataFrame LEFT OUTER JOIN 
            Indian ON dataFrame.Coordinate=Indian.POS 
            AND dataFrame.Chr=Indian.CHROM
        """
df_I = pd.read_sql_query(query , conn)

query = """
        SELECT DISTINCT
            dataFrame.*, Malay_ALLELE_FREQ_1, Malay_ALLELE_FREQ_2, Malay.ID
        FROM
            dataFrame LEFT OUTER JOIN
            Malay ON dataFrame.Coordinate=Malay.POS
            AND dataFrame.Chr=Malay.CHROM
        """
df_M = pd.read_sql_query(query , conn)

query = """
        SELECT DISTINCT
            dataFrame.*, Chinese_ALLELE_FREQ_1, Chinese_ALLELE_FREQ_2, Chinese_ALLELE_FREQ_3, Chinese_ALLELE_FREQ_4, Chinese.ID
        FROM
            dataFrame LEFT OUTER JOIN
            Chinese ON dataFrame.Coordinate=Chinese.POS
            AND dataFrame.Chr=Chinese.CHROM
        """

df_C = pd.read_sql_query(query , conn)
        
timeTaken = time.time()-startTime 
print(datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S'), 
      '- Data Retrieval for all races completed: Took {} seconds to complete.'.format(timeTaken))

2017/09/17 23:52:49 - Data Retrieval for all races completed: Took 334.83163189888 seconds to complete.


## 3. Merging the output tables into one table

In [29]:
df = df_I.merge(df_C[['index','Chinese_ALLELE_FREQ_1','Chinese_ALLELE_FREQ_2','Chinese_ALLELE_FREQ_3', 'Chinese_ALLELE_FREQ_4']], 
                on=['index'], how='outer').merge(df_M[['index','Malay_ALLELE_FREQ_1', 'Malay_ALLELE_FREQ_2']], on=['index'], how='outer')
df[:9]

Unnamed: 0,index,Sample,Gene,Variant,Chr,Coordinate,Variant Length,Type,Genotype,Exonic,...,HGNC,Indian_ALLELE_FREQ_1,Indian_ALLELE_FREQ_2,ID,Chinese_ALLELE_FREQ_1,Chinese_ALLELE_FREQ_2,Chinese_ALLELE_FREQ_3,Chinese_ALLELE_FREQ_4,Malay_ALLELE_FREQ_1,Malay_ALLELE_FREQ_2
0,0,T1-MS,,A>A/G,1,14804874,1,snv,het,no,...,,,,,,,,,,
1,1,T1-MS,SDHB,G>T/T,1,17380497,1,snv,hom,yes,...,SDHB,G:0.0277778,T:0.972222,rs2746462,G:0,T:1,,,G:0,T:1
2,2,T1-MS,MUTYH,C>C/G,1,45797505,1,snv,het,yes,...,"MUTYH, HPDL",C:0.819444,G:0.180556,rs3219489,C:0.66129,G:0.33871,,,C:0.583333,G:0.416667
3,3,T1-MS,MUTYH,C>C/G,1,45797505,1,snv,het,yes,...,"MUTYH, HPDL",C:0.819444,G:0.180556,rs3219489,C:0.66129,G:0.33871,,,C:0.583333,G:0.416667
4,4,T1-MS,,C>C/T,1,150860471,1,snv,het,no,...,,,,,,,,,,
5,5,T1-MS,NR5A2,A>A/G,1,200007432,1,snv,het,no,...,NR5A2,A:0.541667,G:0.458333,rs3790844,A:0.397849,G:0.602151,,,A:0.473958,G:0.526042
6,6,T1-MS,,A>A/G,1,222164948,1,snv,het,no,...,,,,,,,,,,
7,7,T1-MS,,G>G/T,2,15782471,1,snv,het,no,...,,,,,,,,,,
8,8,T1-MS,ALK,G>G/C,2,29416366,1,snv,het,yes,...,ALK,G:0.430556,C:0.569444,rs1881421,G:0.258065,C:0.741935,,,G:0.317708,C:0.682292


## 4. Saving the output tables

In [5]:
df_I.to_excel("output_I.xls")
df_M.to_excel("output_M.xls")
df_C.to_excel("output_C.xls")
df.to_excel("output.xls")

In [6]:
query = """
        SELECT DISTINCT
            Malay.*
        FROM
            Malay
        WHERE POS in ('97816327', '14804874', '176637576', '150860471', '47601106')
        """

conn = sqlite3.connect('dbase_Sqlite')

df_testing = pd.read_sql_query(query , conn)
df_testing

Unnamed: 0,CHROM,POS,N_ALLELES,N_CHR,Malay_ALLELE_FREQ_1,Malay_ALLELE_FREQ_2,ID
0,2,47601106,2,192,T:0.192708,C:0.807292,rs1126497
1,2,97816327,2,174,T:0.965517,C:0.0344828,.
2,2,97816327,2,174,T:0.965517,C:0.0344828,rs6465657
3,2,97816327,2,174,T:0.965517,C:0.0344828,rs77759206
4,5,176637576,2,192,T:0.536458,C:0.463542,rs28932178
5,7,97816327,2,192,C:0.739583,T:0.260417,.
6,7,97816327,2,192,C:0.739583,T:0.260417,rs6465657
7,7,97816327,2,192,C:0.739583,T:0.260417,rs77759206


In [34]:
df_try = df[:]
df_NullMalay = df[df["Malay_ALLELE_FREQ_1"].isnull()]

In [35]:
df_NullMalay_Chr1 = df_NullMalay[(df_NullMalay.Chr=='1')]
df_NullMalay_Chr1['Coordinate']
with open('query_Chr1.bed','w') as bedFile:
    for i in df_NullMalay_Chr1['Coordinate']:
        bedFile.write('chr1' + '\t' + str(i-1) + '\t' + str(i) + '\n')
with open('output.fa', "w") as outfile:
    proc = subprocess.Popen(['seqtk', 'subseq', '/Volumes/Samsung_T3/SgPopulationGenetics/GRCh37/chr1.fa', 'query_Chr1.bed'], 
                            stdin=subprocess.PIPE, stdout=outfile)
    out, err = proc.communicate()
p = open('output.fa', "r")

for i in df_NullMalay_Chr1['index']: 
    p.readline()
    df_try.loc[(df_try.index==i),('Malay_ALLELE_FREQ_1')] = str.capitalize(p.readline().replace('\n', ':1.0'))
df_try[:19]
# os.system('seqtk subseq /Volumes/Samsung_T3/SgPopulationGenetics/GRCh37/chr1.fa query_Chr1.bed > output.fa')


Unnamed: 0,index,Sample,Gene,Variant,Chr,Coordinate,Variant Length,Type,Genotype,Exonic,...,HGNC,Indian_ALLELE_FREQ_1,Indian_ALLELE_FREQ_2,ID,Chinese_ALLELE_FREQ_1,Chinese_ALLELE_FREQ_2,Chinese_ALLELE_FREQ_3,Chinese_ALLELE_FREQ_4,Malay_ALLELE_FREQ_1,Malay_ALLELE_FREQ_2
0,0,T1-MS,,A>A/G,1,14804874,1,snv,het,no,...,,,,,,,,,A:1.0,
1,1,T1-MS,SDHB,G>T/T,1,17380497,1,snv,hom,yes,...,SDHB,G:0.0277778,T:0.972222,rs2746462,G:0,T:1,,,G:0,T:1
2,2,T1-MS,MUTYH,C>C/G,1,45797505,1,snv,het,yes,...,"MUTYH, HPDL",C:0.819444,G:0.180556,rs3219489,C:0.66129,G:0.33871,,,C:0.583333,G:0.416667
3,3,T1-MS,MUTYH,C>C/G,1,45797505,1,snv,het,yes,...,"MUTYH, HPDL",C:0.819444,G:0.180556,rs3219489,C:0.66129,G:0.33871,,,C:0.583333,G:0.416667
4,4,T1-MS,,C>C/T,1,150860471,1,snv,het,no,...,,,,,,,,,C:1.0,
5,5,T1-MS,NR5A2,A>A/G,1,200007432,1,snv,het,no,...,NR5A2,A:0.541667,G:0.458333,rs3790844,A:0.397849,G:0.602151,,,A:0.473958,G:0.526042
6,6,T1-MS,,A>A/G,1,222164948,1,snv,het,no,...,,,,,,,,,A:1.0,
7,7,T1-MS,,G>G/T,2,15782471,1,snv,het,no,...,,,,,,,,,,
8,8,T1-MS,ALK,G>G/C,2,29416366,1,snv,het,yes,...,ALK,G:0.430556,C:0.569444,rs1881421,G:0.258065,C:0.741935,,,G:0.317708,C:0.682292
9,9,T1-MS,ALK,T>T/C,2,29416481,1,snv,het,yes,...,ALK,T:0.430556,C:0.569444,rs1881420,T:0.252688,C:0.747312,,,T:0.3125,C:0.6875
