# Querying population frequency using excel sheet as the input
> The excel sheet should have column name of :
>1. "Coordinate" for nucleotide position
>2. "Chr" for Chromosomal number

In [4]:
import os, sqlite3, time, datetime, subprocess
import pandas as pd
import multiprocessing
import threading
from threading import BoundedSemaphore
from utils import logDecorator as lD

from utils.createDB import *
from utils.query import *

## 1. Creating sql table for the input

In [5]:
xls_file = pd.ExcelFile('query.xlsx')
table = xls_file.parse('All Samples')
# chrom = [x for x in table['Chr'].unique()[:-1]]
conn = sqlite3.connect('dbase_Sqlite')
try:
    cleardb('dbase_Sqlite', 'dataFrame')
except:
    pass
table.to_sql('dataFrame', conn)

  chunksize=chunksize, dtype=dtype)


## 2. Querying from Chinese, Malay and Indian populations

### 2.1 Single thread/cpu 

In [6]:
startTime = time.time()

query = """
        SELECT DISTINCT
            dataFrame.*, Indian_ALLELE_FREQ_1, Indian_ALLELE_FREQ_2, Indian.ID
        FROM
            dataFrame LEFT OUTER JOIN 
            Indian ON dataFrame.Coordinate=Indian.POS 
            AND dataFrame.Chr=Indian.CHROM
        """
df_I = pd.read_sql_query(query , conn)

query = """
        SELECT DISTINCT
            dataFrame.*, Malay_ALLELE_FREQ_1, Malay_ALLELE_FREQ_2, Malay.ID
        FROM
            dataFrame LEFT OUTER JOIN
            Malay ON dataFrame.Coordinate=Malay.POS
            AND dataFrame.Chr=Malay.CHROM
        """
df_M = pd.read_sql_query(query , conn)

query = """
        SELECT DISTINCT
            dataFrame.*, Chinese_ALLELE_FREQ_1, Chinese_ALLELE_FREQ_2, Chinese_ALLELE_FREQ_3, Chinese_ALLELE_FREQ_4, Chinese.ID
        FROM
            dataFrame LEFT OUTER JOIN
            Chinese ON dataFrame.Coordinate=Chinese.POS
            AND dataFrame.Chr=Chinese.CHROM
        """

df_C = pd.read_sql_query(query , conn)
        
timeTaken = time.time()-startTime 
print(datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S'), 
      '- Data Retrieval for all races completed: Took {} seconds to complete.'.format(timeTaken))

# 2017/09/18 15:54:03 - Data Retrieval for all races completed: Took 189.973806142807 seconds to complete (only exome).
# 2017/09/20 13:35:04 - Data Retrieval for all races completed: Took 219.73676705360413 seconds to complete (only exome).
# 2017/09/26 14:44:25 - Data Retrieval for all races completed: Took 465.7713861465454 seconds to complete (whole genome).

2017/10/03 14:26:54 - Data Retrieval for all races completed: Took 612.6685707569122 seconds to complete.


### 2.2 Multithreading/cpus 

In [None]:
startTime = time.time()

print('There are', multiprocessing.cpu_count(), 'cpu\'s available in this machine')

responses={}
responses_lock=threading.Lock()

maxconnections = 8
pool_sema = BoundedSemaphore(value=maxconnections)

def task(fname):
    pool_sema.acquire()
    conn = sqlite3.connect('dbase_Sqlite')
    if fname != 'Chinese':
        query = """
            SELECT DISTINCT
                dataFrame.*, """ + fname + """_ALLELE_FREQ_1, """ + fname + """_ALLELE_FREQ_2, """ + fname + """.ID
            FROM
                dataFrame LEFT OUTER JOIN 
                """ + fname + """ ON dataFrame.Coordinate=""" + fname + """.POS 
            AND dataFrame.Chr=""" + fname + """.CHROM
            """
    else:
        query = """
            SELECT DISTINCT
                dataFrame.*, """ + fname + """_ALLELE_FREQ_1, """ + fname + """_ALLELE_FREQ_2, """ + fname + """_ALLELE_FREQ_3, """ + fname + """_ALLELE_FREQ_4, """ + fname + """.ID
            FROM
                dataFrame LEFT OUTER JOIN 
                """ + fname + """ ON dataFrame.Coordinate=""" + fname + """.POS 
            AND dataFrame.Chr=""" + fname + """.CHROM
            """
    df = pd.read_sql_query(query , conn)
    conn.close()
    pool_sema.release()
    responses_lock.acquire()
    responses[fname] = df
    responses_lock.release()

pool = []

#find sql files and spawn theads
for fname in ['Chinese', 'Indian', 'Malay']:
    #create new thread with task
    thread = threading.Thread(target=task,args=(fname,))
    thread.daemon = True
    # store thread in pool 
    pool.append(thread)
    #thread started
    thread.start()

#wait for all threads tasks done
for thread in pool:
    thread.join()

timeTaken = time.time()-startTime 
print(datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S'), 
      '- Data Retrieval for all races completed: Took {} seconds to complete.'.format(timeTaken))

# 2017/09/20 13:40:13 - Data Retrieval for all races completed: Took 309.63539934158325 seconds to complete (Exome only).
# 2017/09/20 13:53:34 - Data Retrieval for all races completed: Took 274.6211278438568 seconds to complete (Exome only).
### Slower than the single thread operation, probably due to the memory limitation, tested below 80 seconds with 32gb server instance.
# 2017/09/26 14:56:43 - Data Retrieval for all races completed: Took 610.4648928642273 seconds to complete (Whole genome).

There are 4 cpu's available in this machine


## 3. Merging the output tables into one table

In [7]:
df_M = responses['Malay']
df_I = responses['Indian']
df_I = responses['Chinese']
df = df_I.merge(df_C[['index','Chinese_ALLELE_FREQ_1','Chinese_ALLELE_FREQ_2','Chinese_ALLELE_FREQ_3', 'Chinese_ALLELE_FREQ_4']], 
                 on=['index'], how='outer').merge(df_M[['index','Malay_ALLELE_FREQ_1', 'Malay_ALLELE_FREQ_2']], on=['index'], how='outer')

In [8]:
df_working = df[:]

for ethnic in ['Indian', 'Malay', 'Chinese']:
    df_Null_temp = df_working[df_working[ethnic + "_ALLELE_FREQ_1"].isnull()]

    for chrom in list(range(1,23)) + ['X']:
        df_Null_temp_working = df_Null_temp[(df_Null_temp.Chr==str(chrom))]
        with open('query.bed','w') as bedFile:
            for i in df_Null_temp_working['Coordinate']:
                bedFile.write('chr' + str(chrom) + '\t' + str(i-1) + '\t' + str(i) + '\n')
        with open('output.fa', 'w') as outfile:
            proc = subprocess.Popen(['seqtk', 'subseq', 'GRCh37/chr' + str(chrom) + '.fa', 'query.bed'], 
                                     stdin=subprocess.PIPE, stdout=outfile)
            out, err = proc.communicate()
        
        p = open('output.fa', "r")
        for i in df_Null_temp_working.index: 
            p.readline()
            df_working.loc[(df_working.index==i),(ethnic + '_ALLELE_FREQ_1')] = str.capitalize(p.readline().replace('\n', ':1'))

In [9]:
df_working[:9]

Unnamed: 0,index,Sample,Gene,Variant,Chr,Coordinate,Variant Length,Type,Genotype,Exonic,...,HGNC,Indian_ALLELE_FREQ_1,Indian_ALLELE_FREQ_2,ID,Chinese_ALLELE_FREQ_1,Chinese_ALLELE_FREQ_2,Chinese_ALLELE_FREQ_3,Chinese_ALLELE_FREQ_4,Malay_ALLELE_FREQ_1,Malay_ALLELE_FREQ_2
0,0,T1-MS,,A>A/G,1,14804874,1,snv,het,no,...,,,,,A:0.543011,G:0.456989,,,A:0.666667,G:0.333333
1,0,T1-MS,,A>A/G,1,14804874,1,snv,het,no,...,,,,,A:0.543011,G:0.456989,,,A:0.666667,G:0.333333
2,0,T1-MS,,A>A/G,1,14804874,1,snv,het,no,...,,,,,A:0.543011,G:0.456989,,,A:0.666667,G:0.333333
3,1,T1-MS,SDHB,G>T/T,1,17380497,1,snv,hom,yes,...,SDHB,G:0.0277778,T:0.972222,rs2746462,G:0,T:1,,,G:0,T:1
4,2,T1-MS,MUTYH,C>C/G,1,45797505,1,snv,het,yes,...,"MUTYH, HPDL",C:0.819444,G:0.180556,rs3219489,C:0.66129,G:0.33871,,,C:0.583333,G:0.416667
5,3,T1-MS,MUTYH,C>C/G,1,45797505,1,snv,het,yes,...,"MUTYH, HPDL",C:0.819444,G:0.180556,rs3219489,C:0.66129,G:0.33871,,,C:0.583333,G:0.416667
6,4,T1-MS,,C>C/T,1,150860471,1,snv,het,no,...,,C:0.638889,T:0.361111,rs7412746,C:0.623656,T:0.376344,,,C:0.557292,T:0.442708
7,4,T1-MS,,C>C/T,1,150860471,1,snv,het,no,...,,C:0.638889,T:0.361111,rs7412746,C:0.623656,T:0.376344,,,C:0.557292,T:0.442708
8,5,T1-MS,NR5A2,A>A/G,1,200007432,1,snv,het,no,...,NR5A2,A:0.541667,G:0.458333,rs3790844,A:0.397849,G:0.602151,,,A:0.473958,G:0.526042


## 4. Saving the output tables

In [28]:
df_working.to_excel("output.xls")

## 5. Query individual snps

In [6]:
query = """
        SELECT DISTINCT
            Malay.*
        FROM
            Malay
        WHERE POS in ('97816327', '14804874', '176637576', '150860471', '47601106')
        """

conn = sqlite3.connect('dbase_Sqlite')

df_testing = pd.read_sql_query(query , conn)
df_testing

Unnamed: 0,CHROM,POS,N_ALLELES,N_CHR,Malay_ALLELE_FREQ_1,Malay_ALLELE_FREQ_2,ID
0,2,47601106,2,192,T:0.192708,C:0.807292,rs1126497
1,2,97816327,2,174,T:0.965517,C:0.0344828,.
2,2,97816327,2,174,T:0.965517,C:0.0344828,rs6465657
3,2,97816327,2,174,T:0.965517,C:0.0344828,rs77759206
4,5,176637576,2,192,T:0.536458,C:0.463542,rs28932178
5,7,97816327,2,192,C:0.739583,T:0.260417,.
6,7,97816327,2,192,C:0.739583,T:0.260417,rs6465657
7,7,97816327,2,192,C:0.739583,T:0.260417,rs77759206


In [29]:
xls_file = pd.ExcelFile('output.xls')
table = xls_file.parse('Sheet1')

xls_file = pd.ExcelFile('output_threading.xls')
table2 = xls_file.parse('Sheet1')

table.equals(table2)

True