In [1]:
import datetime, multiprocessing, logging, os, re, shutil, sys, sqlite3, subprocess, time
import pandas as pd
from utils import *

##################
### Administration
##################
logging.basicConfig(filename= 'log.txt', level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')

## 1. Making new Database and related tables and schemas

In [2]:
workingFolder_Indian = "SgIndian_vcf/dataFreeze_Feb2013/SNP/biAllele/"

workingFolder_Malay = "SgMalay_vcf/2012_05/snps/"

workingFolder_Chinese = "1000G/Phase3/integrated/"

# Filing number of unique samples found in the working folder...

freqFiles_Indian = [f for f in os.listdir(workingFolder_Indian) if re.match(r'chr\d+_analysis_exome\.frq', f)]
rsIDFiles_Indian = [f for f in os.listdir(workingFolder_Indian) if re.match(r'chr\d+_rsID', f)]
freqFiles_Malay = [f for f in os.listdir(workingFolder_Malay) if re.match(r'chr\d+_analysis_exome\.frq', f)]
rsIDFiles_Malay = [f for f in os.listdir(workingFolder_Malay) if re.match(r'chr\d+_rsID', f)]
freqFiles_Chinese = [f for f in os.listdir(workingFolder_Chinese) if re.match(r'chr\d+_analysis_exome\.frq', f)]
rsIDFiles_Chinese = [f for f in os.listdir(workingFolder_Chinese) if re.match(r'chr\d+_rsID', f)]

freqFilesID_pre = re.compile(r'(chr\d+)_analysis_exome\.frq')
freqFilesID = []
for file in freqFiles_Indian:
    freqFilesID.append(freqFilesID_pre.findall(file))

print(freqFilesID)

[['chr1'], ['chr2'], ['chr3'], ['chr4'], ['chr5'], ['chr6'], ['chr7'], ['chr8'], ['chr9'], ['chr10'], ['chr11'], ['chr12'], ['chr13'], ['chr14'], ['chr15'], ['chr16'], ['chr17'], ['chr18'], ['chr19'], ['chr20'], ['chr21'], ['chr22']]


## 2.1 Parsing the Malay vcf manually to generate rsID table
### >>> created only for the Singapore Malay vcf as command below produced error as such:: 
```
    $ bcftools query -f '%CHROM\t%POS\t%ID\n' SSM.chr8.2012_05.genotypes.vcf.gz -o chr8_rsID 
    $ [E::bcf_hdr_add_sample] Empty sample name: trailing spaces/tabs in the header line?
```

In [21]:
# to gunzip vcf.gz for Malay only
startTime = time.time()
for ID in freqFilesID:
    #print('gunzip '+  workingFolder_Malay + '/SSM.' + ID[0] + '.2012_05.genotypes.vcf.gz')
    try:
        proc1 = subprocess.Popen(['gunzip', workingFolder_Malay + '/SSM.' + ID[0] + '.2012_05.genotypes.vcf.gz'],
                             stdin=subprocess.PIPE, stdout=subprocess.PIPE)
        out, err = proc1.communicate()
        logging.info('gunzip ' +  workingFolder_Malay + '/SSM.' + ID[0] + '.2012_05.genotypes.vcf.gz')
    except:
        logging.info(workingFolder_Malay + '/SSM.' + ID[0] + '.2012_05.genotypes.vcf.gz not present')
        pass
timeTaken = time.time()-startTime 
print(datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S'), 
      '- gunzip completed: Took {} seconds to complete.'.format(timeTaken))

2017/09/13 11:03:08 - gunzip completed: Took 0.13221001625061035 seconds to complete.


In [25]:
# Manually load data from a raw vcf for Malay population only
startTime = time.time()
makedb('dbase_Sqlite', 'Malay_rsID', "(CHROM int(2), POS int(10), ID char(15))")
for ID in freqFilesID:    
    loaddb_vcf_rsID('Malay_rsID', 'dbase_Sqlite', workingFolder_Malay + '/SSM.' + ID[0] + '.2012_05.genotypes.vcf')
    logging.info('Inserting values from ' +  workingFolder_Malay + 'SSM.' + ID[0] + 
                 '.2012_05.genotypes.vcf to Malay_rsID table of dbase_Sqlite database')
timeTaken = time.time()-startTime 
print(datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S'), 
      '- Data Loading for Malay_rsID of dbase_Sqlite database completed: Took {} seconds to complete.'.format(timeTaken))

1083000 rows loaded
1133594 rows loaded
968661 rows loaded
971842 rows loaded
867221 rows loaded
869666 rows loaded
788835 rows loaded
725514 rows loaded
608992 rows loaded
674869 rows loaded
658585 rows loaded
642006 rows loaded
495844 rows loaded
449884 rows loaded
402798 rows loaded
439312 rows loaded
374617 rows loaded
380061 rows loaded
304836 rows loaded
300481 rows loaded
205222 rows loaded
193427 rows loaded
2017/09/13 12:23:22 - Data Loading for rsID of Malay completed: Took 4570.470263719559 seconds to complete.


## 2.2 Processing the rest of the tables

In [39]:
# For Malay Data
startTime = time.time()
makedb('dbase_Sqlite', 'Malay_Data', "(CHROM int(2), POS int(10), N_ALLELES int(1), N_CHR int(4), ALLELE_FREQ_1 char(30), ALLELE_FREQ_2 char(30))")
for ID in freqFilesID:
    loaddb('Malay_Data', 'dbase_Sqlite', workingFolder_Malay + ID[0] +'_analysis_exome.frq')
    logging.info('Inserting values from ' +  workingFolder_Malay + ID[0] + '_analysis_exome.frq ' +
                 'to Malay_Data table of dbase_Sqlite database')
timeTaken = time.time()-startTime 
print(datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S'), 
      '- Data Loading for Malay_Data table of dbase_Sqlite database completed: Took {} '
      'seconds to complete.'.format(timeTaken))

495949 rows loaded
473812 rows loaded
427107 rows loaded
319134 rows loaded
315630 rows loaded
343037 rows loaded
358234 rows loaded
288072 rows loaded
251232 rows loaded
305268 rows loaded
276140 rows loaded
276714 rows loaded
179544 rows loaded
177553 rows loaded
192212 rows loaded
201781 rows loaded
203496 rows loaded
145761 rows loaded
156652 rows loaded
134502 rows loaded
79045 rows loaded
97823 rows loaded
2017/09/13 13:16:25 - Data Loading for Malay_Data table of dbase_Sqlite database completed: Took 136.75834798812866 seconds to complete.


In [6]:
# For Indian Data and rsID
startTime = time.time()

########
makedb('dbase_Sqlite', 'Indian_Data', "(CHROM int(2), POS int(10), N_ALLELES int(1), N_CHR int(4), ALLELE_FREQ_1 char(30), ALLELE_FREQ_2 char(30))")
########
for ID in freqFilesID:
    loaddb('Indian_Data', 'dbase_Sqlite', workingFolder_Indian + ID[0] +'_analysis_exome.frq')
    logging.info('Inserting values from ' +  workingFolder_Indian + ID[0] + '_analysis_exome.frq ' +
                 'to Indian_Data table of dbase_Sqlite database')
timeTaken = time.time()-startTime 
print(datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S'), 
      '- Data Loading for Indian_Data table of dbase_Sqlite database completed: Took {} '
      'seconds to complete.'.format(timeTaken))

########
makedb('dbase_Sqlite', 'Indian_rsID', "(CHROM int(2), POS int(10), ID char(15))")
########
for ID in freqFilesID:
    loaddb('Indian_rsID', 'dbase_Sqlite', workingFolder_Indian + ID[0] +'_rsID')
    logging.info('Inserting values from ' +  workingFolder_Indian + ID[0] + '_rsID ' +
                 'to Indian_rsID table of dbase_Sqlite database')
timeTaken = time.time()-startTime 
print(datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S'), 
      '- Data Loading for Indian_rsID table of dbase_Sqlite database completed: Took {} '
      'seconds to complete.'.format(timeTaken))

360948 rows loaded
351734 rows loaded
319554 rows loaded
239662 rows loaded
229688 rows loaded
262924 rows loaded
264721 rows loaded
217646 rows loaded
187289 rows loaded
238043 rows loaded
212019 rows loaded
209176 rows loaded
133154 rows loaded
131562 rows loaded
141056 rows loaded
149545 rows loaded
153232 rows loaded
110543 rows loaded
119277 rows loaded
99313 rows loaded
56857 rows loaded
72879 rows loaded
2017/09/13 13:39:31 - Data Loading for Indian_Data table of dbase_Sqlite database completed: Took 103.56686115264893 seconds to complete.
785530 rows loaded
827584 rows loaded
710296 rows loaded
709721 rows loaded
626341 rows loaded
656058 rows loaded
577635 rows loaded
540886 rows loaded
428838 rows loaded
519887 rows loaded
493299 rows loaded
475870 rows loaded
368359 rows loaded
316561 rows loaded
285815 rows loaded
312606 rows loaded
278781 rows loaded
283745 rows loaded
231812 rows loaded
216698 rows loaded
138793 rows loaded
137785 rows loaded
2017/09/13 13:42:12 - Data Lo

In [None]:
# For Chinese Data and rsID
startTime = time.time()

query_Data = "(CHROM int(2), POS int(10), N_ALLELES int(1), N_CHR int(4), ALLELE_FREQ_1 char(50), ALLELE_FREQ_2 char(50), ALLELE_FREQ_3 char(50), ALLELE_FREQ_4 char(50))"
makedb('dbase_Sqlite', 'Chinese_Data', query_Data)

for ID in freqFilesID:
    loaddb_chineseData('Chinese_Data', 'dbase_Sqlite', workingFolder_Chinese + ID[0] +'_analysis_exome.frq')
    logging.info('Inserting values from ' +  workingFolder_Chinese + ID[0] + '_analysis_exome.frq ' +
                 'to Chinese_Data table of dbase_Sqlite database')
timeTaken = time.time()-startTime  
print(datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S'), 
      '- Data Loading for Chinese_Data table of dbase_Sqlite database completed: Took {} '
      'seconds to complete.'.format(timeTaken))

######
makedb('dbase_Sqlite', 'Chinese_rsID', '(CHROM int(2), POS int(10), ID chr(15))')
#######
for ID in freqFilesID:
    loaddb('Chinese_rsID', 'dbase_Sqlite', workingFolder_Chinese + ID[0] +'_rsID')
    logging.info('Inserting values from ' +  workingFolder_Chinese + ID[0] + '_rsID ' +
                 'to Chinese_rsID table of dbase_Sqlite database')
timeTaken = time.time()-startTime 
print(datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S'), 
      '- Data Loading for Chinese_rsID table of dbase_Sqlite database completed: Took {} '
      'seconds to complete.'.format(timeTaken))

Drop table Chinese_Data
3106369 rows loaded
3059890 rows loaded
528812 rows loaded
1995361 rows loaded
2008553 rows loaded
2093915 rows loaded
2244496 rows loaded
1879243 rows loaded
1576940 rows loaded
1876124 rows loaded
1794253 rows loaded
1724928 rows loaded
1068998 rows loaded
1114582 rows loaded
1211192 rows loaded
1291652 rows loaded
1319479 rows loaded
912144 rows loaded
978074 rows loaded
848002 rows loaded
460086 rows loaded
607655 rows loaded
2017/09/14 00:41:32 - Data Loading for Chinese_Data table of dbase_Sqlite database completed: Took 2901.64567899704 seconds to complete.


## 3. Checking the table race_Data and race_rsID

In [3]:
conn = sqlite3.connect('dbase_Sqlite')
df = pd.read_sql_query("select * from Chinese_rsID limit 5", conn)
df

Unnamed: 0,CHROM,POS,ID
0,1,10235,rs540431307
1,1,10352,rs555500075
2,1,10505,rs548419688
3,1,10506,rs568405545
4,1,10511,rs534229142


In [4]:
df = pd.read_sql_query("select * from Chinese_Data limit 5", conn)
df

Unnamed: 0,CHROM,POS,N_ALLELES,N_CHR,ALLELE_FREQ_1,ALLELE_FREQ_2,ALLELE_FREQ_3,ALLELE_FREQ_4
0,1,13011,2,186,T:1,G:0,,
1,1,13110,2,186,G:1,A:0,,
2,1,13116,2,186,T:0.994624,G:0.00537634,,
3,1,13118,2,186,A:0.994624,G:0.00537634,,
4,1,13156,2,186,G:1,C:0,,


In [10]:
df = pd.read_sql_query("select count(CHROM) from Chinese_Data", conn)
df

Unnamed: 0,count(CHROM)
0,33699437


In [11]:
df = pd.read_sql_query("select count(CHROM) from Chinese_rsID", conn)
df

Unnamed: 0,count(CHROM)
0,76634002


## 4. Create combined tables (Data + rsID) for Malay, Indian, and Chinese

In [None]:
# Malay

query = '''SELECT 
                Malay_Data.CHROM, 
                Malay_Data.POS, 
                N_ALLELES, 
                N_CHR, 
                ALLELE_FREQ_1, 
                ALLELE_FREQ_2, 
                Malay_rsID.ID 
            FROM 
                Malay_Data 
            INNER JOIN 
                Malay_rsID 
            ON 
                Malay_Data.POS = Malay_rsID.POS'''

combinetables('dbase_Sqlite', 'Malay',  query)

In [None]:
# Indian

query = '''SELECT 
                Indian_Data.CHROM, 
                Indian_Data.POS, 
                N_ALLELES, 
                N_CHR, 
                ALLELE_FREQ_1, 
                ALLELE_FREQ_2, 
                Indian_rsID.ID 
            FROM 
                Indian_Data 
            INNER JOIN 
                Indian_rsID 
            ON 
                Indian_Data.POS = Indian_rsID.POS'''

combinetables('dbase_Sqlite', 'Indian',  query)


In [5]:
# Chinese

query = '''SELECT 
                Chinese_Data.CHROM, 
                Chinese_Data.POS, 
                N_ALLELES, 
                N_CHR, 
                ALLELE_FREQ_1, 
                ALLELE_FREQ_2, 
                ALLELE_FREQ_3,
                ALLELE_FREQ_4,
                Chinese_rsID.ID 
            FROM 
                Chinese_Data 
            INNER JOIN 
                Chinese_rsID 
            ON 
                Chinese_Data.POS = Chinese_rsID.POS'''

combinetables('dbase_Sqlite', 'Chinese',  query)

database table did not exist


## 5. Drop unneccessary race_Data and race_rsID tables

In [None]:
cleardb('dbase_Sqlite', 'Malay_Data')
cleardb('dbase_Sqlite', 'Malay_rsID')
cleardb('dbase_Sqlite', 'Indian_Data')
cleardb('dbase_Sqlite', 'Indian_rsID')
cleardb('dbase_Sqlite', 'Chinese_Data')
cleardb('dbase_Sqlite', 'Chinese_rsID')