# Singapore population genotypic frequency

## First, generate the output files with the command syntax below:
```
$ vcftools --gzvcf chr[#].consolidate.eff.PPH.vcf.gz --freq --chr [#] --out chr[#]_analysis
$ bcftools query -f '%CHROM\t%POS\t%ID\n' chr[#].consolidate.eff.PPH.vcf.gz -o chr[#]_rsID
```

In [1]:
#!/usr/bin/env python3.4

__author__ = 'mdc_hk'
version = '1.0'

# Description: To build the database on the pyspark DataFrame
# Usage: -
# Example: -

In [2]:
# Specify schemas
import pyspark.sql.types as typ

schema_Freq = typ.StructType([
    typ.StructField("CHROM", typ.IntegerType(), False),
    typ.StructField("POS", typ.IntegerType(), False),
    typ.StructField("N_ALLELES", typ.IntegerType(), False),
    typ.StructField("N_CHR", typ.IntegerType(), False),
    typ.StructField("ALLELE_FREQ_1", typ.StringType(), False),
    typ.StructField("ALLELE_FREQ_2", typ.StringType(), False),
])

schema_rsID = typ.StructType([
    typ.StructField("CHROM", typ.IntegerType(), False),
    typ.StructField("POS", typ.IntegerType(), False),
    typ.StructField("ID", typ.StringType(), True),
])

schema_Freq_DF = typ.StructType([
    typ.StructField("CHROM", typ.IntegerType(), False),
    typ.StructField("POS", typ.IntegerType(), False),
    typ.StructField("N_ALLELES", typ.IntegerType(), False),
    typ.StructField("N_CHR", typ.IntegerType(), False),
    typ.StructField("ALLELE_FREQ_1", typ.StringType(), False),
    typ.StructField("ALLELE_FREQ_2", typ.StringType(), False),
    typ.StructField("ID", typ.StringType(), True),
])


In [3]:
# Setting up File Paths and Lists

import datetime, multiprocessing, os, re, shutil, sys, subprocess, time, logging

workingFolder = os.getcwd()

# Filing number of unique samples found in the working folder...
freqFiles = [f for f in os.listdir(workingFolder) if re.match(r'chr\d+_analysis\.frq', f)]
rsIDFiles = [f for f in os.listdir(workingFolder) if re.match(r'chr\d+_rsID', f)]

freqFilesID_pre = re.compile(r'(chr\d+)_analysis\.frq')
freqFilesID = []
for file in freqFiles:
    freqFilesID.append(freqFilesID_pre.findall(file))

print(freqFilesID)

[['chr11'], ['chr1'], ['chr10'], ['chr9'], ['chr2'], ['chr3'], ['chr4'], ['chr5'], ['chr6'], ['chr7'], ['chr8'], ['chr12'], ['chr13'], ['chr14'], ['chr15'], ['chr16'], ['chr17'], ['chr18'], ['chr19'], ['chr20'], ['chr21'], ['chr22']]


In [5]:
# Obtain dataset

# suffixFreqID = ['_analysis.frq', '_rsID']
freqDF = spark.createDataFrame([], schema_Freq_DF)

for ID in freqFilesID:
    df1 = spark.read.csv(ID[0] + "_analysis.frq", header=True, schema=schema_Freq, sep='\t').alias('df1')
    df2 = spark.read.csv(ID[0] + "_rsID", header=False, schema=schema_rsID, sep='\t').alias('df2')
    freqChrN_working = df2.join(df1, df2.POS == df1.POS).select('df1.*','df2.ID')
    freqDF = freqDF.union(freqChrN_working)

# freqDF.show(10)
freqDF.select("ID", "CHROM", "ALLELE_FREQ_1", "ALLELE_FREQ_2").filter("ID == 'rs4109479'").show()

+---------+-----+-------------+-------------+
|       ID|CHROM|ALLELE_FREQ_1|ALLELE_FREQ_2|
+---------+-----+-------------+-------------+
|rs4109479|   11|  C:0.0434783|   T:0.956522|
+---------+-----+-------------+-------------+



In [224]:
# Set File Paths

filePathFreqIndianChr1 = "chr1_analysis.frq"
filePathFreqIndianChr1_rsID = "chr1_rsID"

filePathFreqIndianChr2 = "chr2_analysis.frq"
filePathFreqIndianChr2_rsID = "chr2_rsID"

filePathFreqIndianChr9 = "chr9_analysis.frq"
filePathFreqIndianChr9_rsID = "chr9_rsID"

filePathFreqIndianChr10 = "chr10_analysis.frq"
filePathFreqIndianChr10_rsID = "chr10_rsID"

filePathFreqIndianChr11 = "chr11_analysis.frq"
filePathFreqIndianChr11_rsID = "chr11_rsID"

In [226]:
# Obtain dataset

freqIndianChr1 = spark.read.csv(filePathFreqIndianChr1, header=True, schema=schema_FreqIndianChr, sep='\t')
freqIndianChr1.createOrReplaceTempView("freqIndianChr1")
freqIndianChr1_rsID = spark.read.csv(filePathFreqIndianChr1_rsID, header=False, schema=schema_FreqIndianChr_rsID, sep='\t')
freqIndianChr1_rsID.createOrReplaceTempView("freqIndianChr1_rsID")

freqIndianChr2 = spark.read.csv(filePathFreqIndianChr2, header=True, schema=schema_FreqIndianChr, sep='\t')
freqIndianChr2.createOrReplaceTempView("freqIndianChr2")
freqIndianChr2_rsID = spark.read.csv(filePathFreqIndianChr2_rsID, header=False, schema=schema_FreqIndianChr_rsID, sep='\t')
freqIndianChr2_rsID.createOrReplaceTempView("freqIndianChr2_rsID")

freqIndianChr9 = spark.read.csv(filePathFreqIndianChr9, header=True, schema=schema_FreqIndianChr, sep='\t')
freqIndianChr9.createOrReplaceTempView("freqIndianChr9")
freqIndianChr9_rsID = spark.read.csv(filePathFreqIndianChr9_rsID, header=False, schema=schema_FreqIndianChr_rsID, sep='\t')
freqIndianChr9_rsID.createOrReplaceTempView("freqIndianChr9_rsID")

freqIndianChr10 = spark.read.csv(filePathFreqIndianChr10, header=True, schema=schema_FreqIndianChr, sep='\t')
freqIndianChr10.createOrReplaceTempView("freqIndianChr10")
freqIndianChr10_rsID = spark.read.csv(filePathFreqIndianChr10_rsID, header=False, schema=schema_FreqIndianChr_rsID, sep='\t')
freqIndianChr10_rsID.createOrReplaceTempView("freqIndianChr10_rsID")

freqIndianChr11 = spark.read.csv(filePathFreqIndianChr11, header=True, schema=schema_FreqIndianChr, sep='\t')
freqIndianChr11.createOrReplaceTempView("freqIndianChr11")
freqIndianChr11_rsID = spark.read.csv(filePathFreqIndianChr11_rsID, header=False, schema=schema_FreqIndianChr_rsID, sep='\t')
freqIndianChr11_rsID.createOrReplaceTempView("freqIndianChr11_rsID")

In [212]:
freqIndianChr10.show(10)
spark.sql("select * from freqIndianChr10").show(10)
freqIndianChr11_rsID.show(10)
# freqChr11Indian.printSchema()
# freqChr11Indian.count()

+-----+------+---------+-----+-------------+-------------+
|CHROM|   POS|N_ALLELES|N_CHR|ALLELE_FREQ_1|ALLELE_FREQ_2|
+-----+------+---------+-----+-------------+-------------+
|   10| 94426|        2|   72|   C:0.680556|   T:0.319444|
|   10|119679|        2|   72|   G:0.986111|  T:0.0138889|
|   10|121816|        2|   72|   T:0.708333|   C:0.291667|
|   10|122217|        2|   72|   G:0.930556|  T:0.0694444|
|   10|124381|        2|   72|   A:0.680556|   G:0.319444|
|   10|124490|        2|   72|   C:0.986111|  T:0.0138889|
|   10|124703|        2|   72|   C:0.930556|  T:0.0694444|
|   10|124729|        2|   72|   A:0.930556|  C:0.0694444|
|   10|125008|        2|   72|   G:0.333333|   C:0.666667|
|   10|125009|        2|   72|   G:0.333333|   A:0.666667|
+-----+------+---------+-----+-------------+-------------+
only showing top 10 rows

+-----+------+---------+-----+-------------+-------------+
|CHROM|   POS|N_ALLELES|N_CHR|ALLELE_FREQ_1|ALLELE_FREQ_2|
+-----+------+---------+-----+

In [228]:
df1 = freqIndianChr1.alias('df1')
df2 = freqIndianChr1_rsID.alias('df2')
freqIndianChr1_working = df2.join(df1, df2.POS == df1.POS).select('df1.*','df2.ID')

df1 = freqIndianChr2.alias('df1')
df2 = freqIndianChr2_rsID.alias('df2')
freqIndianChr2_working = df2.join(df1, df2.POS == df1.POS).select('df1.*','df2.ID')

df1 = freqIndianChr9.alias('df1')
df2 = freqIndianChr9_rsID.alias('df2')
freqIndianChr9_working = df2.join(df1, df2.POS == df1.POS).select('df1.*','df2.ID')

df1 = freqIndianChr10.alias('df1')
df2 = freqIndianChr10_rsID.alias('df2')
freqIndianChr10_working = df2.join(df1, df2.POS == df1.POS).select('df1.*','df2.ID')

df1 = freqIndianChr11.alias('df1')
df2 = freqIndianChr11_rsID.alias('df2')
freqIndianChr11_working = df2.join(df1, df2.POS == df1.POS).select('df1.*','df2.ID')

In [233]:
freqIndian = freqIndianChr1_working.unionAll(freqIndianChr2_working)
print('Count of rows: {0}'.format(freqIndian.count()))
freqIndian = freqIndian.unionAll(freqIndianChr9_working)
print('Count of rows: {0}'.format(freqIndian.count()))
freqIndian = freqIndian.unionAll(freqIndianChr10_working)
print('Count of rows: {0}'.format(freqIndian.count()))
freqIndian = freqIndian.unionAll(freqIndianChr11_working)
print('Count of rows: {0}'.format(freqIndian.count()))
freqIndian.printSchema()

Count of rows: 1613116
Count of rows: 2041955
Count of rows: 2561843
Count of rows: 3055143
root
 |-- CHROM: integer (nullable = true)
 |-- POS: integer (nullable = true)
 |-- N_ALLELES: integer (nullable = true)
 |-- N_CHR: integer (nullable = true)
 |-- ALLELE_FREQ_1: string (nullable = true)
 |-- ALLELE_FREQ_2: string (nullable = true)
 |-- ID: string (nullable = true)



In [234]:
freqIndian.show(10)

+-----+-------+---------+-----+-------------+-------------+-----------+
|CHROM|    POS|N_ALLELES|N_CHR|ALLELE_FREQ_1|ALLELE_FREQ_2|         ID|
+-----+-------+---------+-----+-------------+-------------+-----------+
|    1| 843684|        2|   72|   G:0.986111|  A:0.0138889|          .|
|    1| 894670|        2|   72|   G:0.986111|  C:0.0138889|          .|
|    1| 909073|        2|   72|   C:0.513889|   T:0.486111|  rs3892467|
|    1| 923459|        2|   72|          A:0|          G:1|  rs9442609|
|    1|1015817|        2|   72|   G:0.138889|   A:0.861111| rs12746483|
|    1|1129920|        2|   72|      G:0.875|      A:0.125| rs12060422|
|    1|1138411|        2|   72|   C:0.986111|  T:0.0138889|          .|
|    1|1144696|        2|   72|      T:0.875|      C:0.125|rs111842318|
|    1|1226757|        2|   72|   G:0.888889|   A:0.111111|  rs6690013|
|    1|1272497|        2|   72|  A:0.0694444|   G:0.930556|   rs307372|
+-----+-------+---------+-----+-------------+-------------+-----

In [235]:
freqIndianChr11_working.select(freqIndianChr11_working.ID, freqIndianChr11_working.ALLELE_FREQ_1, freqIndianChr11_working.ALLELE_FREQ_2).filter(freqIndianChr11_working.ID == "rs4109479").show()
freqIndianChr11_working.select("ID", "CHROM", "ALLELE_FREQ_1", "ALLELE_FREQ_2").filter("ID == 'rs4109479'").show()
spark.sql("select ID, ALLELE_FREQ_1, ALLELE_FREQ_2 from freqChr11Indian_working where ID = 'rs4109479'").show()

+---------+-------------+-------------+
|       ID|ALLELE_FREQ_1|ALLELE_FREQ_2|
+---------+-------------+-------------+
|rs4109479|  C:0.0434783|   T:0.956522|
+---------+-------------+-------------+

+---------+-----+-------------+-------------+
|       ID|CHROM|ALLELE_FREQ_1|ALLELE_FREQ_2|
+---------+-----+-------------+-------------+
|rs4109479|   11|  C:0.0434783|   T:0.956522|
+---------+-----+-------------+-------------+

+---------+-------------+-------------+
|       ID|ALLELE_FREQ_1|ALLELE_FREQ_2|
+---------+-------------+-------------+
|rs4109479|  C:0.0434783|   T:0.956522|
+---------+-------------+-------------+



In [197]:
print('Count of rows: {0}'.format(freqChr11Indian_working.count()))
print('Count of distinct rows: {0}'.format(freqChr11Indian_working.distinct().count()))
freqChr11Indian_working = freqChr11Indian_working.dropDuplicates()
print('Count of rows: {0}'.format(freqChr11Indian_working.count()))
print('Count of distinct rows: {0}'.format(freqChr11Indian_working.distinct().count()))
print('Count of IDs: {0}'.format(freqChr11Indian_working.count()))
print('Count of distinct IDs: {0}'.format(freqChr11Indian_working.select(freqChr11Indian_working.ID).distinct().count()))
freqChr11Indian_working.where("ID = '.'").show(10)

Count of rows: 493300
Count of distinct rows: 493300
Count of rows: 493300
Count of distinct rows: 493300
Count of IDs: 493300
Count of distinct IDs: 383063
+-----+-------+---------+-----+-------------+-------------+---+
|CHROM|    POS|N_ALLELES|N_CHR|ALLELE_FREQ_1|ALLELE_FREQ_2| ID|
+-----+-------+---------+-----+-------------+-------------+---+
|   11| 264963|        2|   72|   T:0.986111|  A:0.0138889|  .|
|   11|1260861|        2|   72|   C:0.986111|  T:0.0138889|  .|
|   11|1842597|        2|   72|   G:0.986111|  A:0.0138889|  .|
|   11|1941635|        2|   72|   T:0.986111|  C:0.0138889|  .|
|   11|2001157|        2|   72|   T:0.986111|  C:0.0138889|  .|
|   11|2367314|        2|   72|   G:0.972222|  A:0.0277778|  .|
|   11|2462424|        2|   72|   C:0.986111|  T:0.0138889|  .|
|   11|2828932|        2|   72|   C:0.986111|  T:0.0138889|  .|
|   11|2930639|        2|   72|   G:0.986111|  C:0.0138889|  .|
|   11|3823904|        2|   72|   C:0.972222|  A:0.0277778|  .|
+-----+----