# GenoSpark for Singapore population genotypic frequency

## First, generate the output files with the command syntax below:
```
$ vcftools --gzvcf chr[#].consolidate.eff.PPH.vcf.gz --freq --chr [#] --out chr[#]_analysis
$ bcftools query -f '%CHROM\t%POS\t%ID\n' chr[#].consolidate.eff.PPH.vcf.gz -o chr[#]_rsID
```

#### * You may try freqGenerator.sh to generate the above files

In [1]:
#!/usr/bin/env python3.5

__author__ = 'mdc_hk'
version = '1.0'

# Description: To build the database on the pyspark DataFrame
# Usage: -
# Example: -

In [2]:
import datetime, multiprocessing, os, re, shutil, sys, subprocess, time, logging

import pyspark.sql.types as typ
from pyspark.sql.functions import lit


# Specify schemas

schema_Freq = typ.StructType([
    typ.StructField("CHROM", typ.IntegerType(), False),
    typ.StructField("POS", typ.IntegerType(), False),
    typ.StructField("N_ALLELES", typ.IntegerType(), False),
    typ.StructField("N_CHR", typ.IntegerType(), False),
    typ.StructField("ALLELE_FREQ_1", typ.StringType(), False),
    typ.StructField("ALLELE_FREQ_2", typ.StringType(), False),
])

schema_rsID = typ.StructType([
    typ.StructField("CHROM", typ.IntegerType(), False),
    typ.StructField("POS", typ.IntegerType(), False),
    typ.StructField("ID", typ.StringType(), True),
])

schema_Freq_DF = typ.StructType([
    typ.StructField("CHROM", typ.IntegerType(), False),
    typ.StructField("POS", typ.IntegerType(), False),
    typ.StructField("N_ALLELES", typ.IntegerType(), False),
    typ.StructField("N_CHR", typ.IntegerType(), False),
    typ.StructField("ALLELE_FREQ_1", typ.StringType(), False),
    typ.StructField("ALLELE_FREQ_2", typ.StringType(), False),
    typ.StructField("ID", typ.StringType(), True),
])


In [3]:
# Setting up File Paths and Lists

workingFolder_Indian = os.getcwd() + "/SgIndian_vcf/dataFreeze_Feb2013/SNP/biAllele/"
workingFolder_Malay = os.getcwd() + "/SgMalay_vcf/2012_05/snps/"

# Filing number of unique samples found in the working folder...

freqFiles_Indian = [f for f in os.listdir(workingFolder_Indian) if re.match(r'chr\d+_analysis\.frq', f)]
rsIDFiles_Indian = [f for f in os.listdir(workingFolder_Indian) if re.match(r'chr\d+_rsID', f)]
freqFiles_Malay = [f for f in os.listdir(workingFolder_Malay) if re.match(r'chr\d+_analysis\.frq', f)]
rsIDFiles_Malay = [f for f in os.listdir(workingFolder_Malay) if re.match(r'chr\d+_rsID', f)]

freqFilesID_pre = re.compile(r'(chr\d+)_analysis\.frq')
freqFilesID = []
for file in freqFiles_Indian:
    freqFilesID.append(freqFilesID_pre.findall(file))

print(freqFilesID)

[['chr11'], ['chr1'], ['chr10'], ['chr9'], ['chr2'], ['chr3'], ['chr4'], ['chr5'], ['chr6'], ['chr7'], ['chr8'], ['chr12'], ['chr13'], ['chr14'], ['chr15'], ['chr16'], ['chr17'], ['chr18'], ['chr19'], ['chr20'], ['chr21'], ['chr22']]


In [4]:
# Obtain dataset

# suffixFreqID = ['_analysis.frq', '_rsID']
freqDF_Indian = spark.createDataFrame([], schema_Freq_DF)
freqDF_Malay = spark.createDataFrame([], schema_Freq_DF)

for ID in freqFilesID:
    df1 = spark.read.csv(workingFolder_Indian + ID[0] + "_analysis.frq", header=True, schema=schema_Freq, sep='\t').alias('df1')
    df2 = spark.read.csv(workingFolder_Indian + ID[0] + "_rsID", header=False, schema=schema_rsID, sep='\t').alias('df2')
    freqChrN_working = df2.join(df1, df2.POS == df1.POS).select('df1.*','df2.ID')
    freqDF_Indian = freqDF_Indian.union(freqChrN_working)
    
for ID in freqFilesID:
    df1 = spark.read.csv(workingFolder_Malay + ID[0] + "_analysis.frq", header=True, schema=schema_Freq, sep='\t').alias('df1')
    df2 = spark.read.csv(workingFolder_Malay + ID[0] + "_rsID", header=False, schema=schema_rsID, sep='\t').alias('df2')
    freqChrN_working = df2.join(df1, df2.POS == df1.POS).select('df1.*','df2.ID')
    freqDF_Malay = freqDF_Malay.union(freqChrN_working)
    

In [5]:
freqDF_Indian_working = freqDF_Indian.withColumn("ETHNIC", lit("Indian"))
freqDF_Malay_working = freqDF_Malay.withColumn("ETHNIC", lit("Malay"))
freqDF_working = freqDF_Malay_working.union(freqDF_Indian_working)

In [6]:
freqDF_working.show(10)
freqDF_working.select("ETHNIC", "ID", "CHROM", "ALLELE_FREQ_1", "ALLELE_FREQ_2").filter("ID == 'rs4109479'").show()

+-----+------+---------+-----+-------------+-------------+-----------+------+
|CHROM|   POS|N_ALLELES|N_CHR|ALLELE_FREQ_1|ALLELE_FREQ_2|         ID|ETHNIC|
+-----+------+---------+-----+-------------+-------------+-----------+------+
|   11|103739|        2|   30|   T:0.933333|  C:0.0666667|          .|Indian|
|   11|104469|        2|   14|   G:0.714286|   A:0.285714| SSPM_MATCH|Indian|
|   11|105023|        2|   42|   G:0.928571|  A:0.0714286|          .|Indian|
|   11|105073|        2|   46|   G:0.956522|  A:0.0434783|          .|Indian|
|   11|111159|        2|   68|   T:0.897059|   C:0.102941|          .|Indian|
|   11|124986|        2|   24|   G:0.583333|   A:0.416667| SSPM_MATCH|Indian|
|   11|150695|        2|   42|   C:0.880952|   T:0.119048| SSPM_MATCH|Indian|
|   11|153453|        2|   46|   A:0.152174|   G:0.847826|rs187516525|Indian|
|   11|158635|        2|   46|  C:0.0434783|   T:0.956522|  rs4109479|Indian|
|   11|191770|        2|   72|   C:0.930556|  T:0.0694444| SSPM_

In [7]:
# spark.sql("select * from freqDF").show(10)

In [None]:
print('Count of rows: {0}'.format(freqDF.count()))
freqDF.printSchema()

In [None]:
freqDF_working.select(freqDF_working.ID, freqDF_working.ALLELE_FREQ_1, freqDF_working.ALLELE_FREQ_2).filter(freqDF_working.ID == "rs4109479").show()
freqDF_working.select("ID", "CHROM", "ALLELE_FREQ_1", "ALLELE_FREQ_2").filter("ID == 'rs4109479'").show()
spark.sql("select ID, ALLELE_FREQ_1, ALLELE_FREQ_2 from freqDF_working where ID = 'rs4109479'").show()

In [1]:
# print('Count of rows: {0}'.format(freqDF_working.count()))
# print('Count of distinct rows: {0}'.format(freqDF_working.distinct().count()))
# freqDF_working = freqDF_working.dropDuplicates()
# print('Count of rows: {0}'.format(freqDF_working.count()))
# print('Count of distinct rows: {0}'.format(freqDF_working.distinct().count()))
# print('Count of IDs: {0}'.format(freqDF_working.count()))
# print('Count of distinct IDs: {0}'.format(freqDF_working.select(freqDF_working.ID).distinct().count()))
# freqDF_working.where("ID = '.'").show(10)