# GenoSpark for Singapore population genotypic frequency

## First, generate the output files with the command syntax below:
```
$ vcftools --gzvcf chr[#].consolidate.eff.PPH.vcf.gz --freq --chr [#] --out chr[#]_analysis
$ bcftools query -f '%CHROM\t%POS\t%ID\n' chr[#].consolidate.eff.PPH.vcf.gz -o chr[#]_rsID
```

#### * You may try freqGenerator.sh to generate the above files

In [1]:
#!/usr/bin/env python3.5

__author__ = 'mdc_hk'
version = '1.0'

# Description: To build the database on the pyspark DataFrame
# Usage: -
# Example: -

In [2]:
import datetime, multiprocessing, os, re, shutil, sys, subprocess, time, logging

import pyspark.sql.types as typ
from pyspark.sql.functions import lit


# Specify schemas

schema_Freq = typ.StructType([
    typ.StructField("CHROM", typ.IntegerType(), False),
    typ.StructField("POS", typ.IntegerType(), False),
    typ.StructField("N_ALLELES", typ.IntegerType(), False),
    typ.StructField("N_CHR", typ.IntegerType(), False),
    typ.StructField("ALLELE_FREQ_1", typ.StringType(), False),
    typ.StructField("ALLELE_FREQ_2", typ.StringType(), False),
])

schema_rsID = typ.StructType([
    typ.StructField("CHROM", typ.IntegerType(), False),
    typ.StructField("POS", typ.IntegerType(), False),
    typ.StructField("ID", typ.StringType(), True),
])

schema_Freq_DF = typ.StructType([
    typ.StructField("CHROM", typ.IntegerType(), False),
    typ.StructField("POS", typ.IntegerType(), False),
    typ.StructField("N_ALLELES", typ.IntegerType(), False),
    typ.StructField("N_CHR", typ.IntegerType(), False),
    typ.StructField("ALLELE_FREQ_1", typ.StringType(), False),
    typ.StructField("ALLELE_FREQ_2", typ.StringType(), False),
    typ.StructField("ID", typ.StringType(), True),
])


In [3]:
# Setting up File Paths and Lists

workingFolder_Indian = os.getcwd() + "/SgIndian_vcf/dataFreeze_Feb2013/SNP/biAllele/"
workingFolder_Malay = os.getcwd() + "/SgMalay_vcf/2012_05/snps/"

# Filing number of unique samples found in the working folder...

freqFiles_Indian = [f for f in os.listdir(workingFolder_Indian) if re.match(r'chr\d+_analysis\.frq', f)]
rsIDFiles_Indian = [f for f in os.listdir(workingFolder_Indian) if re.match(r'chr\d+_rsID', f)]
freqFiles_Malay = [f for f in os.listdir(workingFolder_Malay) if re.match(r'chr\d+_analysis\.frq', f)]
rsIDFiles_Malay = [f for f in os.listdir(workingFolder_Malay) if re.match(r'chr\d+_rsID', f)]

freqFilesID_pre = re.compile(r'(chr\d+)_analysis\.frq')
freqFilesID = []
for file in freqFiles_Indian:
    freqFilesID.append(freqFilesID_pre.findall(file))

print(freqFilesID)

[['chr11'], ['chr1'], ['chr10'], ['chr9'], ['chr2'], ['chr3'], ['chr4'], ['chr5'], ['chr6'], ['chr7'], ['chr8'], ['chr12'], ['chr13'], ['chr14'], ['chr15'], ['chr16'], ['chr17'], ['chr18'], ['chr19'], ['chr20'], ['chr21'], ['chr22']]


In [4]:
# Obtain dataset

# suffixFreqID = ['_analysis.frq', '_rsID']
freqDF_Indian = spark.createDataFrame([], schema_Freq_DF)
freqDF_Malay = spark.createDataFrame([], schema_Freq_DF)

for ID in freqFilesID:
    df1 = spark.read.csv(workingFolder_Indian + ID[0] + "_analysis.frq", header=True, schema=schema_Freq, sep='\t').alias('df1')
    df2 = spark.read.csv(workingFolder_Indian + ID[0] + "_rsID", header=False, schema=schema_rsID, sep='\t').alias('df2')
    freqChrN_working = df2.join(df1, df2.POS == df1.POS).select('df1.*','df2.ID')
    freqDF_Indian = freqDF_Indian.union(freqChrN_working)
    
for ID in freqFilesID:
    df1 = spark.read.csv(workingFolder_Malay + ID[0] + "_analysis.frq", header=True, schema=schema_Freq, sep='\t').alias('df1')
    df2 = spark.read.csv(workingFolder_Malay + ID[0] + "_rsID", header=False, schema=schema_rsID, sep='\t').alias('df2')
    freqChrN_working = df2.join(df1, df2.POS == df1.POS).select('df1.*','df2.ID')
    freqDF_Malay = freqDF_Malay.union(freqChrN_working)
    

In [7]:
freqDF_Indian_working = freqDF_Indian.withColumn("ETHNIC", lit("Indian"))
freqDF_Malay_working = freqDF_Malay.withColumn("ETHNIC", lit("Malay"))
freqDF_working = freqDF_Malay_working.union(freqDF_Indian_working)

In [6]:
freqDF_working.show(10)
freqDF_working.select("ETHNIC", "ID", "CHROM", "ALLELE_FREQ_1", "ALLELE_FREQ_2").filter("ID == 'rs4109479'").show()

+-----+------+---------+-----+-------------+-------------+-----------+------+
|CHROM|   POS|N_ALLELES|N_CHR|ALLELE_FREQ_1|ALLELE_FREQ_2|         ID|ETHNIC|
+-----+------+---------+-----+-------------+-------------+-----------+------+
|   11|103739|        2|   30|   T:0.933333|  C:0.0666667|          .|Indian|
|   11|104469|        2|   14|   G:0.714286|   A:0.285714| SSPM_MATCH|Indian|
|   11|105023|        2|   42|   G:0.928571|  A:0.0714286|          .|Indian|
|   11|105073|        2|   46|   G:0.956522|  A:0.0434783|          .|Indian|
|   11|111159|        2|   68|   T:0.897059|   C:0.102941|          .|Indian|
|   11|124986|        2|   24|   G:0.583333|   A:0.416667| SSPM_MATCH|Indian|
|   11|150695|        2|   42|   C:0.880952|   T:0.119048| SSPM_MATCH|Indian|
|   11|153453|        2|   46|   A:0.152174|   G:0.847826|rs187516525|Indian|
|   11|158635|        2|   46|  C:0.0434783|   T:0.956522|  rs4109479|Indian|
|   11|191770|        2|   72|   C:0.930556|  T:0.0694444| SSPM_

In [7]:
# spark.sql("select * from freqDF_working").show(10)

In [8]:
print('Count of rows: {0}'.format(freqDF_working.count()))
freqDF_working.printSchema()

Py4JJavaError: An error occurred while calling o1681.count.
: org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree:
Exchange SinglePartition
+- *HashAggregate(keys=[], functions=[partial_count(1)], output=[count#3210L])
   +- Union
      :- *Project
      :  +- Scan ExistingRDD[CHROM#15,POS#16,N_ALLELES#17,N_CHR#18,ALLELE_FREQ_1#19,ALLELE_FREQ_2#20,ID#21]
      :- *Project
      :  +- *BroadcastHashJoin [POS#1563], [POS#1549], Inner, BuildRight
      :     :- *Project [POS#1563]
      :     :  +- *Filter isnotnull(POS#1563)
      :     :     +- *FileScan csv [POS#1563] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#1549]
      :           +- *Filter isnotnull(POS#1549)
      :              +- *FileScan csv [POS#1549] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#1632], [POS#1618], Inner, BuildRight
      :     :- *Project [POS#1632]
      :     :  +- *Filter isnotnull(POS#1632)
      :     :     +- *FileScan csv [POS#1632] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#1618]
      :           +- *Filter isnotnull(POS#1618)
      :              +- *FileScan csv [POS#1618] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#1701], [POS#1687], Inner, BuildRight
      :     :- *Project [POS#1701]
      :     :  +- *Filter isnotnull(POS#1701)
      :     :     +- *FileScan csv [POS#1701] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#1687]
      :           +- *Filter isnotnull(POS#1687)
      :              +- *FileScan csv [POS#1687] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#1770], [POS#1756], Inner, BuildRight
      :     :- *Project [POS#1770]
      :     :  +- *Filter isnotnull(POS#1770)
      :     :     +- *FileScan csv [POS#1770] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#1756]
      :           +- *Filter isnotnull(POS#1756)
      :              +- *FileScan csv [POS#1756] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#1839], [POS#1825], Inner, BuildRight
      :     :- *Project [POS#1839]
      :     :  +- *Filter isnotnull(POS#1839)
      :     :     +- *FileScan csv [POS#1839] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#1825]
      :           +- *Filter isnotnull(POS#1825)
      :              +- *FileScan csv [POS#1825] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#1908], [POS#1894], Inner, BuildRight
      :     :- *Project [POS#1908]
      :     :  +- *Filter isnotnull(POS#1908)
      :     :     +- *FileScan csv [POS#1908] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#1894]
      :           +- *Filter isnotnull(POS#1894)
      :              +- *FileScan csv [POS#1894] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#1977], [POS#1963], Inner, BuildRight
      :     :- *Project [POS#1977]
      :     :  +- *Filter isnotnull(POS#1977)
      :     :     +- *FileScan csv [POS#1977] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#1963]
      :           +- *Filter isnotnull(POS#1963)
      :              +- *FileScan csv [POS#1963] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#2046], [POS#2032], Inner, BuildRight
      :     :- *Project [POS#2046]
      :     :  +- *Filter isnotnull(POS#2046)
      :     :     +- *FileScan csv [POS#2046] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#2032]
      :           +- *Filter isnotnull(POS#2032)
      :              +- *FileScan csv [POS#2032] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#2115], [POS#2101], Inner, BuildRight
      :     :- *Project [POS#2115]
      :     :  +- *Filter isnotnull(POS#2115)
      :     :     +- *FileScan csv [POS#2115] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#2101]
      :           +- *Filter isnotnull(POS#2101)
      :              +- *FileScan csv [POS#2101] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#2184], [POS#2170], Inner, BuildRight
      :     :- *Project [POS#2184]
      :     :  +- *Filter isnotnull(POS#2184)
      :     :     +- *FileScan csv [POS#2184] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#2170]
      :           +- *Filter isnotnull(POS#2170)
      :              +- *FileScan csv [POS#2170] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#2253], [POS#2239], Inner, BuildRight
      :     :- *Project [POS#2253]
      :     :  +- *Filter isnotnull(POS#2253)
      :     :     +- *FileScan csv [POS#2253] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#2239]
      :           +- *Filter isnotnull(POS#2239)
      :              +- *FileScan csv [POS#2239] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#2322], [POS#2308], Inner, BuildRight
      :     :- *Project [POS#2322]
      :     :  +- *Filter isnotnull(POS#2322)
      :     :     +- *FileScan csv [POS#2322] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#2308]
      :           +- *Filter isnotnull(POS#2308)
      :              +- *FileScan csv [POS#2308] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#2391], [POS#2377], Inner, BuildRight
      :     :- *Project [POS#2391]
      :     :  +- *Filter isnotnull(POS#2391)
      :     :     +- *FileScan csv [POS#2391] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#2377]
      :           +- *Filter isnotnull(POS#2377)
      :              +- *FileScan csv [POS#2377] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#2460], [POS#2446], Inner, BuildRight
      :     :- *Project [POS#2460]
      :     :  +- *Filter isnotnull(POS#2460)
      :     :     +- *FileScan csv [POS#2460] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#2446]
      :           +- *Filter isnotnull(POS#2446)
      :              +- *FileScan csv [POS#2446] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#2529], [POS#2515], Inner, BuildRight
      :     :- *Project [POS#2529]
      :     :  +- *Filter isnotnull(POS#2529)
      :     :     +- *FileScan csv [POS#2529] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#2515]
      :           +- *Filter isnotnull(POS#2515)
      :              +- *FileScan csv [POS#2515] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#2598], [POS#2584], Inner, BuildRight
      :     :- *Project [POS#2598]
      :     :  +- *Filter isnotnull(POS#2598)
      :     :     +- *FileScan csv [POS#2598] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#2584]
      :           +- *Filter isnotnull(POS#2584)
      :              +- *FileScan csv [POS#2584] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#2667], [POS#2653], Inner, BuildRight
      :     :- *Project [POS#2667]
      :     :  +- *Filter isnotnull(POS#2667)
      :     :     +- *FileScan csv [POS#2667] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#2653]
      :           +- *Filter isnotnull(POS#2653)
      :              +- *FileScan csv [POS#2653] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#2736], [POS#2722], Inner, BuildRight
      :     :- *Project [POS#2736]
      :     :  +- *Filter isnotnull(POS#2736)
      :     :     +- *FileScan csv [POS#2736] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#2722]
      :           +- *Filter isnotnull(POS#2722)
      :              +- *FileScan csv [POS#2722] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#2805], [POS#2791], Inner, BuildRight
      :     :- *Project [POS#2805]
      :     :  +- *Filter isnotnull(POS#2805)
      :     :     +- *FileScan csv [POS#2805] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#2791]
      :           +- *Filter isnotnull(POS#2791)
      :              +- *FileScan csv [POS#2791] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#2874], [POS#2860], Inner, BuildRight
      :     :- *Project [POS#2874]
      :     :  +- *Filter isnotnull(POS#2874)
      :     :     +- *FileScan csv [POS#2874] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#2860]
      :           +- *Filter isnotnull(POS#2860)
      :              +- *FileScan csv [POS#2860] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#2943], [POS#2929], Inner, BuildRight
      :     :- *Project [POS#2943]
      :     :  +- *Filter isnotnull(POS#2943)
      :     :     +- *FileScan csv [POS#2943] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#2929]
      :           +- *Filter isnotnull(POS#2929)
      :              +- *FileScan csv [POS#2929] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#3012], [POS#2998], Inner, BuildRight
      :     :- *Project [POS#3012]
      :     :  +- *Filter isnotnull(POS#3012)
      :     :     +- *FileScan csv [POS#3012] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#2998]
      :           +- *Filter isnotnull(POS#2998)
      :              +- *FileScan csv [POS#2998] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgMalay_vcf/2012_05/snp..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- Scan ExistingRDD[CHROM#0,POS#1,N_ALLELES#2,N_CHR#3,ALLELE_FREQ_1#4,ALLELE_FREQ_2#5,ID#6]
      :- *Project
      :  +- *BroadcastHashJoin [POS#45], [POS#31], Inner, BuildRight
      :     :- *Project [POS#45]
      :     :  +- *Filter isnotnull(POS#45)
      :     :     +- *FileScan csv [POS#45] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#31]
      :           +- *Filter isnotnull(POS#31)
      :              +- *FileScan csv [POS#31] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#114], [POS#100], Inner, BuildRight
      :     :- *Project [POS#114]
      :     :  +- *Filter isnotnull(POS#114)
      :     :     +- *FileScan csv [POS#114] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#100]
      :           +- *Filter isnotnull(POS#100)
      :              +- *FileScan csv [POS#100] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#183], [POS#169], Inner, BuildRight
      :     :- *Project [POS#183]
      :     :  +- *Filter isnotnull(POS#183)
      :     :     +- *FileScan csv [POS#183] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#169]
      :           +- *Filter isnotnull(POS#169)
      :              +- *FileScan csv [POS#169] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#252], [POS#238], Inner, BuildRight
      :     :- *Project [POS#252]
      :     :  +- *Filter isnotnull(POS#252)
      :     :     +- *FileScan csv [POS#252] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#238]
      :           +- *Filter isnotnull(POS#238)
      :              +- *FileScan csv [POS#238] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#321], [POS#307], Inner, BuildRight
      :     :- *Project [POS#321]
      :     :  +- *Filter isnotnull(POS#321)
      :     :     +- *FileScan csv [POS#321] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#307]
      :           +- *Filter isnotnull(POS#307)
      :              +- *FileScan csv [POS#307] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#390], [POS#376], Inner, BuildRight
      :     :- *Project [POS#390]
      :     :  +- *Filter isnotnull(POS#390)
      :     :     +- *FileScan csv [POS#390] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#376]
      :           +- *Filter isnotnull(POS#376)
      :              +- *FileScan csv [POS#376] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#459], [POS#445], Inner, BuildRight
      :     :- *Project [POS#459]
      :     :  +- *Filter isnotnull(POS#459)
      :     :     +- *FileScan csv [POS#459] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#445]
      :           +- *Filter isnotnull(POS#445)
      :              +- *FileScan csv [POS#445] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#528], [POS#514], Inner, BuildRight
      :     :- *Project [POS#528]
      :     :  +- *Filter isnotnull(POS#528)
      :     :     +- *FileScan csv [POS#528] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#514]
      :           +- *Filter isnotnull(POS#514)
      :              +- *FileScan csv [POS#514] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#597], [POS#583], Inner, BuildRight
      :     :- *Project [POS#597]
      :     :  +- *Filter isnotnull(POS#597)
      :     :     +- *FileScan csv [POS#597] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#583]
      :           +- *Filter isnotnull(POS#583)
      :              +- *FileScan csv [POS#583] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#666], [POS#652], Inner, BuildRight
      :     :- *Project [POS#666]
      :     :  +- *Filter isnotnull(POS#666)
      :     :     +- *FileScan csv [POS#666] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#652]
      :           +- *Filter isnotnull(POS#652)
      :              +- *FileScan csv [POS#652] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#735], [POS#721], Inner, BuildRight
      :     :- *Project [POS#735]
      :     :  +- *Filter isnotnull(POS#735)
      :     :     +- *FileScan csv [POS#735] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#721]
      :           +- *Filter isnotnull(POS#721)
      :              +- *FileScan csv [POS#721] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#804], [POS#790], Inner, BuildRight
      :     :- *Project [POS#804]
      :     :  +- *Filter isnotnull(POS#804)
      :     :     +- *FileScan csv [POS#804] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#790]
      :           +- *Filter isnotnull(POS#790)
      :              +- *FileScan csv [POS#790] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#873], [POS#859], Inner, BuildRight
      :     :- *Project [POS#873]
      :     :  +- *Filter isnotnull(POS#873)
      :     :     +- *FileScan csv [POS#873] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#859]
      :           +- *Filter isnotnull(POS#859)
      :              +- *FileScan csv [POS#859] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#942], [POS#928], Inner, BuildRight
      :     :- *Project [POS#942]
      :     :  +- *Filter isnotnull(POS#942)
      :     :     +- *FileScan csv [POS#942] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#928]
      :           +- *Filter isnotnull(POS#928)
      :              +- *FileScan csv [POS#928] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#1011], [POS#997], Inner, BuildRight
      :     :- *Project [POS#1011]
      :     :  +- *Filter isnotnull(POS#1011)
      :     :     +- *FileScan csv [POS#1011] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#997]
      :           +- *Filter isnotnull(POS#997)
      :              +- *FileScan csv [POS#997] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#1080], [POS#1066], Inner, BuildRight
      :     :- *Project [POS#1080]
      :     :  +- *Filter isnotnull(POS#1080)
      :     :     +- *FileScan csv [POS#1080] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#1066]
      :           +- *Filter isnotnull(POS#1066)
      :              +- *FileScan csv [POS#1066] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#1149], [POS#1135], Inner, BuildRight
      :     :- *Project [POS#1149]
      :     :  +- *Filter isnotnull(POS#1149)
      :     :     +- *FileScan csv [POS#1149] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#1135]
      :           +- *Filter isnotnull(POS#1135)
      :              +- *FileScan csv [POS#1135] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#1218], [POS#1204], Inner, BuildRight
      :     :- *Project [POS#1218]
      :     :  +- *Filter isnotnull(POS#1218)
      :     :     +- *FileScan csv [POS#1218] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#1204]
      :           +- *Filter isnotnull(POS#1204)
      :              +- *FileScan csv [POS#1204] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#1287], [POS#1273], Inner, BuildRight
      :     :- *Project [POS#1287]
      :     :  +- *Filter isnotnull(POS#1287)
      :     :     +- *FileScan csv [POS#1287] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#1273]
      :           +- *Filter isnotnull(POS#1273)
      :              +- *FileScan csv [POS#1273] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#1356], [POS#1342], Inner, BuildRight
      :     :- *Project [POS#1356]
      :     :  +- *Filter isnotnull(POS#1356)
      :     :     +- *FileScan csv [POS#1356] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#1342]
      :           +- *Filter isnotnull(POS#1342)
      :              +- *FileScan csv [POS#1342] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :- *Project
      :  +- *BroadcastHashJoin [POS#1425], [POS#1411], Inner, BuildRight
      :     :- *Project [POS#1425]
      :     :  +- *Filter isnotnull(POS#1425)
      :     :     +- *FileScan csv [POS#1425] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      :     +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      :        +- *Project [POS#1411]
      :           +- *Filter isnotnull(POS#1411)
      :              +- *FileScan csv [POS#1411] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
      +- *Project
         +- *BroadcastHashJoin [POS#1494], [POS#1480], Inner, BuildRight
            :- *Project [POS#1494]
            :  +- *Filter isnotnull(POS#1494)
            :     +- *FileScan csv [POS#1494] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>
            +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
               +- *Project [POS#1480]
                  +- *Filter isnotnull(POS#1480)
                     +- *FileScan csv [POS#1480] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Volumes/Samsung_T3/SgPopulationGenetics/GenoSpark/SgIndian_vcf/dataFreeze..., PartitionFilters: [], PushedFilters: [IsNotNull(POS)], ReadSchema: struct<POS:int>

	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:56)
	at org.apache.spark.sql.execution.exchange.ShuffleExchange.doExecute(ShuffleExchange.scala:115)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:252)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:141)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:386)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:228)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:275)
	at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2430)
	at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2429)
	at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:2837)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2836)
	at org.apache.spark.sql.Dataset.count(Dataset.scala:2429)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.util.concurrent.TimeoutException: Futures timed out after [300 seconds]
	at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:219)
	at scala.concurrent.impl.Promise$DefaultPromise.result(Promise.scala:223)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:201)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.doExecuteBroadcast(BroadcastExchangeExec.scala:123)
	at org.apache.spark.sql.execution.InputAdapter.doExecuteBroadcast(WholeStageCodegenExec.scala:248)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeBroadcast$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeBroadcast$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
	at org.apache.spark.sql.execution.SparkPlan.executeBroadcast(SparkPlan.scala:126)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.prepareBroadcast(BroadcastHashJoinExec.scala:98)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.codegenInner(BroadcastHashJoinExec.scala:197)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doConsume(BroadcastHashJoinExec.scala:82)
	at org.apache.spark.sql.execution.CodegenSupport$class.consume(WholeStageCodegenExec.scala:155)
	at org.apache.spark.sql.execution.ProjectExec.consume(basicPhysicalOperators.scala:36)
	at org.apache.spark.sql.execution.ProjectExec.doConsume(basicPhysicalOperators.scala:68)
	at org.apache.spark.sql.execution.CodegenSupport$class.consume(WholeStageCodegenExec.scala:155)
	at org.apache.spark.sql.execution.FilterExec.consume(basicPhysicalOperators.scala:88)
	at org.apache.spark.sql.execution.FilterExec.doConsume(basicPhysicalOperators.scala:209)
	at org.apache.spark.sql.execution.CodegenSupport$class.consume(WholeStageCodegenExec.scala:155)
	at org.apache.spark.sql.execution.FileSourceScanExec.consume(DataSourceScanExec.scala:155)
	at org.apache.spark.sql.execution.FileSourceScanExec.doProduce(DataSourceScanExec.scala:361)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:80)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:80)
	at org.apache.spark.sql.execution.FileSourceScanExec.produce(DataSourceScanExec.scala:155)
	at org.apache.spark.sql.execution.FilterExec.doProduce(basicPhysicalOperators.scala:128)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:80)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:80)
	at org.apache.spark.sql.execution.FilterExec.produce(basicPhysicalOperators.scala:88)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:46)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:80)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:80)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:36)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doProduce(BroadcastHashJoinExec.scala:77)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:80)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:80)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.produce(BroadcastHashJoinExec.scala:38)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:46)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:80)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:80)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:36)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doCodeGen(WholeStageCodegenExec.scala:331)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:372)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
	at org.apache.spark.sql.execution.UnionExec$$anonfun$doExecute$1.apply(basicPhysicalOperators.scala:556)
	at org.apache.spark.sql.execution.UnionExec$$anonfun$doExecute$1.apply(basicPhysicalOperators.scala:556)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
	at scala.collection.AbstractTraversable.map(Traversable.scala:104)
	at org.apache.spark.sql.execution.UnionExec.doExecute(basicPhysicalOperators.scala:556)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:252)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:141)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:386)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
	at org.apache.spark.sql.execution.exchange.ShuffleExchange.prepareShuffleDependency(ShuffleExchange.scala:88)
	at org.apache.spark.sql.execution.exchange.ShuffleExchange$$anonfun$doExecute$1.apply(ShuffleExchange.scala:124)
	at org.apache.spark.sql.execution.exchange.ShuffleExchange$$anonfun$doExecute$1.apply(ShuffleExchange.scala:115)
	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
	... 35 more


In [None]:
freqDF_working.select(freqDF_working.ID, freqDF_working.ALLELE_FREQ_1, freqDF_working.ALLELE_FREQ_2).filter(freqDF_working.ID == "rs4109479").show()
freqDF_working.select("ID", "CHROM", "ALLELE_FREQ_1", "ALLELE_FREQ_2").filter("ID == 'rs4109479'").show()
spark.sql("select ID, ALLELE_FREQ_1, ALLELE_FREQ_2 from freqDF_working where ID = 'rs4109479'").show()

In [1]:
# print('Count of rows: {0}'.format(freqDF_working.count()))
# print('Count of distinct rows: {0}'.format(freqDF_working.distinct().count()))
# freqDF_working = freqDF_working.dropDuplicates()
# print('Count of rows: {0}'.format(freqDF_working.count()))
# print('Count of distinct rows: {0}'.format(freqDF_working.distinct().count()))
# print('Count of IDs: {0}'.format(freqDF_working.count()))
# print('Count of distinct IDs: {0}'.format(freqDF_working.select(freqDF_working.ID).distinct().count()))
# freqDF_working.where("ID = '.'").show(10)