In [4]:
from pyspark.sql import SparkSession

#start session 
spark_session = SparkSession.builder.appName("VCF_test_FL")\
.config ("spark.sql.shuffle.partitions", "50")\
.config("spark.driver.maxResultSize","5g")\
.config ("spark.sql.execution.arrow.enabled", "true")\
.getOrCreate()

spark_context = spark_session.sparkContext

In [5]:
vcf = spark_context.textFile("File:///home/ubuntu/test.vcf")
vcf.take(20)

['t=VCFv4.0',
 '##fileDate=20090805',
 '##source=myImputationProgramV3.1',
 '##reference=1000GenomesPilot-NCBI36',
 '##phasing=partial',
 '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">',
 '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
 '##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">',
 '##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">',
 '##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">',
 '##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">',
 '##FILTER=<ID=q10,Description="Quality below 10">',
 '##FILTER=<ID=s50,Description="Less than 50% of samples have data">',
 '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
 '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">',
 '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">',
 '##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">',
 '#CHROM POS   

In [6]:
def preProcessVCF(file):
    file = file.filter(lambda line: not line.startswith('##'))
    file = file.filter(lambda line: not line.startswith('t=VCF'))
    #file = file.map(lambda line: line.replace(" ","-"))
    file = file.map(lambda line: line.split())
    return file

vcf_processed = preProcessVCF(vcf)
vcf_processed.take(20)

[['#CHROM',
  'POS',
  'ID',
  'REF',
  'ALT',
  'QUAL',
  'FILTER',
  'INFO',
  'FORMAT',
  'NA00001',
  'NA00002',
  'NA00003'],
 ['20',
  '14370',
  'rs6054257',
  'G',
  'A',
  '29',
  'PASS',
  'NS=3;DP=14;AF=0.5;DB;H2',
  'GT:GQ:DP:HQ',
  '0|0:48:1:51,51',
  '1|0:48:8:51,51',
  '1/1:43:5:.,.'],
 ['20',
  '17330',
  '.',
  'T',
  'A',
  '3',
  'q10',
  'NS=3;DP=11;AF=0.017',
  'GT:GQ:DP:HQ',
  '0|0:49:3:58,50',
  '0|1:3:5:65,3',
  '0/0:41:3'],
 ['20',
  '1110696',
  'rs6040355',
  'A',
  'G,T',
  '67',
  'PASS',
  'NS=2;DP=10;AF=0.333,0.667;AA=T;DB',
  'GT:GQ:DP:HQ',
  '1|2:21:6:23,27',
  '2|1:2:0:18,2',
  '2/2:35:4'],
 ['20',
  '1230237',
  '.',
  'T',
  '.',
  '47',
  'PASS',
  'NS=3;DP=13;AA=T',
  'GT:GQ:DP:HQ',
  '0|0:54:7:56,60',
  '0|0:48:4:51,51',
  '0/0:61:2'],
 ['20',
  '1234567',
  'microsat1',
  'GTCT',
  'G,GTACT',
  '50',
  'PASS',
  'NS=3;DP=9;AA=G',
  'GT:GQ:DP',
  '0/1:35:4',
  '0/2:17:2',
  '1/1:40:3']]

In [7]:
df = vcf_processed.toDF(schema = vcf_processed.first())

df.printSchema()

root
 |-- #CHROM: string (nullable = true)
 |-- POS: string (nullable = true)
 |-- ID: string (nullable = true)
 |-- REF: string (nullable = true)
 |-- ALT: string (nullable = true)
 |-- QUAL: string (nullable = true)
 |-- FILTER: string (nullable = true)
 |-- INFO: string (nullable = true)
 |-- FORMAT: string (nullable = true)
 |-- NA00001: string (nullable = true)
 |-- NA00002: string (nullable = true)
 |-- NA00003: string (nullable = true)



In [11]:
import pyspark.sql

df = df.filter(df["#CHROM"]!="#CHROM")
df = df.withColumn("POS", df["POS"].cast("float"))

df.printSchema()
df.select("#CHROM","POS").show()

root
 |-- #CHROM: string (nullable = true)
 |-- POS: float (nullable = true)
 |-- ID: string (nullable = true)
 |-- REF: string (nullable = true)
 |-- ALT: string (nullable = true)
 |-- QUAL: string (nullable = true)
 |-- FILTER: string (nullable = true)
 |-- INFO: string (nullable = true)
 |-- FORMAT: string (nullable = true)
 |-- NA00001: string (nullable = true)
 |-- NA00002: string (nullable = true)
 |-- NA00003: string (nullable = true)

+------+---------+
|#CHROM|      POS|
+------+---------+
|    20|  14370.0|
|    20|  17330.0|
|    20|1110696.0|
|    20|1230237.0|
|    20|1234567.0|
+------+---------+



In [34]:
from pyspark.sql import functions as F

for i in range(50000,248000000,50000):
    df.filter(df["POS"] > i-50000).show()


+------+---------+---------+----+-------+----+------+--------------------+-----------+--------------+--------------+------------+
|#CHROM|      POS|       ID| REF|    ALT|QUAL|FILTER|                INFO|     FORMAT|       NA00001|       NA00002|     NA00003|
+------+---------+---------+----+-------+----+------+--------------------+-----------+--------------+--------------+------------+
|    20|  14370.0|rs6054257|   G|      A|  29|  PASS|NS=3;DP=14;AF=0.5...|GT:GQ:DP:HQ|0|0:48:1:51,51|1|0:48:8:51,51|1/1:43:5:.,.|
|    20|  17330.0|        .|   T|      A|   3|   q10| NS=3;DP=11;AF=0.017|GT:GQ:DP:HQ|0|0:49:3:58,50|  0|1:3:5:65,3|    0/0:41:3|
|    20|1110696.0|rs6040355|   A|    G,T|  67|  PASS|NS=2;DP=10;AF=0.3...|GT:GQ:DP:HQ|1|2:21:6:23,27|  2|1:2:0:18,2|    2/2:35:4|
|    20|1230237.0|        .|   T|      .|  47|  PASS|     NS=3;DP=13;AA=T|GT:GQ:DP:HQ|0|0:54:7:56,60|0|0:48:4:51,51|    0/0:61:2|
|    20|1234567.0|microsat1|GTCT|G,GTACT|  50|  PASS|      NS=3;DP=9;AA=G|   GT:GQ:DP|    

+------+---------+---------+----+-------+----+------+--------------------+-----------+--------------+--------------+--------+
|#CHROM|      POS|       ID| REF|    ALT|QUAL|FILTER|                INFO|     FORMAT|       NA00001|       NA00002| NA00003|
+------+---------+---------+----+-------+----+------+--------------------+-----------+--------------+--------------+--------+
|    20|1110696.0|rs6040355|   A|    G,T|  67|  PASS|NS=2;DP=10;AF=0.3...|GT:GQ:DP:HQ|1|2:21:6:23,27|  2|1:2:0:18,2|2/2:35:4|
|    20|1230237.0|        .|   T|      .|  47|  PASS|     NS=3;DP=13;AA=T|GT:GQ:DP:HQ|0|0:54:7:56,60|0|0:48:4:51,51|0/0:61:2|
|    20|1234567.0|microsat1|GTCT|G,GTACT|  50|  PASS|      NS=3;DP=9;AA=G|   GT:GQ:DP|      0/1:35:4|      0/2:17:2|1/1:40:3|
+------+---------+---------+----+-------+----+------+--------------------+-----------+--------------+--------------+--------+

+------+---------+---------+----+-------+----+------+--------------------+-----------+--------------+--------------+-

KeyboardInterrupt: 