In [6]:
from pyspark.sql import SparkSession

#start session 
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.81:7077") \
        .appName("nucleotide_div")\
        .config("spark.dynamicAllocation.enabled", False)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .getOrCreate()
        
spark_context = spark_session.sparkContext

In [7]:
vcf = spark_context.textFile("hdfs://192.168.2.81:9000//user/LDSA/ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz")
vcf.take(10)

['##fileformat=VCFv4.1',
 '##FILTER=<ID=PASS,Description="All filters passed">',
 '##fileDate=20150218',
 '##reference=ftp://ftp.1000genomes.ebi.ac.uk//vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz',
 '##source=1000GenomesPhase3Pipeline',
 '##contig=<ID=1,assembly=b37,length=249250621>',
 '##contig=<ID=2,assembly=b37,length=243199373>',
 '##contig=<ID=3,assembly=b37,length=198022430>',
 '##contig=<ID=4,assembly=b37,length=191154276>',
 '##contig=<ID=5,assembly=b37,length=180915260>']

In [8]:
#remove rows that start with ## 
vcf = vcf.filter(lambda line : not  line.startswith('t=VCF'))\
    .filter(lambda line : not line.startswith('##'))\
    .map(lambda line : line.split())

vcf.take(10)

[['#CHROM',
  'POS',
  'ID',
  'REF',
  'ALT',
  'QUAL',
  'FILTER',
  'INFO',
  'FORMAT',
  'HG00096',
  'HG00097',
  'HG00099',
  'HG00100',
  'HG00101',
  'HG00102',
  'HG00103',
  'HG00105',
  'HG00106',
  'HG00107',
  'HG00108',
  'HG00109',
  'HG00110',
  'HG00111',
  'HG00112',
  'HG00113',
  'HG00114',
  'HG00115',
  'HG00116',
  'HG00117',
  'HG00118',
  'HG00119',
  'HG00120',
  'HG00121',
  'HG00122',
  'HG00123',
  'HG00125',
  'HG00126',
  'HG00127',
  'HG00128',
  'HG00129',
  'HG00130',
  'HG00131',
  'HG00132',
  'HG00133',
  'HG00136',
  'HG00137',
  'HG00138',
  'HG00139',
  'HG00140',
  'HG00141',
  'HG00142',
  'HG00143',
  'HG00145',
  'HG00146',
  'HG00148',
  'HG00149',
  'HG00150',
  'HG00151',
  'HG00154',
  'HG00155',
  'HG00157',
  'HG00158',
  'HG00159',
  'HG00160',
  'HG00171',
  'HG00173',
  'HG00174',
  'HG00176',
  'HG00177',
  'HG00178',
  'HG00179',
  'HG00180',
  'HG00181',
  'HG00182',
  'HG00183',
  'HG00185',
  'HG00186',
  'HG00187',
  'HG00188',

In [9]:
#create df with first line as header 
vcf = vcf.toDF(schema = vcf.first()) 
vcf = vcf.filter(vcf["#CHROM"]!="#CHROM")
vcf.printSchema()

root
 |-- #CHROM: string (nullable = true)
 |-- POS: string (nullable = true)
 |-- ID: string (nullable = true)
 |-- REF: string (nullable = true)
 |-- ALT: string (nullable = true)
 |-- QUAL: string (nullable = true)
 |-- FILTER: string (nullable = true)
 |-- INFO: string (nullable = true)
 |-- FORMAT: string (nullable = true)
 |-- HG00096: string (nullable = true)
 |-- HG00097: string (nullable = true)
 |-- HG00099: string (nullable = true)
 |-- HG00100: string (nullable = true)
 |-- HG00101: string (nullable = true)
 |-- HG00102: string (nullable = true)
 |-- HG00103: string (nullable = true)
 |-- HG00105: string (nullable = true)
 |-- HG00106: string (nullable = true)
 |-- HG00107: string (nullable = true)
 |-- HG00108: string (nullable = true)
 |-- HG00109: string (nullable = true)
 |-- HG00110: string (nullable = true)
 |-- HG00111: string (nullable = true)
 |-- HG00112: string (nullable = true)
 |-- HG00113: string (nullable = true)
 |-- HG00114: string (nullable = true)
 |-- HG

In [10]:
import re 
import random

#Get names of sample columns (samples start with HG and NA)
columns = vcf.schema.names
columns = [x for x in columns if re.match("HG*|NA*", x)]

#100 samples at random 
samples = random.sample(range(0, len(columns)), len(columns) - 100)
samples =  [columns[i] for i in samples]

#extract 100 samples for test 
vcf = vcf.drop(*samples)

vcf.printSchema()

root
 |-- #CHROM: string (nullable = true)
 |-- POS: string (nullable = true)
 |-- ID: string (nullable = true)
 |-- REF: string (nullable = true)
 |-- ALT: string (nullable = true)
 |-- QUAL: string (nullable = true)
 |-- FILTER: string (nullable = true)
 |-- INFO: string (nullable = true)
 |-- FORMAT: string (nullable = true)
 |-- HG00131: string (nullable = true)
 |-- HG00173: string (nullable = true)
 |-- HG00282: string (nullable = true)
 |-- HG00321: string (nullable = true)
 |-- HG00448: string (nullable = true)
 |-- HG00534: string (nullable = true)
 |-- HG00613: string (nullable = true)
 |-- HG00675: string (nullable = true)
 |-- HG00759: string (nullable = true)
 |-- HG01110: string (nullable = true)
 |-- HG01122: string (nullable = true)
 |-- HG01281: string (nullable = true)
 |-- HG01326: string (nullable = true)
 |-- HG01378: string (nullable = true)
 |-- HG01396: string (nullable = true)
 |-- HG01447: string (nullable = true)
 |-- HG01498: string (nullable = true)
 |-- HG

In [11]:
vcf.write.format('csv').option('header',True).mode('overwrite').option('delimiter', '\t').save('/user/LDSA/CHR1')

Py4JJavaError: An error occurred while calling o210.save.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:198)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:159)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:104)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:122)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:81)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:676)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:285)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:271)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:229)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Master removed our application: KILLED
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:167)
	... 32 more


In [None]:
spark_context.stop()