In [1]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pylab import *
from pyspark.sql.functions import udf, concat, col, lit
from pyspark.sql.types import IntegerType, ArrayType, StringType, DoubleType
import string
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer, CountVectorizer, Tokenizer, StopWordsRemover, NGram
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [2]:
spark = SparkSession.builder\
    .master("local")\
    .appName("Stock Prediction")\
    .getOrCreate()

In [3]:
data = spark.read.load("data/Combined_News_DJIA.csv", 
                          delimiter=',',
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')

In [4]:
djia = spark.read.load("data/DJIA_table.csv", 
                          delimiter=',',
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')

In [5]:
djia.columns

['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']

In [6]:
data.columns

['Date',
 'Label',
 'Top1',
 'Top2',
 'Top3',
 'Top4',
 'Top5',
 'Top6',
 'Top7',
 'Top8',
 'Top9',
 'Top10',
 'Top11',
 'Top12',
 'Top13',
 'Top14',
 'Top15',
 'Top16',
 'Top17',
 'Top18',
 'Top19',
 'Top20',
 'Top21',
 'Top22',
 'Top23',
 'Top24',
 'Top25']

In [7]:
#replace null values with empty string
data = data.na.fill(' ')

# Only the columns that represent the news
newsColumns = [x for x in data.columns if x not in ['Date', 'Label']]

In [8]:
data.columns

['Date',
 'Label',
 'Top1',
 'Top2',
 'Top3',
 'Top4',
 'Top5',
 'Top6',
 'Top7',
 'Top8',
 'Top9',
 'Top10',
 'Top11',
 'Top12',
 'Top13',
 'Top14',
 'Top15',
 'Top16',
 'Top17',
 'Top18',
 'Top19',
 'Top20',
 'Top21',
 'Top22',
 'Top23',
 'Top24',
 'Top25']

In [9]:
pd.DataFrame(data.take(5), columns=data.columns).transpose()

Unnamed: 0,0,1,2,3,4
Date,2008-08-08,2008-08-11,2008-08-12,2008-08-13,2008-08-14
Label,0,1,0,0,1
Top1,"""b""""Georgia 'downs two Russian warplanes' as c...",b'Why wont America and Nato help us? If they w...,b'Remember that adorable 9-year-old who sang a...,b' U.S. refuses Israel weapons to attack Iran:...,b'All the experts admit that we should legalis...
Top2,b'BREAKING: Musharraf to be impeached.',b'Bush puts foot down on Georgian conflict',"""b""""Russia 'ends Georgia operation'""""""","""b""""When the president ordered to attack Tskhi...",b'War in South Osetia - 89 pictures made by a ...
Top3,b'Russia Today: Columns of troops roll into So...,"""b""""Jewish Georgian minister: Thanks to Israel...","""b'""""If we had no sexual harassment we would h...",we knew then we were doomed. How come he didn...,b'Swedish wrestler Ara Abrahamian throws away ...
Top4,b'Russian tanks are moving towards the capital...,"we're fending off Russia """"""","""b""""Al-Qa'eda is losing support in Iraq becaus...",b' Israel clears troops who killed Reuters cam...,b'Russia exaggerated the death toll in South O...
Top5,"""b""""Afghan children raped with 'impunity",b'Georgian army flees in disarray as Russians ...,b'Ceasefire in Georgia: Putin Outmaneuvers the...,"""b'Britain\'s policy of being tough on drugs i...",b'Missile That Killed 9 Inside Pakistan May Ha...
Top6,' U.N. official says - this is sick,"""b""""Olympic opening ceremony fireworks 'faked'""""""",b'Why Microsoft and Intel tried to kill the XO...,says a former civil servant who once ran the ...,"""b""""Rushdie Condemns Random House's Refusal to..."
Top7,a three year old was raped and they do nothin...,b'What were the Mossad with fraudulent New Zea...,b'Stratfor: The Russo-Georgian War and the Bal...,b'Body of 14 year old found in trunk; Latest (...,b'Poland and US agree to missle defense deal. ...
Top8,b'150 Russian tanks have entered South Ossetia...,b'Russia angered by Israeli military sale to G...,"""b""""I'm Trying to Get a Sense of This Whole Ge...",b'China has moved 10 *million* quake survivors...,"b'Will the Russians conquer Tblisi? Bet on it,..."


In [10]:
len(data.columns)

27

In [11]:
len(newsColumns)

25

In [12]:
#merge news from different news sources per day

data = data.withColumn("allNews", data.Top1)
for i in range(2, len(newsColumns)+1):
    colName = 'Top' + str(i)
    data = data.withColumn('allNews', concat(col("allNews"), lit(" "), col(colName)))

In [13]:
pd.DataFrame(data.take(5), columns=data.columns).transpose()

Unnamed: 0,0,1,2,3,4
Date,2008-08-08,2008-08-11,2008-08-12,2008-08-13,2008-08-14
Label,0,1,0,0,1
Top1,"""b""""Georgia 'downs two Russian warplanes' as c...",b'Why wont America and Nato help us? If they w...,b'Remember that adorable 9-year-old who sang a...,b' U.S. refuses Israel weapons to attack Iran:...,b'All the experts admit that we should legalis...
Top2,b'BREAKING: Musharraf to be impeached.',b'Bush puts foot down on Georgian conflict',"""b""""Russia 'ends Georgia operation'""""""","""b""""When the president ordered to attack Tskhi...",b'War in South Osetia - 89 pictures made by a ...
Top3,b'Russia Today: Columns of troops roll into So...,"""b""""Jewish Georgian minister: Thanks to Israel...","""b'""""If we had no sexual harassment we would h...",we knew then we were doomed. How come he didn...,b'Swedish wrestler Ara Abrahamian throws away ...
Top4,b'Russian tanks are moving towards the capital...,"we're fending off Russia """"""","""b""""Al-Qa'eda is losing support in Iraq becaus...",b' Israel clears troops who killed Reuters cam...,b'Russia exaggerated the death toll in South O...
Top5,"""b""""Afghan children raped with 'impunity",b'Georgian army flees in disarray as Russians ...,b'Ceasefire in Georgia: Putin Outmaneuvers the...,"""b'Britain\'s policy of being tough on drugs i...",b'Missile That Killed 9 Inside Pakistan May Ha...
Top6,' U.N. official says - this is sick,"""b""""Olympic opening ceremony fireworks 'faked'""""""",b'Why Microsoft and Intel tried to kill the XO...,says a former civil servant who once ran the ...,"""b""""Rushdie Condemns Random House's Refusal to..."
Top7,a three year old was raped and they do nothin...,b'What were the Mossad with fraudulent New Zea...,b'Stratfor: The Russo-Georgian War and the Bal...,b'Body of 14 year old found in trunk; Latest (...,b'Poland and US agree to missle defense deal. ...
Top8,b'150 Russian tanks have entered South Ossetia...,b'Russia angered by Israeli military sale to G...,"""b""""I'm Trying to Get a Sense of This Whole Ge...",b'China has moved 10 *million* quake survivors...,"b'Will the Russians conquer Tblisi? Bet on it,..."


In [14]:
#remove puntuation marks from the news

removePunctuation = udf(lambda x: ''.join([' ' if ch in string.punctuation else ch for ch in x]))
data = data.withColumn('allNews', removePunctuation(data.allNews))

In [15]:
#split the news into words

splitNews = udf(lambda s: [x for x in s.split(' ') if (x != u'' and len(x) >= 2)], ArrayType(StringType(), True))
data = data.withColumn('words', splitNews(data.allNews)).select('Date', 'label', 'words')

In [16]:
pd.DataFrame(data.take(5), columns=data.columns).transpose()

Unnamed: 0,0,1,2,3,4
Date,2008-08-08,2008-08-11,2008-08-12,2008-08-13,2008-08-14
label,0,1,0,0,1
words,"[Georgia, downs, two, Russian, warplanes, as, ...","[Why, wont, America, and, Nato, help, us, If, ...","[Remember, that, adorable, year, old, who, san...","[refuses, Israel, weapons, to, attack, Iran, r...","[All, the, experts, admit, that, we, should, l..."


In [17]:
#remove the stop words

myStopwordRemover = StopWordsRemover(inputCol="words", outputCol="stopRemoved")
data = myStopwordRemover.transform(data)

In [18]:
pd.DataFrame(data.take(5), columns=data.columns).transpose()

Unnamed: 0,0,1,2,3,4
Date,2008-08-08,2008-08-11,2008-08-12,2008-08-13,2008-08-14
label,0,1,0,0,1
words,"[Georgia, downs, two, Russian, warplanes, as, ...","[Why, wont, America, and, Nato, help, us, If, ...","[Remember, that, adorable, year, old, who, san...","[refuses, Israel, weapons, to, attack, Iran, r...","[All, the, experts, admit, that, we, should, l..."
stopRemoved,"[Georgia, downs, two, Russian, warplanes, coun...","[wont, America, Nato, help, us, wont, help, us...","[Remember, adorable, year, old, sang, opening,...","[refuses, Israel, weapons, attack, Iran, repor...","[experts, admit, legalise, drugs, War, South, ..."


In [19]:
# Create ngrams of size 2

myngram = NGram(inputCol="stopRemoved", outputCol="ngrams", n=2)
data = myngram.transform(data)
data = data.withColumn('ngrams', data.ngrams.cast(ArrayType(StringType(), True)))


In [20]:
pd.DataFrame(data.take(5), columns=data.columns).transpose()

Unnamed: 0,0,1,2,3,4
Date,2008-08-08,2008-08-11,2008-08-12,2008-08-13,2008-08-14
label,0,1,0,0,1
words,"[Georgia, downs, two, Russian, warplanes, as, ...","[Why, wont, America, and, Nato, help, us, If, ...","[Remember, that, adorable, year, old, who, san...","[refuses, Israel, weapons, to, attack, Iran, r...","[All, the, experts, admit, that, we, should, l..."
stopRemoved,"[Georgia, downs, two, Russian, warplanes, coun...","[wont, America, Nato, help, us, wont, help, us...","[Remember, adorable, year, old, sang, opening,...","[refuses, Israel, weapons, attack, Iran, repor...","[experts, admit, legalise, drugs, War, South, ..."
ngrams,"[Georgia downs, downs two, two Russian, Russia...","[wont America, America Nato, Nato help, help u...","[Remember adorable, adorable year, year old, o...","[refuses Israel, Israel weapons, weapons attac...","[experts admit, admit legalise, legalise drugs..."


In [21]:
# Apply count vectorizer to convert to vector of counts of the ngrams

myCountVectorizer = CountVectorizer(inputCol="ngrams", outputCol="countVect", minDF=1.0)
data = myCountVectorizer.fit(data).transform(data)

In [22]:
pd.DataFrame(data.take(5), columns=data.columns).transpose()

Unnamed: 0,0,1,2,3,4
Date,2008-08-08,2008-08-11,2008-08-12,2008-08-13,2008-08-14
label,0,1,0,0,1
words,"[Georgia, downs, two, Russian, warplanes, as, ...","[Why, wont, America, and, Nato, help, us, If, ...","[Remember, that, adorable, year, old, who, san...","[refuses, Israel, weapons, to, attack, Iran, r...","[All, the, experts, admit, that, we, should, l..."
stopRemoved,"[Georgia, downs, two, Russian, warplanes, coun...","[wont, America, Nato, help, us, wont, help, us...","[Remember, adorable, year, old, sang, opening,...","[refuses, Israel, weapons, attack, Iran, repor...","[experts, admit, legalise, drugs, War, South, ..."
ngrams,"[Georgia downs, downs two, two Russian, Russia...","[wont America, America Nato, Nato help, help u...","[Remember adorable, adorable year, year old, o...","[refuses Israel, Israel weapons, weapons attac...","[experts admit, admit legalise, legalise drugs..."
countVect,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."


In [23]:
# Transform the label using StringINdexer

si_label = StringIndexer(inputCol="label", outputCol="label2", handleInvalid="skip")
data = si_label.fit(data).transform(data)
data.drop('label')
data = data.withColumn('label', data.label2)


In [24]:
pd.DataFrame(data.take(5), columns=data.columns).transpose()

Unnamed: 0,0,1,2,3,4
Date,2008-08-08,2008-08-11,2008-08-12,2008-08-13,2008-08-14
label,1,0,1,1,0
words,"[Georgia, downs, two, Russian, warplanes, as, ...","[Why, wont, America, and, Nato, help, us, If, ...","[Remember, that, adorable, year, old, who, san...","[refuses, Israel, weapons, to, attack, Iran, r...","[All, the, experts, admit, that, we, should, l..."
stopRemoved,"[Georgia, downs, two, Russian, warplanes, coun...","[wont, America, Nato, help, us, wont, help, us...","[Remember, adorable, year, old, sang, opening,...","[refuses, Israel, weapons, attack, Iran, repor...","[experts, admit, legalise, drugs, War, South, ..."
ngrams,"[Georgia downs, downs two, two Russian, Russia...","[wont America, America Nato, Nato help, help u...","[Remember adorable, adorable year, year old, o...","[refuses Israel, Israel weapons, weapons attac...","[experts admit, admit legalise, legalise drugs..."
countVect,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
label2,1,0,1,1,0


In [25]:
# Divide into training and test data

trainData = data[data['Date'] < '20150101']
testData = data[data['Date'] >= '20141231']

In [26]:
# define the random forest classifier model

#rf = RandomForestClassifier(labelCol="label", featuresCol="countVect", numTrees=3, maxDepth=4, maxBins=200)
rf = RandomForestClassifier(labelCol="label", featuresCol="countVect", numTrees=3)

In [None]:
# perform a grid search on a set of parameter values

grid = ParamGridBuilder().addGrid(rf.numTrees, [1, 1])\
                         .addGrid(rf.maxDepth, [1, 1])\
                         .build()
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=rf, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(trainData)

In [None]:
evaluator.evaluate(cvModel.transform(testData))

In [None]:
model = rf.fit(trainData)

Py4JJavaError: An error occurred while calling o398.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 20.0 failed 1 times, most recent failure: Lost task 0.0 in stage 20.0 (TID 19, localhost, executor driver): java.lang.OutOfMemoryError: Java heap space
	at java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:57)
	at java.nio.ByteBuffer.allocate(ByteBuffer.java:335)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator$$anonfun$4.apply(ShuffleBlockFetcherIterator.scala:422)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator$$anonfun$4.apply(ShuffleBlockFetcherIterator.scala:422)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.allocateNewChunkIfNeeded(ChunkedByteBufferOutputStream.scala:87)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.write(ChunkedByteBufferOutputStream.scala:75)
	at org.apache.spark.util.Utils$$anonfun$copyStream$1.apply$mcJ$sp(Utils.scala:347)
	at org.apache.spark.util.Utils$$anonfun$copyStream$1.apply(Utils.scala:332)
	at org.apache.spark.util.Utils$$anonfun$copyStream$1.apply(Utils.scala:332)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.util.Utils$.copyStream(Utils.scala:353)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:427)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:61)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:32)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at org.apache.spark.util.collection.ExternalAppendOnlyMap.insertAll(ExternalAppendOnlyMap.scala:153)
	at org.apache.spark.Aggregator.combineValuesByKey(Aggregator.scala:41)
	at org.apache.spark.shuffle.BlockStoreShuffleReader.read(BlockStoreShuffleReader.scala:90)
	at org.apache.spark.rdd.ShuffledRDD.compute(ShuffledRDD.scala:105)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2092)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:743)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:742)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:742)
	at org.apache.spark.ml.tree.impl.RandomForest$.findSplitsBySorting(RandomForest.scala:928)
	at org.apache.spark.ml.tree.impl.RandomForest$.findSplits(RandomForest.scala:906)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:118)
	at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:139)
	at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:45)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.OutOfMemoryError: Java heap space
	at java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:57)
	at java.nio.ByteBuffer.allocate(ByteBuffer.java:335)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator$$anonfun$4.apply(ShuffleBlockFetcherIterator.scala:422)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator$$anonfun$4.apply(ShuffleBlockFetcherIterator.scala:422)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.allocateNewChunkIfNeeded(ChunkedByteBufferOutputStream.scala:87)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.write(ChunkedByteBufferOutputStream.scala:75)
	at org.apache.spark.util.Utils$$anonfun$copyStream$1.apply$mcJ$sp(Utils.scala:347)
	at org.apache.spark.util.Utils$$anonfun$copyStream$1.apply(Utils.scala:332)
	at org.apache.spark.util.Utils$$anonfun$copyStream$1.apply(Utils.scala:332)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.util.Utils$.copyStream(Utils.scala:353)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:427)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:61)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:32)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at org.apache.spark.util.collection.ExternalAppendOnlyMap.insertAll(ExternalAppendOnlyMap.scala:153)
	at org.apache.spark.Aggregator.combineValuesByKey(Aggregator.scala:41)
	at org.apache.spark.shuffle.BlockStoreShuffleReader.read(BlockStoreShuffleReader.scala:90)
	at org.apache.spark.rdd.ShuffledRDD.compute(ShuffledRDD.scala:105)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)


----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 47038)
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/jmir/miniconda3/envs/spark/lib/python3.6/site-packages/py4j/java_gateway.py", line 1062, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/jmir/miniconda3/envs/spark/lib/python3.6/site-packages/py4j/java_gateway.py", line 908, in send_command
    response = connection.send_command(command)
  File "/home/jmir/miniconda3/envs/spark/lib/python3.6/site-packages/py4j/java_gateway.py", line 1067, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
Traceback (most recent call last):
  File "/home/jmir/miniconda