# Jonathan Halverson
# October 12, 2016
# Converting an RDD to a DataFrame in Spark 2

### Converting a Spark DF to a Pandas DF

In [14]:
df = spark.createDataFrame([(["a", "b", "c"],)], ["words"])
df.show()

+---------+
|    words|
+---------+
|[a, b, c]|
+---------+



In [15]:
import pandas as pd
x = df.toPandas()
x

Unnamed: 0,words
0,"[a, b, c]"


### Adding a second row to the DF

In [16]:
df = spark.createDataFrame([(["a", "b", "c"],), (["e", "f", "g"],)], ["words"]).show()

+---------+
|    words|
+---------+
|[a, b, c]|
|[e, f, g]|
+---------+



In [17]:
df = spark.createDataFrame([(["a", "b", "c"],), (["e", "f", "g"],)], schema=["words"])
df.collect()

[Row(words=[u'a', u'b', u'c']), Row(words=[u'e', u'f', u'g'])]

### Working with vectors

In [18]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import DCT

df1 = spark.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"])
dct = DCT(inverse=False, inputCol="vec", outputCol="resultVec")
df2 = dct.transform(df1)
df2.show()

+-------------+--------------------+
|          vec|           resultVec|
+-------------+--------------------+
|[5.0,8.0,6.0]|[10.9696551146028...|
+-------------+--------------------+



### toDF

In [19]:
ranges = sc.parallelize([[12, 45], [9, 11], [31, 122], [88, 109], [17, 61]])
print type(ranges)

<class 'pyspark.rdd.RDD'>


In [20]:
df = ranges.toDF(schema=['a', 'b'])
df.show()
print type(df)
print df.printSchema()

+---+---+
|  a|  b|
+---+---+
| 12| 45|
|  9| 11|
| 31|122|
| 88|109|
| 17| 61|
+---+---+

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- a: long (nullable = true)
 |-- b: long (nullable = true)

None


### Email classifier

In [21]:
from pyspark.ml.feature import HashingTF

tf = HashingTF(numFeatures=2**18, inputCol="words", outputCol="vecs")
df = spark.createDataFrame([(['There', 'will', 'be', 'cake'],), (['I', 'will', 'run', 'again'],)], ["words"])
out = tf.transform(df)
out.show()

+--------------------+--------------------+
|               words|                vecs|
+--------------------+--------------------+
|[There, will, be,...|(262144,[13007,89...|
|[I, will, run, ag...|(262144,[89356,10...|
+--------------------+--------------------+



In [32]:
ham = sc.textFile('ham.txt')
spam = sc.textFile('spam.txt')

In [33]:
hamFeatures = ham.map(lambda email: tf.transform([email.split()]))
spamFeatures = spam.map(lambda email: tf.transform(email.split()))

In [37]:
tramFeatures = ham.map(lambda email: [[1], [email.split()]])
tramFeatures = tramFeatures.toDF(schema=["labels", "words"])
v = tf.transform(tramFeatures)
v.show()

Py4JJavaError: An error occurred while calling o612.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 45.0 failed 1 times, most recent failure: Lost task 0.0 in stage 45.0 (TID 139, localhost): org.apache.spark.SparkException: HashingTF with murmur3 algorithm does not support type scala.collection.mutable.WrappedArray.ofRef of input data.
	at org.apache.spark.mllib.feature.HashingTF$.murmur3Hash(HashingTF.scala:164)
	at org.apache.spark.mllib.feature.HashingTF$$anonfun$getHashFunction$1.apply(HashingTF.scala:84)
	at org.apache.spark.mllib.feature.HashingTF$$anonfun$getHashFunction$1.apply(HashingTF.scala:84)
	at org.apache.spark.mllib.feature.HashingTF$$anonfun$transform$1.apply(HashingTF.scala:101)
	at org.apache.spark.mllib.feature.HashingTF$$anonfun$transform$1.apply(HashingTF.scala:100)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
	at org.apache.spark.mllib.feature.HashingTF.transform(HashingTF.scala:100)
	at org.apache.spark.ml.feature.HashingTF$$anonfun$1.apply(HashingTF.scala:98)
	at org.apache.spark.ml.feature.HashingTF$$anonfun$1.apply(HashingTF.scala:98)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
	at org.apache.spark.scheduler.Task.run(Task.scala:85)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1450)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1438)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1437)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1437)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1659)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1618)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1607)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1871)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1884)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1897)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:347)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:39)
	at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1.apply(Dataset.scala:2183)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
	at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2532)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$execute$1(Dataset.scala:2182)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collect(Dataset.scala:2189)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:1925)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:1924)
	at org.apache.spark.sql.Dataset.withTypedCallback(Dataset.scala:2562)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:1924)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2139)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:239)
	at sun.reflect.GeneratedMethodAccessor130.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:497)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:211)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: HashingTF with murmur3 algorithm does not support type scala.collection.mutable.WrappedArray.ofRef of input data.
	at org.apache.spark.mllib.feature.HashingTF$.murmur3Hash(HashingTF.scala:164)
	at org.apache.spark.mllib.feature.HashingTF$$anonfun$getHashFunction$1.apply(HashingTF.scala:84)
	at org.apache.spark.mllib.feature.HashingTF$$anonfun$getHashFunction$1.apply(HashingTF.scala:84)
	at org.apache.spark.mllib.feature.HashingTF$$anonfun$transform$1.apply(HashingTF.scala:101)
	at org.apache.spark.mllib.feature.HashingTF$$anonfun$transform$1.apply(HashingTF.scala:100)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
	at org.apache.spark.mllib.feature.HashingTF.transform(HashingTF.scala:100)
	at org.apache.spark.ml.feature.HashingTF$$anonfun$1.apply(HashingTF.scala:98)
	at org.apache.spark.ml.feature.HashingTF$$anonfun$1.apply(HashingTF.scala:98)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
	at org.apache.spark.scheduler.Task.run(Task.scala:85)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


In [30]:
negativeClass.count()

Py4JError: An error occurred while calling o284.__getnewargs__. Trace:
py4j.Py4JException: Method __getnewargs__([]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
	at py4j.Gateway.invoke(Gateway.java:272)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:211)
	at java.lang.Thread.run(Thread.java:745)



In [28]:
from pyspark.sql import Row

positiveClass = spamFeatures.map(lambda record: Row(label=1, features=record))
negativeClass = hamFeatures.map(lambda record: Row(label=0, features=record))
trainRDD = positiveClass.union(negativeClass)

<class 'pyspark.rdd.PipelinedRDD'>


Py4JError: An error occurred while calling o284.__getnewargs__. Trace:
py4j.Py4JException: Method __getnewargs__([]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
	at py4j.Gateway.invoke(Gateway.java:272)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:211)
	at java.lang.Thread.run(Thread.java:745)



In [26]:
trainDF = sqlCtx.createDataFrame(trainRDD)

NameError: name 'trainRDD' is not defined

In [None]:
print type(trainDF)

In [None]:
trainDF.show()

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(trainDF)

In [None]:
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))