
# Jonathan Halverson
# October 12, 2016
# DataFrames and machine learning in Spark 2

### Converting a Spark DF to a Pandas DF

In [14]:
df = spark.createDataFrame([(["a", "b", "c"],)], ["words"])
df.show()

+---------+
|    words|
+---------+
|[a, b, c]|
+---------+



In [15]:
import pandas as pd
x = df.toPandas()
x

Unnamed: 0,words
0,"[a, b, c]"


### Adding a second row to the DF

In [16]:
df = spark.createDataFrame([(["a", "b", "c"],), (["e", "f", "g"],)], ["words"]).show()

+---------+
|    words|
+---------+
|[a, b, c]|
|[e, f, g]|
+---------+



In [17]:
df = spark.createDataFrame([(["a", "b", "c"],), (["e", "f", "g"],)], schema=["words"])
df.collect()

[Row(words=[u'a', u'b', u'c']), Row(words=[u'e', u'f', u'g'])]

### Working with vectors

In [13]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import DCT

df1 = spark.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"])
dct = DCT(inverse=False, inputCol="vec", outputCol="resultVec")
df2 = dct.transform(df1)
df2.show()

+-------------+--------------------+
|          vec|           resultVec|
+-------------+--------------------+
|[5.0,8.0,6.0]|[10.9696551146028...|
+-------------+--------------------+



### toDF

In [19]:
ranges = sc.parallelize([[12, 45], [9, 11], [31, 122], [88, 109], [17, 61]])
print type(ranges)

<class 'pyspark.rdd.RDD'>


In [20]:
df = ranges.toDF(schema=['a', 'b'])
df.show()
print type(df)
print df.printSchema()

+---+---+
|  a|  b|
+---+---+
| 12| 45|
|  9| 11|
| 31|122|
| 88|109|
| 17| 61|
+---+---+

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- a: long (nullable = true)
 |-- b: long (nullable = true)

None


### Convert an RDD of LabeledPoints to a DataFrame

In [4]:
from pyspark.mllib.regression import LabeledPoint
import numpy as np

lp = sc.parallelize([LabeledPoint(1, np.array([1, 6, 7])), LabeledPoint(0, np.array([12, 2, 9]))])
lp.collect(), type(lp)

([LabeledPoint(1.0, [1.0,6.0,7.0]), LabeledPoint(0.0, [12.0,2.0,9.0])],
 pyspark.rdd.RDD)

In [7]:
lp.toDF().show()

+--------------+-----+
|      features|label|
+--------------+-----+
| [1.0,6.0,7.0]|  1.0|
|[12.0,2.0,9.0]|  0.0|
+--------------+-----+



### Test of LogisticRegression

In [31]:
df = spark.createDataFrame([(1, Vectors.dense([7, 2, 9]), 'ignored'),
                            (0, Vectors.dense([6, 3, 1]), 'useless')], ["label", "features", "extra"])
df.show()

+-----+-------------+-------+
|label|     features|  extra|
+-----+-------------+-------+
|    1|[7.0,2.0,9.0]|ignored|
|    0|[6.0,3.0,1.0]|useless|
+-----+-------------+-------+



In [32]:
df.first()

Row(label=1, features=DenseVector([7.0, 2.0, 9.0]), extra=u'ignored')

In [33]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(df)

In [30]:
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

Coefficients: [0.133590225564,-0.460623607458,0.059944830801]
Intercept: -0.00232930499821


### Email classifier

In [41]:
from pyspark.ml.feature import HashingTF

tf = HashingTF(numFeatures=2**18, inputCol="words", outputCol="features")
df = spark.createDataFrame([(1, ['There', 'will', 'be', 'cake'],), (0, ['I', 'will', 'run', 'again'],)], ["label", "words"])
out = tf.transform(df)
out.show()

+-----+--------------------+--------------------+
|label|               words|            features|
+-----+--------------------+--------------------+
|    1|[There, will, be,...|(262144,[13007,89...|
|    0|[I, will, run, ag...|(262144,[89356,10...|
+-----+--------------------+--------------------+



In [39]:
out.first()

Row(labels=1, words=[u'There', u'will', u'be', u'cake'], features=SparseVector(262144, {13007: 1.0, 89356: 1.0, 146559: 1.0, 167152: 1.0}))

Read each line of the files into an RDD:

In [51]:
ham = sc.textFile('ham.txt')
spam = sc.textFile('spam.txt')

Apply a map to the RDD and combine the RDD's:

In [60]:
hamLabelFeatures = ham.map(lambda email: [0, email.split()])
spamLabelFeatures = spam.map(lambda email: [1, email.split()])
trainRDD = hamLabelFeatures.union(spamLabelFeatures)

Convert the RDD to a DataFrame and apply the hashing function:

In [61]:
trainDF = trainRDD.toDF(schema=["label", "words"])
trainDF = tf.transform(trainDF)
trainDF.show()

+-----+--------------------+--------------------+
|label|               words|            features|
+-----+--------------------+--------------------+
|    0|[Dear, Spark, Lea...|(262144,[9639,142...|
|    0|[Hi, Mom,, Apolog...|(262144,[1576,163...|
|    0|[Wow,, hey, Fred,...|(262144,[18327,28...|
|    0|[Hi, Spark, user,...|(262144,[15889,16...|
|    0|[Thanks, Tom, for...|(262144,[8804,163...|
|    0|[Good, job, yeste...|(262144,[14,25570...|
|    0|[Summit, demo, go...|(262144,[31463,64...|
|    1|[Dear, sir,, I, a...|(262144,[12781,36...|
|    1|[Get, Viagra, rea...|(262144,[9129,261...|
|    1|[Oh, my, gosh, yo...|(262144,[14,12946...|
|    1|[YOUR, COMPUTER, ...|(262144,[24967,36...|
|    1|[THIS, IS, NOT, A...|(262144,[14,12946...|
+-----+--------------------+--------------------+



Train a logistic regression model on the data:

In [62]:
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(trainDF)

In [58]:
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

Coefficients: (262144,[126466,140390,236986],[0.219230668165,-0.143429260905,0.647232520829])
Intercept: -0.465236461182


Predict the outcome of a test case:

In [72]:
test = spark.createDataFrame([(['Fox', 'and', 'two', 'are', 'two', 'things'],)], ["words"])
lrModel.transform(tf.transform(test)).select('prediction').show()

+----------+
|prediction|
+----------+
|       0.0|
+----------+

