# Jonathan Halverson
# Tuesday, December 27, 2016
# Wine classification in Spark 2

In [14]:
from __future__ import print_function
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[4]").appName("Wine classification").getOrCreate()

In [15]:
df = spark.read.csv('../../machine_learning/wine.csv', header=False, inferSchema=True)
df.sample(False, 0.1).show()

+---+-----+----+----+----+---+----+----+----+----+--------+----+----+----+
|_c0|  _c1| _c2| _c3| _c4|_c5| _c6| _c7| _c8| _c9|    _c10|_c11|_c12|_c13|
+---+-----+----+----+----+---+----+----+----+----+--------+----+----+----+
|  1|13.16|2.36|2.67|18.6|101| 2.8|3.24| 0.3|2.81|    5.68|1.03|3.17|1185|
|  1|14.38|1.87|2.38|12.0|102| 3.3|3.64|0.29|2.96|     7.5| 1.2| 3.0|1547|
|  1|13.83|1.57|2.62|20.0|115|2.95| 3.4| 0.4|1.72|     6.6|1.13|2.57|1130|
|  1| 13.9|1.68|2.12|16.0|101| 3.1|3.39|0.21|2.14|     6.1|0.91|3.33| 985|
|  2|12.08|1.13|2.51|24.0| 78| 2.0|1.58| 0.4| 1.4|     2.2|1.31|2.72| 630|
|  2|12.16|1.61|2.31|22.8| 90|1.78|1.69|0.43|1.56|    2.45|1.33|2.26| 495|
|  2|11.62|1.99|2.28|18.0| 98|3.02|2.26|0.17|1.35|    3.25|1.16|2.96| 345|
|  2| 12.6|1.34| 1.9|18.5| 88|1.45|1.36|0.29|1.35|    2.45|1.04|2.77| 562|
|  2|12.51|1.73|1.98|20.5| 85| 2.2|1.92|0.32|1.48|    2.94|1.04|3.57| 672|
|  3|13.08| 3.9|2.36|21.5|113|1.41|1.39|0.34|1.14|     9.4|0.57|1.33| 550|
|  3|14.34|1.68| 2.7|25.0

Note that in local mode even with the [4] only one partition is being used:

In [16]:
df.rdd.getNumPartitions()

1

In [17]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: double (nullable = true)
 |-- _c2: double (nullable = true)
 |-- _c3: double (nullable = true)
 |-- _c4: double (nullable = true)
 |-- _c5: integer (nullable = true)
 |-- _c6: double (nullable = true)
 |-- _c7: double (nullable = true)
 |-- _c8: double (nullable = true)
 |-- _c9: double (nullable = true)
 |-- _c10: double (nullable = true)
 |-- _c11: double (nullable = true)
 |-- _c12: double (nullable = true)
 |-- _c13: integer (nullable = true)



In [18]:
columns = ['Class', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', \
           'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', \
           'OD280/OD315 of diluted wines', 'Proline']

In [19]:
for u, v in zip(df.schema.names, columns):
    df = df.withColumnRenamed(u, v)

In [20]:
df.printSchema()

root
 |-- Class: integer (nullable = true)
 |-- Alcohol: double (nullable = true)
 |-- Malic acid: double (nullable = true)
 |-- Ash: double (nullable = true)
 |-- Alcalinity of ash: double (nullable = true)
 |-- Magnesium: integer (nullable = true)
 |-- Total phenols: double (nullable = true)
 |-- Flavanoids: double (nullable = true)
 |-- Nonflavanoid phenols: double (nullable = true)
 |-- Proanthocyanins: double (nullable = true)
 |-- Color intensity: double (nullable = true)
 |-- Hue: double (nullable = true)
 |-- OD280/OD315 of diluted wines: double (nullable = true)
 |-- Proline: integer (nullable = true)



In [31]:
wine = reduce(lambda data, i: data.withColumnRenamed(df.schema.names[i], columns[i]), xrange(len(columns)), df)
wine.sample(False, 0.1).show(5)

+-----+-------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+
|Class|Alcohol|Malic acid| Ash|Alcalinity of ash|Magnesium|Total phenols|Flavanoids|Nonflavanoid phenols|Proanthocyanins|Color intensity| Hue|OD280/OD315 of diluted wines|Proline|
+-----+-------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+
|    1|  14.39|      1.87|2.45|             14.6|       96|          2.5|      2.52|                 0.3|           1.98|           5.25|1.02|                        3.58|   1290|
|    1|   13.5|      1.81|2.61|             20.0|       96|         2.53|      2.61|                0.28|           1.66|           3.52|1.12|                        3.82|    845|
|    1|  13.39|      1.77|2.62|             16.1|       93|         2.85|      2.94|                

In [33]:
wine.describe().show()

+-------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+--------------------+------------------+-----------------+-------------------+----------------------------+-----------------+
|summary|             Class|           Alcohol|        Malic acid|               Ash|Alcalinity of ash|         Magnesium|     Total phenols|        Flavanoids|Nonflavanoid phenols|   Proanthocyanins|  Color intensity|                Hue|OD280/OD315 of diluted wines|          Proline|
+-------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+--------------------+------------------+-----------------+-------------------+----------------------------+-----------------+
|  count|               178|               178|               178|               178|              178|               178|               178| 

In [22]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

In [23]:
wine = wine.rdd.map(lambda row: Row(label=row.Class, features=Vectors.dense(row[1:]))).toDF()
wine = wine.select(wine.features, (wine.label - 1).alias('label'))
wine.sample(False, 0.1).show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[14.23,1.71,2.43,...|    0|
|[13.64,3.1,2.56,1...|    0|
|[14.22,3.99,2.51,...|    0|
|[13.05,1.77,2.1,1...|    0|
|[14.1,2.02,2.4,18...|    0|
+--------------------+-----+
only showing top 5 rows



In [24]:
wine = wine.filter(wine.label < 2).cache()

In [25]:
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol='features', labelCol='label', maxIter=10, regParam=0.3, elasticNetParam=0.8)
pipeline = Pipeline(stages=[lr])

paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01]).build()
crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=5)
cvModel = crossval.fit(wine)

In [26]:
test, _ = wine.randomSplit([0.1, 0.9])
prediction = cvModel.transform(test)

In [27]:
prediction.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[11.82,1.47,1.99,...|    1|[-1.8929463470622...|[0.13090889367574...|       1.0|
|[12.29,1.61,2.21,...|    1|[-1.1223134862045...|[0.24558240781357...|       1.0|
|[12.99,1.67,2.6,3...|    1|[-1.2313092689777...|[0.22595235519275...|       1.0|
|[13.03,0.9,1.71,1...|    1|[-0.4109430925703...|[0.39868600681839...|       1.0|
|[13.05,1.65,2.55,...|    0|[0.69035237596838...|[0.66604530995566...|       0.0|
|[13.05,1.77,2.1,1...|    0|[0.90565454749549...|[0.71211012636091...|       0.0|
|[13.48,1.81,2.41,...|    0|[0.49021152328035...|[0.62015626055635...|       0.0|
|[13.58,1.66,2.36,...|    0|[2.54559670404492...|[0.92727713978336...|       0.0|
|[13.9,1.68,2.12,1...|    0|[1.82266043917934...|[0.86088505251543...|       0.0|
|[14.22,1.7,2.3,

In [30]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(prediction)
evaluator.evaluate(prediction, {evaluator.metricName: "areaUnderPR"})

1.0