# Jonathan Halverson
# Tuesday, December 27, 2016
# Wine classification in Spark 2

In [1]:
from __future__ import print_function
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[4]").appName("Wine classification").getOrCreate()

In [2]:
df = spark.read.csv('../../machine_learning/wine.csv', header=False, inferSchema=True)
df.sample(False, 0.1).show()

+---+-----+----+----+----+---+----+----+----+----+----+----+----+----+
|_c0|  _c1| _c2| _c3| _c4|_c5| _c6| _c7| _c8| _c9|_c10|_c11|_c12|_c13|
+---+-----+----+----+----+---+----+----+----+----+----+----+----+----+
|  1|13.05|1.77| 2.1|17.0|107| 3.0| 3.0|0.28|2.03|5.04|0.88|3.35| 885|
|  2|12.33| 1.1|2.28|16.0|101|2.05|1.09|0.63|0.41|3.27|1.25|1.67| 680|
|  2|12.37|1.13|2.16|19.0| 87| 3.5| 3.1|0.19|1.87|4.45|1.22|2.87| 420|
|  2|11.96|1.09| 2.3|21.0|101|3.38|2.14|0.13|1.65|3.21|0.99|3.13| 886|
|  2|11.66|1.88|1.92|16.0| 97|1.61|1.57|0.34|1.15| 3.8|1.23|2.14| 428|
|  2| 12.7|3.87| 2.4|23.0|101|2.83|2.55|0.43|1.95|2.57|1.19|3.13| 463|
|  2|11.64|2.06|2.46|21.6| 84|1.95|1.69|0.48|1.35| 2.8| 1.0|2.75| 680|
|  2|12.29|3.17|2.21|18.0| 88|2.85|2.99|0.45|2.81| 2.3|1.42|2.83| 406|
|  2| 12.6|1.34| 1.9|18.5| 88|1.45|1.36|0.29|1.35|2.45|1.04|2.77| 562|
|  2|12.42|2.55|2.27|22.0| 90|1.68|1.84|0.66|1.42| 2.7|0.86| 3.3| 315|
|  2|12.72|1.75|2.28|22.5| 84|1.38|1.76|0.48|1.63| 3.3|0.88|2.42| 488|
|  2|1

Class labels must begin with 0 and count up in Spark. Here we will only consider a binary classification problem so we will ignore class 3:

In [3]:
df = df.filter(df._c0 < 3).withColumn('_c0', df['_c0'] - 1)
df.sample(False, 0.1).show()

+---+-----+----+----+----+---+----+----+----+----+----+----+----+----+
|_c0|  _c1| _c2| _c3| _c4|_c5| _c6| _c7| _c8| _c9|_c10|_c11|_c12|_c13|
+---+-----+----+----+----+---+----+----+----+----+----+----+----+----+
|  0|14.37|1.95| 2.5|16.8|113|3.85|3.49|0.24|2.18| 7.8|0.86|3.45|1480|
|  0|14.12|1.48|2.32|16.8| 95| 2.2|2.43|0.26|1.57| 5.0|1.17|2.82|1280|
|  0| 13.3|1.72|2.14|17.0| 94| 2.4|2.19|0.27|1.35|3.95|1.02|2.77|1285|
|  0|13.48|1.81|2.41|20.5|100| 2.7|2.98|0.26|1.86| 5.1|1.04|3.47| 920|
|  0|14.22| 1.7| 2.3|16.3|118| 3.2| 3.0|0.26|2.03|6.38|0.94|3.31| 970|
|  1|13.03| 0.9|1.71|16.0| 86|1.95|2.03|0.24|1.46| 4.6|1.19|2.48| 392|
|  1| 12.0|1.51|2.42|22.0| 86|1.45|1.25| 0.5|1.63| 3.6|1.05|2.65| 450|
|  1|12.29|1.41|1.98|16.0| 85|2.55| 2.5|0.29|1.77| 2.9|1.23|2.74| 428|
|  1|13.05| 5.8|2.13|21.5| 86|2.62|2.65| 0.3|2.01| 2.6|0.73| 3.1| 380|
+---+-----+----+----+----+---+----+----+----+----+----+----+----+----+



Note that in local mode even with the [4] only one partition is being used:

In [4]:
df.rdd.getNumPartitions()

1

In [5]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: double (nullable = true)
 |-- _c2: double (nullable = true)
 |-- _c3: double (nullable = true)
 |-- _c4: double (nullable = true)
 |-- _c5: integer (nullable = true)
 |-- _c6: double (nullable = true)
 |-- _c7: double (nullable = true)
 |-- _c8: double (nullable = true)
 |-- _c9: double (nullable = true)
 |-- _c10: double (nullable = true)
 |-- _c11: double (nullable = true)
 |-- _c12: double (nullable = true)
 |-- _c13: integer (nullable = true)



Let's change the data type of _c5 and _c13 to double:

In [6]:
df = df.withColumn('_c5', df['_c5'].cast('double'))
df = df.withColumn('_c13', df['_c13'].cast('double'))
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: double (nullable = true)
 |-- _c2: double (nullable = true)
 |-- _c3: double (nullable = true)
 |-- _c4: double (nullable = true)
 |-- _c5: double (nullable = true)
 |-- _c6: double (nullable = true)
 |-- _c7: double (nullable = true)
 |-- _c8: double (nullable = true)
 |-- _c9: double (nullable = true)
 |-- _c10: double (nullable = true)
 |-- _c11: double (nullable = true)
 |-- _c12: double (nullable = true)
 |-- _c13: double (nullable = true)



Let's give the columns more meaningful names:

In [7]:
columns = ['Class', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', \
           'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', \
           'OD280/OD315 of diluted wines', 'Proline']

In [8]:
for u, v in zip(df.schema.names, columns):
    df = df.withColumnRenamed(u, v)

In [9]:
df.printSchema()

root
 |-- Class: integer (nullable = true)
 |-- Alcohol: double (nullable = true)
 |-- Malic acid: double (nullable = true)
 |-- Ash: double (nullable = true)
 |-- Alcalinity of ash: double (nullable = true)
 |-- Magnesium: double (nullable = true)
 |-- Total phenols: double (nullable = true)
 |-- Flavanoids: double (nullable = true)
 |-- Nonflavanoid phenols: double (nullable = true)
 |-- Proanthocyanins: double (nullable = true)
 |-- Color intensity: double (nullable = true)
 |-- Hue: double (nullable = true)
 |-- OD280/OD315 of diluted wines: double (nullable = true)
 |-- Proline: double (nullable = true)



Here is an alternative version of assigning the column names:

In [10]:
wineRaw = reduce(lambda data, i: data.withColumnRenamed(df.schema.names[i], columns[i]), xrange(len(columns)), df)
wineRaw.sample(False, 0.05).toPandas().applymap(lambda x: round(x, 1))

Unnamed: 0,Class,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,0.0,13.4,1.8,2.6,16.1,93.0,2.9,2.9,0.3,1.4,4.8,0.9,3.2,1195.0
1,0.0,14.4,3.6,2.3,16.0,102.0,3.3,3.2,0.3,2.2,4.9,1.0,3.4,1065.0
2,1.0,11.8,0.9,2.6,18.0,94.0,2.2,2.2,0.2,2.4,3.0,0.8,3.1,520.0
3,1.0,12.4,4.4,2.7,26.5,102.0,2.2,2.1,0.4,1.7,2.1,0.9,3.1,365.0


Here are the descriptive statistics -- of course, no standardization has been performed yet:

In [11]:
wineRaw.select(wineRaw.schema.names[1:]).toPandas().describe().applymap(lambda x: round(x, 1))

Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
count,130.0,130.0,130.0,130.0,130.0,130.0,130.0,130.0,130.0,130.0,130.0,130.0,130.0
mean,12.9,2.0,2.3,18.8,99.9,2.5,2.5,0.3,1.8,4.2,1.1,3.0,790.1
std,0.9,0.9,0.3,3.4,15.4,0.5,0.7,0.1,0.5,1.6,0.2,0.5,352.5
min,11.0,0.7,1.4,10.6,70.0,1.1,0.6,0.1,0.4,1.3,0.7,1.6,278.0
25%,12.2,1.5,2.2,16.5,88.0,2.1,2.0,0.3,1.4,2.9,0.9,2.7,474.0
50%,13.0,1.7,2.3,18.6,98.0,2.6,2.6,0.3,1.7,3.9,1.1,3.0,716.0
75%,13.7,2.1,2.5,20.9,108.0,2.9,3.0,0.4,2.0,5.4,1.2,3.3,1063.8
max,14.8,5.8,3.2,30.0,162.0,3.9,5.1,0.7,3.6,8.9,1.7,4.0,1680.0


Reformat the data into a new dataframe with features as a vector:

In [12]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

In [13]:
wineRaw = wineRaw.rdd.map(lambda row: Row(label=row.Class, features=Vectors.dense(row[1:]))).toDF()
wineRaw.sample(False, 0.1).show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[14.06,2.15,2.61,...|    0|
|[13.86,1.35,2.27,...|    0|
|[13.24,3.98,2.29,...|    0|
|[13.56,1.73,2.46,...|    0|
|[12.37,0.94,1.36,...|    1|
+--------------------+-----+
only showing top 5 rows



Now that we have the correct format, let's standardize the data by making the mean and variance 0 and 1, respectively, for each column:

In [14]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)
scalerModel = scaler.fit(wineRaw)
wine = scalerModel.transform(wineRaw).cache()

In [15]:
wine.printSchema()

root
 |-- features: vector (nullable = true)
 |-- label: long (nullable = true)
 |-- scaledFeatures: vector (nullable = true)



In [16]:
wine.show(5)

+--------------------+-----+--------------------+
|            features|label|      scaledFeatures|
+--------------------+-----+--------------------+
|[14.23,1.71,2.43,...|    0|[1.44685785426384...|
|[13.2,1.78,2.14,1...|    0|[0.28795214937703...|
|[13.16,2.36,2.67,...|    0|[0.24294610258531...|
|[14.37,1.95,2.5,1...|    0|[1.60437901803486...|
|[13.24,2.59,2.87,...|    0|[0.33295819616875...|
+--------------------+-----+--------------------+
only showing top 5 rows



Let's check that the standardized features have a mean of 0 and a variance of 1:

In [17]:
wine.rdd.map(lambda row: row.scaledFeatures.values.tolist()).toDF().toPandas().describe().applymap(lambda x: round(x, 1))

Unnamed: 0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_10,_11,_12,_13
count,130.0,130.0,130.0,130.0,130.0,130.0,130.0,130.0,130.0,130.0,130.0,130.0,130.0
mean,0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-2.2,-1.4,-3.3,-2.4,-1.9,-2.6,-2.6,-1.8,-2.5,-1.8,-2.2,-2.9,-1.5
25%,-0.8,-0.5,-0.6,-0.7,-0.8,-0.7,-0.7,-0.6,-0.6,-0.8,-0.7,-0.5,-0.9
50%,0.1,-0.3,-0.1,-0.1,-0.1,0.1,0.1,-0.3,-0.1,-0.2,-0.1,0.1,-0.2
75%,0.9,0.1,0.6,0.6,0.5,0.8,0.7,0.6,0.5,0.7,0.6,0.7,0.8
max,2.1,4.4,3.0,3.3,4.0,2.5,3.5,3.0,3.4,2.9,3.9,2.2,2.5


Now that the wine dataFrame is properly formatted, we create a ML model:

In [18]:
train, test = wine.randomSplit([0.7, 0.3])

In [19]:
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol='scaledFeatures', labelCol='label', maxIter=10, regParam=0.3, elasticNetParam=0.8)
pipeline = Pipeline(stages=[lr])

paramGrid = ParamGridBuilder().addGrid(lr.regParam, [1.0, 0.1, 0.01]).addGrid(lr.elasticNetParam, [1.0, 0.1, 0.01]).build()
crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=5)
cvModel = crossval.fit(train)

The optimal regularization parameter was found to be

In [20]:
cvModel.avgMetrics

[0.5,
 0.99,
 0.99,
 0.988,
 0.9949999999999999,
 1.0,
 0.988,
 0.9949999999999999,
 1.0]

In [21]:
cvModel.bestModel.stages[0].coefficients

DenseVector([-1.5564, -0.2522, -0.9723, 1.0837, -0.2928, -0.027, -0.1493, 0.056, -0.1916, -0.7112, 0.1011, -0.577, -1.4946])

In [22]:
cvModel.bestModel.stages[0].intercept

0.4165712185722266

In [23]:
prediction = cvModel.transform(test)
prediction.show()

+--------------------+-----+--------------------+--------------------+--------------------+----------+
|            features|label|      scaledFeatures|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+----------+
|[11.45,2.4,2.42,2...|    1|[-1.6810623977607...|[-3.4270997348184...|[0.03145918189962...|       1.0|
|[11.46,3.74,1.82,...|    1|[-1.6698108860628...|[-5.0807463353969...|[0.00617687765574...|       1.0|
|[11.79,2.13,2.78,...|    1|[-1.2985110000311...|[-6.8629194551069...|[0.00104476346891...|       1.0|
|[11.84,0.89,2.58,...|    1|[-1.2422534415414...|[-2.8733739339197...|[0.05348558945819...|       1.0|
|[12.0,0.92,2.0,19...|    1|[-1.0622292543745...|[-6.8714701596614...|[0.00103587730475...|       1.0|
|[12.0,1.51,2.42,2...|    1|[-1.0622292543745...|[-5.7308641027707...|[0.00323378136867...|       1.0|
|[12.0,3.43,2.0,19...|    1|[-1.0622292543745...|[-5.2954254864624...|[0.

In [24]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(prediction)
evaluator.evaluate(prediction, {evaluator.metricName: "areaUnderPR"})

0.9999999999999999