In [1]:
import findspark
findspark.init('path_to_spark_installation_directory')

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

data = spark.read.csv('winequality-red.csv',
                      inferSchema = True, 
                      header = True,
                     sep = ';')

In [2]:
data.printSchema()

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [3]:
data.show()

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|          7.4|             0.7|        0.0|           1.9|    0.076|               11.0|                34.0| 0.9978|3.51|     0.56|    9.4|      5|
|          7.8|            0.88|        0.0|           2.6|    0.098|               25.0|                67.0| 0.9968| 3.2|     0.68|    9.8|      5|
|          7.8|            0.76|       0.04|           2.3|    0.092|               15.0|                54.0|  0.997|3.26|     0.65|    9.8|      5|
|         11.2|            0.28|       0.56|           1.9|    0.075|               17.0|           

In [4]:
data.describe().show()

+-------+------------------+-------------------+-------------------+------------------+--------------------+-------------------+--------------------+--------------------+-------------------+------------------+------------------+------------------+
|summary|     fixed acidity|   volatile acidity|        citric acid|    residual sugar|           chlorides|free sulfur dioxide|total sulfur dioxide|             density|                 pH|         sulphates|           alcohol|           quality|
+-------+------------------+-------------------+-------------------+------------------+--------------------+-------------------+--------------------+--------------------+-------------------+------------------+------------------+------------------+
|  count|              1599|               1599|               1599|              1599|                1599|               1599|                1599|                1599|               1599|              1599|              1599|              1599|
|   mean

In [5]:
data.columns

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

In [6]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides',
                           'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'],
                outputCol="features")
output = assembler.transform(data)
final_data = output.select('features', 'quality')
final_data.show()

+--------------------+-------+
|            features|quality|
+--------------------+-------+
|[7.4,0.7,0.0,1.9,...|      5|
|[7.8,0.88,0.0,2.6...|      5|
|[7.8,0.76,0.04,2....|      5|
|[11.2,0.28,0.56,1...|      6|
|[7.4,0.7,0.0,1.9,...|      5|
|[7.4,0.66,0.0,1.8...|      5|
|[7.9,0.6,0.06,1.6...|      5|
|[7.3,0.65,0.0,1.2...|      7|
|[7.8,0.58,0.02,2....|      7|
|[7.5,0.5,0.36,6.1...|      5|
|[6.7,0.58,0.08,1....|      5|
|[7.5,0.5,0.36,6.1...|      5|
|[5.6,0.615,0.0,1....|      5|
|[7.8,0.61,0.29,1....|      5|
|[8.9,0.62,0.18,3....|      5|
|[8.9,0.62,0.19,3....|      5|
|[8.5,0.28,0.56,1....|      7|
|[8.1,0.56,0.28,1....|      5|
|[7.4,0.59,0.08,4....|      4|
|[7.9,0.32,0.51,1....|      6|
+--------------------+-------+
only showing top 20 rows



In [7]:
# Splitting the data into trainind and test set
train, test = final_data.randomSplit([0.8, 0.2])

In [8]:
# Creating Regression model
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
lr = LinearRegression(labelCol = 'quality')
rfr = RandomForestRegressor(labelCol = 'quality', maxDepth = 10)

# Fit the model using training data
lrModel = lr.fit(train) 
rfrModel = rfr.fit(train)

### Evaluation and Prediction

In [9]:
# Evaluating Linear Regression Model

print('---------------- Linear Regression Model ----------------')
result_lr = lrModel.evaluate(test)
print("RMSE: {}".format(result_lr.rootMeanSquaredError))
print("MSE: {}".format(result_lr.meanSquaredError))
print("R2: {}".format(result_lr.r2))

---------------- Linear Regression Model ----------------
RMSE: 0.6756515932403583
MSE: 0.45650507544823454
R2: 0.34997807416261095


In [10]:
# Predicting the quality of wine using linear regression model
prediction = lrModel.transform(test)
prediction.show(truncate = False)

+-----------------------------------------------------------+-------+------------------+
|features                                                   |quality|prediction        |
+-----------------------------------------------------------+-------+------------------+
|[5.0,0.42,0.24,2.0,0.06,19.0,50.0,0.9917,3.72,0.74,14.0]   |8      |6.697197514761346 |
|[5.0,0.74,0.0,1.2,0.041,16.0,46.0,0.99258,4.01,0.59,12.5]  |6      |5.7483453533740345|
|[5.1,0.585,0.0,1.7,0.044,14.0,86.0,0.99264,3.56,0.94,12.9] |7      |6.431535554961481 |
|[5.4,0.58,0.08,1.9,0.059,20.0,31.0,0.99484,3.5,0.64,10.2]  |6      |5.538217137536897 |
|[5.6,0.31,0.37,1.4,0.074,12.0,96.0,0.9954,3.32,0.58,9.2]   |5      |5.236398739076211 |
|[5.6,0.605,0.05,2.4,0.073,19.0,25.0,0.99258,3.56,0.55,12.9]|5      |6.17616922927802  |
|[5.6,0.615,0.0,1.6,0.089,16.0,59.0,0.9943,3.58,0.52,9.9]   |5      |5.130264783962634 |
|[5.6,0.66,0.0,2.2,0.087,3.0,11.0,0.99378,3.71,0.63,12.8]   |7      |6.050557086780039 |
|[5.6,0.66,0.0,2.5,0.

In [11]:
# Predicting Random Forest Model
prediction_rfr = rfrModel.transform(test)
prediction_rfr.show()

+--------------------+-------+------------------+
|            features|quality|        prediction|
+--------------------+-------+------------------+
|[5.0,0.42,0.24,2....|      8| 6.611111111111112|
|[5.0,0.74,0.0,1.2...|      6|6.0526785714285705|
|[5.1,0.585,0.0,1....|      7| 6.914285714285714|
|[5.4,0.58,0.08,1....|      6| 5.612744822635267|
|[5.6,0.31,0.37,1....|      5| 5.207173710792132|
|[5.6,0.605,0.05,2...|      5| 5.878205128205129|
|[5.6,0.615,0.0,1....|      5| 4.981385116136996|
|[5.6,0.66,0.0,2.2...|      7| 6.361538461538461|
|[5.6,0.66,0.0,2.5...|      5|5.7666666666666675|
|[5.6,0.915,0.0,2....|      5|            5.9325|
|[5.7,0.6,0.0,1.4,...|      6|           6.08125|
|[5.8,0.29,0.26,1....|      6|              6.35|
|[5.9,0.44,0.0,1.6...|      6| 6.526666666666666|
|[5.9,0.46,0.0,1.9...|      5| 5.817948717948719|
|[6.0,0.31,0.47,3....|      6| 6.333295454545455|
|[6.0,0.54,0.06,1....|      6| 5.486606595159226|
|[6.1,0.38,0.15,1....|      5| 5.143589708285061|


In [12]:
# Evaluating Random Forest Model
from pyspark.ml.evaluation import RegressionEvaluator
rmse_eval = RegressionEvaluator(labelCol = 'quality', predictionCol = 'prediction', metricName = 'rmse')
mse_eval = RegressionEvaluator(labelCol = 'quality', predictionCol = 'prediction', metricName = 'mse')
r2_eval = RegressionEvaluator(labelCol = 'quality', predictionCol = 'prediction', metricName = 'r2')

rmse = rmse_eval.evaluate(prediction_rfr)
mse = mse_eval.evaluate(prediction_rfr)
r2 = r2_eval.evaluate(prediction_rfr)

print('---------------- Random Forest Model ----------------')
print("RMSE: {}".format(rmse))
print("MSE: {}".format(mse))
print("R2: {}".format(r2))

---------------- Random Forest Model ----------------
RMSE: 0.6152923469245164
MSE: 0.3785846721838795
R2: 0.4609296786813334
