In [1]:
# Starting a Spark Session in R
library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib")))
sparkR.session()





Java ref type org.apache.spark.sql.SparkSession id 1 

In [2]:
data = read.df('winequality-red.csv',
               source = 'csv',
              sep = ';',
              header = TRUE,
              inferSchema = TRUE)

In [3]:
printSchema(data)

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)


In [4]:
showDF(data)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|          7.4|             0.7|        0.0|           1.9|    0.076|               11.0|                34.0| 0.9978|3.51|     0.56|    9.4|      5|
|          7.8|            0.88|        0.0|           2.6|    0.098|               25.0|                67.0| 0.9968| 3.2|     0.68|    9.8|      5|
|          7.8|            0.76|       0.04|           2.3|    0.092|               15.0|                54.0|  0.997|3.26|     0.65|    9.8|      5|
|         11.2|            0.28|       0.56|           1.9|    0.075|               17.0|           

In [5]:
showDF(describe(data))

+-------+------------------+-------------------+-------------------+------------------+--------------------+-------------------+--------------------+--------------------+-------------------+------------------+------------------+------------------+
|summary|     fixed acidity|   volatile acidity|        citric acid|    residual sugar|           chlorides|free sulfur dioxide|total sulfur dioxide|             density|                 pH|         sulphates|           alcohol|           quality|
+-------+------------------+-------------------+-------------------+------------------+--------------------+-------------------+--------------------+--------------------+-------------------+------------------+------------------+------------------+
|  count|              1599|               1599|               1599|              1599|                1599|               1599|                1599|                1599|               1599|              1599|              1599|              1599|
|   mean

In [6]:
# Splitting the dataset into the Training set and Test set
train_test = randomSplit(data, c(8, 2), 2)
train = train_test[[1]]
test = train_test[[2]]

In [7]:
showDF(train)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+----------------+-------+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|         alcohol|quality|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+----------------+-------+
|          4.6|            0.52|       0.15|           2.1|    0.054|                8.0|                65.0| 0.9934| 3.9|     0.56|            13.1|      4|
|          4.7|             0.6|       0.17|           2.3|    0.058|               17.0|               106.0| 0.9932|3.85|      0.6|            12.9|      6|
|          4.9|            0.42|        0.0|           2.1|    0.048|               16.0|                42.0|0.99154|3.71|     0.74|            14.0|      7|
|          5.0|            0.38|       0.01|  

In [8]:
# Creating Regression model

# For Linear regression
lr = spark.glm(data = train, formula = quality ~ ., family = 'gaussian')

# For Random Forest Regressor
rfr = spark.randomForest(data = train, formula = quality ~ ., type = 'regression', maxDepth = 10, numTrees = 300)

### Evaluation and Prediction

In [9]:
summary(lr)


Deviance Residuals: 
(Note: These are approximate quantiles with relative error <= 0.01)
     Min        1Q    Median        3Q       Max  
-2.68077  -0.36570  -0.04897   0.42443   2.06857  

Coefficients:
                         Estimate  Std. Error   t value    Pr(>|t|)
(Intercept)            26.1244602  2.3749e+01   1.10003  2.7153e-01
fixed acidity           0.0349074  2.9434e-02   1.18595  2.3586e-01
volatile acidity       -1.1328102  1.3748e-01  -8.23987  4.4409e-16
citric acid            -0.2039941  1.6357e-01  -1.24714  2.1258e-01
residual sugar          0.0209720  1.7111e-02   1.22568  2.2055e-01
chlorides              -1.7391009  4.6624e-01  -3.73004  1.9986e-04
free sulfur dioxide     0.0048271  2.4322e-03   1.98466  4.7396e-02
total sulfur dioxide   -0.0034002  8.3399e-04  -4.07706  4.8431e-05
density               -22.1148993  2.4257e+01  -0.91171  3.6209e-01
pH                     -0.4402051  2.1582e-01  -2.03966  4.1590e-02
sulphates               0.8441003  1.2549e-01

In [10]:
# Predicting Linear Regression Model
result_lr = predict(lr, test)
showDF(select(result_lr, c('quality', 'prediction')))

+-------+------------------+
|quality|        prediction|
+-------+------------------+
|      6| 5.699336900738054|
|      7|6.3538428814908094|
|      7| 6.639521439551206|
|      7| 5.966791188545535|
|      6| 5.648913204710599|
|      5| 6.197362471578224|
|      6| 6.069412116112822|
|      4|4.4169985026205225|
|      6| 6.333236267202281|
|      6| 5.825963129537879|
|      6| 6.214705079423943|
|      5| 5.311939625018791|
|      6| 6.032900208012123|
|      7| 5.735436362672072|
|      6| 6.084478118304229|
|      7| 6.152492547424231|
|      6| 5.900679936745661|
|      6| 5.846438530824212|
|      6|  5.91915943596554|
|      4| 5.189911375022479|
+-------+------------------+
only showing top 20 rows


In [11]:
# Predicting Linear Regression Model
result_rfr = predict(rfr, test)
showDF(select(result_rfr, c('quality', 'prediction')))

+-------+------------------+
|quality|        prediction|
+-------+------------------+
|      6| 5.896950878790396|
|      7|6.9011025071995755|
|      7| 6.982973787288246|
|      7| 7.147514467014466|
|      6| 5.504643711272518|
|      5|  5.63091650486391|
|      6| 5.591916771249731|
|      4| 4.325797019334792|
|      6|  6.37770539803229|
|      6|6.0045228230422225|
|      6| 5.981974523545433|
|      5|   5.0100671101453|
|      6|  6.39300638532546|
|      7| 5.835242189712889|
|      6| 6.154240868520335|
|      7| 6.334249663640181|
|      6| 5.760700024459329|
|      6| 5.830401585147474|
|      6| 5.944411245669022|
|      4| 5.224281681461895|
+-------+------------------+
only showing top 20 rows
