Create a spark session and load the Housing Data set

In [0]:
from pyspark.sql import SparkSession


In [0]:
# File location and type
file_location = "/FileStore/tables/red_or_white_wine-6.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","
 
# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)
 

Data pre-processing

In [0]:
# Import the required libraries
 
from pyspark.sql.functions import datediff,date_format,to_date,to_timestamp

In [0]:
import pyspark.sql.functions as f

In [0]:
df=df.withColumn('type',df.type.cast('integer'))

In [0]:
data = df.select(['fixed acidity',
'volatile acidity',
'citric acid',
'residual sugar',
'chlorides',
'free sulfur dioxide',
'total sulfur dioxide',
'density',
'pH',
'sulphates',
'alcohol',
'quality',
'type'
])

In [0]:
df.printSchema()

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)
 |-- type: integer (nullable = true)



In [0]:
df=df.dropna()

In [0]:
print((df.count(),len(df.columns)))

(0, 13)


In [0]:
# Create a 70-30 train test split
 
train_data,test_data=data.randomSplit([0.7,0.3])

In [0]:
print((train_data.count(),len(train_data.columns)))

(4564, 13)


In [0]:
print((test_data.count(),len(test_data.columns)))

(1933, 13)


In [0]:
# Import the required libraries
 
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler,StringIndexer ,OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.classification import RandomForestClassifier

Decision Tree Classifier

In [0]:
type_indexer = StringIndexer(inputCol='type',outputCol='type_index',handleInvalid='keep')

In [0]:
assembler= VectorAssembler(
                inputCols=['fixed acidity',
'volatile acidity',
'citric acid',
'residual sugar',
'chlorides',
'free sulfur dioxide',
'total sulfur dioxide',
'density',
'pH',
'sulphates',
'alcohol',
'type_index'
],
outputCol="features")

In [0]:
rf_model = RandomForestClassifier(labelCol="quality", featuresCol="features", numTrees=10)

In [0]:
pipe = Pipeline(stages= [type_indexer,assembler,rf_model])

In [0]:
fit_model=pipe.fit(train_data)

In [0]:
# Store the results in a dataframe
 
results = fit_model.transform(test_data)

In [0]:
results.select(['quality','prediction']).show()

+-------+----------+
|quality|prediction|
+-------+----------+
|      8|       7.0|
|      4|       6.0|
|      6|       6.0|
|      5|       6.0|
|      7|       6.0|
|      6|       6.0|
|      7|       6.0|
|      7|       6.0|
|      6|       6.0|
|      5|       6.0|
|      5|       6.0|
|      6|       6.0|
|      7|       6.0|
|      6|       6.0|
|      6|       6.0|
|      7|       7.0|
|      6|       6.0|
|      6|       6.0|
|      6|       6.0|
|      7|       6.0|
+-------+----------+
only showing top 20 rows



Evaluating the model

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
ACC_evaluator = MulticlassClassificationEvaluator(
    labelCol="quality", predictionCol="prediction", metricName="accuracy")

In [0]:
accuracy = ACC_evaluator.evaluate(results)

In [0]:
print("The accuracy of the decision tree classifier is {}".format(accuracy))

The accuracy of the decision tree classifier is 0.5344024831867563
