In [1]:
import findspark
findspark.init()

In [2]:
import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [3]:
# Đọc data từ file Absenteeism_at_work.csv
data = spark.read.csv("Absenteeism_at_work.csv", header = True, sep=",")

In [4]:
data = data.withColumn("MOA", data["Month of absence"] - 0). \
            withColumn("label", data['Height'] - 0). \
            withColumn("ROA", data["Reason for absence"] - 0). \
            withColumn("distance", data["Distance from Residence to Work"] - 0). \
            withColumn("BMI", data["Body mass index"] - 0)

In [33]:
data.select('ROA').show(5)

+----+
| ROA|
+----+
|26.0|
| 0.0|
|23.0|
| 7.0|
|23.0|
+----+
only showing top 5 rows



In [34]:
data.select('BMI').show(5)

+----+
| BMI|
+----+
|30.0|
|31.0|
|31.0|
|24.0|
|30.0|
+----+
only showing top 5 rows



In [5]:
from pyspark.ml.feature import VectorAssembler
assem = VectorAssembler(inputCols=["label", "distance"], outputCol='features')
assem

VectorAssembler_72280e10d1b8

In [6]:
data_assem = assem.transform(data)

In [10]:
data_assem[['features']].show()

+------------+
|    features|
+------------+
|[172.0,36.0]|
|[178.0,13.0]|
|[170.0,51.0]|
| [168.0,5.0]|
|[172.0,36.0]|
|[170.0,51.0]|
|[172.0,52.0]|
|[168.0,50.0]|
|[196.0,12.0]|
|[172.0,11.0]|
|[168.0,50.0]|
|[168.0,50.0]|
|[168.0,50.0]|
|[170.0,51.0]|
|[170.0,51.0]|
|[170.0,25.0]|
|[170.0,51.0]|
|[170.0,51.0]|
|[167.0,29.0]|
|[165.0,25.0]|
+------------+
only showing top 20 rows



In [11]:
from pyspark.ml.feature import StringIndexer
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data_assem)

In [13]:
from pyspark.ml.feature import VectorIndexer
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data_assem)

In [36]:
featureIndexer.categoryMaps

{}

In [14]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data_assem.randomSplit([0.7, 0.3])

In [15]:
from pyspark.ml.classification import RandomForestClassifier
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)

In [16]:
from pyspark.ml.feature import IndexToString
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

In [17]:
y_true = data_assem.select("BMI").rdd.flatMap(lambda x: x).collect()
y_pred = data_assem.select("ROA").rdd.flatMap(lambda x: x).collect()

In [19]:
print(y_true[:5])
print()
print(y_pred[:5])

[30.0, 31.0, 31.0, 24.0, 30.0]

[26.0, 0.0, 23.0, 7.0, 23.0]


In [27]:
from pyspark.ml import Pipeline
# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

In [28]:
# Make predictions.
predictions = model.transform(testData)

In [29]:
# Select example rows to display.
predictions.select("predictedLabel", "label", "features").show()

+--------------+-----+------------+
|predictedLabel|label|    features|
+--------------+-----+------------+
|         172.0|172.0|[172.0,11.0]|
|         172.0|172.0|[172.0,11.0]|
|         172.0|172.0|[172.0,11.0]|
|         172.0|172.0|[172.0,11.0]|
|         172.0|172.0|[172.0,11.0]|
|         172.0|172.0|[172.0,11.0]|
|         172.0|172.0|[172.0,52.0]|
|         172.0|172.0|[172.0,52.0]|
|         172.0|172.0|[172.0,52.0]|
|         172.0|172.0|[172.0,52.0]|
|         172.0|172.0|[172.0,52.0]|
|         172.0|172.0|[172.0,52.0]|
|         172.0|172.0|[172.0,52.0]|
|         172.0|172.0|[172.0,36.0]|
|         172.0|172.0|[172.0,36.0]|
|         172.0|172.0|[172.0,36.0]|
|         172.0|172.0|[172.0,36.0]|
|         172.0|172.0|[172.0,36.0]|
|         172.0|172.0|[172.0,36.0]|
|         172.0|172.0|[172.0,36.0]|
+--------------+-----+------------+
only showing top 20 rows



In [30]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")

In [31]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

accuracy = evaluator.evaluate(predictions)

confusionmatrix = confusion_matrix(y_true, y_pred)

precision = precision_score(y_true, y_pred, average='micro')

recall = recall_score(y_true, y_pred, average='micro')

rfModel = model.stages[2]

In [32]:
print(rfModel)  # summary only
print("Random Forest - Test Accuracy = %g" % (accuracy))
print("Random Forest - Test Error = %g" % (1.0 - accuracy))

print("The Confusion Matrix for Random Forest Model is :\n" + str(confusionmatrix))

print("The precision score for Random Forest Model is: " + str(precision))

print("The recall score for Random Forest Model is: " + str(recall))

RandomForestClassificationModel (uid=RandomForestClassifier_dde3ca3ea1c0) with 10 trees
Random Forest - Test Accuracy = 1
Random Forest - Test Error = 0
The Confusion Matrix for Random Forest Model is :
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [2 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [5 0 0 ... 0 0 0]]
The precision score for Random Forest Model is: 0.02972972972972973
The recall score for Random Forest Model is: 0.02972972972972973
