In [1]:
import findspark
findspark.init()

In [2]:
import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [3]:
# Đọc data từ file Absenteeism_at_work.csv
data = spark.read.csv("Absenteeism_at_work.csv", inferSchema=True, header = True, sep=",")

In [4]:
data.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Reason for absence: integer (nullable = true)
 |-- Month of absence: integer (nullable = true)
 |-- Day of the week: integer (nullable = true)
 |-- Seasons: integer (nullable = true)
 |-- Transportation expense: integer (nullable = true)
 |-- Distance from Residence to Work: integer (nullable = true)
 |-- Service time: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Work load Average/day : double (nullable = true)
 |-- Hit target: integer (nullable = true)
 |-- Disciplinary_failure: integer (nullable = true)
 |-- Education: integer (nullable = true)
 |-- Son: integer (nullable = true)
 |-- Social drinker: integer (nullable = true)
 |-- Social smoker: integer (nullable = true)
 |-- Pet: integer (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- Height: integer (nullable = true)
 |-- Body mass index: integer (nullable = true)
 |-- Absenteeism_time_in_hours: integer (nullable = true)



In [5]:
data = data.withColumn("MOA", data["Month of absence"]). \
            withColumn("label", data['Height']). \
            withColumn("ROA", data["Reason for absence"]). \
            withColumn("distance", data["Distance from Residence to Work"]). \
            withColumn("BMI", data["Body mass index"])

In [6]:
from pyspark.ml.feature import VectorAssembler
assem = VectorAssembler(inputCols=["label", "distance"], outputCol='features')
data = assem.transform(data)

In [7]:
from pyspark.ml.feature import StringIndexer
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")

In [8]:
from pyspark.ml.feature import VectorIndexer
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4)

In [11]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3], seed=1)

In [12]:
from pyspark.ml.classification import RandomForestClassifier
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)

In [38]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])

In [39]:
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

In [40]:
# Make predictions.
predictions = model.transform(testData)

In [41]:
# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show()

+----------+------------+------------+
|prediction|indexedLabel|    features|
+----------+------------+------------+
|       1.0|         1.0|[172.0,11.0]|
|       1.0|         1.0|[172.0,11.0]|
|       1.0|         1.0|[172.0,11.0]|
|       1.0|         1.0|[172.0,11.0]|
|       1.0|         1.0|[172.0,11.0]|
|       8.0|        12.0|[163.0,29.0]|
|       8.0|        12.0|[163.0,29.0]|
|       0.0|         0.0|[170.0,51.0]|
|       0.0|         0.0|[170.0,51.0]|
|       0.0|         0.0|[170.0,51.0]|
|       0.0|         0.0|[170.0,51.0]|
|       0.0|         0.0|[170.0,51.0]|
|       0.0|         0.0|[170.0,51.0]|
|       0.0|         0.0|[170.0,51.0]|
|       0.0|         0.0|[170.0,51.0]|
|       0.0|         0.0|[170.0,51.0]|
|       0.0|         0.0|[170.0,51.0]|
|       0.0|         0.0|[170.0,51.0]|
|       0.0|         0.0|[170.0,51.0]|
|       0.0|         0.0|[170.0,51.0]|
+----------+------------+------------+
only showing top 20 rows



In [42]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",\
                                              predictionCol="prediction",\
                                              metricName="accuracy")

In [43]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

y_true = data.select("BMI").rdd.flatMap(lambda x: x).collect()
y_pred = data.select("ROA").rdd.flatMap(lambda x: x).collect()

accuracy = evaluator.evaluate(predictions)

confusionmatrix = confusion_matrix(y_true, y_pred)

precision = precision_score(y_true, y_pred, average='micro')

recall = recall_score(y_true, y_pred, average='micro')

rfModel = model.stages[2]

In [44]:
print(rfModel)  # summary only
print("Random Forest - Test Accuracy = %g" % (accuracy))
print("Random Forest - Test Error = %g" % (1.0 - accuracy))

print("The Confusion Matrix for Random Forest Model is :\n" + str(confusionmatrix))

print("The precision score for Random Forest Model is: " + str(precision))

print("The recall score for Random Forest Model is: " + str(recall))

RandomForestClassificationModel (uid=RandomForestClassifier_93cc6145b3c1) with 10 trees
Random Forest - Test Accuracy = 0.977679
Random Forest - Test Error = 0.0223214
The Confusion Matrix for Random Forest Model is :
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [2 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [5 0 0 ... 0 0 0]]
The precision score for Random Forest Model is: 0.02972972972972973
The recall score for Random Forest Model is: 0.02972972972972973
