In [1]:
import findspark
findspark.init()

In [2]:
import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [3]:
data = spark.read.csv("Absenteeism_at_work.csv", header = True, sep=",")

In [5]:
data.schema

StructType(List(StructField(ID,StringType,true),StructField(Reason for absence,StringType,true),StructField(Month of absence,StringType,true),StructField(Day of the week,StringType,true),StructField(Seasons,StringType,true),StructField(Transportation expense,StringType,true),StructField(Distance from Residence to Work,StringType,true),StructField(Service time,StringType,true),StructField(Age,StringType,true),StructField(Work load Average/day ,StringType,true),StructField(Hit target,StringType,true),StructField(Disciplinary_failure,StringType,true),StructField(Education,StringType,true),StructField(Son,StringType,true),StructField(Social drinker,StringType,true),StructField(Social smoker,StringType,true),StructField(Pet,StringType,true),StructField(Weight,StringType,true),StructField(Height,StringType,true),StructField(Body mass index,StringType,true),StructField(Absenteeism_time_in_hours,StringType,true)))

---

In [4]:
data = data.withColumn("MOA", data["Month of absence"] - 0). \
            withColumn("label", data['Height'] - 0). \
            withColumn("ROA", data["Reason for absence"] - 0). \
            withColumn("distance", data["Distance from Residence to Work"] - 0). \
            withColumn("BMI", data["Body mass index"] - 0)

In [8]:
data[["MOA", "label", "ROA", "distance", "BMI"]].show()

+---+-----+----+--------+----+
|MOA|label| ROA|distance| BMI|
+---+-----+----+--------+----+
|7.0|172.0|26.0|    36.0|30.0|
|7.0|178.0| 0.0|    13.0|31.0|
|7.0|170.0|23.0|    51.0|31.0|
|7.0|168.0| 7.0|     5.0|24.0|
|7.0|172.0|23.0|    36.0|30.0|
|7.0|170.0|23.0|    51.0|31.0|
|7.0|172.0|22.0|    52.0|27.0|
|7.0|168.0|23.0|    50.0|23.0|
|7.0|196.0|19.0|    12.0|25.0|
|7.0|172.0|22.0|    11.0|29.0|
|7.0|168.0| 1.0|    50.0|23.0|
|7.0|168.0| 1.0|    50.0|23.0|
|7.0|168.0|11.0|    50.0|23.0|
|7.0|170.0|11.0|    51.0|31.0|
|7.0|170.0|23.0|    51.0|31.0|
|7.0|170.0|14.0|    25.0|23.0|
|7.0|170.0|23.0|    51.0|31.0|
|7.0|170.0|21.0|    51.0|31.0|
|7.0|167.0|11.0|    29.0|25.0|
|8.0|165.0|23.0|    25.0|32.0|
+---+-----+----+--------+----+
only showing top 20 rows



In [7]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml import Pipeline

In [8]:
assem = VectorAssembler(inputCols=['label', 'distance'], outputCol='features')
data = assem.transform(data)

In [9]:
labelIndexer = StringIndexer(inputCol='label', outputCol='indexedLabel').fit(data)

In [10]:
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

In [11]:
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [12]:
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

In [13]:
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

In [14]:
model = pipeline.fit(trainingData)

In [15]:
# Make predictions.
predictions = model.transform(testData)

In [16]:
# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+------------+
|prediction|indexedLabel|    features|
+----------+------------+------------+
|       1.0|         1.0|[172.0,11.0]|
|       1.0|         1.0|[172.0,11.0]|
|       1.0|         1.0|[172.0,11.0]|
|       1.0|         1.0|[172.0,11.0]|
|       1.0|         1.0|[172.0,11.0]|
+----------+------------+------------+
only showing top 5 rows



In [18]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")

In [20]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

accuracy = evaluator.evaluate(predictions)

y_true = data.select("BMI").rdd.flatMap(lambda x: x).collect()
y_pred = data.select("ROA").rdd.flatMap(lambda x: x).collect()

confusionmatrix = confusion_matrix(y_true, y_pred)

precision = precision_score(y_true, y_pred, average='micro')

recall = recall_score(y_true, y_pred, average='micro')

In [21]:
treeModel = model.stages[2]

In [22]:
print(treeModel)
print("Decision Tree - Test Accuracy = %g" % (accuracy))
print("Decision Tree - Test Error = %g" % (1.0 - accuracy))

print("The Confusion Matrix for Decision Tree Model is :\n" + str(confusionmatrix))

print("The precision score for Decision Tree Model is: " + str(precision))

print("The recall score for Decision Tree Model is: " + str(recall))

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_853265e13f53) of depth 5 with 21 nodes
Decision Tree - Test Accuracy = 0.970588
Decision Tree - Test Error = 0.0294118
The Confusion Matrix for Decision Tree Model is :
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [2 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [5 0 0 ... 0 0 0]]
The precision score for Decision Tree Model is: 0.02972972972972973
The recall score for Decision Tree Model is: 0.02972972972972973
