In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [2]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=97eb03dab6da60d9b3d5e0ca2b50d9807665aa506c8972b36430c88b9100e683
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [13]:
import pandas as pd
from pyspark.sql import SparkSession, SQLContext
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
from pyspark.sql.types import *
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

import matplotlib.pyplot as plt

In [3]:
import sys 
sys.path.append("/content/drive/MyDrive/Introduction to Business Analytics")

from machine_learning.testing import evaluateModel
from machine_learning.training import trainModel

# **1. Load data**

In [5]:
spark = SparkSession.builder.master("local[2]").appName("Employee_Attrition").getOrCreate()

data_path = '/content/drive/MyDrive/Introduction to Business Analytics/WA_Fn-UseC_-HR-Employee-Attrition.csv'
schema_fields = []

data = pd.read_csv(data_path)
for col_index in range(data.shape[1]):
  field_name = data.columns[col_index]
  field_type = data.dtypes[col_index]

  if field_type == 'int64':
    schema_fields.append(
        StructField(field_name, FloatType(), nullable = True)
    )
  elif field_type == 'object':
        schema_fields.append(
        StructField(field_name, StringType(), nullable = True)
    )

df = spark.read.csv(path=data_path, schema= StructType(schema_fields)).cache()

In [6]:
df = spark.read.csv(path=data_path, schema= StructType(schema_fields)).cache()
df = df.where(df.EmployeeNumber.isNotNull())

df = df.where(df.EmployeeNumber.isNotNull())
df.show()

+----+---------+-----------------+---------+--------------------+----------------+---------+--------------+-------------+--------------+-----------------------+------+----------+--------------+--------+--------------------+---------------+-------------+-------------+-----------+------------------+------+--------+-----------------+-----------------+------------------------+-------------+----------------+-----------------+---------------------+---------------+--------------+------------------+-----------------------+--------------------+
| Age|Attrition|   BusinessTravel|DailyRate|          Department|DistanceFromHome|Education|EducationField|EmployeeCount|EmployeeNumber|EnvironmentSatisfaction|Gender|HourlyRate|JobInvolvement|JobLevel|             JobRole|JobSatisfaction|MaritalStatus|MonthlyIncome|MonthlyRate|NumCompaniesWorked|Over18|OverTime|PercentSalaryHike|PerformanceRating|RelationshipSatisfaction|StandardHours|StockOptionLevel|TotalWorkingYears|TrainingTimesLastYear|WorkLifeBala

In [7]:
discrete_attributes = ["NumCompaniesWorked", "TrainingTimesLastYear", ]
continous_attributes = ["Age", "DailyRate", "DistanceFromHome", "HourlyRate", "MonthlyIncome", 
                        "MonthlyRate", "PercentSalaryHike", "TotalWorkingYears",
                        "YearsAtCompany", "YearsInCurrentRole", "YearsSinceLastPromotion", "YearsWithCurrManager"]
categorical_attributes = ["Department", "EducationField", 
                          "Gender", "JobRole", "MaritalStatus"]
ordinal_attributes = ["BusinessTravel", "Education", "EnvironmentSatisfaction", "JobInvolvement", 
                      "JobLevel", "JobSatisfaction", "PerformanceRating",  "OverTime", 
                      "RelationshipSatisfaction", "StockOptionLevel",
                      "WorkLifeBalance"]
numeric_attributes = discrete_attributes + continous_attributes

target_name = "Attrition"

#**2. Decision Tree**

Split data to train and test, 80% is for training and the rest is for tesing.

In [11]:
train, test = df.randomSplit([0.8, 0.2])

Define decision tree and its hyper parameters for grid searching.

In [14]:
dt = DecisionTreeClassifier(featuresCol= 'features', labelCol = 'label')

dtParamGrid = ParamGridBuilder() \
    .addGrid(dt.minInstancesPerNode, [3, 5, 10, 15]) \
    .addGrid(dt.maxDepth, [3, 5, 10, 20]) \
    .addGrid(dt.maxBins, [8, 16, 32])\
    .build()

Using grid search to find best hyper parameter. The evaluation strategy is k-fold cross validation, in our case, k equals 5. In other words, each hyper parameters set is evaluated by 5-fold cross validation on traing data.  
The best model is tested on test data, results are following.

In [15]:
bestDT = trainModel(train, dt, dtParamGrid)

acc, confusion = evaluateModel(test, bestDT)
print("Accuracy:", acc)
print("Confusion matrix:", confusion)

Accuracy: 0.7675327083002278
Confusion matrix:
 [[197  39]
 [ 31  27]]
Accuracy: 0.7675327083002278
Confusion matrix: [[197  39]
 [ 31  27]]


#**3. Random forest**

Similar to Decision Tree, deinfe random foreset and hyper parameters grid to find best model by 5-fold cross validation on training data.

In [16]:
rf = RandomForestClassifier(featuresCol= 'features', labelCol = 'label')

rfParamGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20, 50, 100]) \
    .addGrid(rf.minInstancesPerNode, [3, 5, 10]) \
    .addGrid(dt.maxDepth, [3, 5, 10]) \
    .build()

Fit to training data and results are bellow. 

In [17]:
rfModel = trainModel(train_data = train, mlModel= rf, paramGrid= rfParamGrid)
acc, confusion = evaluateModel(test, rfModel)

Accuracy: 0.7436777885757478
Confusion matrix:
 [[235   1]
 [ 54   4]]
