# CART Notebook

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

## 1.2 Data Import and Pre-Processing

In [19]:
d = pd.read_csv("../data/Employee.csv")
d = d.drop_duplicates()
d['Duration'] = 2020 - d['JoiningYear']
edlevel = {'Bachelors': 1, 'Masters': 2, 'PHD': 3}
d['EduLevel'] = d['Education'].map(edlevel)

## 1.3 Split Data into Train and Test

In [20]:
dtrain, dtest = train_test_split(d, test_size = .33, random_state = 7)
x = ['Original Dataset', 'Training Data', 'Test Data']
y = [d.shape[0], dtrain.shape[0], dtest.shape[0]]
fig = go.Figure(data=[go.Bar(x=x, y=y)])
fig.update_layout(title_text='Confirming Split')
fig.show()

## 1.4 Check for/Fix Any Imbalance Issues

In [21]:
dtrain['LeaveOrNot'].value_counts()

0    1131
1     720
Name: LeaveOrNot, dtype: int64

In [22]:
ratio = dtrain['LeaveOrNot'].value_counts()[1]/dtrain.shape[0] * 100
ratio

38.897893030794165

## 1.4 Prepare Data for CART

In [23]:
ytrain = dtrain[['LeaveOrNot']]
ytest = dtest[['LeaveOrNot']]
dtrain = pd.get_dummies(dtrain, prefix=None, columns=["City", "Gender","EverBenched"], drop_first=False)
dtest = pd.get_dummies(dtest, prefix=None, columns=["City", "Gender","EverBenched"], drop_first=False)

## 1.5 Create Xtrain and Xtest

In [24]:
dtrain.columns

Index(['Education', 'JoiningYear', 'PaymentTier', 'Age',
       'ExperienceInCurrentDomain', 'LeaveOrNot', 'Duration', 'EduLevel',
       'City_Bangalore', 'City_New Delhi', 'City_Pune', 'Gender_Female',
       'Gender_Male', 'EverBenched_No', 'EverBenched_Yes'],
      dtype='object')

In [25]:
Xdpgc = dtrain[["Duration", "PaymentTier", "Gender_Female", "Gender_Male","City_Bangalore", "City_New Delhi", "City_Pune"]]
Xtdpgc = dtest[["Duration", "PaymentTier", "Gender_Female", "Gender_Male","City_Bangalore", "City_New Delhi", "City_Pune"]]

In [26]:
X_names = ["Duration", "PaymentTier", "Gender_Female", "Gender_Male","City_Bangalore", "City_New Delhi", "City_Pune"]
y_names = ["No, Won't Leave", "Yes, Wil Leave"]

## 1.7 Create Model

In [27]:
cart = DecisionTreeClassifier(criterion = "gini", max_leaf_nodes = 5).fit(Xdpgc,ytrain)

## 1.8 Metrics for Model

In [28]:
predict = cart.predict(Xtdpgc)

In [29]:
cm = confusion_matrix(ytest, predict)
cm

array([[485,  60],
       [202, 166]], dtype=int64)

In [30]:
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TP = cm[1][1]

In [31]:
GT = TN + FP + FN + TP
Accuracy = (TN + TP)/GT
ErrorRate = 1-Accuracy
Sensitivity = TP/(FN + TP)
Recall = Sensitivity
Specificity = TN/(TN + FP)
Precision = TP/(FP + TP)
F1 = (2*Precision*Recall)/(Precision + Recall)
F2 = (5*Precision*Recall)/((4*Precision) + Recall)
F0_5 = (1.25*Precision*Recall)/((.25*Precision)+Recall)

In [32]:
print(Accuracy)
print(ErrorRate)
print(Sensitivity)
print(Specificity)
print(Precision)
print(F1)
print(F2)

0.7130339539978094
0.28696604600219056
0.45108695652173914
0.8899082568807339
0.7345132743362832
0.5589225589225588
0.48881036513545345
