<img src='logo/dsl-logo.png' width="500" align="center" />

# HR Competition

## Decision Tree

In [1]:
# Bibliotheken einbinden
import numpy as np
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import LogisticRegression

In [2]:
# Definition einer Klasse für Text Styles
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [3]:
#Laden der Trainings- und Testdaten
X = np.load(file='exchange/hr_06_X.npy')
X_train = np.load(file='exchange/hr_06_X_train.npy')
X_train_scaled = np.load(file='exchange/hr_06_X_train_scaled.npy')
X_test = np.load(file='exchange/hr_06_X_test.npy')
X_test_scaled = np.load(file='exchange/hr_06_X_test_scaled.npy')
y = np.load(file='exchange/hr_06_y.npy')
y_train = np.load(file='exchange/hr_06_y_train.npy')
y_test = np.load(file='exchange/hr_06_y_test.npy')
#df = np.load(file='exchange/hr_06_premodel.npy')

In [4]:
#Decision Tree initialisieren
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train_scaled, y_train)

In [5]:
clf.score(X_test_scaled, y_test)

0.97875000000000001

In [6]:
clf_predictions = clf.predict(X_test_scaled)

In [7]:
print(confusion_matrix(y_test, clf_predictions))

[[1814   33]
 [  18  535]]


In [8]:
print(classification_report(y_test, clf_predictions))

             precision    recall  f1-score   support

          0       0.99      0.98      0.99      1847
          1       0.94      0.97      0.95       553

avg / total       0.98      0.98      0.98      2400



In [9]:
clf_reg = tree.DecisionTreeRegressor()
clf_reg = clf.fit(X_train_scaled, y_train)

In [10]:
clf_reg.score(X_test_scaled, y_test)

0.97875000000000001

In [11]:
clf_reg_predictions = clf.predict(X_test_scaled)
print(confusion_matrix(y_test, clf_reg_predictions))

[[1813   34]
 [  17  536]]


In [12]:
print(classification_report(y_test, clf_predictions))

             precision    recall  f1-score   support

          0       0.99      0.98      0.99      1847
          1       0.94      0.97      0.95       553

avg / total       0.98      0.98      0.98      2400



#### Graphische Darstellung des Entscheidungsbaums

In [13]:
# Install graphviz on MacOS: http://macappstore.org/graphviz/
!pip install graphviz
import graphviz



In [14]:
dot_data = tree.export_graphviz(clf, out_file='temp/tree.dot')


In [15]:
graph = graphviz.Source('temp/tree.dot')


In [16]:
! pip install pydotplus



In [17]:
from sklearn.externals.six import StringIO  
import pydotplus
from IPython.display import Image


In [19]:
dotfile = StringIO() 
tree.export_graphviz(clf, out_file=dotfile) 
pydotplus.graph_from_dot_data(dotfile.getvalue()).write_png("temp/my_tree.png");
print("Decision Tree Graph:")

<img src='temp/my_tree.png' width="100%" align="center" />

#### w/o department

In [20]:
#Laden der Trainings- und Testdaten
X = np.load(file='exchange/hr_06_X_wodept.npy')
X_train = np.load(file='exchange/hr_06_X_wodept_train.npy')
X_train_scaled = np.load(file='exchange/hr_06_X_wodept_train_scaled.npy')
X_test = np.load(file='exchange/hr_06_X_wodept_test.npy')
X_test_scaled = np.load(file='exchange/hr_06_X_wodept_test_scaled.npy')
y = np.load(file='exchange/hr_06_y_wodept.npy')
y_train = np.load(file='exchange/hr_06_y_wodept_train.npy')
y_test = np.load(file='exchange/hr_06_y_wodept_test.npy')

In [21]:
#Decision Tree initialisieren
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train_scaled, y_train)

In [22]:
clf.score(X_test_scaled, y_test)

0.98250000000000004

In [23]:
clf_predictions = clf.predict(X_test_scaled)

In [24]:
cm = confusion_matrix(y_test, clf_predictions)
print(cm)

[[1824   23]
 [  19  534]]


In [25]:
cm_dict = np.load("exchange/hr_20_cm_dict.npy")
print(cm_dict.item())

{'neural_net': {'cm': array([[1821,   26],
       [  57,  496]], dtype=int64), 'label': 'Neural Network'}, 'random_forest': {'cm': array([[1790,   57],
       [ 182,  371]], dtype=int64), 'label': 'Random Forest'}, 'logistic_regression': {'cm': array([[1736,  111],
       [ 345,  208]], dtype=int64), 'label': 'Logistic Regression'}, 'decision_tree': {'cm': array([[1822,   25],
       [  19,  534]], dtype=int64), 'label': 'Decision Tree'}, 'svm': {'cm': array([[1776,   71],
       [ 142,  411]], dtype=int64), 'label': 'Support Vector Machine'}}


In [26]:
cm_dict.item().update({'decision_tree':{'cm': cm, 'label': 'Decision Tree' }})
print(cm_dict)
np.save("exchange/hr_20_cm_dict.npy", cm_dict)

{'neural_net': {'cm': array([[1821,   26],
       [  57,  496]], dtype=int64), 'label': 'Neural Network'}, 'random_forest': {'cm': array([[1790,   57],
       [ 182,  371]], dtype=int64), 'label': 'Random Forest'}, 'logistic_regression': {'cm': array([[1736,  111],
       [ 345,  208]], dtype=int64), 'label': 'Logistic Regression'}, 'decision_tree': {'cm': array([[1824,   23],
       [  19,  534]], dtype=int64), 'label': 'Decision Tree'}, 'svm': {'cm': array([[1776,   71],
       [ 142,  411]], dtype=int64), 'label': 'Support Vector Machine'}}


In [27]:
print(classification_report(y_test, clf_predictions))

             precision    recall  f1-score   support

          0       0.99      0.99      0.99      1847
          1       0.96      0.97      0.96       553

avg / total       0.98      0.98      0.98      2400



In [28]:
clf_reg = tree.DecisionTreeRegressor()
clf_reg = clf.fit(X_train_scaled, y_train)

In [29]:
clf_reg.score(X_test_scaled, y_test)

0.98208333333333331

In [30]:
clf_reg_predictions = clf.predict(X_test_scaled)
print(confusion_matrix(y_test, clf_reg_predictions))

[[1822   25]
 [  18  535]]


In [31]:
print(classification_report(y_test, clf_predictions))

             precision    recall  f1-score   support

          0       0.99      0.99      0.99      1847
          1       0.96      0.97      0.96       553

avg / total       0.98      0.98      0.98      2400



#### Graphische Darstellung des Entscheidungsbaums

In [32]:
# Install graphviz on MacOS: http://macappstore.org/graphviz/
!pip install graphviz
import graphviz



In [33]:
dot_data = tree.export_graphviz(clf, out_file='temp/tree.dot')


In [34]:
graph = graphviz.Source('temp/tree.dot')


In [35]:
! pip install pydotplus



In [36]:
from sklearn.externals.six import StringIO  
import pydotplus
from IPython.display import Image


In [38]:
dotfile = StringIO() 
tree.export_graphviz(clf, out_file=dotfile) 
pydotplus.graph_from_dot_data(dotfile.getvalue()).write_png("temp/my_tree_wodept.png");
print("Decision Tree Graph:")

<img src='temp/my_tree_wodept.png' width="100%" align="center" />

#### w/o Department and Salary

In [39]:
#Laden der Trainings- und Testdaten
X = np.load(file='exchange/hr_06_X_wodeptsal.npy')
X_train = np.load(file='exchange/hr_06_X_wodeptsal_train.npy')
X_train_scaled = np.load(file='exchange/hr_06_X_wodeptsal_train_scaled.npy')
X_test = np.load(file='exchange/hr_06_X_wodeptsal_test.npy')
X_test_scaled = np.load(file='exchange/hr_06_X_wodeptsal_test_scaled.npy')
y = np.load(file='exchange/hr_06_y_wodeptsal.npy')
y_train = np.load(file='exchange/hr_06_y_wodeptsal_train.npy')
y_test = np.load(file='exchange/hr_06_y_wodeptsal_test.npy')

In [40]:
#Decision Tree initialisieren
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train_scaled, y_train)

In [41]:
clf.score(X_test_scaled, y_test)

0.98124999999999996

In [42]:
clf_predictions = clf.predict(X_test_scaled)

In [43]:
cm = confusion_matrix(y_test, clf_predictions)
print(cm)

[[1822   25]
 [  20  533]]


In [44]:
cm_dict = np.load("exchange/hr_20_cm_dict.npy")
print(cm_dict.item())

{'neural_net': {'cm': array([[1821,   26],
       [  57,  496]], dtype=int64), 'label': 'Neural Network'}, 'random_forest': {'cm': array([[1790,   57],
       [ 182,  371]], dtype=int64), 'label': 'Random Forest'}, 'logistic_regression': {'cm': array([[1736,  111],
       [ 345,  208]], dtype=int64), 'label': 'Logistic Regression'}, 'decision_tree': {'cm': array([[1824,   23],
       [  19,  534]], dtype=int64), 'label': 'Decision Tree'}, 'svm': {'cm': array([[1776,   71],
       [ 142,  411]], dtype=int64), 'label': 'Support Vector Machine'}}


In [45]:
cm_dict.item().update({'decision_tree':{'cm': cm, 'label': 'Decision Tree' }})
print(cm_dict)
np.save("exchange/hr_20_cm_dict.npy", cm_dict)

{'neural_net': {'cm': array([[1821,   26],
       [  57,  496]], dtype=int64), 'label': 'Neural Network'}, 'random_forest': {'cm': array([[1790,   57],
       [ 182,  371]], dtype=int64), 'label': 'Random Forest'}, 'logistic_regression': {'cm': array([[1736,  111],
       [ 345,  208]], dtype=int64), 'label': 'Logistic Regression'}, 'decision_tree': {'cm': array([[1822,   25],
       [  20,  533]], dtype=int64), 'label': 'Decision Tree'}, 'svm': {'cm': array([[1776,   71],
       [ 142,  411]], dtype=int64), 'label': 'Support Vector Machine'}}


In [46]:
print(classification_report(y_test, clf_predictions))

             precision    recall  f1-score   support

          0       0.99      0.99      0.99      1847
          1       0.96      0.96      0.96       553

avg / total       0.98      0.98      0.98      2400



In [47]:
clf_reg = tree.DecisionTreeRegressor()
clf_reg = clf.fit(X_train_scaled, y_train)

In [48]:
clf_reg.score(X_test_scaled, y_test)

0.98124999999999996

In [49]:
clf_reg_predictions = clf.predict(X_test_scaled)
print(confusion_matrix(y_test, clf_reg_predictions))

[[1820   27]
 [  18  535]]


In [50]:
print(classification_report(y_test, clf_predictions))

             precision    recall  f1-score   support

          0       0.99      0.99      0.99      1847
          1       0.96      0.96      0.96       553

avg / total       0.98      0.98      0.98      2400



In [57]:
#keine Veränderung durch Rausnehmen der Salary-Daten - sogar geringfügig schlechter geworden 

#### Graphische Darstellung des Entscheidungsbaums

In [51]:
# Install graphviz on MacOS: http://macappstore.org/graphviz/
!pip install graphviz
import graphviz



In [53]:
dot_data = tree.export_graphviz(clf, out_file='temp/tree.dot')

In [54]:
graph = graphviz.Source('temp/tree.dot')

In [55]:
from sklearn.externals.six import StringIO  
import pydotplus
from IPython.display import Image

In [56]:
dotfile = StringIO() 
tree.export_graphviz(clf, out_file=dotfile) 
pydotplus.graph_from_dot_data(dotfile.getvalue()).write_png("temp/my_tree_wodept.png");
print("Decision Tree Graph:")