## Modellierung Expertensystem

In [1]:
# Bibliotheken einbinden
import numpy as np
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#### Bereinigte Daten aus dem Notebook EDA einlesen

In [2]:
df = pd.read_pickle('data/hr_train_clean.pkl')

In [3]:
df_ids = df['id']

#### Manuelle Datenauswahl (Feature Selection)

Die Spalten "promotion_last_5years" und "department" weisen eine geringe Abhängigkeit zu "left" auf und werden daher entfernt. Zudem die Spalte "id" entfernen.

In [4]:
df = df.drop(['promotion_last_5years'], axis=1)

In [5]:
df.head()

Unnamed: 0,id,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,work_accident,left,department,salary
0,0,0.65,0.96,5,226,2,1,0,marketing,medium
1,1,0.88,0.8,3,166,2,0,0,IT,low
2,2,0.69,0.98,3,214,2,0,0,sales,low
3,3,0.41,0.47,2,154,3,0,1,sales,low
4,4,0.87,0.76,5,254,2,1,0,hr,low


In [6]:
df = df.drop(['department'], axis=1)

In [7]:
df.head()

Unnamed: 0,id,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,work_accident,left,salary
0,0,0.65,0.96,5,226,2,1,0,medium
1,1,0.88,0.8,3,166,2,0,0,low
2,2,0.69,0.98,3,214,2,0,0,low
3,3,0.41,0.47,2,154,3,0,1,low
4,4,0.87,0.76,5,254,2,1,0,low


In [8]:
df = df.drop(['id'],axis=1)
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,work_accident,left,salary
0,0.65,0.96,5,226,2,1,0,medium
1,0.88,0.8,3,166,2,0,0,low
2,0.69,0.98,3,214,2,0,0,low
3,0.41,0.47,2,154,3,0,1,low
4,0.87,0.76,5,254,2,1,0,low


In [9]:
df =df.drop(['salary'],axis=1)
df =df.drop(['work_accident'],axis=1)

#### Datenaufbereitung - Feature Engineering

In [10]:
df_num = pd.get_dummies(df.drop('left', axis=1)).join(df[['left']])

In [11]:
df_num.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,left
0,0.65,0.96,5,226,2,0
1,0.88,0.8,3,166,2,0
2,0.69,0.98,3,214,2,0
3,0.41,0.47,2,154,3,1
4,0.87,0.76,5,254,2,0


### Verwendung von Scikit Learn (sklearn) 


In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

#### Daten aufbereiten

Zielgrößenvektor y setzen:

In [13]:
y = df_num['left'].values
y

array([0, 0, 0, ..., 0, 0, 1])

Klassenlabel setzen:

In [14]:
class_names = np.unique(y)
class_names

array([0, 1])

Merkmalsnamen setzen:

In [15]:
feature_names = np.array(['satisfaction_level'
    ,'last_evaluation'
    ,'number_project'
    ,'average_monthly_hours'
    , 'time_spend_company'
    #, 'work_accident'
    #, 'salary_high'
    #, 'salary_low'
    #, 'salary_medium'
                         ])

Merkmalsmatrix erstellen:

In [16]:
x = df_num[feature_names].values
x

array([[  6.50000000e-01,   9.60000000e-01,   5.00000000e+00,
          2.26000000e+02,   2.00000000e+00],
       [  8.80000000e-01,   8.00000000e-01,   3.00000000e+00,
          1.66000000e+02,   2.00000000e+00],
       [  6.90000000e-01,   9.80000000e-01,   3.00000000e+00,
          2.14000000e+02,   2.00000000e+00],
       ..., 
       [  8.30000000e-01,   8.60000000e-01,   4.00000000e+00,
          9.80000000e+01,   4.00000000e+00],
       [  7.40000000e-01,   5.60000000e-01,   4.00000000e+00,
          2.54000000e+02,   2.00000000e+00],
       [  1.10000000e-01,   8.80000000e-01,   7.00000000e+00,
          2.72000000e+02,   4.00000000e+00]])

Trainingsdaten in internes Test- und Trainingsdaten-Set aufsplitten:

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

### Decision Tree

In [18]:
tree = DecisionTreeClassifier()

In [19]:
cv_scores = cross_val_score(tree, x_train, y_train, cv=5)

In [20]:
np.mean(cv_scores), np.std(cv_scores)

(0.96933394459286415, 0.0033693697607630681)

### Random Forest Tree

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
rfc = RandomForestClassifier()

In [23]:
cv_scores = cross_val_score(rfc, x_train, y_train, cv=5)

In [24]:
np.mean(cv_scores), np.std(cv_scores)

(0.98173279348124143, 0.0024490949007914737)

In [25]:
rfc.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
list(zip(feature_names, rfc.feature_importances_))

[('satisfaction_level', 0.36039092631778447),
 ('last_evaluation', 0.11436903841532627),
 ('number_project', 0.18166449127397535),
 ('average_monthly_hours', 0.1422717007262686),
 ('time_spend_company', 0.20130384326664547)]

### AdaBoost

In [27]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.ensemble import AdaBoostClassifier

In [286]:
clf = AdaBoostClassifier(n_estimators=50, learning_rate=1)
scores = cross_val_score(clf, x_tree_train, y_tree_train)
scores.mean()

0.95559159757512369

### Entscheidungsbaum

In [154]:
df_tree = df_num.copy()

In [155]:
df_tree.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,left
0,0.65,0.96,5,226,2,0
1,0.88,0.8,3,166,2,0
2,0.69,0.98,3,214,2,0
3,0.41,0.47,2,154,3,1
4,0.87,0.76,5,254,2,0


In [156]:
feature_names_tree = df_tree.drop('left', axis=1).columns
feature_names_tree

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_monthly_hours', 'time_spend_company'],
      dtype='object')

In [157]:
X_tree = df_tree[feature_names_tree].values
X_tree

array([[  6.50000000e-01,   9.60000000e-01,   5.00000000e+00,
          2.26000000e+02,   2.00000000e+00],
       [  8.80000000e-01,   8.00000000e-01,   3.00000000e+00,
          1.66000000e+02,   2.00000000e+00],
       [  6.90000000e-01,   9.80000000e-01,   3.00000000e+00,
          2.14000000e+02,   2.00000000e+00],
       ..., 
       [  8.30000000e-01,   8.60000000e-01,   4.00000000e+00,
          9.80000000e+01,   4.00000000e+00],
       [  7.40000000e-01,   5.60000000e-01,   4.00000000e+00,
          2.54000000e+02,   2.00000000e+00],
       [  1.10000000e-01,   8.80000000e-01,   7.00000000e+00,
          2.72000000e+02,   4.00000000e+00]])

In [250]:
x_tree_train, x_tree_test, y_tree_train, y_tree_test = train_test_split(X_tree, y, test_size=2, random_state=11)

In [251]:
tree2 = DecisionTreeClassifier()

In [252]:
cv_scores = cross_val_score(tree2, x_tree_train, y_tree_train, cv=50)

In [253]:
np.mean(cv_scores), np.std(cv_scores)

(0.97429711742793557, 0.011479658563109453)

In [265]:
from sklearn.ensemble import RandomForestClassifier

In [266]:
rfc = RandomForestClassifier()

In [267]:
cv_scores = cross_val_score(rfc, x_train, y_train, cv=5)

In [268]:
np.mean(cv_scores), np.std(cv_scores)

(0.98306568302178499, 0.0033421835022085715)

In [270]:
rfc.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [278]:
rfc.fit(x_tree_train, y_tree_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [279]:
list(zip(feature_names, rfc.feature_importances_))

[('satisfaction_level', 0.36403581136912672),
 ('last_evaluation', 0.12391241163976194),
 ('number_project', 0.18751189695511938),
 ('average_monthly_hours', 0.15034518748861689),
 ('time_spend_company', 0.1741946925473751)]

In [280]:
cv_scores = cross_val_score(rfc, x_tree_train, y_tree_train, cv=5)

In [281]:
np.mean(cv_scores), np.std(cv_scores)

(0.98479749749749756, 0.0014650977907844729)

In [276]:
rfc.fit(x_tree_train, y_tree_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [277]:
list(zip(feature_names_tree, rfc.feature_importances_))

[('satisfaction_level', 0.33496951849268092),
 ('last_evaluation', 0.13700583009328007),
 ('number_project', 0.16249796747374101),
 ('average_monthly_hours', 0.19460662007971344),
 ('time_spend_company', 0.1709200638605847)]

### Neural Network: Multil-Layer-Perceptron

In [287]:
steps = [('scaler', StandardScaler()), ('mlp', MLPClassifier(max_iter=1000, random_state=42))]

In [288]:
pipeline = Pipeline(steps)

In [319]:
type(pipeline)

sklearn.pipeline.Pipeline

In [335]:
param_space = {
    'mlp__activation': ['identity', 'logistic', 'tanh', 'relu'], 
    'mlp__hidden_layer_sizes': [(30, 30, 30), (12, 50), (10), (15)]
            }

In [336]:
gs = GridSearchCV(pipeline, param_space, cv=5)

In [337]:
gs.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('mlp', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rat...       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'mlp__activation': ['identity', 'logistic', 'tanh', 'relu'], 'mlp__hidden_layer_sizes': [(30, 30, 30), (12, 50), 10, 15]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [338]:
gs.grid_scores_



[mean: 0.75747, std: 0.01079, params: {'mlp__activation': 'identity', 'mlp__hidden_layer_sizes': (30, 30, 30)},
 mean: 0.76147, std: 0.00823, params: {'mlp__activation': 'identity', 'mlp__hidden_layer_sizes': (12, 50)},
 mean: 0.76120, std: 0.00763, params: {'mlp__activation': 'identity', 'mlp__hidden_layer_sizes': 10},
 mean: 0.76387, std: 0.00695, params: {'mlp__activation': 'identity', 'mlp__hidden_layer_sizes': 15},
 mean: 0.95960, std: 0.00346, params: {'mlp__activation': 'logistic', 'mlp__hidden_layer_sizes': (30, 30, 30)},
 mean: 0.95720, std: 0.00492, params: {'mlp__activation': 'logistic', 'mlp__hidden_layer_sizes': (12, 50)},
 mean: 0.95827, std: 0.00376, params: {'mlp__activation': 'logistic', 'mlp__hidden_layer_sizes': 10},
 mean: 0.95987, std: 0.00249, params: {'mlp__activation': 'logistic', 'mlp__hidden_layer_sizes': 15},
 mean: 0.96787, std: 0.00437, params: {'mlp__activation': 'tanh', 'mlp__hidden_layer_sizes': (30, 30, 30)},
 mean: 0.96187, std: 0.00431, params: {'mlp_

In [339]:
gs.best_score_, gs.best_params_

(0.96986666666666665,
 {'mlp__activation': 'relu', 'mlp__hidden_layer_sizes': (30, 30, 30)})

### GradiantBoost

In [398]:
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier

x, y = make_hastie_10_2(random_state=0)
x_train, x_test = x[:2000], x[2000:]
y_train, y_test = y[:2000], y[2000:]

clf = GradientBoostingClassifier(n_estimators=300, learning_rate=0.9,
    max_depth=2, random_state=3).fit(x_train, y_train)
clf.score(x_test, y_test)

0.92769999999999997

### Voting Ensemble for Classification

In [409]:
from sklearn.ensemble import VotingClassifier
from sklearn import model_selection

In [410]:
array = df.values

In [411]:
array

array([[  6.50000000e-01,   9.60000000e-01,   5.00000000e+00,
          2.26000000e+02,   2.00000000e+00,   0.00000000e+00],
       [  8.80000000e-01,   8.00000000e-01,   3.00000000e+00,
          1.66000000e+02,   2.00000000e+00,   0.00000000e+00],
       [  6.90000000e-01,   9.80000000e-01,   3.00000000e+00,
          2.14000000e+02,   2.00000000e+00,   0.00000000e+00],
       ..., 
       [  8.30000000e-01,   8.60000000e-01,   4.00000000e+00,
          9.80000000e+01,   4.00000000e+00,   0.00000000e+00],
       [  7.40000000e-01,   5.60000000e-01,   4.00000000e+00,
          2.54000000e+02,   2.00000000e+00,   0.00000000e+00],
       [  1.10000000e-01,   8.80000000e-01,   7.00000000e+00,
          2.72000000e+02,   4.00000000e+00,   1.00000000e+00]])

In [412]:
kfold = model_selection.KFold(n_splits=10, random_state=7)

create the sub models

In [417]:
estimators = []
model1 = LogisticRegression()
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = RandomForestClassifier()
estimators.append(('svm', model3))

create the ensemble model

In [422]:
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, x_tree_train, y_tree_train, cv=kfold)
results.mean()

0.76700000000000002