## Modellierung Expertensystem

In [4]:
# Bibliotheken einbinden
import numpy as np
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#### Bereinigte Daten aus dem Notebook EDA einlesen

In [5]:
df = pd.read_pickle('data/hr_train_clean.pkl')

In [6]:
df_ids = df['id']

#### Manuelle Datenauswahl

Die Spalten "promotion_last_5years" und "department" weisen eine geringe Abhängigkeit zu "left" auf und werden daher entfernt. Zudem die Spalte "id" entfernen.

In [7]:
df = df.drop(['promotion_last_5years'], axis=1)

In [8]:
df.head()

Unnamed: 0,id,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,work_accident,left,department,salary
0,0,0.65,0.96,5,226,2,1,0,marketing,medium
1,1,0.88,0.8,3,166,2,0,0,IT,low
2,2,0.69,0.98,3,214,2,0,0,sales,low
3,3,0.41,0.47,2,154,3,0,1,sales,low
4,4,0.87,0.76,5,254,2,1,0,hr,low


In [9]:
df = df.drop(['department'], axis=1)

In [10]:
df.head()

Unnamed: 0,id,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,work_accident,left,salary
0,0,0.65,0.96,5,226,2,1,0,medium
1,1,0.88,0.8,3,166,2,0,0,low
2,2,0.69,0.98,3,214,2,0,0,low
3,3,0.41,0.47,2,154,3,0,1,low
4,4,0.87,0.76,5,254,2,1,0,low


In [11]:
df = df.drop(['id'],axis=1)
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,work_accident,left,salary
0,0.65,0.96,5,226,2,1,0,medium
1,0.88,0.8,3,166,2,0,0,low
2,0.69,0.98,3,214,2,0,0,low
3,0.41,0.47,2,154,3,0,1,low
4,0.87,0.76,5,254,2,1,0,low


#### Datenaufbereitung

In [12]:
df_num = pd.get_dummies(df.drop('left', axis=1)).join(df[['left']])

In [13]:
df_num.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,work_accident,salary_high,salary_low,salary_medium,left
0,0.65,0.96,5,226,2,1,0,0,1,0
1,0.88,0.8,3,166,2,0,0,1,0,0
2,0.69,0.98,3,214,2,0,0,1,0,0
3,0.41,0.47,2,154,3,0,0,1,0,1
4,0.87,0.76,5,254,2,1,0,1,0,0


#### Verwendung von Scikit Learn (sklearn) 

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

#### Daten aufbereiten
Merkmalsmatrix X und Zielgrößenvektor y aus dem DataFrame extrahieren, Klassenlabel und Merkmalsnamen für Beschritungen extrahieren.

In [15]:
y = df_num['left'].values
y

array([0, 0, 0, ..., 0, 0, 1])

In [16]:
class_names = np.unique(y)
class_names

array([0, 1])

In [17]:
feature_names = np.array(['satisfaction_level'
   , 'last_evaluation'
   , 'number_project'
   , 'average_monthly_hours'
   , 'time_spend_company'
   , 'work_accident'
   , 'salary_high'
   , 'salary_low'
   , 'salary_medium'])
feature_names

array(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_monthly_hours', 'time_spend_company', 'work_accident',
       'salary_high', 'salary_low', 'salary_medium'],
      dtype='<U21')

In [18]:
X = df_num[feature_names].values
X

array([[ 0.65,  0.96,  5.  , ...,  0.  ,  0.  ,  1.  ],
       [ 0.88,  0.8 ,  3.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.69,  0.98,  3.  , ...,  0.  ,  1.  ,  0.  ],
       ..., 
       [ 0.83,  0.86,  4.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.74,  0.56,  4.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.11,  0.88,  7.  , ...,  0.  ,  0.  ,  1.  ]])

Feste Aufteilung der verfügbaren Daten in Trainingsdaten für die Modellerstellung und Testdaten für die abschließende Evaluierung eines Modells.

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### Verwendung des k-Nearest-Neighbor-Klassifikators als sehr einfacher Ansatz (lazy learning)

In [20]:
knn = KNeighborsClassifier(n_neighbors=1)

In [21]:
cv_scores = cross_val_score(knn, X_train, y_train, cv=5)

In [22]:
np.mean(cv_scores), np.std(cv_scores)

(0.94514223381892393, 0.0070526472520814325)

#### Schlechte Klassifikationsleistung
Da alle Merkmale für die Abstandssberechnung verwendet werden, sollte die Merkmale vorher Skaliert werden, z.B. mit dem MinMaxScaler. Wir bauen eine Pipeline auf, die die Merkmale vor der Verwendung des Klassifikators skaliert.

In [23]:
steps = [('scaler', MinMaxScaler()), ('knn', KNeighborsClassifier(n_neighbors=1))]

In [24]:
pipeline = Pipeline(steps)

Die Pipeline lässt sich wie ein Klassifikator verwenden:

In [25]:
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)

In [26]:
np.mean(cv_scores), np.std(cv_scores)

(0.96085611384787151, 0.0057826886478329256)

Die Klassifikationsleitung hat sich durch die Skalierung deutlich verbessert!

#### Automatisierte Betrachtung verschiedener Parameter mit Hilfe von GridSearchCV

In [27]:
steps = [('scaler', MinMaxScaler()), ('knn', KNeighborsClassifier())]

In [28]:
pipeline = Pipeline(steps)

In [29]:
param_space = {'knn__n_neighbors': np.arange(1,50)}

In [30]:
gs = GridSearchCV(pipeline, param_space, cv=5)

In [31]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'knn__n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [32]:
gs.best_params_, gs.best_score_

({'knn__n_neighbors': 2}, 0.96142857142857141)

#### Decision Tree Classifier

In [33]:
tree = DecisionTreeClassifier()

In [34]:
cv_scores = cross_val_score(tree, X_train, y_train, cv=5)

In [35]:
np.mean(cv_scores), np.std(cv_scores)

(0.96757019664660182, 0.0048463495558977243)

#### Visualisierung des Entscheidungsbaumes

** Achtung: ** Erfordert Installation von graphviz und pydotplus!

In [36]:
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus

In [37]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [38]:
def create_dt_image(tree, class_names, feature_names):
    dot_data = StringIO()
    export_graphviz(tree, out_file=dot_data, class_names = class_names, feature_names = feature_names, filled=True, rounded=True, special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    image = graph.create_png()
    return image

In [39]:
image = create_dt_image(tree, class_names, feature_names)
Image(image)

TypeError: ufunc 'add' did not contain a loop with signature matching types dtype('<U104') dtype('<U104') dtype('<U104')

Erkenntnisse der Baumvisualisierung:

    ...


In [40]:
df_tree = df_num.copy()

In [41]:
X_tree = df_tree[feature_names_tree].values
X_tree

NameError: name 'feature_names_tree' is not defined

In [None]:
X_tree_train, X_tree_test, y_tree_train, y_tree_test = train_test_split(X_tree, y, test_size=0.3, random_state=42)

#### Logistische Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [None]:
logreg = LogisticRegression()

In [None]:
steps = [('scaler', StandardScaler()), ('logreg', LogisticRegression())]

In [None]:
pipeline = Pipeline(steps)

In [None]:
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)

In [None]:
np.mean(cv_scores), np.std(cv_scores)

#### Support Vector Machine (SVM) als Klassifikator (SVC)

In [None]:
steps = [('scaler', StandardScaler()), ('svc', SVC())]

In [None]:
pipeline = Pipeline(steps)

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
pipeline.score(X_test, y_test)

In [None]:
param_space = {'svc__kernel' : ['linear', 'poly', 'rbf']}

In [None]:
gs = GridSearchCV(pipeline, param_space, cv=5)

In [None]:
gs.fit(X_train, y_train)

In [None]:
gs.grid_scores_

In [None]:
param_space = {'svc__kernel': ['linear'], 'svc__C': [0.1, 1, 2, 3, 5, 10, 20, 50, 100, 150, 200, 250, 500, 1000]}

In [None]:
gs = GridSearchCV(pipeline, param_space, cv=5)

In [None]:
gs.fit(X_train, y_train)

In [None]:
gs.grid_scores_

In [None]:
gs.best_score_, gs.best_params_

#### Neural Network: Multil-Layer-Perceptron

In [None]:
steps = [('scaler', StandardScaler()), ('mlp', MLPClassifier(max_iter=1000, random_state=42))]

In [None]:
pipeline = Pipeline(steps)

In [None]:
param_space = {
    'mlp__activation': ['identity', 'logistic', 'tanh', 'relu'], 
    'mlp__hidden_layer_sizes': [(11), (12), (13), (14), (15), (20), (14,14), (20, 20)]
            }

In [None]:
gs = GridSearchCV(pipeline, param_space, cv=5)

In [None]:
gs.fit(X_train, y_train)

In [None]:
gs.grid_scores_

In [None]:
gs.best_score_, gs.best_params_

In [None]:
param_space = {
    'mlp__activation': ['tanh'], 
    'mlp__hidden_layer_sizes': [(20,20), (30,30), (20,20,20), (20, 40, 20)]
            }

In [None]:
gs = GridSearchCV(pipeline, param_space, cv=5)

In [None]:
gs.fit(X_train, y_train)

In [None]:
gs.grid_scores_

In [None]:
gs.best_score_, gs.best_params_

#### Random-Forest-Klassifikator

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier()

In [None]:
cv_scores = cross_val_score(rfc, X_train, y_train, cv=3)

In [None]:
np.mean(cv_scores), np.std(cv_scores)

In [None]:
rfc.fit(X_train, y_train)

In [None]:
list(zip(feature_names, rfc.feature_importances_))

Da Bäume Probleme mit Entscheidungsgrenzen haben, die nicht parallel zu den Achsen verlaufen

In [None]:
cv_scores = cross_val_score(rfc, X_tree_train, y_tree_train, cv=5)

In [None]:
np.mean(cv_scores), np.std(cv_scores)

In [None]:
rfc.fit(X_tree_train, y_tree_train)

In [None]:
list(zip(feature_names_tree, rfc.feature_importances_))