## Modellierung Expertensystem (weitere Varianten)

In [80]:
# Bibliotheken einbinden
import numpy as np
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#### Bereinigte Daten aus dem Notebook EDA einlesen

In [81]:
df = pd.read_pickle('data/hr_train_clean.pkl')

#### Manuelle Datenauswahl (Feature Selection)

In [82]:
df.head()

Unnamed: 0,id,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,work_accident,left,promotion_last_5years,department,salary
0,0,0.65,0.96,5,226,2,1,0,0,marketing,medium
1,1,0.88,0.8,3,166,2,0,0,0,IT,low
2,2,0.69,0.98,3,214,2,0,0,0,sales,low
3,3,0.41,0.47,2,154,3,0,1,0,sales,low
4,4,0.87,0.76,5,254,2,1,0,0,hr,low


In [83]:
def varianten_salary(value):
    if value == "low" :
        return 1
    elif value == "medium":
        return 2
    else:
        return 3

df['salary_int'] = df.apply(lambda row: varianten_salary(row['salary']), axis=1)

In [84]:
df.head()

Unnamed: 0,id,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,work_accident,left,promotion_last_5years,department,salary,salary_int
0,0,0.65,0.96,5,226,2,1,0,0,marketing,medium,2
1,1,0.88,0.8,3,166,2,0,0,0,IT,low,1
2,2,0.69,0.98,3,214,2,0,0,0,sales,low,1
3,3,0.41,0.47,2,154,3,0,1,0,sales,low,1
4,4,0.87,0.76,5,254,2,1,0,0,hr,low,1


In [85]:
df = df.drop(['id','department', 'work_accident', 'promotion_last_5years', 'salary'], axis=1)

In [86]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,left,salary_int
0,0.65,0.96,5,226,2,0,2
1,0.88,0.8,3,166,2,0,1
2,0.69,0.98,3,214,2,0,1
3,0.41,0.47,2,154,3,1,1
4,0.87,0.76,5,254,2,0,1


#### Datenaufbereitung - Feature Engineering

In [87]:
df_num = pd.get_dummies(df.drop('left', axis=1)).join(df[['left']])

#### Verwendung von Scikit Learn (sklearn)

In [88]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold

In [99]:
# Zielgrößenvektor y setzen:
y = df_num['left'].values

# Klassenlabel setzen:
class_names = np.unique(y)

# Merkmalsnamen setzen:
feature_names = np.array(['satisfaction_level'
    , 'last_evaluation'
    , 'number_project'
    , 'average_monthly_hours'
    , 'time_spend_company'
    , 'salary_int'
                         ])
# Merkmalsmatrix erstellen:
x = df_num[feature_names].values

Trainingsdaten in internes Test- und Trainingsdaten-Set aufsplitten:

In [90]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

#### Decision Tree

In [91]:
tree = DecisionTreeClassifier()
cv_scores = cross_val_score(tree, x_train, y_train, cv=5)
np.mean(cv_scores), np.std(cv_scores)

(0.97106674542225713, 0.0025161717034084415)

#### Random Forest

In [92]:
rfc = RandomForestClassifier()
cv_scores = cross_val_score(rfc, x_train, y_train, cv=5)
np.mean(cv_scores), np.std(cv_scores)

(0.98213350482970585, 0.0034370468965752893)

In [93]:
rfc2 = RandomForestClassifier(n_estimators= 100)
cv_scores = cross_val_score(rfc2, x_train, y_train, cv=5)
np.mean(cv_scores), np.std(cv_scores)

(0.98413350554081735, 0.0038291897275117177)

#### GradiantBoost

In [94]:
gbc = GradientBoostingClassifier()
cv_scores = cross_val_score(gbc, x_train, y_train, cv=5)
np.mean(cv_scores), np.std(cv_scores)

(0.9724003458371907, 0.0030856590716605136)

In [95]:
gbc2 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.5,
    max_depth=10, random_state=4)
cv_scores = cross_val_score(gbc2, x_train, y_train, cv=5)
np.mean(cv_scores), np.std(cv_scores)

(0.97893377025204598, 0.0040795430662506731)

#### AdaBoost

In [96]:
abc = AdaBoostClassifier(n_estimators=100, base_estimator=rfc2)
scores = cross_val_score(abc, x_train, y_train)
scores.mean()

0.98253327818665781

#### Voting Ensemble for Classification

In [97]:
from sklearn import model_selection

In [98]:
kfold = model_selection.KFold(n_splits=10, random_state=7)
estimators = []
estimators.append(('tree', tree))
#estimators.append(('rfc2', rfc2))
estimators.append(('gbc2', gbc))
estimators.append(('abc', abc))
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, x_train, y_train, cv=kfold)
results.mean()

0.98506666666666653