## Modellierung Expertensystem

In [1]:
# Bibliotheken einbinden
import numpy as np
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#### Bereinigte Daten aus dem Notebook EDA einlesen

In [2]:
df = pd.read_pickle('data/hr_train_clean.pkl')

In [3]:
df_ids = df['id']

#### Manuelle Datenauswahl (Feature Selection)

Die Spalten "promotion_last_5years", "department", "salary" und "work_accident" weisen eine geringe Abhängigkeit zu "left" auf und werden daher entfernt. Zudem die Spalte "id" entfernen.

In [4]:
df = df.drop(['promotion_last_5years'], axis=1)

In [5]:
df.head()

Unnamed: 0,id,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,work_accident,left,department,salary
0,0,0.65,0.96,5,226,2,1,0,marketing,medium
1,1,0.88,0.8,3,166,2,0,0,IT,low
2,2,0.69,0.98,3,214,2,0,0,sales,low
3,3,0.41,0.47,2,154,3,0,1,sales,low
4,4,0.87,0.76,5,254,2,1,0,hr,low


In [6]:
df = df.drop(['department'], axis=1)

In [7]:
df.head()

Unnamed: 0,id,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,work_accident,left,salary
0,0,0.65,0.96,5,226,2,1,0,medium
1,1,0.88,0.8,3,166,2,0,0,low
2,2,0.69,0.98,3,214,2,0,0,low
3,3,0.41,0.47,2,154,3,0,1,low
4,4,0.87,0.76,5,254,2,1,0,low


In [8]:
df = df.drop(['id'],axis=1)
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,work_accident,left,salary
0,0.65,0.96,5,226,2,1,0,medium
1,0.88,0.8,3,166,2,0,0,low
2,0.69,0.98,3,214,2,0,0,low
3,0.41,0.47,2,154,3,0,1,low
4,0.87,0.76,5,254,2,1,0,low


In [9]:
df =df.drop(['salary'],axis=1)
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,work_accident,left
0,0.65,0.96,5,226,2,1,0
1,0.88,0.8,3,166,2,0,0
2,0.69,0.98,3,214,2,0,0
3,0.41,0.47,2,154,3,0,1
4,0.87,0.76,5,254,2,1,0


In [10]:
df =df.drop(['work_accident'],axis=1)
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,left
0,0.65,0.96,5,226,2,0
1,0.88,0.8,3,166,2,0
2,0.69,0.98,3,214,2,0
3,0.41,0.47,2,154,3,1
4,0.87,0.76,5,254,2,0


#### Datenaufbereitung - Feature Engineering

In [11]:
df_num = pd.get_dummies(df.drop('left', axis=1)).join(df[['left']])

In [12]:
df_num.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,left
0,0.65,0.96,5,226,2,0
1,0.88,0.8,3,166,2,0
2,0.69,0.98,3,214,2,0
3,0.41,0.47,2,154,3,1
4,0.87,0.76,5,254,2,0


### Verwendung von Scikit Learn (sklearn) 


In [13]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

#### Daten aufbereiten

Zielgrößenvektor y setzen:

In [14]:
y = df_num['left'].values
y

array([0, 0, 0, ..., 0, 0, 1])

Klassenlabel setzen:

In [15]:
class_names = np.unique(y)
class_names

array([0, 1])

Merkmalsnamen setzen:

In [16]:
feature_names = np.array(['satisfaction_level'
    ,'last_evaluation'
    ,'number_project'
    ,'average_monthly_hours'
    , 'time_spend_company'
    ])

Merkmalsmatrix erstellen:

In [17]:
x = df_num[feature_names].values
x

array([[  6.50000000e-01,   9.60000000e-01,   5.00000000e+00,
          2.26000000e+02,   2.00000000e+00],
       [  8.80000000e-01,   8.00000000e-01,   3.00000000e+00,
          1.66000000e+02,   2.00000000e+00],
       [  6.90000000e-01,   9.80000000e-01,   3.00000000e+00,
          2.14000000e+02,   2.00000000e+00],
       ..., 
       [  8.30000000e-01,   8.60000000e-01,   4.00000000e+00,
          9.80000000e+01,   4.00000000e+00],
       [  7.40000000e-01,   5.60000000e-01,   4.00000000e+00,
          2.54000000e+02,   2.00000000e+00],
       [  1.10000000e-01,   8.80000000e-01,   7.00000000e+00,
          2.72000000e+02,   4.00000000e+00]])

Trainingsdaten in internes Test- und Trainingsdaten-Set aufsplitten:

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

### Random Forest Tree

In [19]:
df_tree= df_num.copy()
x_tree = df_tree[feature_names].values
x_tree_train, x_tree_test, y_tree_train, y_tree_test = train_test_split(x, y, test_size=2, random_state=11)
tree2 = DecisionTreeClassifier()
cv_scores = cross_val_score(tree2, x_tree_train, y_tree_train, cv=50)
np.mean(cv_scores), np.std(cv_scores)

(0.97479813745343624, 0.012476986890032869)

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
rfc = RandomForestClassifier(n_estimators=100)

In [22]:
rfc.fit(x_tree_train, y_tree_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [23]:
cv_scores = cross_val_score(rfc, x_tree_train, y_tree_train, cv=5)

In [24]:
np.mean(cv_scores), np.std(cv_scores)

(0.98749809809809808, 0.0017574421917980671)

##### TESTING
Für Model-Staging: Parameter auslesen

In [25]:
rfc.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Für Model-Staging: Bedeutung der Faktoren für die Entscheidung aufzeigen

In [26]:
list(zip(feature_names, rfc.feature_importances_))

[('satisfaction_level', 0.36848925224023277),
 ('last_evaluation', 0.11894597685347152),
 ('number_project', 0.17580283484354775),
 ('average_monthly_hours', 0.1574381847626731),
 ('time_spend_company', 0.17932375130007522)]

## tatsächliche Testdaten einspielen

In [27]:
input = pd.read_csv('data/hr_test.csv')

In [28]:
#Spaltenköpfe anpassen
input.rename(columns={'average_montly_hours':'average_monthly_hours','Work_accident':'work_accident'},inplace=True)

In [29]:
#ID abspalten
input_ID = input['id']
input_ID_list = input_ID.tolist()
input = input.drop(['id'], axis=1)

In [30]:
# für Lösungsansatzüberflüssige Spalten entfernen
input = input.drop(['department'], axis=1)
input = input.drop(['salary'], axis=1)
input = input.drop(['work_accident'], axis=1)
input = input.drop(['promotion_last_5years'], axis=1)
input.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company
0,0.81,0.96,4,219,2
1,0.86,0.84,4,246,6
2,0.9,0.66,4,242,3
3,0.37,0.54,2,131,3
4,0.52,0.96,3,271,3


In [31]:
prediction = rfc.predict(input)

In [32]:
result = df = pd.DataFrame({'id':input_ID, 'left':prediction})

In [33]:
result.to_csv('data/result_v03.csv', index=False)