## Modellierung Expertensystem

In [4]:
# Bibliotheken einbinden
import numpy as np
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#### Bereinigte Daten aus dem Notebook EDA einlesen

In [5]:
df = pd.read_pickle('data/hr_train_clean.pkl')

In [6]:
df_ids = df['id']

#### Manuelle Datenauswahl

Die Spalten "promotion_last_5years" und "department" weisen eine geringe Abhängigkeit zu "left" auf und werden daher entfernt. Zudem die Spalte "id" entfernen.

In [7]:
df = df.drop(['promotion_last_5years'], axis=1)

In [8]:
df.head()

Unnamed: 0,id,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,work_accident,left,department,salary
0,0,0.65,0.96,5,226,2,1,0,marketing,medium
1,1,0.88,0.8,3,166,2,0,0,IT,low
2,2,0.69,0.98,3,214,2,0,0,sales,low
3,3,0.41,0.47,2,154,3,0,1,sales,low
4,4,0.87,0.76,5,254,2,1,0,hr,low


In [9]:
df = df.drop(['department'], axis=1)

In [10]:
df.head()

Unnamed: 0,id,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,work_accident,left,salary
0,0,0.65,0.96,5,226,2,1,0,medium
1,1,0.88,0.8,3,166,2,0,0,low
2,2,0.69,0.98,3,214,2,0,0,low
3,3,0.41,0.47,2,154,3,0,1,low
4,4,0.87,0.76,5,254,2,1,0,low


In [11]:
df = df.drop(['id'],axis=1)
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,work_accident,left,salary
0,0.65,0.96,5,226,2,1,0,medium
1,0.88,0.8,3,166,2,0,0,low
2,0.69,0.98,3,214,2,0,0,low
3,0.41,0.47,2,154,3,0,1,low
4,0.87,0.76,5,254,2,1,0,low


#### Datenaufbereitung

In [12]:
df_num = pd.get_dummies(df.drop('left', axis=1)).join(df[['left']])

In [13]:
df_num.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,work_accident,salary_high,salary_low,salary_medium,left
0,0.65,0.96,5,226,2,1,0,0,1,0
1,0.88,0.8,3,166,2,0,0,1,0,0
2,0.69,0.98,3,214,2,0,0,1,0,0
3,0.41,0.47,2,154,3,0,0,1,0,1
4,0.87,0.76,5,254,2,1,0,1,0,0


#### Verwendung von Scikit Learn (sklearn) 

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

#### Daten aufbereiten
Merkmalsmatrix X und Zielgrößenvektor y aus dem DataFrame extrahieren, Klassenlabel und Merkmalsnamen für Beschritungen extrahieren.

In [15]:
y = df_num['left'].values
y

array([0, 0, 0, ..., 0, 0, 1])

In [16]:
class_names = np.unique(y)
class_names

array([0, 1])

In [19]:
feature_names = np.array(['satisfaction_level'
   , 'last_evaluation'
   , 'number_project'
   , 'average_monthly_hours'
   , 'time_spend_company'
   , 'work_accident'
   , 'salary_high'
   , 'salary_low'
   , 'salary_medium'])
feature_names

array(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_monthly_hours', 'time_spend_company', 'work_accident',
       'salary_high', 'salary_low', 'salary_medium'],
      dtype='<U21')

In [18]:
X = df_num[feature_names].values
X

array([[ 0.65,  0.96,  5.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.88,  0.8 ,  3.  , ...,  1.  ,  0.  ,  0.  ],
       [ 0.69,  0.98,  3.  , ...,  1.  ,  0.  ,  0.  ],
       ..., 
       [ 0.83,  0.86,  4.  , ...,  1.  ,  0.  ,  0.  ],
       [ 0.74,  0.56,  4.  , ...,  1.  ,  0.  ,  0.  ],
       [ 0.11,  0.88,  7.  , ...,  0.  ,  1.  ,  1.  ]])