## Modellierung Expertensystem (weitere Varianten)

In [1]:
# Bibliotheken einbinden
import numpy as np
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#### Bereinigte Daten aus dem Notebook EDA einlesen

In [2]:
df = pd.read_pickle('data/hr_train_clean.pkl')

#### Manuelle Datenauswahl (Feature Selection)

In [3]:
df.head()

Unnamed: 0,id,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,work_accident,left,promotion_last_5years,department,salary
0,0,0.65,0.96,5,226,2,1,0,0,marketing,medium
1,1,0.88,0.8,3,166,2,0,0,0,IT,low
2,2,0.69,0.98,3,214,2,0,0,0,sales,low
3,3,0.41,0.47,2,154,3,0,1,0,sales,low
4,4,0.87,0.76,5,254,2,1,0,0,hr,low


In [4]:
def varianten_salary(value):
    if value == "low" :
        return 1
    elif value == "medium":
        return 2
    else:
        return 3

df['salary_int'] = df.apply(lambda row: varianten_salary(row['salary']), axis=1)

In [5]:
df.head()

Unnamed: 0,id,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,work_accident,left,promotion_last_5years,department,salary,salary_int
0,0,0.65,0.96,5,226,2,1,0,0,marketing,medium,2
1,1,0.88,0.8,3,166,2,0,0,0,IT,low,1
2,2,0.69,0.98,3,214,2,0,0,0,sales,low,1
3,3,0.41,0.47,2,154,3,0,1,0,sales,low,1
4,4,0.87,0.76,5,254,2,1,0,0,hr,low,1


In [6]:
df = df.drop(['id','department', 'work_accident', 'promotion_last_5years', 'salary'], axis=1)

In [7]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,left,salary_int
0,0.65,0.96,5,226,2,0,2
1,0.88,0.8,3,166,2,0,1
2,0.69,0.98,3,214,2,0,1
3,0.41,0.47,2,154,3,1,1
4,0.87,0.76,5,254,2,0,1


#### Datenaufbereitung - Feature Engineering

In [8]:
df_num = pd.get_dummies(df.drop('left', axis=1)).join(df[['left']])

#### Verwendung von Scikit Learn (sklearn)

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold

In [10]:
# Zielgrößenvektor y setzen:
y = df_num['left'].values

# Klassenlabel setzen:
class_names = np.unique(y)

# Merkmalsnamen setzen:
feature_names = np.array(['satisfaction_level'
    , 'last_evaluation'
    , 'number_project'
    , 'average_monthly_hours'
    , 'time_spend_company'
    #, 'salary_int'
                         ])
# Merkmalsmatrix erstellen:
x = df_num[feature_names].values

Trainingsdaten in internes Test- und Trainingsdaten-Set aufsplitten:

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

#### Decision Tree

In [12]:
tree = DecisionTreeClassifier()
cv_scores = cross_val_score(tree, x_train, y_train, cv=5)
np.mean(cv_scores), np.std(cv_scores)

(0.97013252248852844, 0.0028817438434428689)

#### Random Forest

In [13]:
rfc = RandomForestClassifier()
cv_scores = cross_val_score(rfc, x_train, y_train, cv=5)
np.mean(cv_scores), np.std(cv_scores)

(0.98226621588128116, 0.0030624718766567517)

In [14]:
rfc2 = RandomForestClassifier(n_estimators= 100)
cv_scores = cross_val_score(rfc2, x_train, y_train, cv=5)
np.mean(cv_scores), np.std(cv_scores)

(0.98399981665177039, 0.0038675687518404775)

In [15]:
rfc3 = RandomForestClassifier(n_estimators= 250,min_samples_leaf=1,min_samples_split=2,min_weight_fraction_leaf=0.0)
cv_scores = cross_val_score(rfc3, x_train, y_train, cv=10)
np.mean(cv_scores), np.std(cv_scores)

(0.98613172266380322, 0.0037885371994826299)

#### Daten skalieren

In [16]:
scaler = MinMaxScaler()
x_scaled_train = scaler.fit_transform(x_train)
knn = KNeighborsClassifier(n_neighbors=3)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [None]:
knn.fit(x_scaled_train, y_train)

In [17]:
knn.score(x_scaled_train, y_train)

0.98119999999999996

In [18]:
x_scaled_test = scaler.transform(x_test)

In [19]:
knn.score(x_scaled_test, y_test)

0.96799999999999997

#### Automatisierte Betrachtung verschiedener Parameter mit Hilfe von GridSearchCV

In [20]:
steps = [('scaler', MinMaxScaler()), ('knn', KNeighborsClassifier())]
pipeline = Pipeline(steps)
param_space = {'knn__n_neighbors': np.arange(1,50)}
gs = GridSearchCV(pipeline, param_space, cv=5)

In [22]:
gs.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'knn__n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [23]:
gs.best_params_, gs.best_score_

({'knn__n_neighbors': 1}, 0.96986666666666665)

#### GradiantBoost

In [16]:
gbc = GradientBoostingClassifier()
cv_scores = cross_val_score(gbc, x_train, y_train, cv=5)
np.mean(cv_scores), np.std(cv_scores)

(0.97240016805933394, 0.0032828012491911956)

In [17]:
gbc2 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.5,
    max_depth=10, random_state=4)
cv_scores = cross_val_score(gbc2, x_train, y_train, cv=5)
np.mean(cv_scores), np.std(cv_scores)

(0.97893377025204598, 0.0040795430662506731)

#### AdaBoost

In [18]:
abc = AdaBoostClassifier(base_estimator=rfc2)
scores = cross_val_score(abc, x_train, y_train)
scores.mean()

0.9817333846826749

#### Voting Ensemble for Classification

In [19]:
from sklearn import model_selection

In [20]:
kfold = model_selection.KFold(n_splits=10, random_state=7)
estimators = []
estimators.append(('tree', tree))
estimators.append(('rfc3', rfc3))
estimators.append(('gbc2', gbc))
estimators.append(('abc', abc))
ensemble = VotingClassifier(estimators)
ensemble.fit(x_train, y_train)

VotingClassifier(estimators=[('tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_le...0,
            warm_start=False),
          learning_rate=1.0, n_estimators=50, random_state=None))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [21]:
ensemble.score(x_test, y_test)

0.98680000000000001

In [22]:
x_complete_train, x_compelte_test, y_complete_train, y_complete_test = train_test_split(x, y, test_size=0, random_state=42)
ensemble.fit(x_complete_train, y_complete_train)

VotingClassifier(estimators=[('tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_le...0,
            warm_start=False),
          learning_rate=1.0, n_estimators=50, random_state=None))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

#### Tatsächliche Testdaten einspielen

In [23]:
input = pd.read_csv('data/hr_test.csv')

In [24]:
#Spaltenköpfe anpassen
input.rename(columns={'average_montly_hours':'average_monthly_hours','Work_accident':'work_accident'},inplace=True)

In [25]:
#ID abspalten
input_ID = input['id']
input_ID_list = input_ID.tolist()
input = input.drop(['id'], axis=1)

In [26]:
# für Lösungsansatzüberflüssige Spalten entfernen
input = input.drop(['department'], axis=1)
input = input.drop(['work_accident'], axis=1)
input = input.drop(['promotion_last_5years'], axis=1)
input.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,salary
0,0.81,0.96,4,219,2,low
1,0.86,0.84,4,246,6,low
2,0.9,0.66,4,242,3,high
3,0.37,0.54,2,131,3,medium
4,0.52,0.96,3,271,3,medium


In [27]:
input['salary_int'] = input.apply(lambda row: varianten_salary(row['salary']), axis=1)

In [28]:
input = input.drop(['salary'], axis=1)
input.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,salary_int
0,0.81,0.96,4,219,2,1
1,0.86,0.84,4,246,6,1
2,0.9,0.66,4,242,3,3
3,0.37,0.54,2,131,3,2
4,0.52,0.96,3,271,3,2


In [29]:
#Formel zu Berechnung des Salary_int
#prediction = ensemble.predict(input)
#result = df = pd.DataFrame({'id':input_ID, 'left':prediction})
#result.to_csv('data/result_v05.csv', index=False)