<img src='logo/dsl-logo.png' width="500" align="center" />

# HR Competition

## Gradient Boost for Kaggle

### Initializations

In [27]:
# Bibliotheken einbinden
import numpy as np
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline

### Load Data

In [28]:
df = pd.read_pickle('exchange/hr_01_cleaned_train.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
satisfactionLevel       10000 non-null float64
yearsSinceEvaluation    10000 non-null float64
numberOfProjects        10000 non-null int64
averageMonthlyHours     10000 non-null int64
yearsAtCompany          10000 non-null int64
workAccident            10000 non-null category
hasLeftCompany          10000 non-null category
gotPromotion            10000 non-null category
department              10000 non-null category
salary                  10000 non-null category
dtypes: category(5), float64(2), int64(3)
memory usage: 439.7 KB


In [29]:
# Datentyp von Category in Object umwandeln
for col in df.select_dtypes(['category']):
    print('transforming', col)
    df[col] = df[col].astype('str')

transforming workAccident
transforming hasLeftCompany
transforming gotPromotion
transforming department
transforming salary


In [30]:
df = pd.get_dummies(df.drop(['workAccident','gotPromotion', 'salary', 'hasLeftCompany'], axis=1)).join(df[['hasLeftCompany']])
df.head()

Unnamed: 0,satisfactionLevel,yearsSinceEvaluation,numberOfProjects,averageMonthlyHours,yearsAtCompany,department_IT,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical,hasLeftCompany
0,0.65,0.96,5,226,2,0,0,0,0,0,1,0,0,0,0,0
1,0.88,0.8,3,166,2,1,0,0,0,0,0,0,0,0,0,0
2,0.69,0.98,3,214,2,0,0,0,0,0,0,0,1,0,0,0
3,0.41,0.47,2,154,3,0,0,0,0,0,0,0,1,0,0,1
4,0.87,0.76,5,254,2,0,0,0,1,0,0,0,0,0,0,0


In [31]:
y_train = df['hasLeftCompany'].values
y_train

array(['0', '0', '0', ..., '0', '0', '1'], dtype=object)

In [32]:
X_train = df.drop(['hasLeftCompany'], axis=1).values
X_train

array([[ 0.65,  0.96,  5.  , ...,  0.  ,  0.  ,  0.  ],
       [ 0.88,  0.8 ,  3.  , ...,  0.  ,  0.  ,  0.  ],
       [ 0.69,  0.98,  3.  , ...,  1.  ,  0.  ,  0.  ],
       ..., 
       [ 0.83,  0.86,  4.  , ...,  1.  ,  0.  ,  0.  ],
       [ 0.74,  0.56,  4.  , ...,  0.  ,  0.  ,  0.  ],
       [ 0.11,  0.88,  7.  , ...,  1.  ,  0.  ,  0.  ]])

In [33]:
scaler = MinMaxScaler()

In [34]:
X_train_scaled = scaler.fit_transform(X_train)

In [35]:
df = pd.read_pickle('exchange/hr_01_cleaned_test.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 10 columns):
id                      4999 non-null int64
satisfactionLevel       4999 non-null float64
yearsSinceEvaluation    4999 non-null float64
numberOfProjects        4999 non-null int64
averageMonthlyHours     4999 non-null int64
yearsAtCompany          4999 non-null int64
workAccident            4999 non-null category
gotPromotion            4999 non-null category
department              4999 non-null category
salary                  4999 non-null category
dtypes: category(4), float64(2), int64(4)
memory usage: 254.1 KB


In [36]:
# Datentyp von Category in Object umwandeln
for col in df.select_dtypes(['category']):
    print('transforming', col)
    df[col] = df[col].astype('str')

transforming workAccident
transforming gotPromotion
transforming department
transforming salary


In [37]:
df = pd.get_dummies(df.drop(['workAccident','gotPromotion', 'salary', 'id'], axis=1)).join(df[['id']])
df.head()

Unnamed: 0,satisfactionLevel,yearsSinceEvaluation,numberOfProjects,averageMonthlyHours,yearsAtCompany,department_IT,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical,id
0,0.81,0.96,4,219,2,0,0,0,0,0,0,0,0,0,1,10000
1,0.86,0.84,4,246,6,1,0,0,0,0,0,0,0,0,0,10001
2,0.9,0.66,4,242,3,0,0,0,0,0,0,0,0,1,0,10002
3,0.37,0.54,2,131,3,0,0,0,1,0,0,0,0,0,0,10003
4,0.52,0.96,3,271,3,0,0,0,0,0,0,0,0,0,1,10004


In [38]:
ids = df['id']
ids.head()

0    10000
1    10001
2    10002
3    10003
4    10004
Name: id, dtype: int64

In [39]:
X_test = df.drop(['id'], axis=1).values
X_test

array([[ 0.81,  0.96,  4.  , ...,  0.  ,  0.  ,  1.  ],
       [ 0.86,  0.84,  4.  , ...,  0.  ,  0.  ,  0.  ],
       [ 0.9 ,  0.66,  4.  , ...,  0.  ,  1.  ,  0.  ],
       ..., 
       [ 0.66,  0.73,  5.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.79,  1.  ,  4.  , ...,  1.  ,  0.  ,  0.  ],
       [ 0.98,  0.86,  2.  , ...,  1.  ,  0.  ,  0.  ]])

In [40]:
X_test_scaled = scaler.transform(X_test)

### Predict Kaggle Data

Beim zweiten Anlauf wurde der Gradient Boost Classifier mit den zuvor bestimmten Werten verwendet. Damit ließ sich das bereits sehr gute Ergebnis auf Kaggle nochmals steigern.

In [41]:
clf = GradientBoostingClassifier(max_depth=9, n_estimators=1500, learning_rate=0.05)

In [42]:
scores = cross_val_score(clf, X_train_scaled, y_train, cv=10, n_jobs=-1)
scores.mean()

0.98799978799978783

In [43]:
clf.fit(X_train_scaled, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=9,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=1500,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [44]:
predictions = clf.predict(X_test_scaled)
list(predictions);

In [45]:
list(ids);

In [46]:
df = pd.DataFrame(
    {'id': ids,
     'left': predictions
    })
df.head()

Unnamed: 0,id,left
0,10000,0
1,10001,1
2,10002,0
3,10003,1
4,10004,0


In [32]:
df.to_csv('kaggle/gradient_boost.csv', index=False)

**Ergebnis in Kaggle:** 99.132%