<img src='logo/dsl-logo.png' width="500" align="center" />

# HR Competition

## Preperation for sklearn Models

### Initializations

In [47]:
# Bibliotheken einbinden
import numpy as np
import scipy.stats as stats
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [48]:
# Definition einer Klasse für Text Styles
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

### Import Dataset with All Features

In [49]:
df = pd.read_pickle('exchange/hr_01_cleaned_train.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11999 entries, 0 to 11998
Data columns (total 10 columns):
satisfactionLevel       11999 non-null float64
yearsSinceEvaluation    11999 non-null float64
numberOfProjects        11999 non-null int64
averageMonthlyHours     11999 non-null int64
yearsAtCompany          11999 non-null int64
workAccident            11999 non-null category
hasLeftCompany          11999 non-null category
gotPromotion            11999 non-null category
department              11999 non-null category
salary                  11999 non-null category
dtypes: category(5), float64(2), int64(3)
memory usage: 527.5 KB


In [50]:
# Datentyp von Category in Object umwandeln
for col in df.select_dtypes(['category']):
    print('transforming', col)
    df[col] = df[col].astype('str')

transforming workAccident
transforming hasLeftCompany
transforming gotPromotion
transforming department
transforming salary


In [51]:
df = pd.get_dummies(df.drop('hasLeftCompany', axis=1)).join(df[['hasLeftCompany']])
df.head()

Unnamed: 0,satisfactionLevel,yearsSinceEvaluation,numberOfProjects,averageMonthlyHours,yearsAtCompany,workAccident_0,workAccident_1,gotPromotion_0,gotPromotion_1,department_IT,...,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical,salary_high,salary_low,salary_medium,hasLeftCompany
0,0.55,0.86,4,169,6,1,0,1,0,1,...,0,0,0,0,0,0,0,0,1,0
1,0.66,0.48,4,229,4,1,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0.56,0.67,5,165,3,0,1,1,0,0,...,1,0,0,0,0,0,0,0,1,0
3,0.59,1.0,2,155,5,1,0,1,0,0,...,0,0,0,1,0,0,0,1,0,1
4,0.87,0.49,4,149,2,1,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0


### Split Train and Test Set with All Features

In [52]:
y = df['hasLeftCompany'].values
y

array(['0', '0', '0', ..., '1', '1', '0'], dtype=object)

In [53]:
X = df.drop(['hasLeftCompany'], axis=1).values
X

array([[ 0.55,  0.86,  4.  , ...,  0.  ,  0.  ,  1.  ],
       [ 0.66,  0.48,  4.  , ...,  0.  ,  0.  ,  1.  ],
       [ 0.56,  0.67,  5.  , ...,  0.  ,  0.  ,  1.  ],
       ..., 
       [ 0.77,  0.99,  5.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.43,  0.55,  2.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.9 ,  0.57,  4.  , ...,  0.  ,  0.  ,  1.  ]])

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
X.shape

(11999, 22)

In [56]:
X_train.shape

(9599, 22)

In [57]:
X_test.shape

(2400, 22)

### Scale X Values with All Features

In [58]:
scaler = MinMaxScaler()

In [59]:
X_train_scaled = scaler.fit_transform(X_train)

In [60]:
X_test_scaled = scaler.transform(X_test)

### Export Data Frames for Next Steps with All Features

In [61]:
np.save(file='exchange/hr_06_X.npy', arr=X)
np.save(file='exchange/hr_06_X_train.npy', arr=X_train)
np.save(file='exchange/hr_06_X_train_scaled.npy', arr=X_train_scaled)
np.save(file='exchange/hr_06_X_test.npy', arr=X_test)
np.save(file='exchange/hr_06_X_test_scaled.npy', arr=X_test_scaled)
np.save(file='exchange/hr_06_y.npy', arr=y)
np.save(file='exchange/hr_06_y_train.npy', arr=y_train)
np.save(file='exchange/hr_06_y_test.npy', arr=y_test)

### Import Dataset w/o Department

In [62]:
df = pd.read_pickle('exchange/hr_01_cleaned_train.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11999 entries, 0 to 11998
Data columns (total 10 columns):
satisfactionLevel       11999 non-null float64
yearsSinceEvaluation    11999 non-null float64
numberOfProjects        11999 non-null int64
averageMonthlyHours     11999 non-null int64
yearsAtCompany          11999 non-null int64
workAccident            11999 non-null category
hasLeftCompany          11999 non-null category
gotPromotion            11999 non-null category
department              11999 non-null category
salary                  11999 non-null category
dtypes: category(5), float64(2), int64(3)
memory usage: 527.5 KB


In [63]:
# Datentyp von Category in Object umwandeln
for col in df.select_dtypes(['category']):
    print('transforming', col)
    df[col] = df[col].astype('str')

transforming workAccident
transforming hasLeftCompany
transforming gotPromotion
transforming department
transforming salary


In [64]:
df = pd.get_dummies(df.drop(['hasLeftCompany','department'], axis=1)).join(df[['hasLeftCompany']])
df.head()

Unnamed: 0,satisfactionLevel,yearsSinceEvaluation,numberOfProjects,averageMonthlyHours,yearsAtCompany,workAccident_0,workAccident_1,gotPromotion_0,gotPromotion_1,salary_high,salary_low,salary_medium,hasLeftCompany
0,0.55,0.86,4,169,6,1,0,1,0,0,0,1,0
1,0.66,0.48,4,229,4,1,0,1,0,0,0,1,0
2,0.56,0.67,5,165,3,0,1,1,0,0,0,1,0
3,0.59,1.0,2,155,5,1,0,1,0,0,1,0,1
4,0.87,0.49,4,149,2,1,0,1,0,0,1,0,0


### Split Train and Test Set w/o Department

In [65]:
y = df['hasLeftCompany'].values
y

array(['0', '0', '0', ..., '1', '1', '0'], dtype=object)

In [66]:
X = df.drop(['hasLeftCompany'], axis=1).values
X

array([[ 0.55,  0.86,  4.  , ...,  0.  ,  0.  ,  1.  ],
       [ 0.66,  0.48,  4.  , ...,  0.  ,  0.  ,  1.  ],
       [ 0.56,  0.67,  5.  , ...,  0.  ,  0.  ,  1.  ],
       ..., 
       [ 0.77,  0.99,  5.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.43,  0.55,  2.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.9 ,  0.57,  4.  , ...,  0.  ,  0.  ,  1.  ]])

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [68]:
X.shape

(11999, 12)

In [69]:
X_train.shape

(9599, 12)

In [70]:
X_test.shape

(2400, 12)

### Scale X Values w/o Department

In [71]:
scaler = MinMaxScaler()

In [72]:
X_train_scaled = scaler.fit_transform(X_train)

In [73]:
X_test_scaled = scaler.transform(X_test)

### Export Data Frames for Next Steps w/o Department

In [74]:
np.save(file='exchange/hr_06_X_wodept.npy', arr=X)
np.save(file='exchange/hr_06_X_wodept_train.npy', arr=X_train)
np.save(file='exchange/hr_06_X_wodept_train_scaled.npy', arr=X_train_scaled)
np.save(file='exchange/hr_06_X_wodept_test.npy', arr=X_test)
np.save(file='exchange/hr_06_X_wodept_test_scaled.npy', arr=X_test_scaled)
np.save(file='exchange/hr_06_y_wodept.npy', arr=y)
np.save(file='exchange/hr_06_y_wodept_train.npy', arr=y_train)
np.save(file='exchange/hr_06_y_wodept_test.npy', arr=y_test)