<img src='logo/dsl-logo.png' width="500" align="center" />

# HR Competition

## Preperation for sklearn Models

### Initializations

In [1]:
# Bibliotheken einbinden
import numpy as np
import scipy.stats as stats
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
# Definition einer Klasse für Text Styles
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

### Import Dataset with All Features

In [3]:
df = pd.read_pickle('exchange/hr_01_cleaned_train.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
satisfactionLevel       10000 non-null float64
yearsSinceEvaluation    10000 non-null float64
numberOfProjects        10000 non-null int64
averageMonthlyHours     10000 non-null int64
yearsAtCompany          10000 non-null int64
workAccident            10000 non-null category
hasLeftCompany          10000 non-null category
gotPromotion            10000 non-null category
department              10000 non-null category
salary                  10000 non-null category
dtypes: category(5), float64(2), int64(3)
memory usage: 439.7 KB


In [4]:
# Datentyp von Category in Object umwandeln
for col in df.select_dtypes(['category']):
    print('transforming', col)
    df[col] = df[col].astype('str')

transforming workAccident
transforming hasLeftCompany
transforming gotPromotion
transforming department
transforming salary


In [5]:
df = pd.get_dummies(df.drop('hasLeftCompany', axis=1)).join(df[['hasLeftCompany']])
df.head()

Unnamed: 0,satisfactionLevel,yearsSinceEvaluation,numberOfProjects,averageMonthlyHours,yearsAtCompany,workAccident_0,workAccident_1,gotPromotion_0,gotPromotion_1,department_IT,...,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical,salary_high,salary_low,salary_medium,hasLeftCompany
0,0.65,0.96,5,226,2,0,1,1,0,0,...,0,1,0,0,0,0,0,0,1,0
1,0.88,0.8,3,166,2,1,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
2,0.69,0.98,3,214,2,1,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
3,0.41,0.47,2,154,3,1,0,1,0,0,...,0,0,0,1,0,0,0,1,0,1
4,0.87,0.76,5,254,2,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0


### Split Train and Test Set with All Features

In [6]:
y = df['hasLeftCompany'].values
y

array(['0', '0', '0', ..., '0', '0', '1'], dtype=object)

In [7]:
X = df.drop(['hasLeftCompany'], axis=1).values
X

array([[ 0.65,  0.96,  5.  , ...,  0.  ,  0.  ,  1.  ],
       [ 0.88,  0.8 ,  3.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.69,  0.98,  3.  , ...,  0.  ,  1.  ,  0.  ],
       ..., 
       [ 0.83,  0.86,  4.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.74,  0.56,  4.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.11,  0.88,  7.  , ...,  0.  ,  0.  ,  1.  ]])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
X.shape

(10000, 22)

In [10]:
X_train.shape

(8000, 22)

In [11]:
X_test.shape

(2000, 22)

### Scale X Values with All Features

In [12]:
scaler = MinMaxScaler()

In [13]:
X_train_scaled = scaler.fit_transform(X_train)

In [14]:
X_test_scaled = scaler.transform(X_test)

### Export Data Frames for Next Steps with All Features

In [15]:
np.save(file='exchange/hr_06_X.npy', arr=X)
np.save(file='exchange/hr_06_X_train.npy', arr=X_train)
np.save(file='exchange/hr_06_X_train_scaled.npy', arr=X_train_scaled)
np.save(file='exchange/hr_06_X_test.npy', arr=X_test)
np.save(file='exchange/hr_06_X_test_scaled.npy', arr=X_test_scaled)
np.save(file='exchange/hr_06_y.npy', arr=y)
np.save(file='exchange/hr_06_y_train.npy', arr=y_train)
np.save(file='exchange/hr_06_y_test.npy', arr=y_test)

### Import Dataset w/o Department

In [16]:
df = pd.read_pickle('exchange/hr_01_cleaned_train.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
satisfactionLevel       10000 non-null float64
yearsSinceEvaluation    10000 non-null float64
numberOfProjects        10000 non-null int64
averageMonthlyHours     10000 non-null int64
yearsAtCompany          10000 non-null int64
workAccident            10000 non-null category
hasLeftCompany          10000 non-null category
gotPromotion            10000 non-null category
department              10000 non-null category
salary                  10000 non-null category
dtypes: category(5), float64(2), int64(3)
memory usage: 439.7 KB


In [17]:
# Datentyp von Category in Object umwandeln
for col in df.select_dtypes(['category']):
    print('transforming', col)
    df[col] = df[col].astype('str')

transforming workAccident
transforming hasLeftCompany
transforming gotPromotion
transforming department
transforming salary


In [18]:
df = pd.get_dummies(df.drop(['hasLeftCompany','department'], axis=1)).join(df[['hasLeftCompany']])
df.head()

Unnamed: 0,satisfactionLevel,yearsSinceEvaluation,numberOfProjects,averageMonthlyHours,yearsAtCompany,workAccident_0,workAccident_1,gotPromotion_0,gotPromotion_1,salary_high,salary_low,salary_medium,hasLeftCompany
0,0.65,0.96,5,226,2,0,1,1,0,0,0,1,0
1,0.88,0.8,3,166,2,1,0,1,0,0,1,0,0
2,0.69,0.98,3,214,2,1,0,1,0,0,1,0,0
3,0.41,0.47,2,154,3,1,0,1,0,0,1,0,1
4,0.87,0.76,5,254,2,0,1,1,0,0,1,0,0


### Split Train and Test Set w/o Department

In [19]:
y = df['hasLeftCompany'].values
y

array(['0', '0', '0', ..., '0', '0', '1'], dtype=object)

In [20]:
X = df.drop(['hasLeftCompany'], axis=1).values
X

array([[ 0.65,  0.96,  5.  , ...,  0.  ,  0.  ,  1.  ],
       [ 0.88,  0.8 ,  3.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.69,  0.98,  3.  , ...,  0.  ,  1.  ,  0.  ],
       ..., 
       [ 0.83,  0.86,  4.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.74,  0.56,  4.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.11,  0.88,  7.  , ...,  0.  ,  0.  ,  1.  ]])

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
X.shape

(10000, 12)

In [23]:
X_train.shape

(8000, 12)

In [24]:
X_test.shape

(2000, 12)

### Scale X Values w/o Department

In [25]:
scaler = MinMaxScaler()

In [26]:
X_train_scaled = scaler.fit_transform(X_train)

In [27]:
X_test_scaled = scaler.transform(X_test)

### Export Data Frames for Next Steps w/o Department

In [28]:
np.save(file='exchange/hr_06_X_wodept.npy', arr=X)
np.save(file='exchange/hr_06_X_wodept_train.npy', arr=X_train)
np.save(file='exchange/hr_06_X_wodept_train_scaled.npy', arr=X_train_scaled)
np.save(file='exchange/hr_06_X_wodept_test.npy', arr=X_test)
np.save(file='exchange/hr_06_X_wodept_test_scaled.npy', arr=X_test_scaled)
np.save(file='exchange/hr_06_y_wodept.npy', arr=y)
np.save(file='exchange/hr_06_y_wodept_train.npy', arr=y_train)
np.save(file='exchange/hr_06_y_wodept_test.npy', arr=y_test)

### Import Dataset w/o Salary and Department

In [29]:
df = pd.read_pickle('exchange/hr_01_cleaned_train.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
satisfactionLevel       10000 non-null float64
yearsSinceEvaluation    10000 non-null float64
numberOfProjects        10000 non-null int64
averageMonthlyHours     10000 non-null int64
yearsAtCompany          10000 non-null int64
workAccident            10000 non-null category
hasLeftCompany          10000 non-null category
gotPromotion            10000 non-null category
department              10000 non-null category
salary                  10000 non-null category
dtypes: category(5), float64(2), int64(3)
memory usage: 439.7 KB


In [30]:
# Datentyp von Category in Object umwandeln
for col in df.select_dtypes(['category']):
    print('transforming', col)
    df[col] = df[col].astype('str')

transforming workAccident
transforming hasLeftCompany
transforming gotPromotion
transforming department
transforming salary


In [31]:
df = pd.get_dummies(df.drop(['hasLeftCompany','department', 'salary'], axis=1)).join(df[['hasLeftCompany']])
df.head()

Unnamed: 0,satisfactionLevel,yearsSinceEvaluation,numberOfProjects,averageMonthlyHours,yearsAtCompany,workAccident_0,workAccident_1,gotPromotion_0,gotPromotion_1,hasLeftCompany
0,0.65,0.96,5,226,2,0,1,1,0,0
1,0.88,0.8,3,166,2,1,0,1,0,0
2,0.69,0.98,3,214,2,1,0,1,0,0
3,0.41,0.47,2,154,3,1,0,1,0,1
4,0.87,0.76,5,254,2,0,1,1,0,0


### Split Train and Test Set w/o Department and Salary

In [32]:
y = df['hasLeftCompany'].values
y

array(['0', '0', '0', ..., '0', '0', '1'], dtype=object)

In [33]:
X = df.drop(['hasLeftCompany'], axis=1).values
X

array([[ 0.65,  0.96,  5.  , ...,  1.  ,  1.  ,  0.  ],
       [ 0.88,  0.8 ,  3.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.69,  0.98,  3.  , ...,  0.  ,  1.  ,  0.  ],
       ..., 
       [ 0.83,  0.86,  4.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.74,  0.56,  4.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.11,  0.88,  7.  , ...,  0.  ,  1.  ,  0.  ]])

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
X.shape

(10000, 9)

In [36]:
X_train.shape

(8000, 9)

In [37]:
X_test.shape

(2000, 9)

### Scale X Values w/o Department and Salary

In [38]:
scaler = MinMaxScaler()

In [39]:
X_train_scaled = scaler.fit_transform(X_train)

In [40]:
X_test_scaled = scaler.transform(X_test)

### Export Data Frames for Next Steps w/o Department and Salary

In [41]:
np.save(file='exchange/hr_06_X_wodeptsal.npy', arr=X)
np.save(file='exchange/hr_06_X_wodeptsal_train.npy', arr=X_train)
np.save(file='exchange/hr_06_X_wodeptsal_train_scaled.npy', arr=X_train_scaled)
np.save(file='exchange/hr_06_X_wodeptsal_test.npy', arr=X_test)
np.save(file='exchange/hr_06_X_wodeptsal_test_scaled.npy', arr=X_test_scaled)
np.save(file='exchange/hr_06_y_wodeptsal.npy', arr=y)
np.save(file='exchange/hr_06_y_wodeptsal_train.npy', arr=y_train)
np.save(file='exchange/hr_06_y_wodeptsal_test.npy', arr=y_test)