<img src='logo/dsl-logo.png' width="500" align="center" />

# HR Competition

## Preperation for sklearn Models

### Initializations

Es werden die benötigten Bibliotheken eingebunden, für Variationsmöglichkeiten bei der Ausgabe wird class color definiert und die benötigten Daten geladen.

In [1]:
# Bibliotheken einbinden
import numpy as np
import scipy.stats as stats
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
# Definition einer Klasse für Text Styles
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

### Vorbereitung der Daten
In den folgenden Schritten werden die Daten für die sklearn Modelle aufbereitet und aufgeteilt.

#### Import Dataset with All Features

In [4]:
df = pd.read_pickle('exchange/hr_01_cleaned_train.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
satisfactionLevel       10000 non-null float64
yearsSinceEvaluation    10000 non-null float64
numberOfProjects        10000 non-null int64
averageMonthlyHours     10000 non-null int64
yearsAtCompany          10000 non-null int64
workAccident            10000 non-null category
hasLeftCompany          10000 non-null category
gotPromotion            10000 non-null category
department              10000 non-null category
salary                  10000 non-null category
dtypes: category(5), float64(2), int64(3)
memory usage: 440.3 KB


In [5]:
# Datentyp von Category in Object umwandeln
for col in df.select_dtypes(['category']):
    print('transforming', col)
    df[col] = df[col].astype('str')

transforming workAccident
transforming hasLeftCompany
transforming gotPromotion
transforming department
transforming salary


In [6]:
df = pd.get_dummies(df.drop('hasLeftCompany', axis=1)).join(df[['hasLeftCompany']])
df.head()

Unnamed: 0,satisfactionLevel,yearsSinceEvaluation,numberOfProjects,averageMonthlyHours,yearsAtCompany,workAccident_0,workAccident_1,gotPromotion_0,gotPromotion_1,department_IT,...,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical,salary_high,salary_low,salary_medium,hasLeftCompany
0,0.65,0.96,5,226,2,0,1,1,0,0,...,0,1,0,0,0,0,0,0,1,0
1,0.88,0.8,3,166,2,1,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
2,0.69,0.98,3,214,2,1,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
3,0.41,0.47,2,154,3,1,0,1,0,0,...,0,0,0,1,0,0,0,1,0,1
4,0.87,0.76,5,254,2,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0


#### Split Train and Test Set with All Features

In [7]:
y = df['hasLeftCompany'].values
y

array(['0', '0', '0', ..., '0', '0', '1'], dtype=object)

In [8]:
X = df.drop(['hasLeftCompany'], axis=1).values
X

array([[ 0.65,  0.96,  5.  , ...,  0.  ,  0.  ,  1.  ],
       [ 0.88,  0.8 ,  3.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.69,  0.98,  3.  , ...,  0.  ,  1.  ,  0.  ],
       ..., 
       [ 0.83,  0.86,  4.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.74,  0.56,  4.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.11,  0.88,  7.  , ...,  0.  ,  0.  ,  1.  ]])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
X.shape

(10000, 22)

In [11]:
X_train.shape

(8000, 22)

In [12]:
X_test.shape

(2000, 22)

#### Scale X Values with All Features

In [13]:
scaler = MinMaxScaler()

In [14]:
X_train_scaled = scaler.fit_transform(X_train)

In [15]:
X_test_scaled = scaler.transform(X_test)

#### Export Data Frames for Next Steps with All Features

In [16]:
np.save(file='exchange/hr_06_X.npy', arr=X)
np.save(file='exchange/hr_06_X_train.npy', arr=X_train)
np.save(file='exchange/hr_06_X_train_scaled.npy', arr=X_train_scaled)
np.save(file='exchange/hr_06_X_test.npy', arr=X_test)
np.save(file='exchange/hr_06_X_test_scaled.npy', arr=X_test_scaled)
np.save(file='exchange/hr_06_y.npy', arr=y)
np.save(file='exchange/hr_06_y_train.npy', arr=y_train)
np.save(file='exchange/hr_06_y_test.npy', arr=y_test)

#### Import Dataset w/o Department

In [17]:
df = pd.read_pickle('exchange/hr_01_cleaned_train.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
satisfactionLevel       10000 non-null float64
yearsSinceEvaluation    10000 non-null float64
numberOfProjects        10000 non-null int64
averageMonthlyHours     10000 non-null int64
yearsAtCompany          10000 non-null int64
workAccident            10000 non-null category
hasLeftCompany          10000 non-null category
gotPromotion            10000 non-null category
department              10000 non-null category
salary                  10000 non-null category
dtypes: category(5), float64(2), int64(3)
memory usage: 440.3 KB


In [18]:
# Datentyp von Category in Object umwandeln
for col in df.select_dtypes(['category']):
    print('transforming', col)
    df[col] = df[col].astype('str')

transforming workAccident
transforming hasLeftCompany
transforming gotPromotion
transforming department
transforming salary


In [19]:
df = pd.get_dummies(df.drop(['hasLeftCompany','department'], axis=1)).join(df[['hasLeftCompany']])
df.head()

Unnamed: 0,satisfactionLevel,yearsSinceEvaluation,numberOfProjects,averageMonthlyHours,yearsAtCompany,workAccident_0,workAccident_1,gotPromotion_0,gotPromotion_1,salary_high,salary_low,salary_medium,hasLeftCompany
0,0.65,0.96,5,226,2,0,1,1,0,0,0,1,0
1,0.88,0.8,3,166,2,1,0,1,0,0,1,0,0
2,0.69,0.98,3,214,2,1,0,1,0,0,1,0,0
3,0.41,0.47,2,154,3,1,0,1,0,0,1,0,1
4,0.87,0.76,5,254,2,0,1,1,0,0,1,0,0


#### Split Train and Test Set w/o Department

In [20]:
y = df['hasLeftCompany'].values
y

array(['0', '0', '0', ..., '0', '0', '1'], dtype=object)

In [21]:
X = df.drop(['hasLeftCompany'], axis=1).values
X

array([[ 0.65,  0.96,  5.  , ...,  0.  ,  0.  ,  1.  ],
       [ 0.88,  0.8 ,  3.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.69,  0.98,  3.  , ...,  0.  ,  1.  ,  0.  ],
       ..., 
       [ 0.83,  0.86,  4.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.74,  0.56,  4.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.11,  0.88,  7.  , ...,  0.  ,  0.  ,  1.  ]])

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
X.shape

(10000, 12)

In [24]:
X_train.shape

(8000, 12)

In [25]:
X_test.shape

(2000, 12)

#### Scale X Values w/o Department

In [26]:
scaler = MinMaxScaler()

In [27]:
X_train_scaled = scaler.fit_transform(X_train)

In [28]:
X_test_scaled = scaler.transform(X_test)

#### Export Data Frames for Next Steps w/o Department

In [29]:
np.save(file='exchange/hr_06_X_wodept.npy', arr=X)
np.save(file='exchange/hr_06_X_wodept_train.npy', arr=X_train)
np.save(file='exchange/hr_06_X_wodept_train_scaled.npy', arr=X_train_scaled)
np.save(file='exchange/hr_06_X_wodept_test.npy', arr=X_test)
np.save(file='exchange/hr_06_X_wodept_test_scaled.npy', arr=X_test_scaled)
np.save(file='exchange/hr_06_y_wodept.npy', arr=y)
np.save(file='exchange/hr_06_y_wodept_train.npy', arr=y_train)
np.save(file='exchange/hr_06_y_wodept_test.npy', arr=y_test)

#### Import Dataset w/o Salary and Department

In [30]:
df = pd.read_pickle('exchange/hr_01_cleaned_train.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
satisfactionLevel       10000 non-null float64
yearsSinceEvaluation    10000 non-null float64
numberOfProjects        10000 non-null int64
averageMonthlyHours     10000 non-null int64
yearsAtCompany          10000 non-null int64
workAccident            10000 non-null category
hasLeftCompany          10000 non-null category
gotPromotion            10000 non-null category
department              10000 non-null category
salary                  10000 non-null category
dtypes: category(5), float64(2), int64(3)
memory usage: 440.3 KB


In [31]:
# Datentyp von Category in Object umwandeln
for col in df.select_dtypes(['category']):
    print('transforming', col)
    df[col] = df[col].astype('str')

transforming workAccident
transforming hasLeftCompany
transforming gotPromotion
transforming department
transforming salary


In [32]:
df = pd.get_dummies(df.drop(['hasLeftCompany','department', 'salary'], axis=1)).join(df[['hasLeftCompany']])
df.head()

Unnamed: 0,satisfactionLevel,yearsSinceEvaluation,numberOfProjects,averageMonthlyHours,yearsAtCompany,workAccident_0,workAccident_1,gotPromotion_0,gotPromotion_1,hasLeftCompany
0,0.65,0.96,5,226,2,0,1,1,0,0
1,0.88,0.8,3,166,2,1,0,1,0,0
2,0.69,0.98,3,214,2,1,0,1,0,0
3,0.41,0.47,2,154,3,1,0,1,0,1
4,0.87,0.76,5,254,2,0,1,1,0,0


#### Split Train and Test Set w/o Department and Salary

In [33]:
y = df['hasLeftCompany'].values
y

array(['0', '0', '0', ..., '0', '0', '1'], dtype=object)

In [34]:
X = df.drop(['hasLeftCompany'], axis=1).values
X

array([[ 0.65,  0.96,  5.  , ...,  1.  ,  1.  ,  0.  ],
       [ 0.88,  0.8 ,  3.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.69,  0.98,  3.  , ...,  0.  ,  1.  ,  0.  ],
       ..., 
       [ 0.83,  0.86,  4.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.74,  0.56,  4.  , ...,  0.  ,  1.  ,  0.  ],
       [ 0.11,  0.88,  7.  , ...,  0.  ,  1.  ,  0.  ]])

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
X.shape

(10000, 9)

In [37]:
X_train.shape

(8000, 9)

In [38]:
X_test.shape

(2000, 9)

#### Scale X Values w/o Department and Salary

In [39]:
scaler = MinMaxScaler()

In [40]:
X_train_scaled = scaler.fit_transform(X_train)

In [41]:
X_test_scaled = scaler.transform(X_test)

#### Export Data Frames for Next Steps w/o Department and Salary

In [42]:
np.save(file='exchange/hr_06_X_wodeptsal.npy', arr=X)
np.save(file='exchange/hr_06_X_wodeptsal_train.npy', arr=X_train)
np.save(file='exchange/hr_06_X_wodeptsal_train_scaled.npy', arr=X_train_scaled)
np.save(file='exchange/hr_06_X_wodeptsal_test.npy', arr=X_test)
np.save(file='exchange/hr_06_X_wodeptsal_test_scaled.npy', arr=X_test_scaled)
np.save(file='exchange/hr_06_y_wodeptsal.npy', arr=y)
np.save(file='exchange/hr_06_y_wodeptsal_train.npy', arr=y_train)
np.save(file='exchange/hr_06_y_wodeptsal_test.npy', arr=y_test)

### Create Additional Attributes
- Für die Beantwortung der Aufgabenstellung müssen höherwertige Attribute erzeugt werden, um die High-Potential MA in den folgenden Analysen zu identifizieren. 
- Höherwertige Attribute setzen sich aus mehreren Attributen zusammen, um eine definierte Fragestellung zu beatworten.
- Neben der Aufgabenstellung warum die High-Potential MA das Unternehmen verlassen, muss aus den Daten die Frage beantwortet werden ob überhaupt die High-Potential MA das Unternehmen verlassen. 


#### Import Dataset und entfernen der nicht relevanten Attribute

In [43]:
df = pd.read_pickle('exchange/hr_01_cleaned_train.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
satisfactionLevel       10000 non-null float64
yearsSinceEvaluation    10000 non-null float64
numberOfProjects        10000 non-null int64
averageMonthlyHours     10000 non-null int64
yearsAtCompany          10000 non-null int64
workAccident            10000 non-null category
hasLeftCompany          10000 non-null category
gotPromotion            10000 non-null category
department              10000 non-null category
salary                  10000 non-null category
dtypes: category(5), float64(2), int64(3)
memory usage: 440.3 KB


In [44]:
df_test = pd.read_pickle('exchange/hr_01_cleaned_test.pkl')
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 10 columns):
id                      4999 non-null int64
satisfactionLevel       4999 non-null float64
yearsSinceEvaluation    4999 non-null float64
numberOfProjects        4999 non-null int64
averageMonthlyHours     4999 non-null int64
yearsAtCompany          4999 non-null int64
workAccident            4999 non-null category
gotPromotion            4999 non-null category
department              4999 non-null category
salary                  4999 non-null category
dtypes: category(4), float64(2), int64(4)
memory usage: 254.6 KB


#### 1. Zusätzliches Attribut: Number of Projects per Year

In [45]:
# Number of Projects per Year
df['projectsPerYear'] = df['numberOfProjects'] / df['yearsAtCompany']
df_test['projectsPerYear'] = df_test['numberOfProjects'] / df_test['yearsAtCompany']
df.head()

Unnamed: 0,satisfactionLevel,yearsSinceEvaluation,numberOfProjects,averageMonthlyHours,yearsAtCompany,workAccident,hasLeftCompany,gotPromotion,department,salary,projectsPerYear
0,0.65,0.96,5,226,2,1,0,0,marketing,medium,2.5
1,0.88,0.8,3,166,2,0,0,0,IT,low,1.5
2,0.69,0.98,3,214,2,0,0,0,sales,low,1.5
3,0.41,0.47,2,154,3,0,1,0,sales,low,0.666667
4,0.87,0.76,5,254,2,1,0,0,hr,low,2.5


#### 2. Zusätzliches Attribut: Workinghours per Project

In [46]:
# Workinghours per Project
df['hoursPerProject'] = ( df['yearsAtCompany'] * df['averageMonthlyHours'] * 12 ) / df['numberOfProjects']
df_test['hoursPerProject'] = ( df_test['yearsAtCompany'] * df_test['averageMonthlyHours'] * 12 ) / df_test['numberOfProjects']
df.head()

Unnamed: 0,satisfactionLevel,yearsSinceEvaluation,numberOfProjects,averageMonthlyHours,yearsAtCompany,workAccident,hasLeftCompany,gotPromotion,department,salary,projectsPerYear,hoursPerProject
0,0.65,0.96,5,226,2,1,0,0,marketing,medium,2.5,1084.8
1,0.88,0.8,3,166,2,0,0,0,IT,low,1.5,1328.0
2,0.69,0.98,3,214,2,0,0,0,sales,low,1.5,1712.0
3,0.41,0.47,2,154,3,0,1,0,sales,low,0.666667,2772.0
4,0.87,0.76,5,254,2,1,0,0,hr,low,2.5,1219.2


#### 3. Zusätzliches Attribut: Stundenzufriedenheit

In [47]:
# Zusätzliches Attribut: Stundenzufriedenheit
df['satisfactionHours'] = df['satisfactionLevel'] * df['averageMonthlyHours']
df_test['satisfactionHours'] = df_test['satisfactionLevel'] * df_test['averageMonthlyHours']
df.head()

Unnamed: 0,satisfactionLevel,yearsSinceEvaluation,numberOfProjects,averageMonthlyHours,yearsAtCompany,workAccident,hasLeftCompany,gotPromotion,department,salary,projectsPerYear,hoursPerProject,satisfactionHours
0,0.65,0.96,5,226,2,1,0,0,marketing,medium,2.5,1084.8,146.9
1,0.88,0.8,3,166,2,0,0,0,IT,low,1.5,1328.0,146.08
2,0.69,0.98,3,214,2,0,0,0,sales,low,1.5,1712.0,147.66
3,0.41,0.47,2,154,3,0,1,0,sales,low,0.666667,2772.0,63.14
4,0.87,0.76,5,254,2,1,0,0,hr,low,2.5,1219.2,220.98


#### 4. Zusätzliches Attribut: Workinghours seit letzter Evaluation

In [48]:
# Workinghours seit letzter Evaluation
df['workingHoursSinceLastEvaluation'] = df['yearsSinceEvaluation'] * df['averageMonthlyHours'] * 12
df_test['workingHoursSinceLastEvaluation'] = df_test['yearsSinceEvaluation'] * df_test['averageMonthlyHours'] * 12
df.head()

Unnamed: 0,satisfactionLevel,yearsSinceEvaluation,numberOfProjects,averageMonthlyHours,yearsAtCompany,workAccident,hasLeftCompany,gotPromotion,department,salary,projectsPerYear,hoursPerProject,satisfactionHours,workingHoursSinceLastEvaluation
0,0.65,0.96,5,226,2,1,0,0,marketing,medium,2.5,1084.8,146.9,2603.52
1,0.88,0.8,3,166,2,0,0,0,IT,low,1.5,1328.0,146.08,1593.6
2,0.69,0.98,3,214,2,0,0,0,sales,low,1.5,1712.0,147.66,2516.64
3,0.41,0.47,2,154,3,0,1,0,sales,low,0.666667,2772.0,63.14,868.56
4,0.87,0.76,5,254,2,1,0,0,hr,low,2.5,1219.2,220.98,2316.48


#### 5. Zusätzliches Attribut: Evaluationszufriedenheit

In [49]:
# Evaluationszufriedenheit
df['evaluationSatisfaction'] = df['satisfactionLevel'] * df['yearsSinceEvaluation']
df_test['evaluationSatisfaction'] = df_test['satisfactionLevel'] * df_test['yearsSinceEvaluation']
df.head()

Unnamed: 0,satisfactionLevel,yearsSinceEvaluation,numberOfProjects,averageMonthlyHours,yearsAtCompany,workAccident,hasLeftCompany,gotPromotion,department,salary,projectsPerYear,hoursPerProject,satisfactionHours,workingHoursSinceLastEvaluation,evaluationSatisfaction
0,0.65,0.96,5,226,2,1,0,0,marketing,medium,2.5,1084.8,146.9,2603.52,0.624
1,0.88,0.8,3,166,2,0,0,0,IT,low,1.5,1328.0,146.08,1593.6,0.704
2,0.69,0.98,3,214,2,0,0,0,sales,low,1.5,1712.0,147.66,2516.64,0.6762
3,0.41,0.47,2,154,3,0,1,0,sales,low,0.666667,2772.0,63.14,868.56,0.1927
4,0.87,0.76,5,254,2,1,0,0,hr,low,2.5,1219.2,220.98,2316.48,0.6612


#### 6. Zusätzliches Attribut: Projects per Workinghour 

In [50]:
# Projects per Workinghour
df['projectsPerWorkingHour'] = df['numberOfProjects'] / ( df['yearsSinceEvaluation'] * df['averageMonthlyHours'] * 12 )
df_test['projectsPerWorkingHour'] = df_test['numberOfProjects'] / ( df_test['yearsSinceEvaluation'] * df_test['averageMonthlyHours'] * 12 )
df.head()

Unnamed: 0,satisfactionLevel,yearsSinceEvaluation,numberOfProjects,averageMonthlyHours,yearsAtCompany,workAccident,hasLeftCompany,gotPromotion,department,salary,projectsPerYear,hoursPerProject,satisfactionHours,workingHoursSinceLastEvaluation,evaluationSatisfaction,projectsPerWorkingHour
0,0.65,0.96,5,226,2,1,0,0,marketing,medium,2.5,1084.8,146.9,2603.52,0.624,0.00192
1,0.88,0.8,3,166,2,0,0,0,IT,low,1.5,1328.0,146.08,1593.6,0.704,0.001883
2,0.69,0.98,3,214,2,0,0,0,sales,low,1.5,1712.0,147.66,2516.64,0.6762,0.001192
3,0.41,0.47,2,154,3,0,1,0,sales,low,0.666667,2772.0,63.14,868.56,0.1927,0.002303
4,0.87,0.76,5,254,2,1,0,0,hr,low,2.5,1219.2,220.98,2316.48,0.6612,0.002158


In [54]:
# Zusätzliches Attribut: Wertvoller Mitarbeiter / High Potential
#
# Kriterien: 
#    - hat mit mehr als 1.5 Projekten pro Jahr eine hohe Projektwiederholungsrate
#    - ODER hat eine Beförderung erhalten
#    - UND ist mit einer Stundenzahl von weniger als 230 h nicht Burn Out gefährdet
#    - UND zeigt sich mit 0.5 Satisfaction Level zufrieden mit seiner Arbeit
#
df['valuableEmployee'] = (((df['projectsPerYear'] > 1.5) | (df['gotPromotion'] == 1)) & (df['satisfactionLevel'] > 0.5) & (df['averageMonthlyHours'] < 230)).astype(int)
df_test['valuableEmployee'] = (((df['projectsPerYear'] > 1.5) | (df['gotPromotion'] == 1)) & (df['satisfactionLevel'] > 0.5) & (df['averageMonthlyHours'] < 230)).astype(int)
df.head()

Unnamed: 0,satisfactionLevel,yearsSinceEvaluation,numberOfProjects,averageMonthlyHours,yearsAtCompany,workAccident,hasLeftCompany,gotPromotion,department,salary,projectsPerYear,hoursPerProject,satisfactionHours,workingHoursSinceLastEvaluation,evaluationSatisfaction,projectsPerWorkingHour,valuableEmployee
0,0.65,0.96,5,226,2,1,0,0,marketing,medium,2.5,1084.8,146.9,2603.52,0.624,0.00192,1
1,0.88,0.8,3,166,2,0,0,0,IT,low,1.5,1328.0,146.08,1593.6,0.704,0.001883,0
2,0.69,0.98,3,214,2,0,0,0,sales,low,1.5,1712.0,147.66,2516.64,0.6762,0.001192,0
3,0.41,0.47,2,154,3,0,1,0,sales,low,0.666667,2772.0,63.14,868.56,0.1927,0.002303,0
4,0.87,0.76,5,254,2,1,0,0,hr,low,2.5,1219.2,220.98,2316.48,0.6612,0.002158,0


In [52]:
df['valuableEmployee'].sum()

1124

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 17 columns):
satisfactionLevel                  10000 non-null float64
yearsSinceEvaluation               10000 non-null float64
numberOfProjects                   10000 non-null int64
averageMonthlyHours                10000 non-null int64
yearsAtCompany                     10000 non-null int64
workAccident                       10000 non-null category
hasLeftCompany                     10000 non-null category
gotPromotion                       10000 non-null category
department                         10000 non-null category
salary                             10000 non-null category
projectsPerYear                    10000 non-null float64
hoursPerProject                    10000 non-null float64
satisfactionHours                  10000 non-null float64
workingHoursSinceLastEvaluation    10000 non-null float64
evaluationSatisfaction             10000 non-null float64
projectsPerWorkingHour 

In [81]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 17 columns):
id                                 4999 non-null int64
satisfactionLevel                  4999 non-null float64
yearsSinceEvaluation               4999 non-null float64
numberOfProjects                   4999 non-null int64
averageMonthlyHours                4999 non-null int64
yearsAtCompany                     4999 non-null int64
workAccident                       4999 non-null category
gotPromotion                       4999 non-null category
department                         4999 non-null category
salary                             4999 non-null category
projectsPerYear                    4999 non-null float64
hoursPerProject                    4999 non-null float64
satisfactionHours                  4999 non-null float64
workingHoursSinceLastEvaluation    4999 non-null float64
evaluationSatisfaction             4999 non-null float64
projectsPerWorkingHour             4999 no

In [82]:
df.describe()

Unnamed: 0,satisfactionLevel,yearsSinceEvaluation,numberOfProjects,averageMonthlyHours,yearsAtCompany,projectsPerYear,hoursPerProject,satisfactionHours,workingHoursSinceLastEvaluation,evaluationSatisfaction,projectsPerWorkingHour,valuableEmployee
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,0.611204,0.717084,3.8007,200.6882,3.4832,1.216497,2352.710734,122.545712,1761.516456,0.442899,0.002406,0.1124
std,0.248888,0.171446,1.234719,50.060579,1.448768,0.517149,1234.181366,61.553629,692.002979,0.225156,0.001093,0.315874
min,0.09,0.36,2.0,96.0,2.0,0.2,428.0,13.2,448.44,0.0468,0.000608,0.0
25%,0.44,0.56,3.0,156.0,3.0,0.8,1548.0,66.0,1185.84,0.236575,0.001742,0.0
50%,0.64,0.72,4.0,199.0,3.0,1.0,2160.0,122.4,1674.0,0.4457,0.002179,0.0
75%,0.82,0.87,5.0,245.0,4.0,1.5,2790.0,170.1825,2302.11,0.615,0.002762,0.0
max,1.0,1.0,7.0,310.0,10.0,3.5,13500.0,282.44,3608.4,0.99,0.012531,1.0


In [83]:
df_test.describe()

Unnamed: 0,id,satisfactionLevel,yearsSinceEvaluation,numberOfProjects,averageMonthlyHours,yearsAtCompany,projectsPerYear,hoursPerProject,satisfactionHours,workingHoursSinceLastEvaluation,evaluationSatisfaction,projectsPerWorkingHour,valuableEmployee
count,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0
mean,12499.0,0.616093,0.714137,3.807762,201.774755,3.528306,1.203355,2392.669482,123.793115,1764.527193,0.444163,0.002401,0.112222
std,1443.231328,0.248107,0.170614,1.228436,49.704337,1.482305,0.508555,1312.868627,60.66549,689.825244,0.224937,0.001058,0.315672
min,10000.0,0.09,0.36,2.0,96.0,2.0,0.2,452.0,14.7,419.04,0.0532,0.000578,0.0
25%,11249.5,0.44,0.56,3.0,156.0,3.0,0.8,1593.0,67.83,1190.16,0.2405,0.001744,0.0
50%,12499.0,0.65,0.72,4.0,202.0,3.0,1.0,2172.0,125.02,1679.04,0.44,0.002181,0.0
75%,13748.5,0.82,0.86,5.0,245.0,4.0,1.5,2796.0,170.64,2313.48,0.6156,0.002756,0.0
max,14998.0,1.0,1.0,7.0,310.0,10.0,3.0,16440.0,275.0,3608.4,1.0,0.01221,1.0


### Exports Dataframes for Next Steps

In [84]:
df.to_pickle('exchange/hr_01_enriched_train.pkl')
df_test.to_pickle('exchange/hr_01_enriched_test.pkl')