# Feature engineering
There are so many features and this means high dimensions of computation. Some features can be compounded.


In [20]:
import pandas as pd
import numpy as np
import os

In [21]:
df = pd.read_csv("../data/clean-dataset.csv")

df.shape

(5000, 18)

### Combined performance metric
Create a single index combining the three most important, scaled performance metrics.
They can be averaged since they are already scaled/standardized.


In [22]:
# create performance index
performance_cols = ['previous_rating', 'avg_training_score', 'kpi_met']
df['weighted_performance_index'] = df[performance_cols].mean(axis=1)

df['weighted_performance_index']

0       0.946241
1      -0.905009
2       1.560968
3       0.994941
4       0.389876
          ...   
4995    0.023599
4996    1.082951
4997    1.276931
4998    0.131967
4999    0.755224
Name: weighted_performance_index, Length: 5000, dtype: float64

In [23]:
# drop features involved
df2 = df.drop(columns=performance_cols)
df2.head()

Unnamed: 0,age,gender,department,education,tenure_years,trainings_attended,awards_won,job_level,region,recruitment_channel,work_type,language_count,multilingual,international_hire,is_promoted,weighted_performance_index
0,0.851187,1.0,2.0,2.0,0.854043,3.528853,-0.707591,3.0,2.0,2.0,0.0,-0.903567,1.0,0.0,0,0.946241
1,-0.430142,0.0,3.0,0.0,-0.439257,-1.405423,-0.707591,2.0,2.0,1.0,1.0,-0.903567,1.0,0.0,0,-0.905009
2,-1.070807,1.0,1.0,0.0,-0.751797,-1.405423,0.679298,0.0,3.0,0.0,0.0,1.859634,0.0,0.0,1,1.560968
3,0.118999,1.0,2.0,0.0,-0.916359,-1.405423,0.679298,1.0,1.0,1.0,0.0,0.478034,1.0,0.0,1,0.994941
4,-0.064048,0.0,3.0,2.0,-0.455064,-1.405423,0.679298,1.0,1.0,0.0,0.0,0.478034,0.0,1.0,0,0.389876


### Add high achiever flag
Create a flag for employees who have won awards (which is rare, per data analysis) and have a top rating.
Since awards won and previous rating were processed, a simple threshold can be used.

Assumption: awards_won will be > 0 (post-scaling) for employees with awards.

In [24]:
# rated on 0-5 scale 
df['previous_rating'].max()

np.float64(4.0)

In [25]:
df2['high_achiever_flag'] = np.where(
    (df['awards_won'] > 0) & (df['previous_rating'] >= df['previous_rating'].max()-1), 
    1,
    0
)

df2['high_achiever_flag'].value_counts()

high_achiever_flag
0    4095
1     905
Name: count, dtype: int64

In [26]:
# drop features involved
df3 = df2.drop(['awards_won'], axis=1)
df3.head()

Unnamed: 0,age,gender,department,education,tenure_years,trainings_attended,job_level,region,recruitment_channel,work_type,language_count,multilingual,international_hire,is_promoted,weighted_performance_index,high_achiever_flag
0,0.851187,1.0,2.0,2.0,0.854043,3.528853,3.0,2.0,2.0,0.0,-0.903567,1.0,0.0,0,0.946241,0
1,-0.430142,0.0,3.0,0.0,-0.439257,-1.405423,2.0,2.0,1.0,1.0,-0.903567,1.0,0.0,0,-0.905009,0
2,-1.070807,1.0,1.0,0.0,-0.751797,-1.405423,0.0,3.0,0.0,0.0,1.859634,0.0,0.0,1,1.560968,1
3,0.118999,1.0,2.0,0.0,-0.916359,-1.405423,1.0,1.0,1.0,0.0,0.478034,1.0,0.0,1,0.994941,1
4,-0.064048,0.0,3.0,2.0,-0.455064,-1.405423,1.0,1.0,0.0,0.0,0.478034,0.0,1.0,0,0.389876,0


### Add training efficiency feature
How effective were the trainings? (Average score per training session attended).

Since is it is a ratio, a small constant (1) is added to the denominator to avoid division by zero.

In [27]:
df2['training_efficiency'] = df['avg_training_score'] / (df2['trainings_attended'] + 1)

df2['training_efficiency'].value_counts()

training_efficiency
 6.771732    36
 2.019132    24
 1.186448    22
-5.002077    20
 0.840024    10
             ..
-0.472627     1
 0.353039     1
-2.352375     1
 1.797339     1
-0.004617     1
Name: count, Length: 4890, dtype: int64

### Double check missing values

In [28]:
if df2.isnull().sum().sum() > 0:
    df2 = df2.fillna(0)

### Export final dataset

In [29]:
df2.to_csv("../data/final-dataset.csv", index=None)

df2.shape

(5000, 18)

In [30]:
df2.columns

Index(['age', 'gender', 'department', 'education', 'tenure_years',
       'trainings_attended', 'awards_won', 'job_level', 'region',
       'recruitment_channel', 'work_type', 'language_count', 'multilingual',
       'international_hire', 'is_promoted', 'weighted_performance_index',
       'high_achiever_flag', 'training_efficiency'],
      dtype='object')