# Data preprocessing
In this phase, the data is being cleaned and prepated for processing.

The dataset features include: 
- employee_id              - string/int
- age                      - int (20-65)
- gender                   - categorical
- department               - categorical
- job_level                - int (1-5)
- region                   - categorical
- recruitment_channel      - categorical
- tenure_years             - float
- previous_rating          - int (1-5)
- kpi_met                  - int (0-100)
- awards_won               - int
- trainings_attended       - int
- avg_training_score       - float (0-100)
- education                - categorical/ordinal
- work_type                - categorical
- language_count           - int (1, 2, 3+)
- international_hire       - binary (0/1)
- is_promoted              - binary (0/1) (TARGET)

## import required libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import os

## load dataset

In [3]:
df = pd.read_csv("../data/employee-dataset.csv")
df.head()

Unnamed: 0,age,gender,department,education,tenure_years,previous_rating,kpi_met,trainings_attended,avg_training_score,awards_won,job_level,region,recruitment_channel,work_type,language_count,multilingual,international_hire,is_promoted
0,50,M,Operations,Master's,5.438794,4,60,7,75.802386,0,4,South,Referral,Full-Time,1,1,0,0
1,36,F,Sales,Bachelor's,1.6634,1,42,0,53.57937,0,3,South,Direct,Part-Time Day,1,1,0,0
2,29,M,HR,Bachelor's,0.751038,4,83,0,83.267675,1,1,West,Agency,Full-Time,3,0,0,1
3,42,M,Operations,Bachelor's,0.270648,4,67,0,71.950245,1,2,North,Direct,Full-Time,2,1,0,1
4,40,F,Sales,Master's,1.617259,1,64,0,92.001845,1,2,North,Agency,Full-Time,2,0,1,0


In [4]:
df.shape

(5000, 18)

In [5]:
df.columns

Index(['age', 'gender', 'department', 'education', 'tenure_years',
       'previous_rating', 'kpi_met', 'trainings_attended',
       'avg_training_score', 'awards_won', 'job_level', 'region',
       'recruitment_channel', 'work_type', 'language_count', 'multilingual',
       'international_hire', 'is_promoted'],
      dtype='object')

In [6]:
# separate features (X) and target (y)
X = df.drop(columns=['is_promoted'])
y = df['is_promoted']

## Organize features in the dataset
some features are numerical or categorical;
- numerical features are scaled down
- categorical are represented with numbers using label encoding

In [7]:
num_cols = [
    'age', 'tenure_years', 'kpi_met', 'awards_won', 'trainings_attended', 'avg_training_score', 'language_count'
]

cat_cols = [
    'gender', 'department', 'region', 'recruitment_channel', 'work_type', 'international_hire',
    'job_level', 'previous_rating', 'education', 'multilingual'
]

## Transform fatures using ColumnTransformer and pipelines

In [8]:
# replacing NaNs with a constant/median
df[num_cols] = df[num_cols].fillna(df[num_cols].median())
df[cat_cols] = df[cat_cols].fillna('missing')

# scaling
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# label encoding
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
df[cat_cols] = encoder.fit_transform(df[cat_cols])

In [14]:
label_mapping = {
    col: {cat: i for i, cat in enumerate(cats)}
    for col, cats in zip(cat_cols, encoder.categories_)
}

label_mapping

{'gender': {'F': 0, 'M': 1},
 'department': {'Finance': 0,
  'HR': 1,
  'Operations': 2,
  'Sales': 3,
  'Technology': 4},
 'region': {'East': 0, 'North': 1, 'South': 2, 'West': 3},
 'recruitment_channel': {'Agency': 0, 'Direct': 1, 'Referral': 2},
 'work_type': {'Full-Time': 0, 'Part-Time Day': 1, 'Part-Time Night': 2},
 'international_hire': {np.int64(0): 0, np.int64(1): 1},
 'job_level': {np.int64(1): 0,
  np.int64(2): 1,
  np.int64(3): 2,
  np.int64(4): 3,
  np.int64(5): 4},
 'previous_rating': {np.int64(1): 0,
  np.int64(2): 1,
  np.int64(3): 2,
  np.int64(4): 3,
  np.int64(5): 4},
 'education': {"Bachelor's": 0, 'High School': 1, "Master's": 2, 'PhD': 3},
 'multilingual': {np.int64(0): 0, np.int64(1): 1}}

In [11]:
df.head()

Unnamed: 0,age,gender,department,education,tenure_years,previous_rating,kpi_met,trainings_attended,avg_training_score,awards_won,job_level,region,recruitment_channel,work_type,language_count,multilingual,international_hire,is_promoted
0,0.851187,1.0,2.0,2.0,0.854043,3.0,-0.548828,3.528853,0.38755,-0.707591,3.0,2.0,2.0,0.0,-0.903567,1.0,0.0,0
1,-0.430142,0.0,3.0,0.0,-0.439257,0.0,-1.596033,-1.405423,-1.118994,-0.707591,2.0,2.0,1.0,1.0,-0.903567,1.0,0.0,0
2,-1.070807,1.0,1.0,0.0,-0.751797,3.0,0.789267,-1.405423,0.893637,0.679298,0.0,3.0,0.0,0.0,1.859634,0.0,0.0,1
3,0.118999,1.0,2.0,0.0,-0.916359,3.0,-0.141582,-1.405423,0.126405,0.679298,1.0,1.0,1.0,0.0,0.478034,1.0,0.0,1
4,-0.064048,0.0,3.0,2.0,-0.455064,0.0,-0.316116,-1.405423,1.485745,0.679298,1.0,1.0,0.0,0.0,0.478034,0.0,1.0,0


## Save processed data

In [14]:
df.head()

Unnamed: 0,age,gender,department,education,tenure_years,previous_rating,kpi_met,trainings_attended,avg_training_score,awards_won,job_level,region,recruitment_channel,work_type,language_count,multilingual,international_hire,is_promoted
0,0.851187,1.0,2.0,2.0,0.854043,3.0,-0.548828,3.528853,0.38755,-0.707591,3.0,2.0,2.0,0.0,-0.903567,1.0,0.0,0
1,-0.430142,0.0,3.0,0.0,-0.439257,0.0,-1.596033,-1.405423,-1.118994,-0.707591,2.0,2.0,1.0,1.0,-0.903567,1.0,0.0,0
2,-1.070807,1.0,1.0,0.0,-0.751797,3.0,0.789267,-1.405423,0.893637,0.679298,0.0,3.0,0.0,0.0,1.859634,0.0,0.0,1
3,0.118999,1.0,2.0,0.0,-0.916359,3.0,-0.141582,-1.405423,0.126405,0.679298,1.0,1.0,1.0,0.0,0.478034,1.0,0.0,1
4,-0.064048,0.0,3.0,2.0,-0.455064,0.0,-0.316116,-1.405423,1.485745,0.679298,1.0,1.0,0.0,0.0,0.478034,0.0,1.0,0


In [16]:
df.shape

(5000, 18)

In [17]:
# combine features and target into the final clean csv
df['is_promoted'] = y

df

Unnamed: 0,age,gender,department,education,tenure_years,previous_rating,kpi_met,trainings_attended,avg_training_score,awards_won,job_level,region,recruitment_channel,work_type,language_count,multilingual,international_hire,is_promoted
0,0.851187,1.0,2.0,2.0,0.854043,3.0,-0.548828,3.528853,0.387550,-0.707591,3.0,2.0,2.0,0.0,-0.903567,1.0,0.0,0
1,-0.430142,0.0,3.0,0.0,-0.439257,0.0,-1.596033,-1.405423,-1.118994,-0.707591,2.0,2.0,1.0,1.0,-0.903567,1.0,0.0,0
2,-1.070807,1.0,1.0,0.0,-0.751797,3.0,0.789267,-1.405423,0.893637,0.679298,0.0,3.0,0.0,0.0,1.859634,0.0,0.0,1
3,0.118999,1.0,2.0,0.0,-0.916359,3.0,-0.141582,-1.405423,0.126405,0.679298,1.0,1.0,1.0,0.0,0.478034,1.0,0.0,1
4,-0.064048,0.0,3.0,2.0,-0.455064,0.0,-0.316116,-1.405423,1.485745,0.679298,1.0,1.0,0.0,0.0,0.478034,0.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.393569,0.0,3.0,0.0,0.265901,0.0,1.254691,1.414164,-1.183893,0.679298,1.0,1.0,1.0,0.0,0.478034,0.0,0.0,0
4996,-0.338619,0.0,4.0,2.0,0.082687,3.0,1.312869,0.004370,-1.064015,0.679298,3.0,0.0,0.0,2.0,-0.903567,0.0,0.0,0
4997,-0.613189,0.0,0.0,2.0,1.152481,3.0,0.032952,0.004370,0.797840,-0.707591,2.0,0.0,2.0,0.0,0.478034,1.0,0.0,0
4998,0.485093,1.0,2.0,0.0,1.370361,2.0,-1.363321,0.709267,-0.240778,0.679298,1.0,3.0,0.0,0.0,1.859634,0.0,1.0,0


In [18]:
df.to_csv("../data/clean-dataset.csv", index=None, index_label=None)