In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#!wget https://www.kaggle.com/api/v1/datasets/download/rohitgrewal/hr-data-mnc

In [3]:
#!unzip hr-data-mnc -d hr_data

In [None]:
df = pd.read_csv('hr_data/HR_Data_MNC_Data Science Lovers.csv')

# Keep 30% data = 500k rows for processing
df = df.sample(frac=0.3, random_state=42)

# Delete not importance cols
df = df.drop(['Unnamed: 0', 'Employee_ID', 'Full_Name'], axis=1)

df.columns = df.columns.str.lower().str.replace(' ','_')
strings = list(df.dtypes[df.dtypes == 'object'].index)
for col in strings:
    df[col] = df[col].str.lower().str.replace(' ','_')

# I just want to get country out of location information
df['location'] = df['location'].str.split(',_').str[-1]
# I am Vietnamese so i want to convert salary from INR to VND
df['salary_vnd'] = round(df['salary_inr'] * 296.77, 0)
# Create a new field hire_year instead of hire_date
df['hire_date'] = pd.to_datetime(df['hire_date'], errors='coerce') 
df['hire_year'] = df['hire_date'].dt.year

del df['salary_inr']
del df['hire_date']

In [None]:
df.isnull().sum()

In [None]:
categories = df.columns[df.dtypes == 'object'].tolist()

In [None]:
for col in categories:
    print(df[col].value_counts())
    print()

In [None]:
df['hire_year'].value_counts()

In [None]:
df[['experience_years', 'hire_year']].corr()

In [None]:
#Given the similarity between the two fields, experiences_years and hire_year, I decided to drop the hire_year field.
del df['hire_year']

In [None]:
%matplotlib inline
sns.histplot(df.salary_vnd, bins=50)

It seems like my data has fallen into a long-tail case.

In [None]:
# I will narrow the range of values by applying log1p to the target column.
sns.histplot(np.log1p(df['salary_vnd']), bins=50)

Better now

In [None]:
df['performance_rating'].value_counts()

In [None]:
# I think performance_rating is not like numerical type, its more like categorical. So i will change it into categorical column by change the values
performance_rating_values = {
    1: 'rating1',
    2: 'rating2',
    3: 'rating3',
    4: 'rating4',
    5: 'rating5'
}

In [None]:
df.performance_rating = df.performance_rating.map(performance_rating_values)
df

In [None]:
#Everything seem good now. Let's export it out as a csv file for training
df.to_csv("data_prepared.csv", index=False, encoding='utf-8-sig')