In [23]:
import pickle

# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline

In [2]:
# Set the path to the file you'd like to load
file_path = "HR_Data_MNC_Data Science Lovers.csv"

# Load the latest version
df = kagglehub.dataset_load(
KaggleDatasetAdapter.PANDAS,
"rohitgrewal/hr-data-mnc",
file_path,
)
df = df.sample(frac=0.3, random_state=42)
df = df.reset_index(drop=True)
df = df.drop(['Unnamed: 0', 'Employee_ID', 'Full_Name'], axis=1)
df.columns = df.columns.str.lower().str.replace(' ','_')

strings = list(df.dtypes[df.dtypes == 'object'].index)
for col in strings:
    df[col] = df[col].str.lower().str.replace(' ','_')

df['location'] = df['location'].str.split(',_').str[-1]
df['salary_vnd'] = round(df['salary_inr'] * 296.77, 0)

del df['salary_inr']
del df['hire_date']

performance_rating_values = {
    1: 'rating1',
    2: 'rating2',
    3: 'rating3',
    4: 'rating4',
    5: 'rating5'
}
df.performance_rating = df.performance_rating.map(performance_rating_values)

In [3]:
df

Unnamed: 0,department,job_title,location,performance_rating,experience_years,status,work_mode,salary_vnd
0,it,devops_engineer,south_georgia_and_the_south_sandwich_islands,rating2,3,active,remote,140650580.0
1,it,software_engineer,bosnia_and_herzegovina,rating2,4,active,on-site,316452083.0
2,it,software_engineer,greenland,rating4,7,active,on-site,184822124.0
3,hr,talent_acquisition_specialist,samoa,rating3,13,active,remote,281166724.0
4,sales,account_manager,denmark,rating5,3,active,remote,129407152.0
...,...,...,...,...,...,...,...,...
599995,finance,accountant,jordan,rating1,9,terminated,on-site,224015944.0
599996,hr,hr_manager,liberia,rating3,7,active,remote,421909896.0
599997,r&d,research_scientist,barbados,rating3,9,resigned,remote,296434650.0
599998,it,software_engineer,guatemala,rating4,6,resigned,on-site,217386993.0


In [4]:
y_train = np.log1p(df['salary_vnd'])
del df['salary_vnd']

In [5]:
train_dicts = df.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)
X_train = dv.transform(train_dicts)
features = dv.get_feature_names_out().tolist()
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)

In [6]:
xgb_params = {
        'eta': 0.3, 
        'max_depth': 10,
        'min_child_weight': 1,      
        'objective': 'reg:squarederror',
        'nthread': 8,
        'eval_metric': 'rmse',     
        'seed': 42,
        'verbosity': 1,
    }
model = xgb.train(xgb_params, dtrain, num_boost_round=81, verbose_eval=5)

In [20]:
with open('xgboost.bin', 'wb') as f_out:
    pickle.dump((dv, model), f_out)

In [7]:
cv = {'department': 'it',
 'job_title': 'software_engineer',
 'location': 'bosnia_and_herzegovina',
 'performance_rating': 'rating2',
 'experience_years': 4,
 'status': 'active',
 'work_mode': 'on-site'}

In [8]:
X = dv.transform(cv)

In [9]:
d = xgb.DMatrix(X, feature_names=features)

In [10]:
result = model.predict(d)

In [11]:
result = result[0]

In [14]:
result

np.float32(19.63543)

In [19]:
print(format(round(np.expm1(result), 0), ".0f"))

336944704
