# Salary Prediction

This notebook presents a method for predicting salaries based on features like age, years of experience, and department, among others. We'll train two models, XGBoost and LightGBM, and create an ensemble by averaging their predictions.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import numpy as np



## Data Loading

First, let's load the training and test datasets.

In [2]:
# Load the datasets
train_data = pd.read_csv('/kaggle/input/thapar-summer-school-employee-salary-prediction/train.csv')
test_data = pd.read_csv('/kaggle/input/thapar-summer-school-employee-salary-prediction/test.csv')


## Data Preprocessing

We split the training data into training and validation sets. We then separate the target variable (salary) from the features. Categorical variables are one-hot encoded.


In [3]:
# Split the training data into training and validation sets
train_data, validation_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Separate the target variable (salary) from the features
train_features = train_data.drop('salary', axis=1)
train_target = train_data['salary']
validation_features = validation_data.drop('salary', axis=1)
validation_target = validation_data['salary']

# One-hot encode the categorical variables
encoder = OneHotEncoder(drop='first', sparse=False)
encoder.fit(train_features[['company', 'department']])

train_encoded_features = encoder.transform(train_features[['company', 'department']])
validation_encoded_features = encoder.transform(validation_features[['company', 'department']])
test_encoded_features = encoder.transform(test_data[['company', 'department']])

# Get the feature names
feature_names = encoder.categories_[0][1:].tolist() + encoder.categories_[1][1:].tolist()

# Convert the encoded features to DataFrames
train_encoded_df = pd.DataFrame(train_encoded_features, columns=feature_names)
validation_encoded_df = pd.DataFrame(validation_encoded_features, columns=feature_names)
test_encoded_df = pd.DataFrame(test_encoded_features, columns=feature_names)

# Reset the indices
train_features.reset_index(drop=True, inplace=True)
validation_features.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

# Add the encoded features to the original DataFrames
train_features = pd.concat([train_features.drop(['company', 'department'], axis=1), train_encoded_df], axis=1)
validation_features = pd.concat([validation_features.drop(['company', 'department'], axis=1), validation_encoded_df], axis=1)
test_features = pd.concat([test_data.drop(['company', 'department'], axis=1), test_encoded_df], axis=1)



In [15]:
# Initialize the base models with GPU parameters
base_models = [
    ('xgb', XGBRegressor(tree_method='gpu_hist', gpu_id=0, n_estimators=500, learning_rate=0.05, max_depth=5, random_state=42)),
    ('lgbm', LGBMRegressor(device='gpu', gpu_platform_id=0, gpu_device_id=0, n_estimators=500, learning_rate=0.05, max_depth=5, random_state=42))
]

In [16]:
# Initialize the meta-model
meta_model = LinearRegression()


In [17]:
# Initialize the stacking regressor
stacking_reg = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5)


In [18]:
# Train the stacking regressor
stacking_reg.fit(train_features, train_target)

In [19]:
# Make predictions on the validation set
validation_pred_stacking = stacking_reg.predict(validation_features)

In [20]:
# Calculate the MAE of the ensemble predictions
mae_stacking = mean_absolute_error(validation_target, validation_pred_stacking)

print(f'MAE of ensemble: {mae_stacking}')

MAE of ensemble: 11699.669342069104


In [21]:
# Make predictions on the test set
test_pred_stacking = stacking_reg.predict(test_features)

In [22]:
# Prepare the submission file
submission = pd.DataFrame({'id': test_data['id'], 'salary': test_pred_stacking})


In [23]:
# Save the submission file
submission.to_csv('/kaggle/working/submission.csv', index=False)

In [14]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [500, 1000, 1500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [5, 7, 9],
}

# Initialize a new XGBoost model
xgb_model = XGBRegressor(tree_method='gpu_hist', gpu_id=0, random_state=42)

# Initialize the grid search
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1)

# Perform the grid search
grid_search.fit(train_features, train_target)

# Print the best hyperparameters
print(grid_search.best_params_)

Fitting 3 folds for each of 27 candidates, totalling 81 fits




[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=500; total time=   4.3s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=500; total time=   3.3s
[CV] END .learning_rate=0.01, max_depth=5, n_estimators=1000; total time=   4.7s
[CV] END .learning_rate=0.01, max_depth=5, n_estimators=1500; total time=   7.9s
[CV] END .learning_rate=0.01, max_depth=5, n_estimators=1500; total time=   6.7s
[CV] END ..learning_rate=0.01, max_depth=7, n_estimators=500; total time=   4.6s
[CV] END .learning_rate=0.01, max_depth=7, n_estimators=1000; total time=   9.9s
[CV] END .learning_rate=0.01, max_depth=7, n_estimators=1500; total time=  13.1s
[CV] END .learning_rate=0.01, max_depth=7, n_estimators=1500; total time=  11.4s
[CV] END ..learning_rate=0.01, max_depth=9, n_estimators=500; total time=   9.2s
[CV] END .learning_rate=0.01, max_depth=9, n_estimators=1000; total time=  19.0s
[CV] END .learning_rate=0.01, max_depth=9, n_estimators=1000; total time=  17.9s
[CV] END .learning_rate=0.01



{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 500}
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=500; total time=   4.3s
[CV] END .learning_rate=0.01, max_depth=5, n_estimators=1000; total time=   5.5s
[CV] END .learning_rate=0.01, max_depth=5, n_estimators=1000; total time=   4.6s
[CV] END .learning_rate=0.01, max_depth=5, n_estimators=1500; total time=   7.1s
[CV] END ..learning_rate=0.01, max_depth=7, n_estimators=500; total time=   4.8s
[CV] END ..learning_rate=0.01, max_depth=7, n_estimators=500; total time=   4.6s
[CV] END .learning_rate=0.01, max_depth=7, n_estimators=1000; total time=  10.0s
[CV] END .learning_rate=0.01, max_depth=7, n_estimators=1000; total time=  10.0s
[CV] END .learning_rate=0.01, max_depth=7, n_estimators=1500; total time=  11.6s
[CV] END ..learning_rate=0.01, max_depth=9, n_estimators=500; total time=   7.9s
[CV] END ..learning_rate=0.01, max_depth=9, n_estimators=500; total time=   9.1s
[CV] END .learning_rate=0.01, max_depth=9, n_est