In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [2]:
df_encoded = pd.read_csv('../data/encoded_data.csv')

In [3]:
train_df, test_df = train_test_split(df_encoded, test_size=0.2, random_state=123)
features = [col for col in train_df.columns if col != 'salary_in_usd']

In [4]:
# current best params from hyperparameter tuning:
# {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 75, 'subsample': 0.8}
xbg = xgb.XGBRegressor(colsample_bytree=0.8, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=75, subsample=0.8)
xbg.fit(train_df[features], train_df['salary_in_usd'])

predictions = xbg.predict(test_df[features])

print(f"XGBoost Mean Squared Error: {mean_squared_error(test_df['salary_in_usd'], predictions)}")
print(f"XGBoost R-squared: {r2_score(test_df['salary_in_usd'], predictions)}")
print(f"XGBoost Accuracy: {xbg.score(test_df[features], test_df['salary_in_usd'])}")
print(f"Mean Absolute Error: {mean_absolute_error(test_df['salary_in_usd'], predictions)}")

# Feature importance
feature_importance = pd.DataFrame({'Feature': features, 'Importance': xbg.feature_importances_})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print(feature_importance.head(10))

XGBoost Mean Squared Error: 2024085760.0
XGBoost R-squared: 0.44016188383102417
XGBoost Accuracy: 0.44016188383102417
Mean Absolute Error: 34731.91015625
                                Feature  Importance
18                  experience_level_SE    0.186654
6                job_title_Data Analyst    0.130248
1                    employee_residence    0.089373
19                  experience_level_EX    0.065821
3                      company_location    0.056195
15         job_title_Research Scientist    0.040793
17                  experience_level_MI    0.039507
8               job_title_Data Engineer    0.038368
12  job_title_Machine Learning Engineer    0.038310
4                job_title_AI Scientist    0.028228


In [5]:

def bootstrap_predictions(n_bootstrap, train_df, test_df, features):
    # Reallocate the bootstrap_preds array based on the current n_bootstrap value
    n_test = test_df.shape[0]
    bootstrap_preds = np.zeros((n_bootstrap, n_test))
    
    for i in range(n_bootstrap):
        # Sample training data with replacement
        train_sample = train_df.sample(frac=1, replace=True, random_state=i)
        
        model = xgb.XGBRegressor(
            colsample_bytree=0.8,
            learning_rate=0.1,
            max_depth=5,
            min_child_weight=1,
            n_estimators=75,
            subsample=0.8,
            random_state=123
        )
        model.fit(train_sample[features], train_sample['salary_in_usd'])
        
        # Predict on the test set
        bootstrap_preds[i, :] = model.predict(test_df[features])
    
    # For each test point, compute the 2.5th and 97.5th percentiles for a 95% prediction interval
    lower_bounds = np.percentile(bootstrap_preds, 2.5, axis=0)
    upper_bounds = np.percentile(bootstrap_preds, 97.5, axis=0)
    median_preds  = np.median(bootstrap_preds, axis=0)
    return lower_bounds, upper_bounds, median_preds

n_bootstrap = 100

lb, ub, mp = bootstrap_predictions(n_bootstrap, train_df, test_df, features)
print("Sample predictions with 95% prediction intervals(n=10):")
for i in range(10):
    print(f"Test Point {i}: Median Prediction = {mp[i]:.2f}, "
          f"Lower Bound = {lb[i]:.2f}, Upper Bound = {ub[i]:.2f}")

Sample predictions with 95% prediction intervals(n=10):
Test Point 0: Median Prediction = 164770.82, Lower Bound = 160386.24, Upper Bound = 170301.63
Test Point 1: Median Prediction = 214022.90, Lower Bound = 157507.99, Upper Bound = 253314.00
Test Point 2: Median Prediction = 132148.15, Lower Bound = 123603.54, Upper Bound = 139543.41
Test Point 3: Median Prediction = 180548.66, Lower Bound = 169107.11, Upper Bound = 194125.44
Test Point 4: Median Prediction = 35444.63, Lower Bound = 13341.52, Upper Bound = 63167.55
Test Point 5: Median Prediction = 69505.85, Lower Bound = 49725.53, Upper Bound = 91555.53
Test Point 6: Median Prediction = 132148.15, Lower Bound = 123603.54, Upper Bound = 139543.41
Test Point 7: Median Prediction = 101102.52, Lower Bound = 88229.04, Upper Bound = 120702.42
Test Point 8: Median Prediction = 73798.82, Lower Bound = 67159.64, Upper Bound = 79850.45
Test Point 9: Median Prediction = 98904.74, Lower Bound = 81657.74, Upper Bound = 115770.49


In [6]:
n_bootstrap = 200

lb, ub, mp = bootstrap_predictions(n_bootstrap, train_df, test_df, features)
print("Sample predictions with 95% prediction intervals(n=200):")
for i in range(10):
    print(f"Test Point {i}: Median Prediction = {mp[i]:.2f}, "
          f"Lower Bound = {lb[i]:.2f}, Upper Bound = {ub[i]:.2f}")

Sample predictions with 95% prediction intervals(n=200):
Test Point 0: Median Prediction = 164981.39, Lower Bound = 160134.59, Upper Bound = 170268.42
Test Point 1: Median Prediction = 217269.36, Lower Bound = 157434.42, Upper Bound = 253222.79
Test Point 2: Median Prediction = 131459.73, Lower Bound = 123213.11, Upper Bound = 140212.94
Test Point 3: Median Prediction = 180258.56, Lower Bound = 169015.77, Upper Bound = 193376.33
Test Point 4: Median Prediction = 37127.55, Lower Bound = 8867.46, Upper Bound = 63867.34
Test Point 5: Median Prediction = 69534.13, Lower Bound = 50980.96, Upper Bound = 90188.41
Test Point 6: Median Prediction = 131459.73, Lower Bound = 123213.11, Upper Bound = 140212.94
Test Point 7: Median Prediction = 102085.18, Lower Bound = 87544.72, Upper Bound = 120380.70
Test Point 8: Median Prediction = 73648.38, Lower Bound = 67299.25, Upper Bound = 79629.63
Test Point 9: Median Prediction = 97997.09, Lower Bound = 79318.91, Upper Bound = 116376.80
