# Testing out models

In [102]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [103]:
df_encoded = pd.read_csv('../data/encoded_data.csv')

train_df, test_df = train_test_split(df_encoded, test_size=0.2, random_state=123)
features = [col for col in train_df.columns if col != 'salary_in_usd']

## Decision Tree Model

In [104]:
model = DecisionTreeRegressor()

# Train the model
model.fit(train_df[features], train_df['salary_in_usd'])

# Make predictions on the test set
predictions = model.predict(test_df[features])

# Evaluate the model
print(f"Mean Squared Error: {mean_squared_error(test_df['salary_in_usd'], predictions)}")
print(f"R-squared: {r2_score(test_df['salary_in_usd'], predictions)}")
print(f"Accuracy: {model.score(test_df[features], test_df['salary_in_usd'])}")
print(f"Mean Absolute Error: {mean_absolute_error(test_df['salary_in_usd'], predictions)}")

# Feature importance
feature_importance = pd.DataFrame({'Feature': features, 'Importance': model.feature_importances_})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print(feature_importance.head(10))  # Show top 10 most important features

Mean Squared Error: 2592595269.09228
R-squared: 0.28291893764299014
Accuracy: 0.28291893764299014
Mean Absolute Error: 36936.08781460268
                         Feature  Importance
1             employee_residence    0.441546
6         job_title_Data Analyst    0.101479
18           experience_level_SE    0.100752
0                      work_year    0.047055
3               company_location    0.045312
2                   remote_ratio    0.035187
17           experience_level_MI    0.029659
19           experience_level_EX    0.029068
8        job_title_Data Engineer    0.024400
15  job_title_Research Scientist    0.022687


## Linear Regression Model

In [105]:
# Initialize the model
model = LinearRegression()

# Train the model
model.fit(train_df[features], train_df['salary_in_usd'])

# Make predictions on the test set
predictions = model.predict(test_df[features])

# Evaluate the model
print(f"Mean Squared Error: {mean_squared_error(test_df['salary_in_usd'], predictions)}")
print(f"R-squared: {r2_score(test_df['salary_in_usd'], predictions)}")
print(f"Accuracy: {model.score(test_df[features], test_df['salary_in_usd'])}")
print(f"Mean Absolute Error: {mean_absolute_error(test_df['salary_in_usd'], predictions)}")

# Feature importance
feature_importance = pd.DataFrame({'Feature': features, 'Importance': model.coef_})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print(feature_importance.head(10))  # Show top 10 most important features

Mean Squared Error: 2283713481.2406917
R-squared: 0.3683519719526217
Accuracy: 0.3683519719526217
Mean Absolute Error: 37522.98449692578
                                Feature    Importance
19                  experience_level_EX  50712.180068
4                job_title_AI Scientist  42496.804665
15         job_title_Research Scientist  31875.343499
18                  experience_level_SE  30601.538601
17                  experience_level_MI  21944.172989
24                         same_country  20818.796428
22                   employment_type_FT  16252.835242
12  job_title_Machine Learning Engineer  16005.519658
0                             work_year   7106.917780
25                       company_size_L   6679.352218


## Random Forest Model

In [106]:
# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=123)
rf_model.fit(train_df[features], train_df['salary_in_usd'])

# Make predictions on the test set
predictions = rf_model.predict(test_df[features])

# Evaluate the model
print(f"Random Forest Mean Squared Error: {mean_squared_error(test_df['salary_in_usd'], predictions)}")
print(f"Random Forest R-squared: {r2_score(test_df['salary_in_usd'], predictions)}")
print(f"Random Forest Accuracy: {rf_model.score(test_df[features], test_df['salary_in_usd'])}")
print(f"Mean Absolute Error: {mean_absolute_error(test_df['salary_in_usd'], predictions)}")

# Feature importance
rf_feature_importance = pd.DataFrame({'Feature': features, 'Importance': rf_model.feature_importances_})
rf_feature_importance = rf_feature_importance.sort_values(by='Importance', ascending=False)
print(rf_feature_importance.head(10))

Random Forest Mean Squared Error: 2150090450.169163
Random Forest R-squared: 0.4053105154701694
Random Forest Accuracy: 0.4053105154701694
Mean Absolute Error: 35351.083007521185
                         Feature  Importance
1             employee_residence    0.416690
18           experience_level_SE    0.111187
6         job_title_Data Analyst    0.088055
0                      work_year    0.066094
2                   remote_ratio    0.048984
3               company_location    0.038211
19           experience_level_EX    0.028777
15  job_title_Research Scientist    0.024493
25                company_size_L    0.022394
17           experience_level_MI    0.021160


## Gradient Boosting Model

In [107]:
# Initialize and train the Gradient Boosting model
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=123)
gb_model.fit(train_df[features], train_df['salary_in_usd'])

# Make predictions on the test set
predictions = gb_model.predict(test_df[features])

# Evaluate the model
print(f"Gradient Boosting Mean Squared Error: {mean_squared_error(test_df['salary_in_usd'], predictions)}")
print(f"Gradient Boosting R-squared: {r2_score(test_df['salary_in_usd'], predictions)}")
print(f"Gradient Boosting Accuracy: {gb_model.score(test_df[features], test_df['salary_in_usd'])}")
print(f"Mean Absolute Error: {mean_absolute_error(test_df['salary_in_usd'], predictions)}")

# Feature importance
rf_feature_importance = pd.DataFrame({'Feature': features, 'Importance': rf_model.feature_importances_})
rf_feature_importance = rf_feature_importance.sort_values(by='Importance', ascending=False)
print(rf_feature_importance.head(10))

Gradient Boosting Mean Squared Error: 2018327106.6027784
Gradient Boosting R-squared: 0.4417546915090219
Gradient Boosting Accuracy: 0.4417546915090219
Mean Absolute Error: 34964.1222155299
                         Feature  Importance
1             employee_residence    0.416690
18           experience_level_SE    0.111187
6         job_title_Data Analyst    0.088055
0                      work_year    0.066094
2                   remote_ratio    0.048984
3               company_location    0.038211
19           experience_level_EX    0.028777
15  job_title_Research Scientist    0.024493
25                company_size_L    0.022394
17           experience_level_MI    0.021160


## Support Vector Regression Model

In [108]:
# Build a pipeline: scale features then train SVR
svr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

# Train the SVR model
svr_pipeline.fit(train_df[features], train_df['salary_in_usd'])

# Make predictions on the test set
predictions = svr_pipeline.predict(test_df[features])

# Evaluate the model
print(f"SVR Mean Squared Error: {mean_squared_error(test_df['salary_in_usd'], predictions)}")
print(f"SVR R-squared: {r2_score(test_df['salary_in_usd'], predictions)}")
print(f"SVR Accuracy: {svr_pipeline.score(test_df[features], test_df['salary_in_usd'])}")
print(f"Mean Absolute Error: {mean_absolute_error(test_df['salary_in_usd'], predictions)}")

# Feature importance
rf_feature_importance = pd.DataFrame({'Feature': features, 'Importance': rf_model.feature_importances_})
rf_feature_importance = rf_feature_importance.sort_values(by='Importance', ascending=False)
print(rf_feature_importance.head(10))

SVR Mean Squared Error: 3612827156.126812
SVR R-squared: 0.0007349137319370147
SVR Accuracy: 0.0007349137319370147
Mean Absolute Error: 47588.93857393002
                         Feature  Importance
1             employee_residence    0.416690
18           experience_level_SE    0.111187
6         job_title_Data Analyst    0.088055
0                      work_year    0.066094
2                   remote_ratio    0.048984
3               company_location    0.038211
19           experience_level_EX    0.028777
15  job_title_Research Scientist    0.024493
25                company_size_L    0.022394
17           experience_level_MI    0.021160


## XGBoost Model

In [109]:
# Initialize and train the XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
xgb_model.fit(train_df[features], train_df['salary_in_usd'])

# Make predictions on the test set
predictions = xgb_model.predict(test_df[features])

# Evaluate the model
print(f"XGBoost Mean Squared Error: {mean_squared_error(test_df['salary_in_usd'], predictions)}")
print(f"XGBoost R-squared: {r2_score(test_df['salary_in_usd'], predictions)}")
print(f"XGBoost Accuracy: {xgb_model.score(test_df[features], test_df['salary_in_usd'])}")
print(f"Mean Absolute Error: {mean_absolute_error(test_df['salary_in_usd'], predictions)}")

# Feature importance
rf_feature_importance = pd.DataFrame({'Feature': features, 'Importance': rf_model.feature_importances_})
rf_feature_importance = rf_feature_importance.sort_values(by='Importance', ascending=False)
print(rf_feature_importance.head(10))

XGBoost Mean Squared Error: 2186288896.0
XGBoost R-squared: 0.39529842138290405
XGBoost Accuracy: 0.39529842138290405
Mean Absolute Error: 35730.01953125
                         Feature  Importance
1             employee_residence    0.416690
18           experience_level_SE    0.111187
6         job_title_Data Analyst    0.088055
0                      work_year    0.066094
2                   remote_ratio    0.048984
3               company_location    0.038211
19           experience_level_EX    0.028777
15  job_title_Research Scientist    0.024493
25                company_size_L    0.022394
17           experience_level_MI    0.021160


Gradient Boosting is working the best so we can use that model for the final model.

## Hyperparameter Tuning Gradient Boosting Model

In [122]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [20, 50, 75, 100],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': [6, 7, 8, 9, 10],
    'min_samples_split': [15, 20, 25, 30],
    'min_samples_leaf': [1, 2, 3]
}

gb_model = GradientBoostingRegressor()

grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(train_df[features], train_df['salary_in_usd'])

print(grid_search.best_params_)

{'learning_rate': 0.2, 'max_depth': 6, 'min_samples_leaf': 1, 'min_samples_split': 20, 'n_estimators': 50}
Default GB Mean Absolute Error: 34995.92699863281


In [123]:
# make model with {'learning_rate': 0.2, 'max_depth': 6, 'min_samples_leaf': 1, 'min_samples_split': 20, 'n_estimators': 50}
gb_model = GradientBoostingRegressor(learning_rate=0.2, max_depth=6, min_samples_leaf=1, min_samples_split=20, n_estimators=50)
gb_model.fit(train_df[features], train_df['salary_in_usd'])

predictions = gb_model.predict(test_df[features])

print(f"Gradient Boosting Mean Squared Error: {mean_squared_error(test_df['salary_in_usd'], predictions)}")
print(f"Gradient Boosting R-squared: {r2_score(test_df['salary_in_usd'], predictions)}")
print(f"Gradient Boosting Accuracy: {gb_model.score(test_df[features], test_df['salary_in_usd'])}")
print(f"Mean Absolute Error: {mean_absolute_error(test_df['salary_in_usd'], predictions)}")

# Feature importance
feature_importance = pd.DataFrame({'Feature': features, 'Importance': gb_model.feature_importances_})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print(feature_importance.head(10))

Gradient Boosting Mean Squared Error: 2100895439.0099707
Gradient Boosting R-squared: 0.4189172713279976
Gradient Boosting Accuracy: 0.4189172713279976
Mean Absolute Error: 35313.8965762087
                         Feature  Importance
1             employee_residence    0.456948
18           experience_level_SE    0.111983
6         job_title_Data Analyst    0.104777
0                      work_year    0.044372
3               company_location    0.043272
4         job_title_AI Scientist    0.032693
19           experience_level_EX    0.031664
17           experience_level_MI    0.024375
15  job_title_Research Scientist    0.023456
2                   remote_ratio    0.022434


In [131]:
# Define the parameter grid
param_grid = {
    'n_estimators': [25, 50, 75],
    'learning_rate': [0.05, 0.1, 0.15],
    'max_depth': [4, 5, 6, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

xbg = xgb.XGBRegressor()

grid_search = GridSearchCV(estimator=xbg, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(train_df[features], train_df['salary_in_usd'])

print(grid_search.best_params_)

{'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 75, 'subsample': 0.8}


In [130]:
# xbg model with {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 50}
xbg = xgb.XGBRegressor(learning_rate=0.1, max_depth=6, n_estimators=50)
xbg.fit(train_df[features], train_df['salary_in_usd'])

predictions = xbg.predict(test_df[features])

print(f"XGBoost Mean Squared Error: {mean_squared_error(test_df['salary_in_usd'], predictions)}")
print(f"XGBoost R-squared: {r2_score(test_df['salary_in_usd'], predictions)}")
print(f"XGBoost Accuracy: {xbg.score(test_df[features], test_df['salary_in_usd'])}")
print(f"Mean Absolute Error: {mean_absolute_error(test_df['salary_in_usd'], predictions)}")

# Feature importance
feature_importance = pd.DataFrame({'Feature': features, 'Importance': xbg.feature_importances_})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print(feature_importance.head(10))


XGBoost Mean Squared Error: 2068845312.0
XGBoost R-squared: 0.42778193950653076
XGBoost Accuracy: 0.42778193950653076
Mean Absolute Error: 34996.12890625
                                Feature  Importance
6                job_title_Data Analyst    0.215807
1                    employee_residence    0.147742
18                  experience_level_SE    0.141537
19                  experience_level_EX    0.067708
8               job_title_Data Engineer    0.042165
17                  experience_level_MI    0.038047
15         job_title_Research Scientist    0.036798
12  job_title_Machine Learning Engineer    0.028912
4                job_title_AI Scientist    0.026230
3                      company_location    0.025925


In [None]:
# {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 75, 'subsample': 0.8}
xbg = xgb.XGBRegressor(colsample_bytree=0.8, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=75, subsample=0.8)
xbg.fit(train_df[features], train_df['salary_in_usd'])

predictions = xbg.predict(test_df[features])

print(f"XGBoost Mean Squared Error: {mean_squared_error(test_df['salary_in_usd'], predictions)}")
print(f"XGBoost R-squared: {r2_score(test_df['salary_in_usd'], predictions)}")
print(f"XGBoost Accuracy: {xbg.score(test_df[features], test_df['salary_in_usd'])}")
print(f"Mean Absolute Error: {mean_absolute_error(test_df['salary_in_usd'], predictions)}")

# Feature importance
feature_importance = pd.DataFrame({'Feature': features, 'Importance': xbg.feature_importances_})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print(feature_importance.head(10))

XGBoost Mean Squared Error: 2024085760.0
XGBoost R-squared: 0.44016188383102417
XGBoost Accuracy: 0.44016188383102417
Mean Absolute Error: 34731.91015625
                                Feature  Importance
18                  experience_level_SE    0.186654
6                job_title_Data Analyst    0.130248
1                    employee_residence    0.089373
19                  experience_level_EX    0.065821
3                      company_location    0.056195
15         job_title_Research Scientist    0.040793
17                  experience_level_MI    0.039507
8               job_title_Data Engineer    0.038368
12  job_title_Machine Learning Engineer    0.038310
4                job_title_AI Scientist    0.028228
