# Testing out models

In [86]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [87]:
df_encoded = pd.read_csv('../data/encoded_data.csv')

train_df, test_df = train_test_split(df_encoded, test_size=0.2, random_state=123)
features = [col for col in train_df.columns if col != 'salary_in_usd']

## Decision Tree Model

In [88]:
model = DecisionTreeRegressor()

# Train the model
model.fit(train_df[features], train_df['salary_in_usd'])

# Make predictions on the test set
predictions = model.predict(test_df[features])

# Evaluate the model
print(f"Mean Squared Error: {mean_squared_error(test_df['salary_in_usd'], predictions)}")
print(f"R-squared: {r2_score(test_df['salary_in_usd'], predictions)}")
print(f"Accuracy: {model.score(test_df[features], test_df['salary_in_usd'])}")
print(f"Mean Absolute Error: {mean_absolute_error(test_df['salary_in_usd'], predictions)}")

# Feature importance
feature_importance = pd.DataFrame({'Feature': features, 'Importance': model.feature_importances_})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print(feature_importance.head(10))  # Show top 10 most important features

Mean Squared Error: 2611286783.7520714
R-squared: 0.2777490866642842
Accuracy: 0.2777490866642842
Mean Absolute Error: 37022.53084494818
                    Feature  Importance
1        employee_residence    0.434328
6    job_title_Data Analyst    0.101558
18      experience_level_SE    0.100752
0                 work_year    0.046325
3          company_location    0.041772
2              remote_ratio    0.037934
17      experience_level_MI    0.029557
19      experience_level_EX    0.028949
25           company_size_L    0.027059
8   job_title_Data Engineer    0.024719


## Linear Regression Model

In [89]:
# Initialize the model
model = LinearRegression()

# Train the model
model.fit(train_df[features], train_df['salary_in_usd'])

# Make predictions on the test set
predictions = model.predict(test_df[features])

# Evaluate the model
print(f"Mean Squared Error: {mean_squared_error(test_df['salary_in_usd'], predictions)}")
print(f"R-squared: {r2_score(test_df['salary_in_usd'], predictions)}")
print(f"Accuracy: {model.score(test_df[features], test_df['salary_in_usd'])}")
print(f"Mean Absolute Error: {mean_absolute_error(test_df['salary_in_usd'], predictions)}")

# Feature importance
feature_importance = pd.DataFrame({'Feature': features, 'Importance': model.coef_})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print(feature_importance.head(10))  # Show top 10 most important features

Mean Squared Error: 2283713481.2406917
R-squared: 0.3683519719526217
Accuracy: 0.3683519719526217
Mean Absolute Error: 37522.98449692578
                                Feature    Importance
19                  experience_level_EX  50712.180068
4                job_title_AI Scientist  42496.804665
15         job_title_Research Scientist  31875.343499
18                  experience_level_SE  30601.538601
17                  experience_level_MI  21944.172989
24                         same_country  20818.796428
22                   employment_type_FT  16252.835242
12  job_title_Machine Learning Engineer  16005.519658
0                             work_year   7106.917780
25                       company_size_L   6679.352218


## Random Forest Model

In [90]:
# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=123)
rf_model.fit(train_df[features], train_df['salary_in_usd'])

# Make predictions on the test set
rf_predictions = rf_model.predict(test_df[features])

# Evaluate the model
print(f"Random Forest Mean Squared Error: {mean_squared_error(test_df['salary_in_usd'], rf_predictions)}")
print(f"Random Forest R-squared: {r2_score(test_df['salary_in_usd'], rf_predictions)}")
print(f"Random Forest Accuracy: {rf_model.score(test_df[features], test_df['salary_in_usd'])}")
print(f"Mean Absolute Error: {mean_absolute_error(test_df['salary_in_usd'], predictions)}")

# Feature importance
rf_feature_importance = pd.DataFrame({'Feature': features, 'Importance': rf_model.feature_importances_})
rf_feature_importance = rf_feature_importance.sort_values(by='Importance', ascending=False)
print(rf_feature_importance.head(10))

Random Forest Mean Squared Error: 2150090450.169163
Random Forest R-squared: 0.4053105154701694
Random Forest Accuracy: 0.4053105154701694
Mean Absolute Error: 37522.98449692578
                         Feature  Importance
1             employee_residence    0.416690
18           experience_level_SE    0.111187
6         job_title_Data Analyst    0.088055
0                      work_year    0.066094
2                   remote_ratio    0.048984
3               company_location    0.038211
19           experience_level_EX    0.028777
15  job_title_Research Scientist    0.024493
25                company_size_L    0.022394
17           experience_level_MI    0.021160


## Gradient Boosting Model

In [91]:
# Initialize and train the Gradient Boosting model
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=123)
gb_model.fit(train_df[features], train_df['salary_in_usd'])

# Make predictions on the test set
gb_predictions = gb_model.predict(test_df[features])

# Evaluate the model
print(f"Gradient Boosting Mean Squared Error: {mean_squared_error(test_df['salary_in_usd'], gb_predictions)}")
print(f"Gradient Boosting R-squared: {r2_score(test_df['salary_in_usd'], gb_predictions)}")
print(f"Gradient Boosting Accuracy: {gb_model.score(test_df[features], test_df['salary_in_usd'])}")
print(f"Mean Absolute Error: {mean_absolute_error(test_df['salary_in_usd'], predictions)}")

# Feature importance
rf_feature_importance = pd.DataFrame({'Feature': features, 'Importance': rf_model.feature_importances_})
rf_feature_importance = rf_feature_importance.sort_values(by='Importance', ascending=False)
print(rf_feature_importance.head(10))

Gradient Boosting Mean Squared Error: 2018327106.6027784
Gradient Boosting R-squared: 0.4417546915090219
Gradient Boosting Accuracy: 0.4417546915090219
Mean Absolute Error: 37522.98449692578
                         Feature  Importance
1             employee_residence    0.416690
18           experience_level_SE    0.111187
6         job_title_Data Analyst    0.088055
0                      work_year    0.066094
2                   remote_ratio    0.048984
3               company_location    0.038211
19           experience_level_EX    0.028777
15  job_title_Research Scientist    0.024493
25                company_size_L    0.022394
17           experience_level_MI    0.021160


## Support Vector Regression Model

In [92]:
# Build a pipeline: scale features then train SVR
svr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

# Train the SVR model
svr_pipeline.fit(train_df[features], train_df['salary_in_usd'])

# Make predictions on the test set
svr_predictions = svr_pipeline.predict(test_df[features])

# Evaluate the model
print(f"SVR Mean Squared Error: {mean_squared_error(test_df['salary_in_usd'], svr_predictions)}")
print(f"SVR R-squared: {r2_score(test_df['salary_in_usd'], svr_predictions)}")
print(f"SVR Accuracy: {svr_pipeline.score(test_df[features], test_df['salary_in_usd'])}")
print(f"Mean Absolute Error: {mean_absolute_error(test_df['salary_in_usd'], predictions)}")

# Feature importance
rf_feature_importance = pd.DataFrame({'Feature': features, 'Importance': rf_model.feature_importances_})
rf_feature_importance = rf_feature_importance.sort_values(by='Importance', ascending=False)
print(rf_feature_importance.head(10))

SVR Mean Squared Error: 3612827156.126812
SVR R-squared: 0.0007349137319370147
SVR Accuracy: 0.0007349137319370147
Mean Absolute Error: 37522.98449692578
                         Feature  Importance
1             employee_residence    0.416690
18           experience_level_SE    0.111187
6         job_title_Data Analyst    0.088055
0                      work_year    0.066094
2                   remote_ratio    0.048984
3               company_location    0.038211
19           experience_level_EX    0.028777
15  job_title_Research Scientist    0.024493
25                company_size_L    0.022394
17           experience_level_MI    0.021160


## XGBoost Model

In [93]:


# Initialize and train the XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
xgb_model.fit(train_df[features], train_df['salary_in_usd'])

# Make predictions on the test set
xgb_predictions = xgb_model.predict(test_df[features])

# Evaluate the model
print(f"XGBoost Mean Squared Error: {mean_squared_error(test_df['salary_in_usd'], xgb_predictions)}")
print(f"XGBoost R-squared: {r2_score(test_df['salary_in_usd'], xgb_predictions)}")
print(f"XGBoost Accuracy: {xgb_model.score(test_df[features], test_df['salary_in_usd'])}")
print(f"Mean Absolute Error: {mean_absolute_error(test_df['salary_in_usd'], predictions)}")

# Feature importance
rf_feature_importance = pd.DataFrame({'Feature': features, 'Importance': rf_model.feature_importances_})
rf_feature_importance = rf_feature_importance.sort_values(by='Importance', ascending=False)
print(rf_feature_importance.head(10))

XGBoost Mean Squared Error: 2186288896.0
XGBoost R-squared: 0.39529842138290405
XGBoost Accuracy: 0.39529842138290405
Mean Absolute Error: 37522.98449692578
                         Feature  Importance
1             employee_residence    0.416690
18           experience_level_SE    0.111187
6         job_title_Data Analyst    0.088055
0                      work_year    0.066094
2                   remote_ratio    0.048984
3               company_location    0.038211
19           experience_level_EX    0.028777
15  job_title_Research Scientist    0.024493
25                company_size_L    0.022394
17           experience_level_MI    0.021160
