In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

In [2]:
# Read the dataset
df = pd.read_csv('../data/ds_salaries.csv')

In [3]:
df['job_title'].value_counts()

job_title
Data Engineer                          1040
Data Scientist                          840
Data Analyst                            612
Machine Learning Engineer               289
Analytics Engineer                      103
                                       ... 
Principal Machine Learning Engineer       1
Azure Data Engineer                       1
Manager Data Management                   1
Marketing Data Engineer                   1
Finance Data Analyst                      1
Name: count, Length: 93, dtype: int64

In [4]:
import re

# Define a function to map job titles to broader categories
def map_job_title(job_title):
    # Normalize the job title: lowercase and strip whitespace
    title = job_title.lower().strip()
    
    # Define a mapping dictionary for grouping similar titles
    mapping = {
        # Data Engineer group
        "data engineer": "Data Engineer",
        "etl developer": "Data Engineer",
        "etl engineer": "Data Engineer",
        "big data engineer": "Data Engineer",
        "cloud data engineer": "Data Engineer",
        "azure data engineer": "Data Engineer",
        "data devops engineer": "Data Engineer",
        "cloud database engineer": "Data Engineer",
        "data operations engineer": "Data Engineer",
        "data infrastructure engineer": "Data Engineer",
        "bi data engineer": "Data Engineer",
        
        # Data Scientist group
        "data scientist": "Data Scientist",
        "applied data scientist": "Data Scientist",
        "lead data scientist": "Data Scientist",
        "staff data scientist": "Data Scientist",
        "data scientist lead": "Data Scientist",
        "product data scientist": "Data Scientist",
        "data science manager": "Data Scientist",
        "director of data science": "Data Scientist",
        "data science consultant": "Data Scientist",
        "data science lead": "Data Scientist",
        "data science engineer": "Data Scientist",
        "data science tech lead": "Data Scientist",
        
        # Data Analyst group
        "data analyst": "Data Analyst",
        "lead data analyst": "Data Analyst",
        "staff data analyst": "Data Analyst",
        "business data analyst": "Data Analyst",
        "bi data analyst": "Data Analyst",
        "financial data analyst": "Data Analyst",
        "product data analyst": "Data Analyst",
        "compliance data analyst": "Data Analyst",
        "marketing data analyst": "Data Analyst",
        "data quality analyst": "Data Analyst",
        "bi analyst": "Data Analyst",
        "analytics engineer": "Data Analyst",
        "data analytics manager": "Data Analyst",
        "data analytics engineer": "Data Analyst",
        "data analytics specialist": "Data Analyst",
        "data analytics consultant": "Data Analyst",
        "data analytics lead": "Data Analyst",
        
        # Machine Learning Engineer group
        "machine learning engineer": "Machine Learning Engineer",
        "ml engineer": "Machine Learning Engineer",
        "machine learning scientist": "Machine Learning Engineer",
        "machine learning developer": "Machine Learning Engineer",
        "machine learning software engineer": "Machine Learning Engineer",
        "applied machine learning scientist": "Machine Learning Engineer",
        "applied machine learning engineer": "Machine Learning Engineer",
        "machine learning researcher": "Machine Learning Engineer",
        "machine learning research engineer": "Machine Learning Engineer",
        "lead machine learning engineer": "Machine Learning Engineer",
        "machine learning manager": "Machine Learning Engineer",
        "principal machine learning engineer": "Machine Learning Engineer",
        "machine learning infrastructure engineer": "Machine Learning Engineer",
        "mlops engineer": "Machine Learning Engineer",
        
        # Other groups
        "data architect": "Data Architect",
        "big data architect": "Data Architect",
        "research scientist": "Research Scientist",
        "applied scientist": "Research Scientist",
        "research engineer": "Research Scientist",
        "computer vision engineer": "Computer Vision Engineer",
        "computer vision software engineer": "Computer Vision Engineer",
        "3d computer vision researcher": "Computer Vision Engineer",
        "nlp engineer": "NLP Engineer",
        "ai scientist": "AI Scientist",
        "ai developer": "AI Scientist",
        "ai programmer": "AI Scientist",
        "bi developer": "Data Analyst",
        "data manager": "Data Manager",
        "data specialist": "Data Specialist",
        "data modeler": "Data Engineer",
        "data strategist": "Data Scientist",
        "business intelligence engineer": "Data Engineer",
        "power bi developer": "Data Engineer",
        "data management specialist": "Data Manager",
        "manager data management": "Data Manager"
    }
    
    # Loop through the mapping keys and check if the key is in the title string
    for key in mapping:
        if key in title:
            return mapping[key]
    # If none of the keys match, assign a default group (e.g., "Other")
    return "Other"

# Create a new column with the cleaned and grouped job titles
df['job_title_clean'] = df['job_title'].apply(map_job_title)

# Check the value counts of the new grouped job titles
print(df['job_title_clean'].value_counts())

job_title_clean
Data Engineer                1107
Data Scientist                980
Data Analyst                  829
Machine Learning Engineer     412
Research Scientist            177
Data Architect                105
Other                          37
Data Manager                   31
AI Scientist                   29
Computer Vision Engineer       27
Data Specialist                14
NLP Engineer                    7
Name: count, dtype: int64


In [5]:
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,job_title_clean
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L,Data Scientist
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S,Machine Learning Engineer
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S,Machine Learning Engineer
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M,Data Scientist
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M,Data Scientist


In [6]:
# one-hot encode this new column:
df = pd.get_dummies(df, columns=['job_title_clean'], prefix='job_title')

In [7]:
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,...,job_title_Data Analyst,job_title_Data Architect,job_title_Data Engineer,job_title_Data Manager,job_title_Data Scientist,job_title_Data Specialist,job_title_Machine Learning Engineer,job_title_NLP Engineer,job_title_Other,job_title_Research Scientist
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,...,False,False,False,False,True,False,False,False,False,False
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,...,False,False,False,False,False,False,True,False,False,False
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,...,False,False,False,False,False,False,True,False,False,False
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,...,False,False,False,False,True,False,False,False,False,False
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,...,False,False,False,False,True,False,False,False,False,False


In [8]:
# Drop the original 'job_title' column 
df.drop(columns=['job_title'], inplace=True)

In [9]:
# make experence cumulative
def cumulative_experience_encoding(df):
    level_order = ['EN', 'MI', 'SE', 'EX'] # Define the order of levels
    for level in level_order:
        df[f'experience_level_{level}'] = False # Initialize all to False

    for index, row in df.iterrows():
        level = row['experience_level']
        if level == 'EN':
            df.at[index, 'experience_level_EN'] = True
        elif level == 'MI':
            df.at[index, 'experience_level_EN'] = True
            df.at[index, 'experience_level_MI'] = True
        elif level == 'SE':
            df.at[index, 'experience_level_EN'] = True
            df.at[index, 'experience_level_MI'] = True
            df.at[index, 'experience_level_SE'] = True
        elif level == 'EX':
            for l in level_order: # For EX, all levels are true
                df.at[index, f'experience_level_{l}'] = True
    return df

df = cumulative_experience_encoding(df)
df.drop(columns=['experience_level'], inplace=True)
df.head()

Unnamed: 0,work_year,employment_type,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,job_title_AI Scientist,...,job_title_Data Scientist,job_title_Data Specialist,job_title_Machine Learning Engineer,job_title_NLP Engineer,job_title_Other,job_title_Research Scientist,experience_level_EN,experience_level_MI,experience_level_SE,experience_level_EX
0,2023,FT,80000,EUR,85847,ES,100,ES,L,False,...,True,False,False,False,False,False,True,True,True,False
1,2023,CT,30000,USD,30000,US,100,US,S,False,...,False,False,True,False,False,False,True,True,False,False
2,2023,CT,25500,USD,25500,US,100,US,S,False,...,False,False,True,False,False,False,True,True,False,False
3,2023,FT,175000,USD,175000,CA,100,CA,M,False,...,True,False,False,False,False,False,True,True,True,False
4,2023,FT,120000,USD,120000,CA,100,CA,M,False,...,True,False,False,False,False,False,True,True,True,False


In [10]:
# one hot encode emploment type
df = pd.get_dummies(df, columns=['employment_type'])
df.head()

Unnamed: 0,work_year,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,job_title_AI Scientist,job_title_Computer Vision Engineer,...,job_title_Other,job_title_Research Scientist,experience_level_EN,experience_level_MI,experience_level_SE,experience_level_EX,employment_type_CT,employment_type_FL,employment_type_FT,employment_type_PT
0,2023,80000,EUR,85847,ES,100,ES,L,False,False,...,False,False,True,True,True,False,False,False,True,False
1,2023,30000,USD,30000,US,100,US,S,False,False,...,False,False,True,True,False,False,True,False,False,False
2,2023,25500,USD,25500,US,100,US,S,False,False,...,False,False,True,True,False,False,True,False,False,False
3,2023,175000,USD,175000,CA,100,CA,M,False,False,...,False,False,True,True,True,False,False,False,True,False
4,2023,120000,USD,120000,CA,100,CA,M,False,False,...,False,False,True,True,True,False,False,False,True,False


In [11]:
# new column for if employee_location is in the same country as company_location
df['same_country'] = df['employee_residence'] == df['company_location']
df.head()

Unnamed: 0,work_year,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,job_title_AI Scientist,job_title_Computer Vision Engineer,...,job_title_Research Scientist,experience_level_EN,experience_level_MI,experience_level_SE,experience_level_EX,employment_type_CT,employment_type_FL,employment_type_FT,employment_type_PT,same_country
0,2023,80000,EUR,85847,ES,100,ES,L,False,False,...,False,True,True,True,False,False,False,True,False,True
1,2023,30000,USD,30000,US,100,US,S,False,False,...,False,True,True,False,False,True,False,False,False,True
2,2023,25500,USD,25500,US,100,US,S,False,False,...,False,True,True,False,False,True,False,False,False,True
3,2023,175000,USD,175000,CA,100,CA,M,False,False,...,False,True,True,True,False,False,False,True,False,True
4,2023,120000,USD,120000,CA,100,CA,M,False,False,...,False,True,True,True,False,False,False,True,False,True


In [12]:
#one hot encode company size
df = pd.get_dummies(df, columns=['company_size'])
df.head()

Unnamed: 0,work_year,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,job_title_AI Scientist,job_title_Computer Vision Engineer,job_title_Data Analyst,...,experience_level_SE,experience_level_EX,employment_type_CT,employment_type_FL,employment_type_FT,employment_type_PT,same_country,company_size_L,company_size_M,company_size_S
0,2023,80000,EUR,85847,ES,100,ES,False,False,False,...,True,False,False,False,True,False,True,True,False,False
1,2023,30000,USD,30000,US,100,US,False,False,False,...,False,False,True,False,False,False,True,False,False,True
2,2023,25500,USD,25500,US,100,US,False,False,False,...,False,False,True,False,False,False,True,False,False,True
3,2023,175000,USD,175000,CA,100,CA,False,False,False,...,True,False,False,False,True,False,True,False,True,False
4,2023,120000,USD,120000,CA,100,CA,False,False,False,...,True,False,False,False,True,False,True,False,True,False


In [13]:
# make encoded columns
df_encoded = df.drop(columns=['salary', 'salary_currency'])
#df_encoded = pd.get_dummies(df_encoded, columns=['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size'])

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_encoded['employee_residence'] = le.fit_transform(df_encoded['employee_residence'])
df_encoded['company_location'] = le.fit_transform(df_encoded['company_location'])

df_encoded.head()

Unnamed: 0,work_year,salary_in_usd,employee_residence,remote_ratio,company_location,job_title_AI Scientist,job_title_Computer Vision Engineer,job_title_Data Analyst,job_title_Data Architect,job_title_Data Engineer,...,experience_level_SE,experience_level_EX,employment_type_CT,employment_type_FL,employment_type_FT,employment_type_PT,same_country,company_size_L,company_size_M,company_size_S
0,2023,85847,26,100,25,False,False,False,False,False,...,True,False,False,False,True,False,True,True,False,False
1,2023,30000,75,100,70,False,False,False,False,False,...,False,False,True,False,False,False,True,False,False,True
2,2023,25500,75,100,70,False,False,False,False,False,...,False,False,True,False,False,False,True,False,False,True
3,2023,175000,11,100,12,False,False,False,False,False,...,True,False,False,False,True,False,True,False,True,False
4,2023,120000,11,100,12,False,False,False,False,False,...,True,False,False,False,True,False,True,False,True,False


In [14]:
# Simple decision tree model
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Split the data into training and testing sets
train_df, test_df = train_test_split(df_encoded, test_size=0.2, random_state=42)

# Initialize the model
model = DecisionTreeRegressor()

# Get all feature columns except salary_in_usd (our target variable)
features = [col for col in train_df.columns if col != 'salary_in_usd']

# Train the model
model.fit(train_df[features], train_df['salary_in_usd'])

# Make predictions on the test set
predictions = model.predict(test_df[features])

# Evaluate the model
print(f"Mean Squared Error: {mean_squared_error(test_df['salary_in_usd'], predictions)}")
print(f"R-squared: {r2_score(test_df['salary_in_usd'], predictions)}")
print(f"Accuracy: {model.score(test_df[features], test_df['salary_in_usd'])}")
print(f"Mean Absolute Error: {mean_absolute_error(test_df['salary_in_usd'], predictions)}")

# Feature importance
feature_importance = pd.DataFrame({'Feature': features, 'Importance': model.feature_importances_})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print(feature_importance.head(10))  # Show top 10 most important features

Mean Squared Error: 2720937146.60234
R-squared: 0.3107695039448297
Accuracy: 0.3107695039448297
Mean Absolute Error: 38534.03282168316
                         Feature  Importance
1             employee_residence    0.455060
6         job_title_Data Analyst    0.101127
18           experience_level_SE    0.098026
0                      work_year    0.056979
25                company_size_L    0.041208
15  job_title_Research Scientist    0.029123
2                   remote_ratio    0.028974
19           experience_level_EX    0.027357
17           experience_level_MI    0.026956
3               company_location    0.025025


In [15]:
# Linear Regression Model
from sklearn.linear_model import LinearRegression

# Initialize the model
model = LinearRegression()

# Train the model
model.fit(train_df[features], train_df['salary_in_usd'])

# Make predictions on the test set
predictions = model.predict(test_df[features])

# Evaluate the model
print(f"Mean Squared Error: {mean_squared_error(test_df['salary_in_usd'], predictions)}")
print(f"R-squared: {r2_score(test_df['salary_in_usd'], predictions)}")
print(f"Accuracy: {model.score(test_df[features], test_df['salary_in_usd'])}")
print(f"Mean Absolute Error: {mean_absolute_error(test_df['salary_in_usd'], predictions)}")

# Feature importance
feature_importance = pd.DataFrame({'Feature': features, 'Importance': model.coef_})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print(feature_importance.head(10))  # Show top 10 most important features

Mean Squared Error: 2575304311.335931
R-squared: 0.3476592172621428
Accuracy: 0.3476592172621428
Mean Absolute Error: 39439.31863525878
                                Feature    Importance
19                  experience_level_EX  50261.853061
4                job_title_AI Scientist  41840.497850
18                  experience_level_SE  32053.028156
15         job_title_Research Scientist  31865.716263
24                         same_country  25547.226957
17                  experience_level_MI  21020.746992
12  job_title_Machine Learning Engineer  16140.265219
20                   employment_type_CT  13803.064567
22                   employment_type_FT  10556.122527
26                       company_size_M   7065.773255


In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(train_df[features], train_df['salary_in_usd'])

# Make predictions on the test set
rf_predictions = rf_model.predict(test_df[features])

# Evaluate the model
print(f"Random Forest Mean Squared Error: {mean_squared_error(test_df['salary_in_usd'], rf_predictions)}")
print(f"Random Forest R-squared: {r2_score(test_df['salary_in_usd'], rf_predictions)}")
print(f"Random Forest Accuracy: {rf_model.score(test_df[features], test_df['salary_in_usd'])}")
print(f"Mean Absolute Error: {mean_absolute_error(test_df['salary_in_usd'], predictions)}")

# Feature importance
rf_feature_importance = pd.DataFrame({'Feature': features, 'Importance': rf_model.feature_importances_})
rf_feature_importance = rf_feature_importance.sort_values(by='Importance', ascending=False)
print(rf_feature_importance.head(10))

Random Forest Mean Squared Error: 2426354949.961866
Random Forest R-squared: 0.38538902750606585
Random Forest Accuracy: 0.38538902750606585
Mean Absolute Error: 39439.31863525878
                         Feature  Importance
1             employee_residence    0.434119
18           experience_level_SE    0.103927
6         job_title_Data Analyst    0.089020
0                      work_year    0.067163
2                   remote_ratio    0.042191
3               company_location    0.035658
19           experience_level_EX    0.027787
25                company_size_L    0.026080
15  job_title_Research Scientist    0.024314
17           experience_level_MI    0.023212


In [17]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and train the Gradient Boosting model
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(train_df[features], train_df['salary_in_usd'])

# Make predictions on the test set
gb_predictions = gb_model.predict(test_df[features])

# Evaluate the model
print(f"Gradient Boosting Mean Squared Error: {mean_squared_error(test_df['salary_in_usd'], gb_predictions)}")
print(f"Gradient Boosting R-squared: {r2_score(test_df['salary_in_usd'], gb_predictions)}")
print(f"Gradient Boosting Accuracy: {gb_model.score(test_df[features], test_df['salary_in_usd'])}")
print(f"Mean Absolute Error: {mean_absolute_error(test_df['salary_in_usd'], predictions)}")

# Feature importance
rf_feature_importance = pd.DataFrame({'Feature': features, 'Importance': rf_model.feature_importances_})
rf_feature_importance = rf_feature_importance.sort_values(by='Importance', ascending=False)
print(rf_feature_importance.head(10))

Gradient Boosting Mean Squared Error: 2384828405.7983727
Gradient Boosting R-squared: 0.39590796237709036
Gradient Boosting Accuracy: 0.39590796237709036
Mean Absolute Error: 39439.31863525878
                         Feature  Importance
1             employee_residence    0.434119
18           experience_level_SE    0.103927
6         job_title_Data Analyst    0.089020
0                      work_year    0.067163
2                   remote_ratio    0.042191
3               company_location    0.035658
19           experience_level_EX    0.027787
25                company_size_L    0.026080
15  job_title_Research Scientist    0.024314
17           experience_level_MI    0.023212


In [18]:
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Build a pipeline: scale features then train SVR
svr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

# Train the SVR model
svr_pipeline.fit(train_df[features], train_df['salary_in_usd'])

# Make predictions on the test set
svr_predictions = svr_pipeline.predict(test_df[features])

# Evaluate the model
print(f"SVR Mean Squared Error: {mean_squared_error(test_df['salary_in_usd'], svr_predictions)}")
print(f"SVR R-squared: {r2_score(test_df['salary_in_usd'], svr_predictions)}")
print(f"SVR Accuracy: {svr_pipeline.score(test_df[features], test_df['salary_in_usd'])}")
print(f"Mean Absolute Error: {mean_absolute_error(test_df['salary_in_usd'], predictions)}")

# Feature importance
rf_feature_importance = pd.DataFrame({'Feature': features, 'Importance': rf_model.feature_importances_})
rf_feature_importance = rf_feature_importance.sort_values(by='Importance', ascending=False)
print(rf_feature_importance.head(10))

SVR Mean Squared Error: 3943070791.575289
SVR R-squared: 0.0011953634975774285
SVR Accuracy: 0.0011953634975774285
Mean Absolute Error: 39439.31863525878
                         Feature  Importance
1             employee_residence    0.434119
18           experience_level_SE    0.103927
6         job_title_Data Analyst    0.089020
0                      work_year    0.067163
2                   remote_ratio    0.042191
3               company_location    0.035658
19           experience_level_EX    0.027787
25                company_size_L    0.026080
15  job_title_Research Scientist    0.024314
17           experience_level_MI    0.023212


In [19]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and train the XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
xgb_model.fit(train_df[features], train_df['salary_in_usd'])

# Make predictions on the test set
xgb_predictions = xgb_model.predict(test_df[features])

# Evaluate the model
print(f"XGBoost Mean Squared Error: {mean_squared_error(test_df['salary_in_usd'], xgb_predictions)}")
print(f"XGBoost R-squared: {r2_score(test_df['salary_in_usd'], xgb_predictions)}")
print(f"XGBoost Accuracy: {xgb_model.score(test_df[features], test_df['salary_in_usd'])}")
print(f"Mean Absolute Error: {mean_absolute_error(test_df['salary_in_usd'], predictions)}")

# Feature importance
rf_feature_importance = pd.DataFrame({'Feature': features, 'Importance': rf_model.feature_importances_})
rf_feature_importance = rf_feature_importance.sort_values(by='Importance', ascending=False)
print(rf_feature_importance.head(10))

XGBoost Mean Squared Error: 2448243200.0
XGBoost R-squared: 0.379844605922699
XGBoost Accuracy: 0.379844605922699
Mean Absolute Error: 39439.31863525878
                         Feature  Importance
1             employee_residence    0.434119
18           experience_level_SE    0.103927
6         job_title_Data Analyst    0.089020
0                      work_year    0.067163
2                   remote_ratio    0.042191
3               company_location    0.035658
19           experience_level_EX    0.027787
25                company_size_L    0.026080
15  job_title_Research Scientist    0.024314
17           experience_level_MI    0.023212
