In [77]:
import pandas as pd

In [78]:
salaries_data = pd.read_csv("data/ds_salaries_cleaned.csv")
salaries_data = salaries_data.sample(frac=1)
salaries_data.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,remote_ratio,residence_gdp_per_capita,company_gdp_per_capita,company_size,salary_in_usd
543,2022,SE,FT,ETL Developer,100,77979.858,77979.858,M,149411.666667
326,2022,EN,FT,Machine Learning Research Engineer,50,49725.343,49725.343,L,66192.0
481,2022,SE,FT,Applied Machine Learning Scientist,100,42581.012,46103.227,L,77119.0
451,2022,MI,FT,Machine Learning Engineer,100,42581.012,49725.343,M,84053.0
295,2022,EN,FT,Data Science Consultant,50,35653.833,35653.833,M,24165.0


In [79]:
def normalize(df):
    return (df-df.min())/(df.max()-df.min())

In [80]:
years_mapping = {
    "2020": 1,
    "2021": 2,
    "2022": 3,
    "2023": 4,
}
years_encoded = salaries_data["work_year"].apply(lambda x: years_mapping[str(x)])
salaries_data["work_year"] = years_encoded

In [81]:
experience_levels_mapping = {
    "EN": 1,
    "MI": 2,
    "SE": 3,
    "EX": 4
}
experience_levels_encoded = salaries_data["experience_level"].apply(lambda x: experience_levels_mapping[str(x)])
salaries_data["experience_level"] = experience_levels_encoded

In [82]:
employment_type_mapping = {
    "PT": 1,
    "FL": 2,
    "CT": 3,
    "FT": 4
}
employment_type_encoded = salaries_data["employment_type"].apply(lambda x: employment_type_mapping[str(x)])
salaries_data["employment_type"] = employment_type_encoded

In [83]:
remote_ratio_mapping = {
    "0": 1,
    "50": 2,
    "100": 3
}
remote_ratio_encoded = salaries_data["remote_ratio"].apply(lambda x: remote_ratio_mapping[str(x)])
salaries_data["remote_ratio"] = remote_ratio_encoded

In [84]:
company_size_mapping = {
    "S": 1,
    "M": 2,
    "L": 3
}
company_size_encoded = salaries_data["company_size"].apply(lambda x: company_size_mapping[str(x)])
salaries_data["company_size"] = company_size_encoded

In [85]:
salaries_data["residence_gdp_per_capita"] = normalize(salaries_data["residence_gdp_per_capita"])
salaries_data["company_gdp_per_capita"] = normalize(salaries_data["company_gdp_per_capita"])
salaries_data["experience_level"] = normalize(salaries_data["experience_level"])
salaries_data["employment_type"] = normalize(salaries_data["employment_type"])
salaries_data["work_year"] = normalize(salaries_data["work_year"])
salaries_data["remote_ratio"] = normalize(salaries_data["remote_ratio"])
salaries_data["company_size"] = normalize(salaries_data["company_size"])

In [86]:
bag_of_words = []
for job_title in salaries_data["job_title"]:
    job_title = job_title.lower()
    words = job_title.split()
    words = [word.strip('.,!;()[]') for word in words]
    words = [word.replace("'s", '') for word in words]
    words = [word.strip('.,!;()[]') for word in words]
    words = [word.replace("'s", '') for word in words]
    
    for word in words:
        if word not in bag_of_words:
            bag_of_words.append(word)

In [87]:
for index, row in salaries_data.iterrows():
    job_title = row["job_title"]
    job_title = job_title.lower()
    words = job_title.split()
    
    for word in bag_of_words:
        contains_word = word in words
        salaries_data.loc[index, f"contains_{word}"] = contains_word

In [88]:
salaries_data

Unnamed: 0,work_year,experience_level,employment_type,job_title,remote_ratio,residence_gdp_per_capita,company_gdp_per_capita,company_size,salary_in_usd,contains_etl,...,contains_financial,contains_devops,contains_insight,contains_azure,contains_quality,contains_autonomous,contains_vehicle,contains_technician,contains_power,contains_compliance
543,0.666667,0.666667,1.0,ETL Developer,1.0,0.576347,0.576347,0.5,149411.666667,True,...,False,False,False,False,False,False,False,False,False,False
326,0.666667,0.000000,1.0,Machine Learning Research Engineer,0.5,0.366142,0.366142,1.0,66192.000000,False,...,False,False,False,False,False,False,False,False,False,False
481,0.666667,0.666667,1.0,Applied Machine Learning Scientist,1.0,0.312991,0.339195,1.0,77119.000000,False,...,False,False,False,False,False,False,False,False,False,False
451,0.666667,0.333333,1.0,Machine Learning Engineer,1.0,0.312991,0.366142,0.5,84053.000000,False,...,False,False,False,False,False,False,False,False,False,False
295,0.666667,0.000000,1.0,Data Science Consultant,0.5,0.261455,0.261455,0.5,24165.000000,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,0.666667,1.000000,1.0,Data Manager,0.5,0.409944,0.409944,1.0,125976.000000,False,...,False,False,False,False,False,False,False,False,False,False
51,0.000000,0.333333,1.0,Product Data Analyst,1.0,0.010453,0.010453,1.0,6072.000000,False,...,False,False,False,False,False,False,False,False,False,False
362,0.666667,0.333333,1.0,Applied Data Scientist,1.0,0.576347,0.576347,1.0,157000.000000,False,...,False,False,False,False,False,False,False,False,False,False
708,1.000000,0.333333,1.0,Research Scientist,0.0,0.611575,0.611575,0.5,116250.000000,False,...,False,False,False,False,False,False,False,False,False,False


In [95]:
salaries_data = salaries_data.drop("job_title", axis=1)

In [108]:
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

def k_fold_cross_validation(model_builder, df, epochs, batch_size, k=5):    
    kf = KFold(n_splits=k, shuffle=True)
    validation_errors = []
    for fold, (train_index, val_index) in enumerate(kf.split(df)):
        print(f"Fold {fold + 1}/{k}")
        
        train_df = df.iloc[train_index]   
        val_df = df.iloc[val_index]
        
        train_target = train_df["salary_in_usd"]
        train_features = train_df.drop("salary_in_usd", axis=1)
    
        val_target = val_df["salary_in_usd"]
        val_features = val_df.drop("salary_in_usd", axis=1)
        
        model = RandomForestRegressor(n_estimators = 50, max_depth=100)
        model.fit(train_features, train_target)
        
        val_pred = model.predict(val_features)
        rmse = float(format(np.sqrt(mean_squared_error(val_target, val_pred))))
        print(f"RMSE: {rmse:.4f}")
        validation_errors.append(rmse)
    
    average_error = np.mean(validation_errors)
    print(f"Validation error:{average_error:.4f}")


In [109]:
k_fold_cross_validation(_, salaries_data, _, _,)

Fold 1/5
RMSE: 44324.6169
Fold 2/5
RMSE: 39165.5759
Fold 3/5
RMSE: 41631.8068
Fold 4/5
RMSE: 35254.9775
Fold 5/5
RMSE: 31923.3636
Validation error:38460.0681
