# Training
Our purpose is to predict a tech professional salary.

---

## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

---

## Loading Data
The loaded data has been cleaned in `notebooks/attributes-engineering.ipynb`, stored in `data/training/dataset.csv`.

In [4]:
# Load to DataFrame
df = pd.read_csv("../data/training/dataset.csv")

In [5]:
df.head()

Unnamed: 0,Age,MainBranch,RemoteWork,Industry,OrgSize,AIAgents,Country,DevType,EdLevel,YearsCode,...,WebframeHaveWorkedWith__Nuxt.js,WebframeHaveWorkedWith__Phoenix,WebframeHaveWorkedWith__React,WebframeHaveWorkedWith__Ruby on Rails,WebframeHaveWorkedWith__Spring Boot,WebframeHaveWorkedWith__Svelte,WebframeHaveWorkedWith__Symfony,WebframeHaveWorkedWith__Vue.js,WebframeHaveWorkedWith__WordPress,WebframeHaveWorkedWith__jQuery
0,25-34 years old,I am a developer by profession,Remote,Fintech,20 to 99 employees,"Yes, I use AI agents at work monthly or infreq...",Ukraine,"Developer, mobile",Master’s degree,14.0,...,False,False,False,False,False,False,False,False,False,False
1,25-34 years old,I am a developer by profession,"Hybrid (some in-person, leans heavy to flexibi...",Retail and Consumer Services,500 to 999 employees,"No, and I don't plan to",Netherlands,"Developer, back-end",Less than Bachelor’s,10.0,...,False,False,False,False,True,False,False,False,False,False
2,35-44 years old,I am a developer by profession,Remote,Retail and Consumer Services,"10,000 or more employees","Yes, I use AI agents at work monthly or infreq...",Ukraine,"Developer, back-end",Bachelor’s degree,5.0,...,False,False,False,False,True,False,False,False,False,False
3,35-44 years old,I am a developer by profession,Remote,Software Development,Less than 20 employees,"No, but I plan to",Ukraine,"Architect, software or solutions",Doctor’s degree,30.0,...,False,False,False,False,False,False,False,False,False,False
4,25-34 years old,I am a developer by profession,Remote,Software Development,20 to 99 employees,"No, but I plan to",Ukraine,"Developer, mobile",Master’s degree,10.0,...,False,False,False,False,False,False,False,False,False,False


---

## Preprocessing

#### Split train/test datasets

Our proportions are going to be 80/20

In [7]:
# Define features (X) and labels (y)
X = df.drop(["ConvertedCompYearly"], axis=1)
y = np.log1p(df["ConvertedCompYearly"])

# Split train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

---

## Train and statistical eval

In [11]:
# Indices of categorical columns (object or category)
cat_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

# Build CatBoost pools
train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)

# Model
model = CatBoostRegressor(
    loss_function="RMSE",
    depth=8,
    learning_rate=0.5,
    iterations=2000,
    random_seed=42,
    verbose=False
)

# Train
model.fit(train_pool, eval_set=test_pool)

# Predict log salaries
prediction_log = model.predict(test_pool)

# Convert back to salary
y_true = np.expm1(y_test)
prediction = np.expm1(prediction_log)

# Evaluate in real salary scale
mae = mean_absolute_error(y_true, prediction)
mse = mean_squared_error(y_true, prediction)
rmse = np.sqrt(mse)
r2 = r2_score(y_true, prediction)

print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2:", r2)

MAE: 26265.907698969088
MSE: 1421569315.442772
RMSE: 37703.70426685914
R2: 0.5758109840507829


---

## Export model

Since `CatBoostRegressor` performed better, we'll select it as our model.

In [10]:
model.save_model("../models/catboost-salary-prediction-1.cbm")