# Tech Professional Salary Prediction Model
Our purpose is to predict a tech professional salary.

## Imports

In [2]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

## Loading Data
The loaded data has been cleaned in `notebooks/attributes-engineering.ipynb`, stored in `data/training/dataset.csv`.

In [3]:
# Load to DataFrame
df = pd.read_csv("../data/training/dataset.csv")

In [4]:
df.head()

Unnamed: 0,Country,DevType,EdLevel,YearsCode,Employment,ConvertedCompYearly
0,Ukraine,"Developer, mobile",Master’s degree,14.0,Employed,61256.0
1,Netherlands,"Developer, back-end",Less than Bachelor’s,10.0,Employed,104413.0
2,Ukraine,"Developer, front-end",Bachelor’s degree,12.0,"Independent contractor, freelancer, or self-em...",53061.0
3,Ukraine,"Developer, back-end",Bachelor’s degree,5.0,Employed,36197.0
4,Ukraine,Engineering manager,Master’s degree,22.0,"Independent contractor, freelancer, or self-em...",60000.0


## Preprocessing

#### Feature encoding

We'll apply `OrdinalEncoder()` for categorical attributes and `StandardScaler()` for numeric ones, using the `ColumnTransformer()`.

In [5]:
# Defining categorical and numeric columns
target = "ConvertedCompYearly"
categorical_cols = ["Country", "DevType", "EdLevel", "Employment"]
numeric_cols = ["YearsCode"]

# Encode DataFrame with ColumTransformer()
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OrdinalEncoder(), categorical_cols),
        ("num", StandardScaler(), numeric_cols)
    ],
    remainder="drop"  # or "passthrough"
)

# Build encoded DataFrame
X_encoded = preprocess.fit_transform(df)
feature_names = preprocess.get_feature_names_out()
df_encoded = pd.DataFrame(X_encoded, columns=feature_names)

# Show results
df_encoded.head()

Unnamed: 0,cat__Country,cat__DevType,cat__EdLevel,cat__Employment,num__YearsCode
0,39.0,17.0,3.0,0.0,-0.3484
1,23.0,11.0,2.0,0.0,-0.731922
2,39.0,14.0,0.0,1.0,-0.540161
3,39.0,11.0,0.0,0.0,-1.211324
4,39.0,18.0,3.0,1.0,0.418643


#### Split train/test datasets

Our proportions are going to be 80/20

In [6]:
# Define features (X) and labels (y)
X = df.drop(["ConvertedCompYearly"], axis=1)
y = np.log1p(df["ConvertedCompYearly"])

# Split train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Train and statistical eval

We'll test a two different models: RandomForest and CatBoost.

#### Model 1: RandomForest

In [7]:
# Initialize model
model = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("regressor", RandomForestRegressor(
            random_state=42,
            min_samples_split=40
        ))
    ]
)

# Train model
model.fit(X_train, y_train)

# Predict log salaries
prediction_log = model.predict(X_test)

# Convert back to salary
y_true = np.expm1(y_test)
prediction = np.expm1(prediction_log)

# Evaluate in real salary scale
mae = mean_absolute_error(y_true, prediction)
mse = mean_squared_error(y_true, prediction)
rmse = np.sqrt(mse)
r2 = r2_score(y_true, prediction)

print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2:", r2)

MAE: 30488.535520218276
MSE: 1820621606.0432098
RMSE: 42668.74272864399
R2: 0.444690451715109


#### Model 2: CatBoost

In [8]:
# Indices of categorical columns (object or category)
cat_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

# Build CatBoost pools
train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)

# Model
model = CatBoostRegressor(
    loss_function="RMSE",
    depth=8,
    learning_rate=0.05,
    iterations=2000,
    random_seed=42,
    verbose=False
)

# Train
model.fit(train_pool, eval_set=test_pool)

# Predict log salaries
prediction_log = model.predict(test_pool)

# Convert back to salary
y_true = np.expm1(y_test)
prediction = np.expm1(prediction_log)

# Evaluate in real salary scale
mae = mean_absolute_error(y_true, prediction)
mse = mean_squared_error(y_true, prediction)
rmse = np.sqrt(mse)
r2 = r2_score(y_true, prediction)

print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2:", r2)

MAE: 28872.99099530982
MSE: 1646537306.6481717
RMSE: 40577.546828858096
R2: 0.49778807141800074


### Export model

Since `CatBoostRegressor` performed better, we'll select it as our model.

In [10]:
model.save_model("../models/catboost-salary-prediction-1.cbm")