In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder, PolynomialFeatures

from scipy.stats import zscore
import joblib

In [None]:
raw_data = pd.read_csv("datasets/FOAI-assignment2-1.csv")

In [None]:
print("\n____________ Dataset info ____________")
print(raw_data.info())
print("\n____________ Some first data examples ____________")
print(raw_data.head())
print("\n____________ Statistics of numeric features ____________")
num_data = raw_data.select_dtypes(include=["float64"])
print(num_data.describe())
print("\n____________ Statistics of categorical features ____________")
cat_data = raw_data.select_dtypes(include=["int64", "object"]).astype("category")
print(cat_data.describe())

In [None]:
raw_data.plot(kind="scatter", x="experience_level", y="salary_in_usd", alpha=0.2)
plt.savefig("figures/scatter_1_feat.png", format="png", dpi=300)
plt.show()

In [None]:
raw_data.plot(kind="scatter", x="employment_type", y="salary_in_usd", alpha=0.2)
plt.savefig('figures/scatter_2_feat.png', format='png', dpi=300)
plt.show()

In [None]:
from pandas.plotting import scatter_matrix

scatter_matrix(raw_data, figsize=(12, 8))
plt.savefig("figures/scatter_mat_all_feat.png", format="png", dpi=300)
plt.show()

In [None]:
from pandas.plotting import scatter_matrix

features_to_plot = ["salary_in_usd"]
scatter_matrix(raw_data[features_to_plot], figsize=(12, 8))
plt.show()

In [None]:
raw_data.hist(figsize=(10, 5))
plt.rcParams["xtick.labelsize"] = 10
plt.rcParams["ytick.labelsize"] = 10
plt.tight_layout()
plt.savefig("figures/hist_raw_data.png", format="png", dpi=300)
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(
    raw_data.corr(numeric_only=True),
    annot=True,
    cmap="coolwarm",
    center=0,
    square=True,
    fmt=".2f",
)
plt.title("Correlation Matrix of Numerical Variables")
plt.tight_layout()
plt.show()

In [None]:
avg_salaries_by_location = raw_data.groupby("company_location")["salary_in_usd"].mean().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
plt.bar(avg_salaries_by_location.index, avg_salaries_by_location.values)
plt.xlabel("Company Location")
plt.ylabel("Average Salary in USD")
plt.title("Average Salary by Company Location")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
avg_salaries_by_size = raw_data.groupby("company_size")["salary_in_usd"].mean().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
plt.bar(avg_salaries_by_size.index, avg_salaries_by_size.values)
plt.xlabel("Company Size")
plt.ylabel("Average Salary in USD")
plt.title("Average Salary by Company Size")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
raw_data.drop(
    columns=["salary", "salary_currency", "job_title", "employee_residence"], inplace=True
)
raw_data.dropna(inplace=True)
raw_data.drop_duplicates(inplace=True)

z_scores = np.abs(zscore(raw_data["salary_in_usd"]))
raw_data = raw_data[z_scores < 3]

raw_data.info()

In [None]:
from sklearn.model_selection import train_test_split

X = raw_data.drop("salary_in_usd", axis=1)
y = raw_data["salary_in_usd"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\n____________ Split training and test set ____________")
print(len(X_train), "training +", len(X_test), "test examples")

In [None]:
class RareCategoryGrouper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=10):
        self.threshold = threshold
        self.frequent_categories_ = None

    def fit(self, X, y):
        values, counts = np.unique(X, return_counts=True)
        self.frequent_categories_ = values[counts >= self.threshold]
        return self

    def transform(self, X):
        X = np.array(X)
        return np.where(np.isin(X, self.frequent_categories_), X, "Other").reshape(-1, 1)

In [None]:
cat_feat_names = [
    "work_year",
    "remote_ratio",
    "experience_level",
    "employment_type",
    "company_location",
    "company_size",
]

cat_pipeline = ColumnTransformer(
    [
        ("ord_work_year", OrdinalEncoder(categories=[[2020, 2021, 2022, 2023]]), ["work_year"]),
        ("ord_remote_ratio", OrdinalEncoder(categories=[[0, 50, 100]]), ["remote_ratio"]),
        ("ord_experience_level", OrdinalEncoder(categories=[["EN", "MI", "SE", "EX"]], handle_unknown="use_encoded_value", unknown_value=-1), ["experience_level"]),
        ("onehot_employment_type", OneHotEncoder(categories=[["FT", "PT", "CT", "FL"]], handle_unknown="ignore", sparse_output=False), ["employment_type"]),
        ("rare_onehot_company_location", 
            Pipeline([
                ("rare_grouper", RareCategoryGrouper(threshold=5)),
                ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
            ]), 
            ["company_location"]
        ),
        ("ord_company_size", OrdinalEncoder(categories=[["S", "M", "L"]], handle_unknown="use_encoded_value", unknown_value=-1), ["company_size"]),
    ]
)

In [None]:
full_pipeline = FeatureUnion(
    [
        ("cat_pipeline", cat_pipeline),
    ]
)

processed_X_train = full_pipeline.fit_transform(X_train, y_train)
print("\n____________ Processed feature values ____________")
print(processed_X_train)
print(processed_X_train.shape)

joblib.dump(full_pipeline, r"models/full_pipeline.pkl")

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

def calculate_metrics(model, train_data, labels):
    r2score = model.score(train_data, labels)
    prediction = model.predict(train_data)

    mae = mean_absolute_error(labels, prediction)
    mse = mean_squared_error(labels, prediction)
    rmse = np.sqrt(mse)
    return r2score, mae, mse, rmse

def store_model(model, model_name=""):
    if model_name == "":
        model_name = type(model).__name__
    joblib.dump(model, "models/" + model_name + "_model.pkl")


def load_model(model_name):
    model = joblib.load("models/" + model_name + "_model.pkl")
    return model

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

model = RandomForestRegressor()
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=20, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42)
random_search.fit(processed_X_train, y_train)

print("Best parameters:", random_search.best_params_)
best_model = random_search.best_estimator_

In [None]:
print("\n____________ RandomForestRegressor ____________")
r2score, mae, mse, rmse = calculate_metrics(best_model, processed_X_train, y_train)
print("\nR2 score (on training data, best=1):", r2score)
print("Mean Absolute Error: ", mae)
print("Mean Squared Error: ", mse)
print("Root Mean Square Error: ", rmse)
store_model(best_model)

print("Input data: \n", X_train.iloc[0:9])
print("\nPredictions: ", best_model.predict(processed_X_train[0:9]).round(decimals=1))
print("Labels:      ", list(y_train[0:9]))

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [1, 1.5, 2.0]
}

model = XGBRegressor()
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=20, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42)
random_search.fit(processed_X_train, y_train)

print("Best parameters:", random_search.best_params_)
best_model = random_search.best_estimator_

In [None]:
print("\n____________ XGBRegressor ____________")
r2score, mae, mse, rmse = calculate_metrics(best_model, processed_X_train, y_train)
print("\nR2 score (on training data, best=1):", r2score)
print("Mean Absolute Error: ", mae)
print("Mean Squared Error: ", mse)
print("Root Mean Square Error: ", rmse)
store_model(best_model)

print("Input data: \n", X_train.iloc[0:9])
print("\nPredictions: ", best_model.predict(processed_X_train[0:9]).round(decimals=1))
print("Labels:      ", list(y_train[0:9]))

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV

param_grid_lgbm = {
    'num_leaves': [30, 50, 70],
    'max_depth': [5, 10, 20],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 500],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

model = LGBMRegressor()
cat_X_train = X_train.select_dtypes(include=["object"])
cat_features = cat_X_train.columns.tolist()
for col in cat_features:
    cat_X_train[col] = cat_X_train[col].astype("category")

random_search_lgbm = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid_lgbm,
    n_iter=20,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)
random_search_lgbm.fit(cat_X_train, y_train, categorical_feature=cat_features)

print("Best parameters:", random_search_lgbm.best_params_)
best_model = random_search_lgbm.best_estimator_

In [None]:
print("\n____________ LGBMRegressor ____________")
r2score, mae, mse, rmse = calculate_metrics(best_model, cat_X_train, y_train)
print("\nR2 score (on training data, best=1):", r2score)
print("Mean Absolute Error: ", mae)
print("Mean Squared Error: ", mse)
print("Root Mean Square Error: ", rmse)
store_model(best_model)

print("Input data: \n", X_train.iloc[0:9])
print("\nPredictions: ", best_model.predict(cat_X_train[0:9]).round(decimals=1))
print("Labels:      ", list(y_train[0:9]))

In [None]:
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV

param_grid_catboost = {
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'iterations': [100, 200, 500],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'border_count': [32, 64, 128]
}

model = CatBoostRegressor(verbose=0)
cat_X_train = X_train.select_dtypes(include=["object"])
cat_features = cat_X_train.columns.tolist()

random_search_catboost = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid_catboost,
    n_iter=20,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)
random_search_catboost.fit(X_train, y_train, cat_features=cat_features)

print("Best parameters:", random_search_catboost.best_params_)
best_model = random_search_catboost.best_estimator_

In [None]:
print("\n____________ CatBoostRegressor ____________")
r2score, mae, mse, rmse = calculate_metrics(best_model, X_train, y_train)
print("\nR2 score (on training data, best=1):", r2score)
print("Mean Absolute Error: ", mae)
print("Mean Squared Error: ", mse)
print("Root Mean Square Error: ", rmse)
store_model(best_model)

print("Input data: \n", X_train.iloc[0:9])
print("\nPredictions: ", best_model.predict(X_train[0:9]).round(decimals=1))
print("Labels:      ", list(y_train[0:9]))

In [None]:
full_pipeline = joblib.load(r"models/full_pipeline.pkl")
processed_X_test = full_pipeline.transform(X_test)

In [None]:
model = joblib.load(r"models/RandomForestRegressor_model.pkl")

r2score, mae, mse, rmse = calculate_metrics(model, processed_X_test, y_test)
print("\nPerformance on test data:")
print("R2 score (on test data, best=1):", r2score)
print("Mean Absolute Error: ", mae)
print("Mean Squared Error: ", mse)
print("Root Mean Square Error: ", rmse)

print("\nTest data: \n", X_test.iloc[0:9])
print("\nPredictions: ", model.predict(processed_X_test[0:9]).round(decimals=1))
print("Labels:      ", list(y_test[0:9]), "\n")

In [None]:
model = joblib.load(r"models/XGBRegressor_model.pkl")

r2score, mae, mse, rmse = calculate_metrics(model, processed_X_test, y_test)
print("\nPerformance on test data:")
print("R2 score (on test data, best=1):", r2score)
print("Mean Absolute Error: ", mae)
print("Mean Squared Error: ", mse)
print("Root Mean Square Error: ", rmse)

print("\nTest data: \n", X_test.iloc[0:5])
print("\nPredictions: ", model.predict(processed_X_test[0:5]).round(decimals=1))
print("Labels:      ", list(y_test[0:5]), "\n")

In [None]:
model = joblib.load(r"models/LGBMRegressor_model.pkl")

cat_X_test = X_test.select_dtypes(include=["object"])

for col in cat_features:
    cat_X_test[col] = cat_X_test[col].astype("category")

r2score, mae, mse, rmse = calculate_metrics(model, cat_X_test, y_test)
print("\nPerformance on test data:")
print("R2 score (on test data, best=1):", r2score)
print("Mean Absolute Error: ", mae)
print("Mean Squared Error: ", mse)
print("Root Mean Square Error: ", rmse)

print("\nTest data: \n", X_test.iloc[0:9])
print("\nPredictions: ", model.predict(cat_X_test[0:9]).round(decimals=1))
print("Labels:      ", list(y_test[0:9]), "\n")

In [None]:
model = joblib.load(r"models/CatBoostRegressor_model.pkl")

r2score, mae, mse, rmse = calculate_metrics(model, X_test, y_test)
print("\nPerformance on test data:")
print("R2 score (on test data, best=1):", r2score)
print("Mean Absolute Error: ", mae)
print("Mean Squared Error: ", mse)
print("Root Mean Square Error: ", rmse)

print("\nTest data: \n", X_test.iloc[0:9])
print("\nPredictions: ", model.predict(X_test[0:9]).round(decimals=1))
print("Labels:      ", list(y_test[0:9]), "\n")