# Testing out models

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

mlflow.set_experiment("Exploring Salary Prediction Models")

In [2]:
df_encoded = pd.read_csv('../data/encoded_data.csv')

train_df, test_df = train_test_split(df_encoded, test_size=0.2, random_state=123)
features = [col for col in train_df.columns if col != 'salary_in_usd']

## Decision Tree Model

In [None]:
with mlflow.start_run(run_name="DecisionTreeRegressor"):
    mlflow.log_param("model_type", "DecisionTreeRegressor")
    for p, v in DecisionTreeRegressor().get_params().items():
        mlflow.log_param(p, v)

    model = DecisionTreeRegressor()
    model.fit(train_df[features], train_df["salary_in_usd"])

    preds = model.predict(test_df[features])
    mse = mean_squared_error(test_df["salary_in_usd"], preds)
    r2  = r2_score(test_df["salary_in_usd"], preds)
    mae = mean_absolute_error(test_df["salary_in_usd"], preds)

    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)

    signature = infer_signature(train_df[features], model.predict(train_df[features]))
    input_example = train_df[features].head(3)

    mlflow.sklearn.log_model(
        model,
        "decision_tree_model",
        signature=signature,
        input_example=input_example
    )

## Linear Regression Model

In [None]:
with mlflow.start_run(run_name="LinearRegression"):
    mlflow.log_param("model_type", "LinearRegression")
    for p, v in LinearRegression().get_params().items():
        mlflow.log_param(p, v)

    model = LinearRegression()
    model.fit(train_df[features], train_df["salary_in_usd"])

    preds = model.predict(test_df[features])
    mse   = mean_squared_error(test_df["salary_in_usd"], preds)
    r2    = r2_score(test_df["salary_in_usd"], preds)
    mae   = mean_absolute_error(test_df["salary_in_usd"], preds)
    acc   = model.score(test_df[features], test_df["salary_in_usd"])

    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("accuracy", acc)

    signature     = infer_signature(train_df[features], model.predict(train_df[features]))
    input_example = train_df[features].head(3)

    mlflow.sklearn.log_model(
        model,
        "linear_regression_model",
        signature=signature,
        input_example=input_example
    )

## Random Forest Model

In [None]:
with mlflow.start_run(run_name="RandomForestRegressor"):
    mlflow.log_param("model_type", "RandomForestRegressor")
    for p, v in RandomForestRegressor().get_params().items():
        mlflow.log_param(p, v)

    model = RandomForestRegressor(n_estimators=100, random_state=123)
    model.fit(train_df[features], train_df["salary_in_usd"])

    predictions = model.predict(test_df[features])
    mse = mean_squared_error(test_df["salary_in_usd"], predictions)
    r2 = r2_score(test_df["salary_in_usd"], predictions)
    mae = mean_absolute_error(test_df["salary_in_usd"], predictions)
    acc = model.score(test_df[features], test_df["salary_in_usd"])

    print(f"Random Forest MSE: {mse}")
    print(f"Random Forest R²: {r2}")
    print(f"Random Forest Accuracy: {acc}")
    print(f"Random Forest MAE: {mae}")

    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("accuracy", acc)

    signature = infer_signature(train_df[features], model.predict(train_df[features]))
    input_example = train_df[features].head(3)

    mlflow.sklearn.log_model(
        model,
        "random_forest_model",
        signature=signature,
        input_example=input_example
    )

## Gradient Boosting Model

In [None]:
with mlflow.start_run(run_name="GradientBoostingRegressor"):
    mlflow.log_param("model_type", "GradientBoostingRegressor")
    for p, v in GradientBoostingRegressor().get_params().items():
        mlflow.log_param(p, v)

    model = GradientBoostingRegressor(n_estimators=100, random_state=123)
    model.fit(train_df[features], train_df["salary_in_usd"])

    preds = model.predict(test_df[features])
    mse  = mean_squared_error(test_df["salary_in_usd"], preds)
    r2   = r2_score(test_df["salary_in_usd"], preds)
    mae  = mean_absolute_error(test_df["salary_in_usd"], preds)
    acc  = model.score(test_df[features], test_df["salary_in_usd"])

    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("accuracy", acc)

    signature     = infer_signature(train_df[features], model.predict(train_df[features]))
    input_example = train_df[features].head(3)

    mlflow.sklearn.log_model(
        model,
        "gradient_boosting_model",
        signature=signature,
        input_example=input_example
    )

## Support Vector Regression Model

In [None]:
# Re‐define the SVR pipeline before logging
svr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

with mlflow.start_run(run_name="SVR"):
    mlflow.log_param("model_type", "SVR")
    for p, v in svr_pipeline.get_params().items():
        mlflow.log_param(p, v)

    svr_pipeline.fit(train_df[features], train_df["salary_in_usd"])
    preds = svr_pipeline.predict(test_df[features])
    mse   = mean_squared_error(test_df["salary_in_usd"], preds)
    r2    = r2_score(test_df["salary_in_usd"], preds)
    mae   = mean_absolute_error(test_df["salary_in_usd"], preds)
    acc   = svr_pipeline.score(test_df[features], test_df["salary_in_usd"])

    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("accuracy", acc)

    signature     = infer_signature(train_df[features], svr_pipeline.predict(train_df[features]))
    input_example = train_df[features].head(3)

    mlflow.sklearn.log_model(
        svr_pipeline,
        "svr_model",
        signature=signature,
        input_example=input_example
    )

## XGBoost Model

In [None]:
with mlflow.start_run(run_name="XGBRegressor"):
    mlflow.log_param("model_type", "XGBRegressor")
    model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
    for p, v in model.get_params().items():
        mlflow.log_param(p, v)

    model.fit(train_df[features], train_df["salary_in_usd"])

    preds = model.predict(test_df[features])
    mse   = mean_squared_error(test_df["salary_in_usd"], preds)
    r2    = r2_score(test_df["salary_in_usd"], preds)
    mae   = mean_absolute_error(test_df["salary_in_usd"], preds)
    acc   = model.score(test_df[features], test_df["salary_in_usd"])

    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("accuracy", acc)

    signature     = infer_signature(train_df[features], model.predict(train_df[features]))
    input_example = train_df[features].head(3)

    mlflow.sklearn.log_model(
        model,
        "xgb_model",
        signature=signature,
        input_example=input_example
    )

Gradient Boosting is working the best so we can use that model for the final model.

## Hyperparameter Tuning Gradient Boosting Model

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [20, 50, 75, 100],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': [6, 7, 8, 9, 10],
    'min_samples_split': [15, 20, 25, 30],
    'min_samples_leaf': [1, 2, 3]
}

gb_model = GradientBoostingRegressor()

grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(train_df[features], train_df['salary_in_usd'])

print(grid_search.best_params_)

In [None]:
with mlflow.start_run(run_name="Best_Tuned_XGB"):
    mlflow.log_param("model_type", "XGBRegressor")
    best_params = {
        "colsample_bytree": 0.8,
        "learning_rate":    0.1,
        "max_depth":        5,
        "min_child_weight": 1,
        "n_estimators":     75,
        "subsample":        0.8
    }
    mlflow.log_params(best_params)

    model = xgb.XGBRegressor(**best_params)
    model.fit(train_df[features], train_df["salary_in_usd"])

    predictions = model.predict(test_df[features])
    mse  = mean_squared_error(test_df["salary_in_usd"], predictions)
    r2   = r2_score(test_df["salary_in_usd"], predictions)
    mae  = mean_absolute_error(test_df["salary_in_usd"], predictions)
    acc  = model.score(test_df[features], test_df["salary_in_usd"])

    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("accuracy", acc)

    signature     = infer_signature(train_df[features], model.predict(train_df[features]))
    input_example = train_df[features].head(3)

    mlflow.sklearn.log_model(
        model,
        "best_tuned_xgb_model",
        signature=signature,
        input_example=input_example
    )

In [None]:
# Define the parameter grid
param_grid = {
    'n_estimators': [25, 50, 75],
    'learning_rate': [0.05, 0.1, 0.15],
    'max_depth': [4, 5, 6, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

xbg = xgb.XGBRegressor()

grid_search = GridSearchCV(estimator=xbg, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(train_df[features], train_df['salary_in_usd'])

print(grid_search.best_params_)

In [None]:
with mlflow.start_run(run_name="Final_XGBModel"):
    mlflow.log_param("model_type", "XGBRegressor_Final")
    best_params = {
        "colsample_bytree":   0.8,
        "learning_rate":      0.1,
        "max_depth":          5,
        "min_child_weight":   1,
        "n_estimators":       75,
        "subsample":          0.8,
        "objective":          "reg:squarederror"
    }
    mlflow.log_params(best_params)

    xbg = xgb.XGBRegressor(**best_params)
    xbg.fit(train_df[features], train_df["salary_in_usd"])

    preds = xbg.predict(test_df[features])
    mse  = mean_squared_error(test_df["salary_in_usd"], preds)
    r2   = r2_score(test_df["salary_in_usd"], preds)
    mae  = mean_absolute_error(test_df["salary_in_usd"], preds)
    acc  = xbg.score(test_df[features], test_df["salary_in_usd"])

    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("accuracy", acc)

    signature     = infer_signature(train_df[features], xbg.predict(train_df[features]))
    input_example = train_df[features].head(3)

    mlflow.sklearn.log_model(
        xbg,
        "final_xgb_model",
        signature=signature,
        input_example=input_example
    )