In [15]:
import pandas as pd
import itertools
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    ExtraTreesRegressor,
    AdaBoostRegressor,
)
from sklearn.exceptions import ConvergenceWarning
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import warnings

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# ===== LOAD DATA =====
df = pd.read_csv("data/jobs_salaries_2023.csv")  # Replace with your CSV file
df.dropna(inplace=True)  # Drop missing values

# ===== FILTER OUT JOB TITLES WITH FEWER THAN 100 RECORDS =====
job_counts = df["job_title"].value_counts()
df = df[df["job_title"].isin(job_counts[job_counts >= 100].index)]

# ===== DEFINE FEATURES & TARGET =====
target_column = "salary_in_usd"  # Replace with your target
feature_columns = [col for col in df.columns if col != target_column]

# ===== DEFINE MODELS =====
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "RandomForest": RandomForestRegressor(random_state=42),
    "ExtraTrees": ExtraTreesRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "AdaBoost": AdaBoostRegressor(random_state=42),
    "SVR": SVR(),
    "XGB": XGBRegressor(random_state=42, verbosity=0),
    "LightGBM": LGBMRegressor(random_state=42, verbose=-1),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
}

# ===== CALCULATE TOTAL ITERATIONS FOR tqdm =====
total_iterations = sum(
    len(list(itertools.combinations(feature_columns, r))) * len(models)
    for r in range(1, len(feature_columns) + 1)
)

# ===== TRY ALL FEATURE COMBINATIONS =====
results = []

with tqdm(total=total_iterations, desc="Training models", unit="run") as pbar:
    for r in range(1, len(feature_columns) + 1):
        for subset in itertools.combinations(feature_columns, r):
            X = pd.get_dummies(df[list(subset)], drop_first=True)  # Handle categorical
            y = df[target_column]

            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42
            )

            for model_name, model in models.items():
                try:
                    model.fit(X_train, y_train)
                    y_pred = model.predict(X_test)
                    r2 = r2_score(y_test, y_pred)
                    mae = mean_absolute_error(y_test, y_pred)
                    results.append(
                        {
                            "features": subset,
                            "model": model_name,
                            "r2_score": r2,
                            "mae": mae,
                        }
                    )
                except Exception as e:
                    results.append(
                        {
                            "features": subset,
                            "model": model_name,
                            "r2_score": None,
                            "mae": None,
                            "error": str(e),
                        }
                    )
                pbar.update(1)  # Progress bar update after each model run

# ===== SHOW TOP RESULTS =====
results_df = pd.DataFrame(results)
results_df = results_df.dropna(subset=["r2_score"])  # Keep only successful runs
results_df = results_df.sort_values(by="r2_score", ascending=False)

print("\n=== Top 10 Models by R² ===")
print(results_df.head(10))

# Save to CSV
results_df.to_csv("model_results.csv", index=False)
print("\nResults saved to model_results.csv")

Training models: 100%|██████████| 13299/13299 [17:26<00:00, 12.71run/s]


=== Top 10 Models by R² ===
                                               features       model  r2_score  \
4101  (employment_type, job_title, salary, salary_cu...  ExtraTrees  0.997017   
1826               (job_title, salary, salary_currency)  ExtraTrees  0.996931   
7572  (employment_type, job_title, salary, salary_cu...  ExtraTrees  0.996864   
4569  (job_title, salary, salary_currency, remote_ra...  ExtraTrees  0.996565   
6649  (experience_level, employment_type, job_title,...  ExtraTrees  0.996542   
9938  (experience_level, employment_type, job_title,...  ExtraTrees  0.996324   
4309  (employment_type, salary, salary_currency, rem...  ExtraTrees  0.996308   
526                           (salary, salary_currency)  ExtraTrees  0.996208   
2034            (salary, salary_currency, remote_ratio)  ExtraTrees  0.996156   
1631         (employment_type, salary, salary_currency)  ExtraTrees  0.996081   

             mae  
4101  802.414183  
1826  793.298325  
7572  806.664095  
456


