In [13]:
# classification_models_comparison.ipynb

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# --------------------
# 1. Load your dataset
# --------------------
# Replace with your dataset load
df = pd.read_csv("data/jobs_salaries_2023.csv")
df = df.dropna(subset=["salary_in_usd"])  # Drop rows with missing target

# =====================================
# 2. Prepare classification target
# Here we turn salary_in_usd into bins for classification
# Example: Low / Medium / High
# =====================================
df["salary_class"] = pd.qcut(df["salary_in_usd"], q=3, labels=["Low", "Medium", "High"])

# Remove rare classes (fewer than 2 samples)
class_counts = df["salary_class"].value_counts()
rare_classes = class_counts[class_counts < 2].index
df = df[~df["salary_class"].isin(rare_classes)]

# Separate features and target
X = df.drop(columns=['salary_in_usd', 'salary', 'salary_class'])
y = df["salary_in_usd"]  # ensure this is categorical

# --------------------
# 2. Train-test split
# --------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --------------------
# 3. Preprocessing
# --------------------
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object", "category"]).columns

numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

categorical_transformer = Pipeline(
    steps=[("encoder", OneHotEncoder(handle_unknown="ignore"))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# --------------------
# 4. Models to compare
# --------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=500, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42),
}

# --------------------
# 5. Train, predict, and evaluate
# --------------------
results = []

for name, model in models.items():
    pipe = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    probs = (
        pipe.predict_proba(X_test)[:, 1]
        if hasattr(pipe.named_steps["model"], "predict_proba")
        else None
    )

    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds, average="weighted", zero_division=0)
    rec = recall_score(y_test, preds, average="weighted", zero_division=0)
    f1 = f1_score(y_test, preds, average="weighted", zero_division=0)
    auc = (
        roc_auc_score(y_test, probs, multi_class="ovr")
        if probs is not None and len(np.unique(y_test)) > 2
        else None
    )

    results.append(
        {
            "Model": name,
            "Accuracy": acc,
            "Precision": prec,
            "Recall": rec,
            "F1-score": f1,
            "ROC AUC": auc,
        }
    )

# --------------------
# 6. Show results
# --------------------
results_df = pd.DataFrame(results)
print(results_df)

# Optional: Detailed classification report for the best model
best_model_name = results_df.sort_values(by="F1-score", ascending=False).iloc[0][
    "Model"
]
print(f"\nBest model based on F1-score: {best_model_name}")
best_model = models[best_model_name]
pipe = Pipeline(steps=[("preprocessor", preprocessor), ("model", best_model)])
pipe.fit(X_train, y_train)
preds = pipe.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, preds))

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.