In [None]:
logreg = Pipeline([
    ("prep", preprocessor),
    ("model", LogisticRegression(max_iter=1000, random_state=42))
])

printX = logreg.fit(X_train, y_train, model__sample_weight=w_train)
y_pred_lr = logreg.predict(X_test)
y_prob_lr = logreg.predict_proba(X_test)[:, 1]

print("=== Logistic Regression (baseline) ===")
print(classification_report(y_test, y_pred_lr, sample_weight=w_test))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_lr, sample_weight=w_test))
print("PR-AUC:", average_precision_score(y_test, y_prob_lr, sample_weight=w_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr, sample_weight=w_test))

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, average_precision_score

# === 3. Logistic Regression (with weights) ===
logreg = Pipeline([
    ("prep", preprocessor),
    ("model", LogisticRegression(max_iter=1000, random_state=42))
])

# Fit on training data
logreg.fit(X_train, y_train, model__sample_weight=w_train)

# --- Holdout evaluation ---
y_pred_lr = logreg.predict(X_test)
y_prob_lr = logreg.predict_proba(X_test)[:, 1]

print("=== Logistic Regression (weighted, Holdout) ===")
print(classification_report(y_test, y_pred_lr, sample_weight=w_test))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_lr, sample_weight=w_test))
print("PR-AUC:", average_precision_score(y_test, y_prob_lr, sample_weight=w_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr, sample_weight=w_test))

# --- Cross-validation evaluation ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    "roc_auc": "roc_auc",
    "pr_auc": "average_precision"
}

cv_results_lr = cross_validate(
    logreg,
    X_train, y_train,
    cv=cv,
    scoring=scoring,
    params={"model__sample_weight": w_train.to_numpy()}
)


print("\n\n=== Logistic Regression (weighted, Cross Validation) ===")
print("ROC-AUC: %.3f ± %.3f" % (
    cv_results_lr["test_roc_auc"].mean(),
    cv_results_lr["test_roc_auc"].std()
))
print("PR-AUC : %.3f ± %.3f" % (
    cv_results_lr["test_pr_auc"].mean(),
    cv_results_lr["test_pr_auc"].std()
))


In [None]:
dt_default = Pipeline([
    ("prep", preprocessor),   # same preprocessing as before
    ("model", DecisionTreeClassifier(random_state=42))
])

# Fit with weights
dt_default.fit(X_train, y_train, model__sample_weight=w_train)

# Predictions
y_pred_dt = dt_default.predict(X_test)
y_prob_dt = dt_default.predict_proba(X_test)[:, 1]

# Evaluation (weighted)
print("=== Decision Tree (Default Parameters, Holdout) ===")
print(classification_report(y_test, y_pred_dt, sample_weight=w_test))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_dt, sample_weight=w_test))
print("PR-AUC:", average_precision_score(y_test, y_prob_dt, sample_weight=w_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt, sample_weight=w_test))

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, average_precision_score

# Define scorers
scoring = {
    "roc_auc": "roc_auc",
    "pr_auc": "average_precision"
}

# Stratified CV so each fold has both classes
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Decision Tree pipeline
dt = Pipeline([
    ("prep", preprocessor),
    ("model", DecisionTreeClassifier(random_state=42))
])

# Parameter grid
param_grid = {
    "model__max_depth": [3, 5, 10, 15,None],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 5, 10]
}

# GridSearchCV
grid_dt = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    cv=cv,
    scoring=scoring,
    refit="roc_auc",   # refit the best model according to ROC-AUC
    n_jobs=-1
)

# Fit with sample weights
grid_dt.fit(X_train, y_train, model__sample_weight=w_train.to_numpy())

# CV Results
print("\n=== Decision Tree CV Results ===")
print("Best Params:", grid_dt.best_params_)
print("Best CV ROC-AUC:", grid_dt.cv_results_['mean_test_roc_auc'][grid_dt.best_index_])
print("Best CV PR-AUC :", grid_dt.cv_results_['mean_test_pr_auc'][grid_dt.best_index_])

# Holdout evaluation with best model
best_dt = grid_dt.best_estimator_   

y_pred_dt = best_dt.predict(X_test)
y_prob_dt = best_dt.predict_proba(X_test)[:, 1]

print("\n=== Decision Tree (Tuned, Holdout) ===")
print(classification_report(y_test, y_pred_dt, sample_weight=w_test))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_dt, sample_weight=w_test))
print("PR-AUC:", average_precision_score(y_test, y_prob_dt, sample_weight=w_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt, sample_weight=w_test))


## Decision Boundary

feat_x = "Age"
feat_y = "Weekly_Working_Time"

# Subset data
X_2d = df_labeled[[feat_x, feat_y]]
y_2d = df_labeled["Income"].astype(int)

# Train a simple decision tree on just 2 features
dt_2d = DecisionTreeClassifier(max_depth=4, random_state=42)
dt_2d.fit(X_2d, y_2d)

# Create grid for plotting decision surface
x_min, x_max = X_2d[feat_x].min()-5, X_2d[feat_x].max()+5
y_min, y_max = X_2d[feat_y].min()-5, X_2d[feat_y].max()+5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200),
                     np.linspace(y_min, y_max, 200))

Z = dt_2d.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot decision boundary
plt.figure(figsize=(8,6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap="coolwarm")
plt.scatter(X_2d[feat_x], X_2d[feat_y], c=y_2d, cmap="coolwarm", edgecolor="k", alpha=0.7)
plt.xlabel(feat_x)
plt.ylabel(feat_y)
plt.title("Decision Boundary (Decision Tree, 2D projection)")
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest pipeline
rf = Pipeline([
    ("prep", tree_preprocessor),
    ("model", RandomForestClassifier(
        random_state=42,
        n_jobs=-1
    ))
])

# Fit with weights
rf.fit(X_train, y_train, model__sample_weight=w_train)

# Predictions
y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]

# Evaluation
print("=== Random Forest (Tree-Specific Preprocessing, Holdout) ===")
print(classification_report(y_test, y_pred_rf, sample_weight=w_test))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_rf, sample_weight=w_test))
print("PR-AUC:", average_precision_score(y_test, y_prob_rf, sample_weight=w_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf, sample_weight=w_test))


In [None]:
# =====================================================
# Handle Missing Values with DecisionTreeClassifier Imputer
# =====================================================

# --- Step 4: Rebuild preprocessing pipeline (no imputers needed now) ---
regular_cat_tf = OneHotEncoder(handle_unknown="ignore")

num_plain_tf = StandardScaler()

num_log_tf = Pipeline([
    ("log", FunctionTransformer(np.log1p, validate=False)),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("cat", regular_cat_tf, cat_cols),
    ("num_plain", num_plain_tf, num_plain),
    ("num_log", num_log_tf, num_log)
])
