In [2]:
# MODEL COMPARISON & ANALYSIS
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


In [3]:
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)


In [4]:
# PART 1: LOAD ALL MODELS & TEST DATA
X_test = pd.read_csv('data/X_test.csv')
y_test = pd.read_csv('data/y_test.csv').values.ravel()


In [5]:
model_lr = pickle.load(open('models/logistic_model.pkl', 'rb'))
model_rf = pickle.load(open('models/random_forest_model.pkl', 'rb'))
model_gb = pickle.load(open('models/gradient_boosting_model.pkl', 'rb'))
ensemble = pickle.load(open('models/ensemble_model.pkl', 'rb'))


In [6]:
# PART 2: COMPREHENSIVE MODEL COMPARISON
models = {
    'Logistic Regression': model_lr,
    'Random Forest': model_rf,
    'Gradient Boosting': model_gb,
    'Ensemble (RF+GB)': ensemble
}

In [8]:
results = {}

for name, model in models.items():
    print(f"{name}...", end=" ")
    
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else 0
    
    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc
    }
    
    print(f"({accuracy:.4f})")


Logistic Regression... (0.9425)
Random Forest... (0.9345)
Gradient Boosting... (0.9415)
Ensemble (RF+GB)... (0.9410)


In [9]:
# Display results
results_df = pd.DataFrame(results).T
print("\n" + results_df.round(4).to_string())


                     Accuracy  Precision  Recall  F1-Score  ROC-AUC
Logistic Regression    0.9425     0.9314  0.9636    0.9472   0.9905
Random Forest          0.9345     0.9304  0.9486    0.9394   0.9856
Gradient Boosting      0.9415     0.9297  0.9636    0.9464   0.9897
Ensemble (RF+GB)       0.9410     0.9344  0.9570    0.9456   0.9889


In [10]:
# Save results
results_df.to_csv('outputs/model_comparison_results.csv')


In [13]:
# PART 3: DETAILED ANALYSIS BY METRIC
print("\nAccuracy:")
accuracy_ranking = results_df['Accuracy'].sort_values(ascending=False)
for i, (model, acc) in enumerate(accuracy_ranking.items(), 1):
    print(f"  {i}. {model}: {acc:.4f} ({acc*100:.2f}%)")

print("\nF1-Score:")
f1_ranking = results_df['F1-Score'].sort_values(ascending=False)
for i, (model, f1) in enumerate(f1_ranking.items(), 1):
    print(f"  {i}. {model}: {f1:.4f}")

best_model_name = results_df['Accuracy'].idxmax()
best_accuracy = results_df.loc[best_model_name, 'Accuracy']

print(f"\nBEST MODEL: {best_model_name}")
print(f"Accuracy: {best_accuracy:.4f} ({best_accuracy*100:.2f}%)")



Accuracy:
  1. Logistic Regression: 0.9425 (94.25%)
  2. Gradient Boosting: 0.9415 (94.15%)
  3. Ensemble (RF+GB): 0.9410 (94.10%)
  4. Random Forest: 0.9345 (93.45%)

F1-Score:
  1. Logistic Regression: 0.9472
  2. Gradient Boosting: 0.9464
  3. Ensemble (RF+GB): 0.9456
  4. Random Forest: 0.9394

BEST MODEL: Logistic Regression
Accuracy: 0.9425 (94.25%)


In [14]:
# PART 4: CONFUSION MATRICES
cm_data = {}

for name, model in models.items():
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    
    cm_data[name] = {
        'True Negatives': cm[0, 0],
        'False Positives': cm[0, 1],
        'False Negatives': cm[1, 0],
        'True Positives': cm[1, 1]
    }

cm_df = pd.DataFrame(cm_data).T
print("\n" + cm_df.to_string())


                     True Negatives  False Positives  False Negatives  True Positives
Logistic Regression             853               76               39            1032
Random Forest                   853               76               55            1016
Gradient Boosting               851               78               39            1032
Ensemble (RF+GB)                857               72               46            1025


In [16]:
# PART 5: FEATURE IMPORTANCE (for tree-based models)
feature_names = ['Gender', 'Education', 'Experience', 'Actual Wage']

print("\nRandom Forest Feature Importance:")
rf_importance = model_rf.feature_importances_
for name, imp in sorted(zip(feature_names, rf_importance), key=lambda x: x[1], reverse=True):
    print(f"  {name}: {imp:.4f}")

print("\nGradient Boosting Feature Importance:")
gb_importance = model_gb.feature_importances_
for name, imp in sorted(zip(feature_names, gb_importance), key=lambda x: x[1], reverse=True):
    print(f"  {name}: {imp:.4f}")

print("\nLogistic Regression Coefficients:")
lr_coef = model_lr.coef_[0]
for name, coef in sorted(zip(feature_names, lr_coef), key=lambda x: abs(x[1]), reverse=True):
    direction = "Increases" if coef > 0 else "Decreases"
    print(f"  {name}: {coef:.4f} {direction}")



Random Forest Feature Importance:
  Actual Wage: 0.8293
  Experience: 0.0860
  Education: 0.0763
  Gender: 0.0084

Gradient Boosting Feature Importance:
  Actual Wage: 0.9971
  Education: 0.0015
  Experience: 0.0012
  Gender: 0.0002

Logistic Regression Coefficients:
  Education: -0.0684 Decreases
  Gender: -0.0415 Decreases
  Experience: 0.0032 Increases
  Actual Wage: -0.0014 Decreases


In [17]:
# Save feature importance
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Random Forest': rf_importance,
    'Gradient Boosting': gb_importance,
    'Avg Importance': (rf_importance + gb_importance) / 2
})
importance_df = importance_df.sort_values('Avg Importance', ascending=False)
importance_df.to_csv('outputs/feature_importance.csv', index=False)
