In [1]:
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
from sklearn.metrics import roc_curve, auc
import os

In [8]:
current_dir = os.getcwd()


fnn_output = pd.read_csv(os.path.normpath(os.path.join(current_dir, '../results/', 'fnn_output.csv')), sep='\t')
hnn_output = pd.read_csv(os.path.normpath(os.path.join(current_dir, '../results/', 'hmm_output.csv')), sep='\t')
hlr_output = pd.read_csv(os.path.normpath(os.path.join(current_dir, '../results/', 'hlr_output.csv')), sep='\t')
lr_output = pd.read_csv(os.path.normpath(os.path.join(current_dir, '../results/', 'linear_regression_output.csv')), sep='\t')
xgboost_output = pd.read_csv(os.path.normpath(os.path.join(current_dir, '../results/', 'xgboost.csv')), sep='\t')



In [18]:
xgboost_output

Unnamed: 0,p_recall,p_recall_pred,outcome
0,1.000000,0.889664,1.0
1,0.857143,0.879037,1.0
2,0.857143,0.879037,1.0
3,0.857143,0.879037,1.0
4,0.857143,0.879037,1.0
...,...,...,...
4535498,0.666667,0.881564,1.0
4535499,0.666667,0.881564,1.0
4535500,0.666667,0.881564,0.0
4535501,1.000000,0.940124,1.0


In [10]:
xgboost_output = xgboost_output.rename(columns={"y_test": "p_recall", "y_pred": "p_recall_pred"}, errors="raise")


In [24]:
lr_output = lr_output.rename(columns={"predictions":"p_recall_pred"})

In [25]:
lr_output

Unnamed: 0,p_recall_pred,p_recall,outcome,decile
0,0.938091,1.0,1.0,2
1,0.938091,1.0,1.0,2
2,0.885309,1.0,1.0,7
3,0.885309,1.0,1.0,7
4,0.885309,1.0,1.0,7
...,...,...,...,...
4614509,0.867557,1.0,1.0,8
4614510,0.927240,1.0,1.0,3
4614511,0.927240,1.0,1.0,3
4614512,0.927240,1.0,1.0,3


In [31]:
datasets = [fnn_output, hnn_output, hlr_output, lr_output, xgboost_output]
labels = ["fnn", "n-hlr", "hlr", "lr", "xgboost"] 


In [27]:
def AUC_ROC(datasets, labels, filename):
    
    plt.figure(figsize=(8, 6))
    
    for df, label in zip(datasets, labels):

        outcome = df['outcome']
        outcome_prob = df['p_recall_pred']
        
        # Calculate ROC curve and AUC
        fpr, tpr, _ = roc_curve(outcome, outcome_prob)
        roc_auc = auc(fpr, tpr)
        
        # Plot ROC curve
        plt.plot(fpr, tpr, label=f"{label} (AUC = {roc_auc:.3f})", linewidth=2)
        print(f"{label} - AUC: {roc_auc:.4f}")
    
    # Plot random guess line
    plt.plot([0, 1], [0, 1], 'k--', label="Random (AUC = 0.500)")
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve Comparison')
    plt.legend(loc='lower right')
    plt.grid(True)
    
    plt.savefig(filename, bbox_inches='tight', dpi=300)
    plt.close()

In [28]:
AUC_ROC(datasets, labels, "roc_comparison.png")

fnn - AUC: 0.6014
hnn - AUC: 0.5320
hlr - AUC: 0.5290
lr - AUC: 0.6159
xgboost - AUC: 0.6548


In [37]:
def plot_lift(datasets, outcome_col, outcome_prob_col, labels, filename):
    """
    Parameters:
    - datasets: List of DataFrames, each containing outcome and predicted probability columns.
    - outcome_col: Name of the column with true outcomes (e.g., 'outcome').
    - outcome_prob_col: Name of the column with predicted probabilities (e.g., 'p_recall_pred').
    - labels: List of labels for each dataset (for legend).
    - filename: Output filename for the plot.
    """
    plt.figure(figsize=(8, 6))
    colors = plt.cm.tab10.colors  # Use a color palette for clarity

    for idx, (df, label) in enumerate(zip(datasets, labels)):
        # Calculate deciles (1 = highest predictions)
        df["decile"] = pd.qcut(
            df[outcome_prob_col].rank(method="first", ascending=False),
            10,
            labels=False,
        ) + 1

        # Group by decile and calculate lift
        lift_df = df.groupby("decile")[outcome_col].agg(["mean", "count"])
        baseline_rate = df[outcome_col].mean()
        lift_df["lift"] = lift_df["mean"] / baseline_rate

        # Plot lift curve
        plt.plot(
            lift_df.index,
            lift_df["lift"],
            marker="o",
            linestyle="-",
            color=colors[idx],
            label=f"{label}",
        )

    # Add baseline and styling
    plt.axhline(y=1, color="gray", linestyle="--", label="Overall Baseline (1x)")
    plt.xlabel("Decile (1 = Highest Predictions)")
    plt.ylabel("Lift")
    plt.title("Lift Chart Comparison")
    plt.legend(loc="upper right")
    plt.grid(True)
    plt.xticks(range(1, 11))  # Ensure all deciles are shown

    plt.savefig(filename, bbox_inches='tight', dpi=300)
    plt.close()

In [38]:
plot_lift(datasets, "outcome", "p_recall_pred", labels, "lift_comparison.png")

In [41]:
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt

def plot_calibration(datasets, outcome_col, outcome_prob_col, labels, filename, n_bins=10):
    """
    Parameters:
    - datasets: List of DataFrames, each containing outcome and predicted probability columns.
    - outcome_col: Name of the column with true outcomes (e.g., 'outcome').
    - outcome_prob_col: Name of the column with predicted probabilities (e.g., 'p_recall_pred').
    - labels: List of labels for each dataset (for legend).
    - filename: Output filename for the plot.
    - n_bins: Number of bins for calibration curve (default: 10).
    """
    plt.figure(figsize=(8, 6))
    colors = plt.cm.tab10.colors  # Use a color palette for clarity

    for idx, (df, label) in enumerate(zip(datasets, labels)):
        # Extract true labels and predicted probabilities
        outcome = df[outcome_col]
        outcome_prob = df[outcome_prob_col]

        # Calculate calibration curve
        prob_true, prob_pred = calibration_curve(outcome, outcome_prob, n_bins=n_bins)

        # Plot calibration curve
        plt.plot(
            prob_pred,
            prob_true,
            "s-",  # Square markers with solid lines
            color=colors[idx],
            label=f"{label}",
            markersize=6,
        )

    # Plot perfect calibration line
    plt.plot([0, 1], [0, 1], "k--", label="Perfectly Calibrated")
    plt.xlabel("Mean Predicted Probability")
    plt.ylabel("Observed Frequency")
    plt.title("Calibration Plot Comparison")
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.xlim(0, 1)
    plt.ylim(0, 1)

    plt.savefig(filename, bbox_inches="tight", dpi=300)
    plt.close()


In [42]:
# Generate the plot
plot_calibration(datasets, "outcome", "p_recall_pred", labels, "calibration_comparison.png")