# Create Visualizations — Documented Notebook
This notebook converts `create_visualizations_clean.py` into a documented, cell-based workflow.
It loads the CSV outputs from the analysis pipeline (`loan_optimization_results.csv`, `recommended_increases.csv`, `simulation_results.csv`) and produces four publication-ready figures saved as PNG files in the project folder.
Use this notebook to iterate on plots interactively or run headless to regenerate figure files.

## Setup: imports and plotting style
We import pandas, matplotlib and seaborn and set a consistent style for the figures.

In [None]:
import os
from typing import Tuple
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

sns.set_style('whitegrid')
# Working directory for reading/writing relative to this notebook file
WORKDIR = os.path.abspath(os.path.dirname(__file__)) if '__file__' in globals() else os.getcwd()

print('Workdir:', WORKDIR)

## Load results
A small utility to load the three CSV outputs produced by the analysis pipeline. The function returns (df, recommendations, simulations).

In [None]:
def load_results(workdir: str = WORKDIR) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Read the CSV outputs from the analysis pipeline.
    Returns a tuple: (loan dataset, recommendations, simulations).
    """
    df = pd.read_csv(os.path.join(workdir, 'loan_optimization_results.csv'))
    recommendations = pd.read_csv(os.path.join(workdir, 'recommended_increases.csv'))
    simulations = pd.read_csv(os.path.join(workdir, 'simulation_results.csv'))
    return df, recommendations, simulations

## Figure 1 — Dataset overview
Multiple small plots showing distributions and relationships in the base dataset. This helps validate data shape and key signals (payments, loan size, risk buckets).

In [None]:
def fig1_dataset_overview(df: pd.DataFrame, outpath: str):
    # Create a grid of simple exploratory plots
    fig, axes = plt.subplots(3, 3, figsize=(16, 10))
    # Distribution of on-time payment percentages
    df['On-time Payments (%)'].hist(bins=50, ax=axes[0, 0])
    axes[0,0].set_title('On-time Payments (%)')
    # Distribution of initial loan amounts
    df['Initial Loan ($)'].hist(bins=50, ax=axes[0, 1])
    axes[0,1].set_title('Initial Loan ($)')
    # Days since last loan distribution
    df['Days Since Last Loan'].hist(bins=50, ax=axes[0, 2])
    axes[0,2].set_title('Days Since Last Loan')
    # Risk category counts
    df['Risk_Category'].value_counts().plot(kind='bar', ax=axes[1, 0])
    axes[1,0].set_title('Risk Category Counts')
    # Historical increases counts
    df['No. of Increases in 2023'].value_counts().sort_index().plot(kind='bar', ax=axes[1, 1])
    axes[1,1].set_title('No. of Increases in 2023')
    # Total profit contribution distribution (if available)
    if 'Total Profit Contribution ($)' in df.columns:
        df['Total Profit Contribution ($)'].hist(bins=30, ax=axes[1, 2])
    axes[1,2].set_title('Total Profit Contribution ($)')
    # Scatter matrix for selected numeric features (small sample to keep plotting light)
    sample = df[['Initial Loan ($)', 'Days Since Last Loan', 'On-time Payments (%)']].sample(n=min(300, len(df)))
    pd.plotting.scatter_matrix(sample, ax=axes[2, 0], diagonal='kde')
    axes[2,0].set_title('Scatter matrix (sample)')
    # Uptake probability by risk bucket (bar chart)
    if 'Uptake_Probability' in df.columns:
        sns.barplot(x=df.groupby('Risk_Category')['Uptake_Probability'].mean().index, y=df.groupby('Risk_Category')['Uptake_Probability'].mean().values, ax=axes[2, 2])
        axes[2,2].set_title('Mean Uptake Probability by Risk')
    plt.suptitle('Dataset Overview')
    fig.savefig(outpath, bbox_inches='tight')
    plt.close(fig)

## Figure 2 — Model performance and risk analysis
Visualizations focusing on model outputs (default probabilities, expected value) and approvals breakdown.

In [None]:
def fig2_model_performance(df: pd.DataFrame, recommendations: pd.DataFrame, outpath: str):
    fig, axes = plt.subplots(2, 3, figsize=(16, 10))
    # Average default probability by risk bucket
    df.groupby('Risk_Category')['Default_Probability'].mean().plot(kind='bar', ax=axes[0, 0])
    axes[0,0].set_title('Mean Default Probability by Risk')
    # Distribution of positive expected values
    df[df['Expected_Value'] > 0]['Expected_Value'].hist(bins=50, ax=axes[0, 1])
    axes[0,1].set_title('Positive Expected Value Distribution')
    # Scatter of default probability vs expected value (sampled)
    df.sample(n=min(2000, len(df))).plot.scatter(x='Default_Probability', y='Expected_Value', ax=axes[0, 2])
    axes[0,2].set_title('Default Probability vs Expected Value (sample)')
    # Average customer NPV by risk bucket (if present)
    if 'Customer_NPV' in df.columns:
        df.groupby('Risk_Category')['Customer_NPV'].mean().plot(kind='bar', ax=axes[1, 0])
        axes[1,0].set_title('Mean Customer NPV by Risk')
    # Credit score proxy trend by risk (line plot may be noisy but informative)
    if 'Credit_Score_Proxy' in df.columns:
        df.groupby('Risk_Category')['Credit_Score_Proxy'].mean().plot(kind='line', ax=axes[1, 1])
        axes[1,1].set_title('Credit Score Proxy by Risk')
    # Approvals summary: eligible vs approved vs high-risk approved
    axes[1, 2].bar(['Total Eligible', 'Approved for Increase', 'High Risk Approved'], [df['Eligible'].sum(), len(recommendations), len(recommendations[recommendations['Risk_Category']=='Sub-Prime'])])
    axes[1,2].set_title('Approval Summary')
    plt.suptitle('Model Performance and Risk Analysis')
    fig.savefig(outpath, bbox_inches='tight')
    plt.close(fig)

## Figure 3 — Monte Carlo simulation results
Summaries of the simulation outputs: net value distributions, defaults, and profit/loss balances by risk bucket.

In [None]:
def fig3_simulation_results(simulations: pd.DataFrame, outpath: str):
    fig, axes = plt.subplots(2, 3, figsize=(16, 10))
    # Histogram of net values by risk category (overlaying first three categories found)
    cats = simulations['risk_category'].unique()[:3]
    for i, cat in enumerate(cats):
        simulations[simulations['risk_category']==cat]['net_value'].hist(bins=50, ax=axes[0, 0], alpha=0.6, label=str(cat))
    axes[0,0].legend()
    axes[0,0].set_title('Net Value Distribution by Risk Category')
    # Defaults rate by risk category
    simulations.groupby('risk_category')['defaults'].mean().plot(kind='bar', ax=axes[0, 2])
    axes[0,2].set_title('Mean Defaults by Risk Category')
    # Profit vs losses scatter at customer level (averaged across sims)
    customer_summary = simulations.groupby('customer_id')[['total_profit','total_losses']].mean()
    customer_summary.plot(kind='scatter', x='total_profit', y='total_losses', ax=axes[1, 0])
    axes[1,0].set_title('Avg Profit vs Losses (customer level)')
    # Average increases granted by risk category
    simulations.groupby('risk_category')['increases_granted'].mean().plot(kind='bar', ax=axes[1, 1])
    axes[1,1].set_title('Avg Increases Granted by Risk')
    # Summary statistics per risk bucket (mean, median, std) for net value
    sims_desc = simulations.groupby('risk_category')['net_value'].describe()[['mean','50%','std']]
    sims_desc.plot(kind='bar', ax=axes[1, 2])
    axes[1,2].set_title('Net Value Summary by Risk')
    plt.suptitle('Monte Carlo Simulation Results')
    fig.savefig(outpath, bbox_inches='tight')
    plt.close(fig)

## Figure 4 — Optimization strategy results
Plots summarizing the distribution of recommended increases, their expected value, and characteristics of approved customers.

In [None]:
def fig4_optimization_strategy(recommendations: pd.DataFrame, outpath: str):
    fig, axes = plt.subplots(2, 3, figsize=(16, 8))
    # How many customers received 0,1,2,.. recommended increases
    recommendations['Recommended_Increases'].value_counts().sort_index().plot(kind='bar', ax=axes[0, 0])
    axes[0,0].set_title('Recommended Increases Counts')
    # Average total expected value by number of recommended increases
    recommendations.groupby('Recommended_Increases')['Total_Expected_Value'].mean().plot(ax=axes[0, 1])
    axes[0,1].set_title('Mean Total Expected Value by #Increases')
    # Risk breakdown of recommended approvals
    recommendations['Risk_Category'].value_counts().plot(kind='pie', ax=axes[0, 2])
    axes[0,2].set_title('Risk Category Share (approved)')
    # Distribution of on-time payments for approved customers
    recommendations['On-time Payments (%)'].hist(bins=30, ax=axes[1, 0])
    axes[1,0].set_title('On-time Payments (%) — Approved')
    # Distribution of loan sizes for approved customers
    recommendations['Initial Loan ($)'].hist(bins=30, ax=axes[1, 1])
    axes[1,1].set_title('Initial Loan ($) — Approved')
    # Default probability distribution for approved customers
    recommendations['Default_Probability'].hist(bins=30, ax=axes[1, 2])
    axes[1,2].set_title('Default Probability — Approved')
    plt.suptitle('Optimization Strategy Results')
    fig.savefig(outpath, bbox_inches='tight')
    plt.close(fig)

## Run and save figures
This cell loads results and runs the four figure functions. Adjust the output names if you want to version figures.

In [None]:
# Load data and run plotting functions
df, recommendations, simulations = load_results()
out1 = os.path.join(WORKDIR, 'fig1_dataset_overview.png')
out2 = os.path.join(WORKDIR, 'fig2_model_performance.png')
out3 = os.path.join(WORKDIR, 'fig3_monte_carlo.png')
out4 = os.path.join(WORKDIR, 'fig4_optimization_strategy.png')
fig1_dataset_overview(df, out1)
fig2_model_performance(df, recommendations, out2)
fig3_simulation_results(simulations, out3)
fig4_optimization_strategy(recommendations, out4)
print('Saved figures:', out1, out2, out3, out4)

---
Tips:
- If any plot errors arise because a column is missing, check that the analysis pipeline successfully created `loan_optimization_results.csv`, `recommended_increases.csv`, and `simulation_results.csv`.
- To preview plots inline in the notebook, run the cells interactively in Jupyter or VS Code.
- To regenerate figures headlessly, use `jupyter nbconvert --to notebook --execute` as shown in the analysis notebook.