# Balanced Data Deep Analysis

This notebook provides a graphical analysis of the balanced dataset `data/calls_data_balanced_v5.json`.

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

FILE_PATH = '../data/calls_data_balanced_v5.json'

print(f"Loading {FILE_PATH}...")
with open(FILE_PATH, 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"Total samples: {len(data)}")

# Create DataFrame
df = pd.DataFrame([
    {
        'input': item['input'],
        'disposition': item['output']['disposition'],
        'payment_disposition': item['output']['payment_disposition'] if item['output']['payment_disposition'] else 'None',
        'remarks': item['output']['remarks']
    } for item in data
])

sns.set_theme(style="whitegrid")

## 1. Top 20 Overall Dispositions

This shows the most common outcomes across the entire dataset.

In [None]:
plt.figure(figsize=(12, 8))
counts = df['disposition'].value_counts().head(20)
ax = sns.barplot(y=counts.index, x=counts.values, palette="mako")
plt.title("Top 20 Overall Dispositions", fontsize=16)
plt.xlabel("Count")

# Add labels
for i, v in enumerate(counts.values):
    ax.text(v + 10, i, str(v), va='center')

plt.tight_layout()
plt.show()

## 2. Payment Disposition of 'ANSWERED' Calls

For calls where the disposition was explicitly **ANSWERED** (16k+ samples), what was the payment outcome?
This helps verify the diversity of outcomes within successful contacts.

In [None]:
answered_df = df[df['disposition'] == 'ANSWERED']
plt.figure(figsize=(12, 6))
counts = answered_df['payment_disposition'].value_counts()
ax = sns.barplot(x=counts.index, y=counts.values, palette="viridis")
plt.title("Payment Disposition Distribution for 'ANSWERED' Calls", fontsize=16)
plt.ylabel("Count")
plt.xticks(rotation=45, ha='right')

# Add labels
for i, v in enumerate(counts.values):
    ax.text(i, v + 50, str(v), ha='center')

plt.tight_layout()
plt.show()

## 3. None Payment Disposition Breakdown

This drills down into the "None" category (where No Payment Disposition exists) to show the underlying failure reasons.

In [None]:
none_df = df[df['payment_disposition'] == 'None']
plt.figure(figsize=(12, 8))
counts = none_df['disposition'].value_counts().head(20)
ax = sns.barplot(y=counts.index, x=counts.values, palette="rocket")
plt.title("Top 20 Dispositions for 'None' Payment Calls", fontsize=16)
plt.xlabel("Count")

# Add labels
for i, v in enumerate(counts.values):
    ax.text(v + 10, i, str(v), va='center')

plt.tight_layout()
plt.show()

## 4. Payment Share (Pie Chart)

In [None]:
plt.figure(figsize=(10, 10))
counts = df['payment_disposition'].value_counts()
plt.pie(counts, labels=counts.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette("pastel"))
plt.title("Overall Payment Disposition Share (v5 - Synthetic Augmented)")
plt.show()