# Data Balance Verification (Updated)
Analysis of `data/processed/balanced_data.jsonl` focusing on the **RPC Schema Update** (true, false, partial, insufficient_data).

In [None]:
import pandas as pd
import json

# Load the dataset
data = []
file_path = 'data/processed/balanced_data.jsonl'

with open(file_path, 'r') as f:
    for line in f:
        entry = json.loads(line)
        # Extract the assistant's analysis
        analysis_str = entry['messages'][2]['content']
        try:
            analysis = json.loads(analysis_str)
            data.append(analysis)
        except:
            continue

df = pd.DataFrame(data)
print(f"Total Samples: {len(df)}")
df.head()

## 6 Balancing Buckets Verification
We expect exactly **400 samples** for each of the 6 major keys.

In [None]:
def get_bucket(row):
    disp = row['DISPOSITION']
    rpc = row.get('RPC_STATUS', 'unknown')
    if disp == 'ANSWERED':
        # Combine Dispo + RPC for Answered
        return f"{disp}_{rpc}"
    return disp

df['Bucket'] = df.apply(get_bucket, axis=1)
print(df['Bucket'].value_counts())

## Deep Dive: "ANSWERED" Dispositions

In [None]:
answered_df = df[df['DISPOSITION'] == 'ANSWERED']
print(f"Total Answered Samples: {len(answered_df)}")

In [None]:
print("RPC_STATUS Distribution (Answered):\n")
print(answered_df['RPC_STATUS'].value_counts())

In [None]:
print("NAME_VERIFIED Distribution:")
print(answered_df['NAME_VERIFIED'].value_counts())

In [None]:
print("LOAN_NUMBER_VERIFIED Distribution:")
print(answered_df['LOAN_NUMBER_VERIFIED'].value_counts())

### Verification Crosstab (RPC vs Name Verified)
Check correlation: Does RPC 'true' always mean Name Verified?

In [None]:
pd.crosstab(answered_df['RPC_STATUS'], answered_df['NAME_VERIFIED'], rownames=['RPC (true/false)'], colnames=['Name Verified'])