# Eligibility Scoring & 360° Profiles - Data Exploration

**Use Case:** AI-PLATFORM-02 - Eligibility Scoring & 360° Profiles with Network Benefit Analytics  
**Objective:** Explore profile data to understand distributions, patterns, and coverage  
**MLflow Experiment:** `smart/eligibility_scoring_360_profile/*`

## Overview

This notebook explores:
- Golden Records distribution
- Relationship networks
- Benefit patterns
- Income band inference results
- Cluster analysis
- Anomaly flags
- Profile coverage metrics



In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import warnings
warnings.filterwarnings('ignore')

# Add paths
project_root = Path().absolute().parent.parent.parent
sys.path.append(str(project_root / 'shared' / 'utils'))
from db_connector import DBConnector

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Load config
config_path = Path().absolute().parent.parent / "config" / "db_config.yaml"
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

# Connect to database
db = DBConnector(
    host=config['database']['host'],
    port=config['database']['port'],
    database=config['database']['dbname'],
    user=config['database']['user'],
    password=config['database']['password']
)
db.connect()

print("✅ Connected to database")
print(f"   Database: {config['database']['dbname']} at {config['database']['host']}:{config['database']['port']}")



## 1. Golden Records Overview



In [None]:
# Load Golden Records Summary
query = """
SELECT 
    COUNT(*) as total_records,
    COUNT(DISTINCT family_id) as total_families,
    COUNT(*) FILTER (WHERE status = 'active') as active_records
FROM golden_records
"""
summary = db.execute_query(query)
print("Golden Records Summary:")
print(summary)


In [None]:
# Distribution by district
query = """
SELECT district_id, COUNT(*) as count
FROM golden_records
WHERE status = 'active'
GROUP BY district_id
ORDER BY count DESC
LIMIT 10
"""
district_dist = db.execute_query(query)
print("Top 10 Districts:")
print(district_dist)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(data=district_dist, x='district_id', y='count')
plt.title('Golden Records by District')
plt.xlabel('District ID')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## 2. Relationship Networks



In [None]:
# Relationship types distribution
query = """
SELECT relationship_type, COUNT(*) as count
FROM gr_relationships
GROUP BY relationship_type
ORDER BY count DESC
"""
rel_dist = db.execute_query(query)
print("Relationship Types:")
print(rel_dist)

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(data=rel_dist, x='relationship_type', y='count')
plt.title('Relationship Types Distribution')
plt.xlabel('Relationship Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## 3. Benefit Patterns



In [None]:
# Benefit totals by scheme category
query = """
SELECT 
    sm.category,
    COUNT(DISTINCT be.gr_id) as beneficiaries,
    SUM(be.amount) as total_amount,
    AVG(be.amount) as avg_amount
FROM benefit_events be
JOIN scheme_master sm ON be.scheme_id = sm.scheme_id
GROUP BY sm.category
ORDER BY total_amount DESC
"""
benefit_by_category = db.execute_query(query)
print("Benefits by Category:")
print(benefit_by_category)

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sns.barplot(data=benefit_by_category, x='category', y='total_amount', ax=axes[0])
axes[0].set_title('Total Benefits by Category')
axes[0].set_xlabel('Category')
axes[0].set_ylabel('Total Amount')
axes[0].tick_params(axis='x', rotation=45)

sns.barplot(data=benefit_by_category, x='category', y='beneficiaries', ax=axes[1])
axes[1].set_title('Beneficiaries by Category')
axes[1].set_xlabel('Category')
axes[1].set_ylabel('Beneficiaries')
axes[1].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()


## 4. Income Band Inference



In [None]:
# Income band distribution
query = """
SELECT 
    inferred_income_band,
    COUNT(*) as count,
    AVG(income_band_confidence) as avg_confidence
FROM profile_360
WHERE inferred_income_band IS NOT NULL
GROUP BY inferred_income_band
ORDER BY count DESC
"""
income_dist = db.execute_query(query)
print("Income Band Distribution:")
print(income_dist)

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sns.barplot(data=income_dist, x='inferred_income_band', y='count', ax=axes[0])
axes[0].set_title('Income Band Distribution')
axes[0].set_xlabel('Income Band')
axes[0].set_ylabel('Count')

sns.barplot(data=income_dist, x='inferred_income_band', y='avg_confidence', ax=axes[1])
axes[1].set_title('Average Confidence by Income Band')
axes[1].set_xlabel('Income Band')
axes[1].set_ylabel('Average Confidence')
axes[1].set_ylim(0, 1)
plt.tight_layout()
plt.show()


## 5. Cluster Analysis



In [None]:
# Cluster sizes
query = """
SELECT 
    cluster_id,
    COUNT(*) as cluster_size
FROM profile_360
WHERE cluster_id IS NOT NULL
GROUP BY cluster_id
ORDER BY cluster_size DESC
LIMIT 20
"""
cluster_sizes = db.execute_query(query)
print("Top 20 Clusters by Size:")
print(cluster_sizes)

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(data=cluster_sizes, x='cluster_id', y='cluster_size')
plt.title('Cluster Sizes')
plt.xlabel('Cluster ID')
plt.ylabel('Size')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# Cluster statistics
print("\nCluster Statistics:")
print(f"Total clusters: {len(cluster_sizes)}")
print(f"Largest cluster: {cluster_sizes['cluster_size'].max()} members")
print(f"Average cluster size: {cluster_sizes['cluster_size'].mean():.2f}")


## 6. Anomaly Flags



In [None]:
# Flag distribution
query = """
SELECT 
    flag_type,
    flag_severity,
    COUNT(*) as count,
    AVG(flag_score) as avg_score
FROM analytics_flags
WHERE flag_status = 'ACTIVE'
GROUP BY flag_type, flag_severity
ORDER BY flag_type, flag_severity
"""
flag_dist = db.execute_query(query)
print("Active Flags:")
print(flag_dist)

# Plot
if len(flag_dist) > 0:
    plt.figure(figsize=(14, 6))
    sns.barplot(data=flag_dist, x='flag_type', y='count', hue='flag_severity')
    plt.title('Anomaly Flags Distribution')
    plt.xlabel('Flag Type')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.legend(title='Severity')
    plt.tight_layout()
    plt.show()
else:
    print("No active flags found. Run anomaly_detection.py first.")


## 7. Profile Coverage



In [None]:
# Profile coverage statistics
query = """
SELECT 
    COUNT(DISTINCT gr.gr_id) as total_golden_records,
    COUNT(DISTINCT p.gr_id) as profiles_with_360,
    COUNT(DISTINCT CASE WHEN p.inferred_income_band IS NOT NULL THEN p.gr_id END) as with_income_band,
    COUNT(DISTINCT CASE WHEN p.cluster_id IS NOT NULL THEN p.gr_id END) as with_cluster,
    COUNT(DISTINCT CASE WHEN array_length(p.risk_flags, 1) > 0 THEN p.gr_id END) as with_flags
FROM golden_records gr
LEFT JOIN profile_360 p ON gr.gr_id = p.gr_id
WHERE gr.status = 'active'
"""
coverage = db.execute_query(query)
print("Profile Coverage:")
print(coverage)

# Calculate percentages
total = coverage.iloc[0]['total_golden_records']
profiles = coverage.iloc[0]['profiles_with_360']
income = coverage.iloc[0]['with_income_band']
cluster = coverage.iloc[0]['with_cluster']
flags = coverage.iloc[0]['with_flags']

print(f"\nCoverage Percentages:")
print(f"360° Profiles: {profiles/total*100:.2f}%")
print(f"Income Band Inference: {income/total*100:.2f}%")
print(f"Cluster Assignment: {cluster/total*100:.2f}%")
print(f"Risk Flags: {flags/total*100:.2f}%")


In [None]:
# Close connection
db.disconnect()
print("✅ Exploration complete")
