# Golden Record - Data Exploration

**Use Case:** AI-PLATFORM-01 - Golden Record Creation & Maintenance  
**Objective:** Explore citizen data to understand data quality, duplicates, and patterns  
**MLflow Experiment:** `smart/golden_record/baseline_v1`


In [None]:
# Import libraries
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import warnings
warnings.filterwarnings('ignore')

# Add paths
project_root = Path().absolute().parent.parent.parent
sys.path.append(str(project_root / 'shared' / 'utils'))
sys.path.append(str(project_root / 'use-cases' / 'golden_record' / 'src'))

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
print('✅ Libraries imported successfully')


In [None]:
# Initialize MLflow
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('smart/golden_record/baseline_v1')

with mlflow.start_run(run_name='data_exploration'):
    print("✅ MLflow experiment started")


In [None]:
# Load data
from data_loader import GoldenRecordDataLoader

loader = GoldenRecordDataLoader()
print("Loading citizens data...")
citizens = loader.load_all_citizens()
print(f"✅ Loaded {len(citizens)} citizens")
print(f"\nColumns: {list(citizens.columns)}")

# Log dataset size to MLflow
mlflow.log_param('dataset_size', len(citizens))
mlflow.log_param('num_features', len(citizens.columns))


In [None]:
# Basic statistics
print("="*60)
print("DATASET OVERVIEW")
print("="*60)
print(f"Total records: {len(citizens):,}")
print(f"Total columns: {len(citizens.columns)}")
print(f"\nMemory usage: {citizens.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print("\n" + "="*60)
print("DATA TYPES")
print("="*60)
print(citizens.dtypes)


In [None]:
# Missing values analysis
missing_data = citizens.isnull().sum()
missing_pct = (missing_data / len(citizens)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing %': missing_pct
}).sort_values('Missing %', ascending=False)

missing_df = missing_df[missing_df['Missing Count'] > 0]

print("Missing Values Analysis")
print("="*60)
print(missing_df)

# Log to MLflow
mlflow.log_metric('missing_values_total', missing_data.sum())
mlflow.log_metric('columns_with_missing', len(missing_df))

# Visualize
if len(missing_df) > 0:
    plt.figure(figsize=(10, 6))
    missing_df['Missing %'].plot(kind='barh')
    plt.title('Missing Values Percentage by Column')
    plt.xlabel('Missing Percentage (%)')
    plt.tight_layout()
    plt.savefig('missing_values.png')
    mlflow.log_artifact('missing_values.png')
    plt.show()


In [None]:
# Duplicate detection - Jan Aadhaar
print("="*60)
print("DUPLICATE ANALYSIS - Jan Aadhaar")
print("="*60)

duplicate_aadhaar = citizens['jan_aadhaar'].duplicated(keep=False)
duplicate_count = duplicate_aadhaar.sum()

print(f"Duplicate Jan Aadhaar records: {duplicate_count}")
print(f"Percentage: {duplicate_count / len(citizens) * 100:.2f}%")

if duplicate_count > 0:
    duplicate_records = citizens[duplicate_aadhaar].sort_values('jan_aadhaar')
    print(f"\nSample duplicate records:")
    print(duplicate_records[['jan_aadhaar', 'full_name', 'date_of_birth', 'district_id']].head(10))

# Log to MLflow
mlflow.log_metric('duplicate_jan_aadhaar', duplicate_count)
mlflow.log_metric('duplicate_percentage', duplicate_count / len(citizens) * 100)


In [None]:
# Close loader
loader.close()
print("\n✅ Data exploration completed")
