# SECTION 1: IMPORTS & SETUP

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("LAST MILE CONNECT - COVERAGE GAP ANALYSIS")
print("="*80)

DATA_PATH = Path("../data/processed")
EXTERNAL_PATH = Path("../data/external")
VIZ_PATH = Path("../docs/visualizations")
VIZ_PATH.mkdir(parents=True, exist_ok=True)

LAST MILE CONNECT - COVERAGE GAP ANALYSIS


# SECTION 2: DATA LOADING

In [26]:
# Load master dataset
df = pd.read_csv(DATA_PATH / "master_district_month.csv", parse_dates=["date"])
print(f"‚úÖ Master data loaded: {df.shape}")

# Load district summary (total across all months)
try:
    district_summary = pd.read_csv(DATA_PATH / "district_summary.csv")
    print(f"‚úÖ District summary loaded: {len(district_summary)} districts")
except FileNotFoundError:
    # Create district summary if not exists
    district_summary = df.groupby(['state', 'district']).agg({
        'age_0_5': 'sum',
        'age_5_17': 'sum',
        'age_18_greater': 'sum',
        'total_enrolment': 'sum',
        'total_biometric_updates': 'sum',
        'total_demographic_updates': 'sum'
    }).reset_index()
    district_summary.to_csv(DATA_PATH / "district_summary.csv", index=False)
    print(f"‚úÖ Created district summary: {len(district_summary)} districts")


‚úÖ Master data loaded: (5004, 18)
‚úÖ District summary loaded: 1045 districts


# SECTION 3: LOAD POPULATION DATA

In [31]:
print("\n Loading population data...")

try:
    # Try to load actual Census 2011 data
    census_df = pd.read_csv(EXTERNAL_PATH / "census_2011_district_population_clean.csv")
    
    # Standardize names for matching
    census_df['state'] = census_df['state'].str.strip().str.title()
    census_df['district'] = census_df['district'].str.strip().str.title()
    
    print(f"‚úÖ Census data loaded: {len(census_df)} districts")
    HAS_CENSUS_DATA = True
    
except FileNotFoundError:
    print("‚ö†Ô∏è  Census 2011 data not found!")
    print("   Creating ESTIMATED population based on national proportions...")
    print("   ‚ö†Ô∏è  REPLACE WITH ACTUAL CENSUS DATA FOR FINAL SUBMISSION!")
    
    # FALLBACK: Estimate population based on enrolment proportions
    # This is NOT ideal but allows the analysis to run
    
    # India 2025 population estimate
    INDIA_POPULATION_2025 = 1_450_000_000
    NATIONAL_COVERAGE_ESTIMATE = 0.88  # 88% coverage assumption
    
    total_enrolments = district_summary['total_enrolment'].sum()
    
    # Proportional allocation (rough estimate)
    district_summary['population_2025_ESTIMATED'] = (
        (district_summary['total_enrolment'] / total_enrolments) * 
        INDIA_POPULATION_2025
    ).round(0).astype(int)
    
    census_df = district_summary[['state', 'district', 'population_2025_ESTIMATED']].copy()
    census_df.columns = ['state', 'district', 'population_2025']
    
    HAS_CENSUS_DATA = False
    print(f"‚ö†Ô∏è  Using ESTIMATED population for {len(census_df)} districts")

# If we have actual Census 2011, project to 2025
if HAS_CENSUS_DATA:
    ANNUAL_GROWTH_RATE = 0.012  # 1.2% per year (India's average)
    YEARS_ELAPSED = 2025 - 2011
    
    census_df['population_2025'] = (
        census_df['population_2011'] * 
        (1 + ANNUAL_GROWTH_RATE) ** YEARS_ELAPSED
    ).round(0).astype(int)
    
    print(f"‚úÖ Population projected from 2011 to 2025")
    print(f"   Growth rate: {ANNUAL_GROWTH_RATE*100}% annually")


 Loading population data...
‚úÖ Census data loaded: 640 districts
‚úÖ Population projected from 2011 to 2025
   Growth rate: 1.2% annually


# SECTION 4: MERGE ENROLMENT WITH POPULATION

In [32]:
print("\n Merging enrolment data with population...")

# Merge district summary with population
gap_df = district_summary.merge(
    census_df[['state', 'district', 'population_2025']],
    on=['state', 'district'],
    how='left'
)

# Check for unmatched districts
unmatched = gap_df[gap_df['population_2025'].isna()]
if len(unmatched) > 0:
    print(f"\n‚ö†Ô∏è  WARNING: {len(unmatched)} districts have no population data:")
    print(unmatched[['state', 'district']].head(10))
    
    # For unmatched districts, estimate based on their enrolment
    # Assume national average coverage rate
    avg_coverage = gap_df['total_enrolment'].sum() / gap_df['population_2025'].sum()
    gap_df.loc[gap_df['population_2025'].isna(), 'population_2025'] = (
        gap_df.loc[gap_df['population_2025'].isna(), 'total_enrolment'] / avg_coverage
    ).round(0).astype(int)
    
    print(f"   ‚úÖ Estimated population for unmatched districts")

print(f"\n‚úÖ Merge completed: {len(gap_df)} districts")


 Merging enrolment data with population...

                          state                  district
0                        100000                    100000
1     Andaman & Nicobar Islands                  Andamans
4   Andaman And Nicobar Islands                   Nicobar
5   Andaman And Nicobar Islands  North And Middle Andaman
6   Andaman And Nicobar Islands             South Andaman
8                Andhra Pradesh     Alluri Sitharama Raju
9                Andhra Pradesh                Anakapalli
11               Andhra Pradesh                Ananthapur
12               Andhra Pradesh             Ananthapuramu
13               Andhra Pradesh                 Annamayya
   ‚úÖ Estimated population for unmatched districts

‚úÖ Merge completed: 1045 districts


# SECTION 5: CALCULATE COVERAGE METRICS

In [33]:
print("\n" + "="*80)
print("CALCULATING COVERAGE METRICS")
print("="*80)

# Coverage rate (%)
gap_df['coverage_rate'] = (
    gap_df['total_enrolment'] / gap_df['population_2025'] * 100
).clip(upper=100).round(2)  # Cap at 100% (migration effects)

# Unreached population
gap_df['unreached_population'] = (
    gap_df['population_2025'] - gap_df['total_enrolment']
).clip(lower=0).astype(int)

# Coverage gap (%)
gap_df['coverage_gap_pct'] = (100 - gap_df['coverage_rate']).round(2)

# National statistics
national_stats = {
    'total_population': gap_df['population_2025'].sum(),
    'total_enrolled': gap_df['total_enrolment'].sum(),
    'total_unreached': gap_df['unreached_population'].sum(),
    'national_coverage_rate': (
        gap_df['total_enrolment'].sum() / 
        gap_df['population_2025'].sum() * 100
    ),
    'national_gap_pct': (
        gap_df['unreached_population'].sum() / 
        gap_df['population_2025'].sum() * 100
    )
}

print(f"\nüáÆüá≥ NATIONAL STATISTICS:")
print(f"   Total Population (2025): {national_stats['total_population']:,}")
print(f"   Total Enrolled: {national_stats['total_enrolled']:,}")
print(f"   Total Unreached: {national_stats['total_unreached']:,}")
print(f"   National Coverage: {national_stats['national_coverage_rate']:.2f}%")
print(f"   National Gap: {national_stats['national_gap_pct']:.2f}%")
print(f"   Unreached (Crores): {national_stats['total_unreached']/10**7:.2f} Cr")


CALCULATING COVERAGE METRICS

üáÆüá≥ NATIONAL STATISTICS:
   Total Population (2025): 1,678,059,418.0
   Total Enrolled: 5,331,760
   Total Unreached: 1,672,727,658
   National Coverage: 0.32%
   National Gap: 99.68%
   Unreached (Crores): 167.27 Cr


# SECTION 6: PRIORITY CLASSIFICATION

In [34]:
print("\n" + "="*80)
print("DISTRICT PRIORITY CLASSIFICATION")
print("="*80)

# Classify districts by coverage level
def classify_priority(coverage_rate):
    """Classify district priority based on coverage rate"""
    if coverage_rate < 70:
        return 'CRITICAL'
    elif coverage_rate < 85:
        return 'HIGH'
    elif coverage_rate < 95:
        return 'MEDIUM'
    else:
        return 'LOW'

gap_df['priority_level'] = gap_df['coverage_rate'].apply(classify_priority)

# Priority distribution
priority_counts = gap_df['priority_level'].value_counts()
print(f"\nüìä PRIORITY DISTRIBUTION:")
for level in ['CRITICAL', 'HIGH', 'MEDIUM', 'LOW']:
    if level in priority_counts.index:
        count = priority_counts[level]
        unreached = gap_df[gap_df['priority_level'] == level]['unreached_population'].sum()
        avg_coverage = gap_df[gap_df['priority_level'] == level]['coverage_rate'].mean()
        print(f"   {level:10s}: {count:4d} districts | "
              f"{unreached:>12,} unreached | "
              f"Avg coverage: {avg_coverage:.1f}%")


DISTRICT PRIORITY CLASSIFICATION

üìä PRIORITY DISTRIBUTION:
   CRITICAL  : 1045 districts | 1,672,727,658 unreached | Avg coverage: 0.4%


# SECTION 7: PRIORITY SCORING

In [35]:
print("\n Calculating priority scores...")

# Composite priority score
# Higher score = higher priority for intervention
gap_df['priority_score'] = (
    # Absolute unreached population (60% weight)
    (gap_df['unreached_population'] / gap_df['unreached_population'].max()) * 60 +
    # Coverage gap percentage (40% weight)
    (gap_df['coverage_gap_pct'] / 100) * 40
).round(2)

# Rank districts
gap_df = gap_df.sort_values('priority_score', ascending=False).reset_index(drop=True)
gap_df['priority_rank'] = range(1, len(gap_df) + 1)

print(f"‚úÖ Priority scoring completed")


 Calculating priority scores...
‚úÖ Priority scoring completed


# SECTION 8: TOP PRIORITY DISTRICTS

In [36]:
print("\n" + "="*80)
print("[STEP 7] TOP 20 PRIORITY DISTRICTS")
print("="*80)

top_20 = gap_df.head(20)[
    ['state', 'district', 'population_2025', 'total_enrolment', 
     'unreached_population', 'coverage_rate', 'priority_level', 'priority_score']
].copy()

print("\nüéØ Districts with HIGHEST priority for mobile camps:\n")
print(top_20.to_string(index=False))


[STEP 7] TOP 20 PRIORITY DISTRICTS

üéØ Districts with HIGHEST priority for mobile camps:

        state                   district  population_2025  total_enrolment  unreached_population  coverage_rate priority_level  priority_score
  Maharashtra                      Thane       13070377.0            43142              13027235           0.33       CRITICAL           99.87
  West Bengal North Twenty Four Parganas       11829101.0             1597              11827504           0.01       CRITICAL           94.47
    Karnataka                  Bangalore       11370309.0             6824              11363485           0.06       CRITICAL           92.31
  Maharashtra                       Pune       11143243.0            31148              11112095           0.28       CRITICAL           91.07
  Maharashtra            Mumbai Suburban       11057630.0            17981              11039649           0.16       CRITICAL           90.78
  West Bengal South Twenty Four Parganas        9

# SECTION 9: STATE-LEVEL ANALYSIS

In [37]:
print("\n" + "="*80)
print("STATE-LEVEL GAP ANALYSIS")
print("="*80)

state_analysis = gap_df.groupby('state').agg({
    'population_2025': 'sum',
    'total_enrolment': 'sum',
    'unreached_population': 'sum',
    'district': 'count'
}).reset_index()

state_analysis.columns = [
    'state', 'population', 'enrolled', 'unreached', 'num_districts'
]

state_analysis['coverage_rate'] = (
    state_analysis['enrolled'] / state_analysis['population'] * 100
).round(2)

state_analysis['gap_pct'] = (100 - state_analysis['coverage_rate']).round(2)

# Sort by unreached population
state_analysis = state_analysis.sort_values('unreached', ascending=False)

print("\nüìä TOP 10 STATES BY UNREACHED POPULATION:\n")
print(state_analysis.head(10)[
    ['state', 'coverage_rate', 'gap_pct', 'unreached', 'num_districts']
].to_string(index=False))


STATE-LEVEL GAP ANALYSIS

üìä TOP 10 STATES BY UNREACHED POPULATION:

         state  coverage_rate  gap_pct  unreached  num_districts
 Uttar Pradesh           0.39    99.61  252921357             89
   West Bengal           0.25    99.75  149454935             50
   Maharashtra           0.26    99.74  141931527             53
         Bihar           0.42    99.58  141607398             47
Andhra Pradesh           0.13    99.87   98920577             47
     Karnataka           0.22    99.78   97910336             55
       Gujarat           0.29    99.71   95030149             40
Madhya Pradesh           0.53    99.47   91700965             61
    Tamil Nadu           0.23    99.77   91605350             46
     Rajasthan           0.40    99.60   84573760             42


# SECTION 10: VISUALIZATIONS

In [39]:
print("\nCreating visualizations...")

# Visualization 1: Coverage Rate Distribution
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Histogram
axes[0, 0].hist(gap_df['coverage_rate'], bins=40, color='steelblue', edgecolor='black')
axes[0, 0].axvline(national_stats['national_coverage_rate'], color='red', 
                    linestyle='--', linewidth=2, label=f'National: {national_stats["national_coverage_rate"]:.1f}%')
axes[0, 0].set_xlabel('Coverage Rate (%)')
axes[0, 0].set_ylabel('Number of Districts')
axes[0, 0].set_title('Distribution of Aadhaar Coverage Rates', fontweight='bold')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# Priority pie chart
priority_data = gap_df['priority_level'].value_counts()
colors = {'CRITICAL': '#d32f2f', 'HIGH': '#f57c00', 'MEDIUM': '#fbc02d', 'LOW': '#388e3c'}
pie_colors = [colors.get(x, 'gray') for x in priority_data.index]
axes[0, 1].pie(priority_data.values, labels=priority_data.index, autopct='%1.1f%%',
                colors=pie_colors, startangle=90)
axes[0, 1].set_title('Districts by Priority Level', fontweight='bold')

# Top 15 states by unreached
top_states = state_analysis.head(15)
axes[1, 0].barh(range(len(top_states)), top_states['unreached'] / 10**6, color='coral')
axes[1, 0].set_yticks(range(len(top_states)))
axes[1, 0].set_yticklabels(top_states['state'])
axes[1, 0].set_xlabel('Unreached Population (Millions)')
axes[1, 0].set_title('Top 15 States by Unreached Population', fontweight='bold')
axes[1, 0].grid(axis='x', alpha=0.3)

# Coverage vs Unreached scatter
scatter = axes[1, 1].scatter(
    gap_df['coverage_rate'], 
    gap_df['unreached_population'] / 10**3,
    c=gap_df['priority_level'].map(colors),
    s=50, alpha=0.6, edgecolors='black', linewidth=0.5
)
axes[1, 1].set_xlabel('Coverage Rate (%)')
axes[1, 1].set_ylabel('Unreached Population (Thousands)')
axes[1, 1].set_title('Coverage Rate vs Unreached Population', fontweight='bold')
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig(VIZ_PATH / 'coverage_gap_analysis.png', dpi=300, bbox_inches='tight')
print("‚úÖ Saved: coverage_gap_analysis.png")
plt.close()

# Visualization 2: State comparison
fig, ax = plt.subplots(figsize=(14, 8))
top_15_states = state_analysis.head(15)
x = np.arange(len(top_15_states))
width = 0.35

bars1 = ax.bar(x - width/2, top_15_states['coverage_rate'], width, 
                label='Coverage Rate (%)', color='seagreen')
bars2 = ax.bar(x + width/2, top_15_states['gap_pct'], width,
                label='Gap (%)', color='crimson')

ax.set_xlabel('State')
ax.set_ylabel('Percentage')
ax.set_title('Coverage Rate vs Gap - Top 15 States by Unreached Population', fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(top_15_states['state'], rotation=45, ha='right')
ax.legend()
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(VIZ_PATH / 'state_coverage_comparison.png', dpi=300, bbox_inches='tight')
print("‚úÖ Saved: state_coverage_comparison.png")
plt.close()


Creating visualizations...
‚úÖ Saved: coverage_gap_analysis.png
‚úÖ Saved: state_coverage_comparison.png


# SECTION 11: SAVE RESULTS

In [40]:
print("\n Saving analysis results...")

# Save complete gap analysis
gap_df.to_csv(DATA_PATH / "district_gap_analysis.csv", index=False)
print("‚úÖ Saved: district_gap_analysis.csv")

# Save state analysis
state_analysis.to_csv(DATA_PATH / "state_gap_analysis.csv", index=False)
print("‚úÖ Saved: state_gap_analysis.csv")

# Save critical priority districts (for immediate action)
critical_districts = gap_df[gap_df['priority_level'] == 'CRITICAL'].copy()
critical_districts.to_csv(DATA_PATH / "critical_priority_districts.csv", index=False)
print(f"‚úÖ Saved: critical_priority_districts.csv ({len(critical_districts)} districts)")

# Save top 1000 priority districts (for comprehensive planning)
top_1000 = gap_df.head(1000)
top_1000.to_csv(DATA_PATH / "top_1000_priority_districts.csv", index=False)
print("‚úÖ Saved: top_1000_priority_districts.csv")


 Saving analysis results...
‚úÖ Saved: district_gap_analysis.csv
‚úÖ Saved: state_gap_analysis.csv
‚úÖ Saved: critical_priority_districts.csv (1045 districts)
‚úÖ Saved: top_1000_priority_districts.csv


# SECTION 12: FINAL SUMMARY & INSIGHTS

In [41]:
print("\n" + "="*80)
print("‚ú® COVERAGE GAP ANALYSIS COMPLETED!")
print("="*80)

print(f"\nüìä KEY FINDINGS:")
print(f"   ‚Ä¢ National Coverage: {national_stats['national_coverage_rate']:.2f}%")
print(f"   ‚Ä¢ National Gap: {national_stats['national_gap_pct']:.2f}%")
print(f"   ‚Ä¢ Unreached Citizens: {national_stats['total_unreached']:,} ({national_stats['total_unreached']/10**7:.2f} Cr)")
print(f"   ‚Ä¢ CRITICAL districts: {priority_counts.get('CRITICAL', 0)}")
print(f"   ‚Ä¢ HIGH priority districts: {priority_counts.get('HIGH', 0)}")

print(f"\nüéØ TOP 5 STATES NEEDING INTERVENTION:")
for i, row in state_analysis.head(5).iterrows():
    print(f"   {row['state']:30s} | Coverage: {row['coverage_rate']:>5.1f}% | "
          f"Unreached: {row['unreached']/10**6:>5.2f}M")

print(f"\nüìÅ FILES CREATED:")
print(f"   ‚Ä¢ district_gap_analysis.csv - Complete district analysis")
print(f"   ‚Ä¢ state_gap_analysis.csv - State-level summary")
print(f"   ‚Ä¢ critical_priority_districts.csv - Immediate action required")
print(f"   ‚Ä¢ top_1000_priority_districts.csv - Comprehensive planning")
print(f"   ‚Ä¢ 2 visualization PNG files")



‚ú® COVERAGE GAP ANALYSIS COMPLETED!

üìä KEY FINDINGS:
   ‚Ä¢ National Coverage: 0.32%
   ‚Ä¢ National Gap: 99.68%
   ‚Ä¢ Unreached Citizens: 1,672,727,658 (167.27 Cr)
   ‚Ä¢ CRITICAL districts: 1045
   ‚Ä¢ HIGH priority districts: 0

üéØ TOP 5 STATES NEEDING INTERVENTION:
   Uttar Pradesh                  | Coverage:   0.4% | Unreached: 252.92M
   West Bengal                    | Coverage:   0.2% | Unreached: 149.45M
   Maharashtra                    | Coverage:   0.3% | Unreached: 141.93M
   Bihar                          | Coverage:   0.4% | Unreached: 141.61M
   Andhra Pradesh                 | Coverage:   0.1% | Unreached: 98.92M

üìÅ FILES CREATED:
   ‚Ä¢ district_gap_analysis.csv - Complete district analysis
   ‚Ä¢ state_gap_analysis.csv - State-level summary
   ‚Ä¢ critical_priority_districts.csv - Immediate action required
   ‚Ä¢ top_1000_priority_districts.csv - Comprehensive planning
   ‚Ä¢ 2 visualization PNG files
