In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.seasonal import seasonal_decompose
import scipy.stats as stats
import warnings
warnings.filterwarnings('ignore')

# ---------------------------------------------------------
# 1. DATA GENERATION (SIMULATION)
# ---------------------------------------------------------
# Creating a dummy dataset to verify the code logic before giving it to the user
def create_dummy_data():
    dates = pd.date_range(start='2020-01-01', end='2023-12-31', freq='M')
    states = ['Andhra Pradesh', 'Maharashtra', 'Uttar Pradesh', 'Punjab']
    districts = {
        'Andhra Pradesh': ['Krishna', 'Guntur', 'Chittoor'],
        'Maharashtra': ['Pune', 'Mumbai', 'Nagpur'],
        'Uttar Pradesh': ['Lucknow', 'Varanasi', 'Agra'],
        'Punjab': ['Amritsar', 'Ludhiana', 'Jalandhar']
    }
    
    data = []
    for date in dates:
        for state in states:
            for district in districts[state]:
                # Simulate 5 pincodes per district
                for pin in range(1, 6):
                    pincode = f"{hash(state+district) % 100000 + pin + 500000}"
                    
                    # Random logic with some patterns
                    base = np.random.randint(50, 200)
                    
                    # Age 0-5 (Growth trend + Seasonality)
                    age_0_5 = int(base * 0.1 + (date.month % 4) * 2 + np.random.normal(0, 2))
                    
                    # Age 5-17 (Steady)
                    age_5_17 = int(base * 0.3 + np.random.normal(0, 5))
                    
                    # Age 18+ (Bulk)
                    age_18_plus = int(base * 0.6 + np.random.normal(0, 10))
                    
                    # Inject Anomaly
                    if np.random.random() < 0.01:
                        age_0_5 *= 5  # Spike
                    
                    data.append([date, state, district, pincode, max(0, age_0_5), max(0, age_5_17), max(0, age_18_plus)])
    
    df = pd.DataFrame(data, columns=['Date', 'State', 'District', 'Pincode', 'Age_0_5', 'Age_5_17', 'Age_18_greater'])
    return df

df = create_dummy_data()
df.to_csv('aadhaar_cleaned.csv', index=False)

# ---------------------------------------------------------
# 2. ANALYSIS SCRIPT (The logic to be delivered)
# ---------------------------------------------------------

class AadhaarHackathonAnalyzer:
    def __init__(self, df):
        self.df = df
        self.prepare_data()
        
    def prepare_data(self):
        self.df['Date'] = pd.to_datetime(self.df['Date'])
        self.df['Total_Reg'] = self.df['Age_0_5'] + self.df['Age_5_17'] + self.df['Age_18_greater']
        self.df['Year'] = self.df['Date'].dt.year
        self.df['Month'] = self.df['Date'].dt.month
        
    def analyze_benfords_law(self):
        """Forensic Analysis using Benford's Law"""
        # Extract leading digit from Total_Reg
        leading_digits = self.df['Total_Reg'].astype(str).str[0].astype(int)
        leading_digits = leading_digits[leading_digits > 0]
        
        observed_counts = leading_digits.value_counts(normalize=True).sort_index()
        expected_counts = np.log10(1 + 1/np.arange(1, 10))
        
        # Plot
        plt.figure(figsize=(10, 6))
        plt.bar(observed_counts.index, observed_counts.values, alpha=0.6, label='Observed', color='teal')
        plt.plot(range(1, 10), expected_counts, color='red', marker='o', linestyle='--', label='Benford Expected')
        plt.title("Forensic Check: Benford's Law Analysis on Enrolments")
        plt.xlabel("Leading Digit")
        plt.ylabel("Frequency")
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.savefig('benford_analysis.png')
        plt.close()
        
    def digital_inequality_gini(self):
        """Calculate Gini Coefficient for Digital Access per District"""
        def gini(x):
            total = 0
            for i, xi in enumerate(x[:-1], 1):
                total += np.sum(np.abs(xi - x[i:]))
            return total / (len(x)**2 * np.mean(x)) if np.mean(x) > 0 else 0

        gini_scores = self.df.groupby(['State', 'District'])['Total_Reg'].apply(lambda x: gini(x.values)).reset_index()
        gini_scores.rename(columns={'Total_Reg': 'Gini_Inequality'}, inplace=True)
        
        # Sort and Plot Top 10 Unequal
        top_inequal = gini_scores.sort_values('Gini_Inequality', ascending=False).head(10)
        
        plt.figure(figsize=(12, 6))
        sns.barplot(data=top_inequal, y='District', x='Gini_Inequality', hue='State', dodge=False)
        plt.title("Top 10 Districts with Highest Digital Inequality (Gini Index)")
        plt.xlabel("Gini Coefficient (0=Equal, 1=Unequal)")
        plt.tight_layout()
        plt.savefig('gini_inequality.png')
        plt.close()
        
        return gini_scores

    def detect_anomalies(self):
        """ML-based Anomaly Detection using Isolation Forest"""
        # Feature Engineering for Anomaly Detection
        features = self.df[['Total_Reg', 'Age_0_5', 'Age_5_17']].copy()
        
        # Fit Model
        iso = IsolationForest(contamination=0.01, random_state=42)
        self.df['Anomaly_Score'] = iso.fit_predict(features)
        
        # Plot Anomalies
        anomalies = self.df[self.df['Anomaly_Score'] == -1]
        
        plt.figure(figsize=(10, 6))
        plt.scatter(self.df['Total_Reg'], self.df['Age_0_5'], c='blue', alpha=0.1, label='Normal')
        plt.scatter(anomalies['Total_Reg'], anomalies['Age_0_5'], c='red', alpha=0.6, label='Anomaly')
        plt.title("AI-Driven Anomaly Detection: Unusual Age Distributions")
        plt.xlabel("Total Registrations")
        plt.ylabel("Age 0-5 Registrations")
        plt.legend()
        plt.savefig('anomaly_detection.png')
        plt.close()
        
        return anomalies

    def forecast_trends(self):
        """Time Series Decomposition"""
        monthly_trend = self.df.groupby('Date')['Total_Reg'].sum()
        
        # Decompose
        if len(monthly_trend) > 24: # Need enough data
            result = seasonal_decompose(monthly_trend, model='additive')
            
            plt.figure(figsize=(12, 8))
            plt.subplot(411)
            plt.plot(result.observed, label='Observed')
            plt.legend(loc='upper left')
            plt.title('Time Series Decomposition of National Enrolments')
            
            plt.subplot(412)
            plt.plot(result.trend, label='Trend')
            plt.legend(loc='upper left')
            
            plt.subplot(413)
            plt.plot(result.seasonal, label='Seasonality')
            plt.legend(loc='upper left')
            
            plt.subplot(414)
            plt.plot(result.resid, label='Residuals')
            plt.legend(loc='upper left')
            
            plt.tight_layout()
            plt.savefig('forecast_decomposition.png')
            plt.close()
            
    def state_clustering(self):
        """Cluster States based on Demographic Profile"""
        state_profile = self.df.groupby('State')[['Age_0_5', 'Age_5_17', 'Age_18_greater']].mean()
        scaler = StandardScaler()
        scaled = scaler.fit_transform(state_profile)
        
        kmeans = KMeans(n_clusters=3, random_state=42)
        state_profile['Cluster'] = kmeans.fit_predict(scaled)
        
        plt.figure(figsize=(10, 6))
        sns.scatterplot(data=state_profile, x='Age_0_5', y='Age_18_greater', hue='Cluster', s=100, palette='viridis')
        for i in range(state_profile.shape[0]):
            plt.text(state_profile.Age_0_5[i]+0.2, state_profile.Age_18_greater[i], state_profile.index[i], fontsize=9)
        plt.title("State Clustering: Grouping States by Demographic Patterns")
        plt.savefig('state_clustering.png')
        plt.close()

# Execute Analysis
analyzer = AadhaarHackathonAnalyzer(df)
analyzer.analyze_benfords_law()
gini_df = analyzer.digital_inequality_gini()
anomalies = analyzer.detect_anomalies()
analyzer.forecast_trends()
analyzer.state_clustering()

print("Script execution completed. Images generated.")
print("Anomalies Found:", len(anomalies))

Script execution completed. Images generated.
Anomalies Found: 29


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.ensemble import IsolationForest
from statsmodels.tsa.seasonal import seasonal_decompose
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# Force Plotly to render in Notebooks
import plotly.io as pio
pio.renderers.default = 'iframe'  # 'iframe' is most robust for Kaggle/Jupyter

# Configuration
FILE_PATH = "/kaggle/input/aadhar/aadhaar_cleaned.csv"

# ==========================================
# 1. ADVANCED DATA LOADER & CLEANER
# ==========================================
def load_and_clean_data(path):
    print(f"üîÑ Loading data from {path}...")
    try:
        df = pd.read_csv(path)
        
        # 1. CLEAN HEADERS
        df.columns = df.columns.str.replace('#', '', regex=False).str.strip()
        
        # 2. INTELLIGENT COLUMN MAPPING
        rename_map = {}
        for col in df.columns:
            if col.startswith('Age_18'):
                rename_map[col] = 'Age_18_greater'
        df.rename(columns=rename_map, inplace=True)

        # 3. DATE PARSING
        df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')
        
        # 4. NUMERIC CONVERSION (STRICT)
        numeric_cols = ['Age_0_5', 'Age_5_17', 'Age_18_greater']
        for col in numeric_cols:
            if col in df.columns:
                # Force numeric, coerce errors to NaN, then fill with 0
                df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
        
        # Create 'Total_Reg' metric
        df['Total_Reg'] = df['Age_0_5'] + df['Age_5_17'] + df['Age_18_greater']
        
        # 5. DERIVED METRICS
        # Avoid Division by Zero by replacing 0 with NaN temporarily or handling it
        df['Child_Ratio'] = df.apply(lambda x: x['Age_0_5'] / x['Total_Reg'] if x['Total_Reg'] > 0 else 0, axis=1)
        
        # Drop rows where everything is 0 to keep analysis clean
        df = df[df['Total_Reg'] > 0]
        
        print(f"‚úî Data Loaded & Cleaned: {df.shape[0]} valid rows processed.")
        return df

    except Exception as e:
        print(f"‚ùå Critical Error in Loader: {e}")
        return None

# ==========================================
# 2. FORENSIC ANALYSIS: BENFORD'S LAW
# ==========================================
def analyze_benfords_law(df):
    print("\nüîç Running Benford's Law Forensic Check...")
    
    # Analyze 'Total_Reg' column - Ensure strict int
    # Filter for values >= 10 to ensure leading digits are meaningful
    valid_data = df[df['Total_Reg'] >= 10]['Total_Reg'].astype(int).astype(str)
    
    if len(valid_data) == 0:
        print("‚ö†Ô∏è Not enough data > 10 registrations to run Benford's Law.")
        return

    leading_digits = valid_data.str[0].astype(int)
    
    observed = leading_digits.value_counts(normalize=True).sort_index()
    expected = np.log10(1 + 1/np.arange(1, 10))
    
    # Create DF for Plotly
    plot_df = pd.DataFrame({
        'Digit': observed.index,
        'Frequency': observed.values
    })
    
    fig = go.Figure()
    fig.add_trace(go.Bar(x=plot_df['Digit'], y=plot_df['Frequency'], name='Observed (Your Data)', marker_color='#008080'))
    fig.add_trace(go.Scatter(x=list(range(1, 10)), y=expected, name='Benford Law (Natural)', line=dict(color='red', dash='dash')))
    
    fig.update_layout(
        title="<b>Forensic Integrity Check: Benford's Law</b>",
        xaxis_title="Leading Digit",
        yaxis_title="Frequency",
        template="plotly_white",
        xaxis=dict(tickmode='linear', tick0=1, dtick=1)
    )
    fig.show()

# ==========================================
# 3. SOCIAL IMPACT: GINI INEQUALITY INDEX
# ==========================================
def analyze_digital_inequality(df):
    print("\n‚öñÔ∏è Calculating Digital Inequality (Gini Index)...")
    
    # Fast Gini implementation using NumPy
    def gini_fast(x):
        if len(x) == 0 or np.mean(x) == 0: return 0
        sorted_x = np.sort(x)
        n = len(x)
        cumx = np.cumsum(sorted_x, dtype=float)
        return (n + 1 - 2 * np.sum(cumx) / cumx[-1]) / n

    # Group by District
    # Ensure Total_Reg is float for calculation
    gini_scores = df.groupby(['State', 'District'])['Total_Reg'].apply(lambda x: gini_fast(x.values)).reset_index()
    gini_scores.rename(columns={'Total_Reg': 'Inequality_Score'}, inplace=True)
    
    # Top 15 Unequal
    top_unequal = gini_scores.sort_values('Inequality_Score', ascending=False).head(15)
    
    fig = px.bar(
        top_unequal, 
        y='District', 
        x='Inequality_Score', 
        color='State',
        title="<b>The Digital Divide: Top 15 Unequal Districts</b>",
        orientation='h',
        color_discrete_sequence=px.colors.qualitative.Bold
    )
    fig.update_layout(yaxis={'categoryorder':'total ascending'})
    fig.show()

# ==========================================
# 4. HIERARCHICAL VISUALIZATION (SUNBURST)
# ==========================================
def visualize_hierarchy(df):
    print("\nüó∫Ô∏è Generating Interactive Hierarchy...")
    
    # 1. Aggregate first
    viz_df = df.groupby(['State', 'District', 'Pincode']).agg({
        'Total_Reg': 'sum',
        'Age_0_5': 'sum'
    }).reset_index()
    
    # 2. CRITICAL FIX: Remove rows where Total_Reg is 0 to prevent ZeroDivisionError
    viz_df = viz_df[viz_df['Total_Reg'] > 0]
    
    # 3. Calculate Density
    viz_df['Child_Density'] = viz_df['Age_0_5'] / viz_df['Total_Reg']
    
    # 4. Handle any remaining NaNs
    viz_df['Child_Density'] = viz_df['Child_Density'].fillna(0)
    
    try:
        fig = px.sunburst(
            viz_df,
            path=['State', 'District', 'Pincode'],
            values='Total_Reg',
            color='Child_Density',
            color_continuous_scale='RdBu_r',
            title="<b>National Hierarchical Drill-Down</b>"
        )
        fig.show()
    except Exception as e:
        print(f"‚ö†Ô∏è Could not generate Sunburst due to data complexity: {e}")

# ==========================================
# 5. AI ANOMALY DETECTION
# ==========================================
def detect_anomalies(df):
    print("\nü§ñ Running AI Anomaly Detection...")
    
    features = df[['Age_0_5', 'Age_5_17', 'Age_18_greater']].fillna(0)
    
    # Use 1% contamination
    clf = IsolationForest(contamination=0.01, random_state=42, n_jobs=-1)
    df['Anomaly_Score'] = clf.fit_predict(features)
    
    anomalies = df[df['Anomaly_Score'] == -1]
    print(f"‚ö†Ô∏è Found {len(anomalies)} anomalies out of {len(df)} records.")
    
    # Sample plot (limit points for speed)
    plot_sample = df.sample(n=min(10000, len(df)), random_state=42)
    
    fig = px.scatter(
        plot_sample, 
        x='Total_Reg', 
        y='Age_0_5', 
        color='Anomaly_Score',
        color_discrete_map={1: 'blue', -1: 'red'},
        title="<b>AI-Detected Anomalies (Sampled View)</b>"
    )
    fig.show()
    return anomalies

# ==========================================
# 6. TIME SERIES FORECASTING
# ==========================================
def forecast_registrations(df):
    print("\nüìà Analyzing Time Trends...")
    
    if df['Date'].isnull().all():
        print("‚ö†Ô∏è Date column is empty or invalid. Skipping forecasting.")
        return

    unique_dates = df['Date'].nunique()
    if unique_dates < 10:
        print(f"‚ÑπÔ∏è Skipping Time Series: Only {unique_dates} unique dates found (need >10 for meaningful trend).")
        return

    # Resample to monthly sum
    daily_data = df.groupby('Date')['Total_Reg'].sum().reset_index()
    daily_data = daily_data.set_index('Date').resample('M').sum()
    
    # Decomposition
    try:
        decomposition = seasonal_decompose(daily_data, model='additive', extrapolate_trend='freq')
        
        fig = plt.figure(figsize=(14, 8))
        plt.suptitle('Time Series Decomposition', fontsize=16)
        
        plt.subplot(411)
        plt.plot(decomposition.observed, label='Observed', color='black')
        plt.legend(loc='upper left')
        
        plt.subplot(412)
        plt.plot(decomposition.trend, label='Trend', color='blue')
        plt.legend(loc='upper left')
        
        plt.subplot(413)
        plt.plot(decomposition.seasonal, label='Seasonality', color='green')
        plt.legend(loc='upper left')
        
        plt.tight_layout()
        plt.show()
    except Exception as e:
        print(f"Time series error: {e}")

# ==========================================
# MAIN EXECUTION
# ==========================================
if __name__ == "__main__":
    df = load_and_clean_data(FILE_PATH)

    if df is not None and not df.empty:
        analyze_benfords_law(df)
        analyze_digital_inequality(df)
        visualize_hierarchy(df)
        anomalies = detect_anomalies(df)
        forecast_registrations(df)
        print("\n‚úÖ Analysis Complete.")
    else:
        print("‚ùå Dataframe is empty. Please check your CSV file.")

üîÑ Loading data from /kaggle/input/aadhar/aadhaar_cleaned.csv...
‚úî Data Loaded & Cleaned: 1208727 valid rows processed.

üîç Running Benford's Law Forensic Check...



‚öñÔ∏è Calculating Digital Inequality (Gini Index)...



üó∫Ô∏è Generating Interactive Hierarchy...



ü§ñ Running AI Anomaly Detection...
‚ö†Ô∏è Found 12084 anomalies out of 1208727 records.



üìà Analyzing Time Trends...
Time series error: x must have 2 complete cycles requires 24 observations. x only has 11 observation(s)

‚úÖ Analysis Complete.


In [4]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import warnings

# ---------------------------------------------------------
# CONFIGURATION & SETUP
# ---------------------------------------------------------
warnings.filterwarnings('ignore')
pio.renderers.default = 'iframe'  # Critical for Kaggle to render charts
pio.templates.default = "plotly_white" # Professional styling

FILE_PATH = "/kaggle/input/aadhar/aadhaar_cleaned.csv"

# ---------------------------------------------------------
# 1. ROBUST DATA LOADER (Aggressive Cleaning)
# ---------------------------------------------------------
def load_data(path):
    print("üöÄ Starting Advanced Analysis Pipeline...")
    try:
        df = pd.read_csv(path)
        
        # Aggressive Header Cleaning: Remove '#', extra spaces, make lower case for matching
        df.columns = df.columns.str.replace('#', '', regex=False).str.strip()
        
        # Map columns dynamically to handle variations
        col_map = {}
        for col in df.columns:
            if 'age_0_5' in col.lower(): col_map[col] = 'Age_0_5'
            elif 'age_5_17' in col.lower(): col_map[col] = 'Age_5_17'
            elif 'age_18' in col.lower(): col_map[col] = 'Age_18_greater'
            elif 'state' in col.lower(): col_map[col] = 'State'
            elif 'district' in col.lower(): col_map[col] = 'District'
            elif 'pincode' in col.lower(): col_map[col] = 'Pincode'
            elif 'date' in col.lower(): col_map[col] = 'Date'
            
        df.rename(columns=col_map, inplace=True)
        
        # Ensure numerics
        nums = ['Age_0_5', 'Age_5_17', 'Age_18_greater']
        for c in nums:
            df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0)
            
        df['Total_Reg'] = df[nums].sum(axis=1)
        
        # Remove zero-data rows to fix "Black/Blank" charts
        df = df[df['Total_Reg'] > 0]
        
        print(f"‚úî Data Loaded: {df.shape[0]:,} active records.")
        return df
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return None

# ---------------------------------------------------------
# 2. FORENSIC: BENFORD'S LAW (Fixed)
# ---------------------------------------------------------
def analyze_benford(df):
    print("\nüîç Executing Benford's Law Forensic Check...")
    # Get leading digit of Total_Reg
    # We convert to string, strip zeros, take first char
    s = df['Total_Reg'].astype(int).astype(str)
    leading = s.str[0].astype(int)
    
    # Remove 0s if any exist as leading (shouldn't happen in int, but safety first)
    leading = leading[leading > 0]
    
    observed = leading.value_counts(normalize=True).sort_index()
    expected = np.log10(1 + 1/np.arange(1, 10))
    
    fig = go.Figure()
    fig.add_trace(go.Bar(x=observed.index, y=observed.values, name='Observed Data', marker_color='#2E86C1'))
    fig.add_trace(go.Scatter(x=list(range(1, 10)), y=expected, name='Benford Law', line=dict(color='red', width=3, dash='dash')))
    
    fig.update_layout(
        title="<b>Forensic Integrity Check</b><br><sub>Divergence from red line indicates potential data manipulation</sub>",
        xaxis_title="Leading Digit", yaxis_title="Probability",
        height=500
    )
    fig.show()

# ---------------------------------------------------------
# 3. HIERARCHY: SUNBURST (Optimized for Speed)
# ---------------------------------------------------------
def analyze_hierarchy(df):
    print("\nüó∫Ô∏è Generating National Hierarchy (Optimized)...")
    
    # AGGREGATE TO DISTRICT LEVEL (Critical Fix for "Black Screen")
    # Dropping Pincode here allows the chart to render 100x faster
    viz_df = df.groupby(['State', 'District']).agg({
        'Total_Reg': 'sum', 
        'Age_0_5': 'sum'
    }).reset_index()
    
    # Calculate Child Density for coloring
    viz_df['Child_Ratio'] = viz_df['Age_0_5'] / viz_df['Total_Reg']
    
    fig = px.sunburst(
        viz_df,
        path=['State', 'District'],
        values='Total_Reg',
        color='Child_Ratio',
        color_continuous_scale='RdBu_r', # Red = High Child %, Blue = Low
        title="<b>National Aadhaar Penetration (State ‚Üí District)</b><br><sub>Size = Total Registrations | Color = Child Density (0-5 Years)</sub>",
        height=700
    )
    fig.show()

# ---------------------------------------------------------
# 4. NEW: DEMOGRAPHIC CLUSTERING (The "Winner" Analysis)
# ---------------------------------------------------------
def analyze_clusters(df):
    print("\nüß¨ Running AI Cluster Analysis (Demographic Profiles)...")
    
    # 1. Create Profile per District
    dist_df = df.groupby(['State', 'District']).agg({
        'Age_0_5': 'sum',
        'Age_5_17': 'sum',
        'Age_18_greater': 'sum',
        'Total_Reg': 'sum'
    }).reset_index()
    
    # 2. Normalize features (Ratios)
    dist_df['Pct_Child'] = dist_df['Age_0_5'] / dist_df['Total_Reg']
    dist_df['Pct_Adult'] = dist_df['Age_18_greater'] / dist_df['Total_Reg']
    
    features = dist_df[['Pct_Child', 'Pct_Adult']].fillna(0)
    
    # 3. K-Means Clustering (3 Types of Districts)
    kmeans = KMeans(n_clusters=3, random_state=42)
    dist_df['Cluster'] = kmeans.fit_predict(features)
    
    # Map Clusters to Names (Logic: High Child = Emerging, High Adult = Working Hub)
    # We analyze the cluster centers to name them dynamically
    centers = kmeans.cluster_centers_
    # Simple logic to map 0,1,2 to readable names
    # This part is simplified; in a real hackathon, check the centers printed
    dist_df['Cluster_Label'] = dist_df['Cluster'].map({
        0: 'Type A (Mixed Demographics)',
        1: 'Type B (High Working Pop)',
        2: 'Type C (High Growth/Rural)' 
    })
    
    fig = px.scatter(
        dist_df, x='Pct_Child', y='Pct_Adult',
        color='Cluster_Label', hover_name='District',
        size='Total_Reg', size_max=40,
        title="<b>AI Demographic Clustering of Districts</b><br><sub>Classifying India into Growth Zones vs. Work Hubs</sub>",
        labels={'Pct_Child': '% Children (0-5)', 'Pct_Adult': '% Adults (18+)'},
        height=600
    )
    fig.show()

# ---------------------------------------------------------
# 5. NEW: THE "EXTREMES" REPORT (Policy Actionable)
# ---------------------------------------------------------
def analyze_extremes(df):
    print("\n‚ö° Identifying Policy Anomalies...")
    
    # Group by Pincode for granular view
    pin_df = df.groupby(['State', 'District', 'Pincode']).sum().reset_index()
    pin_df['Total'] = pin_df['Age_0_5'] + pin_df['Age_18_greater'] + pin_df['Age_5_17']
    pin_df = pin_df[pin_df['Total'] > 100] # Ignore tiny pincodes
    
    # 1. "Ghost Villages" (High Adult, Zero Children)
    ghosts = pin_df[(pin_df['Age_18_greater'] > 50) & (pin_df['Age_0_5'] == 0)]
    
    # 2. "Baby Boomers" (High Child Ratio)
    pin_df['Child_Ratio'] = pin_df['Age_0_5'] / pin_df['Total']
    boomers = pin_df.sort_values('Child_Ratio', ascending=False).head(10)
    
    print(f"‚ö†Ô∏è Found {len(ghosts)} 'Ghost Pincodes' (Adults only, possible migration hubs or data errors).")
    
    # Visualize Top 10 High-Growth Areas
    fig = px.bar(
        boomers, x='Child_Ratio', y='Pincode', orientation='h',
        color='State',
        title="<b>Top 10 'High Growth' Pincodes</b><br><sub>Areas with highest % of 0-5 Age Group (Need Schools/Healthcare)</sub>"
    )
    fig.show()

# ---------------------------------------------------------
# EXECUTION PIPELINE
# ---------------------------------------------------------
if __name__ == "__main__":
    df = load_data(FILE_PATH)
    
    if df is not None:
        analyze_benford(df)      # Forensic
        analyze_hierarchy(df)    # Drill Down (Fixed)
        analyze_clusters(df)     # AI Analysis (New)
        analyze_extremes(df)     # Policy Insights (New)
        
        print("‚úÖ FULL ANALYSIS COMPLETE.")

üöÄ Starting Advanced Analysis Pipeline...
‚úî Data Loaded: 1,208,727 active records.

üîç Executing Benford's Law Forensic Check...



üó∫Ô∏è Generating National Hierarchy (Optimized)...



üß¨ Running AI Cluster Analysis (Demographic Profiles)...



‚ö° Identifying Policy Anomalies...
‚ö†Ô∏è Found 1 'Ghost Pincodes' (Adults only, possible migration hubs or data errors).


‚úÖ FULL ANALYSIS COMPLETE.


In [5]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from statsmodels.tsa.seasonal import seasonal_decompose
import os
import warnings

# ==========================================
# 0. CONFIGURATION & SETUP
# ==========================================
warnings.filterwarnings('ignore')
OUTPUT_DIR = "hackathon_outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)
FILE_PATH = "/kaggle/input/aadhar/aadhaar_cleaned.csv"

# ==========================================
# 1. ROBUST DATA LOADER
# ==========================================
def load_and_prep_data(path):
    print("üöÄ Booting Analytics Engine...")
    try:
        df = pd.read_csv(path)
        
        # Clean Headers
        df.columns = df.columns.str.replace('#', '', regex=False).str.strip()
        
        # Rename for consistency
        col_map = {
            'Age_18_gr...': 'Age_18_greater',
            'Age_18_group': 'Age_18_greater'
        }
        df.rename(columns=col_map, inplace=True)
        
        # Force numeric
        cols = ['Age_0_5', 'Age_5_17', 'Age_18_greater']
        for c in cols:
            if c in df.columns:
                df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0)
        
        df['Total_Reg'] = df[cols].sum(axis=1)
        df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')
        
        # Remove empty data
        df = df[df['Total_Reg'] > 0]
        
        print(f"‚úî Data Loaded: {len(df)} records ready.")
        return df
    except Exception as e:
        print(f"‚ùå Loader Error: {e}")
        return None

# ==========================================
# FEATURE 1: MULTI-LEVEL HIERARCHY (Sunburst)
# ==========================================
def feature_1_hierarchy(df):
    print("üåü Feature 1: Generating Hierarchical Sunburst...")
    # Drill down: State -> District (Skip Pincode to prevent crash)
    agg = df.groupby(['State', 'District']).agg({'Total_Reg':'sum', 'Age_0_5':'sum'}).reset_index()
    agg['Child_Density'] = agg['Age_0_5'] / agg['Total_Reg']
    
    fig = px.sunburst(
        agg,
        path=['State', 'District'],
        values='Total_Reg',
        color='Child_Density',
        color_continuous_scale='RdBu',
        title="<b>Feature 1: National Hierarchical Drill-Down</b><br><sub>Size: Registration Volume | Color: Future Demographics (0-5 Age Ratio)</sub>"
    )
    fig.write_html(f"{OUTPUT_DIR}/1_Hierarchy_Sunburst.html")

# ==========================================
# FEATURE 2: DYNAMIC HEATMAP (Matrix)
# ==========================================
def feature_2_heatmap(df):
    print("üåü Feature 2: Generating Density Heatmap...")
    # Since we lack Lat/Long for a map, we build a "State vs Time" Heatmap
    if df['Date'].nunique() > 1:
        df['Month'] = df['Date'].dt.to_period('M').astype(str)
        pivot = df.pivot_table(index='State', columns='Month', values='Total_Reg', aggfunc='sum')
        
        fig = px.imshow(
            pivot,
            labels=dict(x="Timeline", y="State", color="Registrations"),
            title="<b>Feature 2: Temporal Density Matrix</b><br><sub>Heatmap of Enrolment Intensity over Time</sub>",
            aspect="auto",
            color_continuous_scale="Viridis"
        )
        fig.write_html(f"{OUTPUT_DIR}/2_Density_Heatmap.html")

# ==========================================
# FEATURE 3: COHORT FLOW (Sankey)
# ==========================================
def feature_3_sankey(df):
    print("üåü Feature 3: Generating Cohort Sankey...")
    # Visualize flow of Total Population into Age Groups
    total = df['Total_Reg'].sum()
    age_0_5 = df['Age_0_5'].sum()
    age_5_17 = df['Age_5_17'].sum()
    adults = df['Age_18_greater'].sum()
    
    fig = go.Figure(data=[go.Sankey(
        node = dict(
          pad = 15, thickness = 20, line = dict(color = "black", width = 0.5),
          label = ["Total Population", "Infants (0-5)", "Students (5-17)", "Adults (18+)"],
          color = ["blue", "green", "orange", "red"]
        ),
        link = dict(
          source = [0, 0, 0], 
          target = [1, 2, 3],
          value = [age_0_5, age_5_17, adults]
      ))])
    
    fig.update_layout(title_text="<b>Feature 3: Demographic Cohort Flow</b>", font_size=10)
    fig.write_html(f"{OUTPUT_DIR}/3_Cohort_Sankey.html")

# ==========================================
# FEATURE 4: PREDICTIVE FORECASTING
# ==========================================
def feature_4_forecast(df):
    print("üåü Feature 4: Building Prediction Model...")
    # Aggregate to National Level Daily
    daily = df.groupby('Date')['Total_Reg'].sum().reset_index().sort_values('Date')
    
    if len(daily) > 20:
        # Simple Rolling Average Forecast for demonstration
        daily['MA_30'] = daily['Total_Reg'].rolling(window=3).mean()
        
        fig = go.Figure()
        fig.add_trace(go.Scatter(x=daily['Date'], y=daily['Total_Reg'], name='Actual'))
        fig.add_trace(go.Scatter(x=daily['Date'], y=daily['MA_30'], name='Trend Forecast', line=dict(dash='dash')))
        
        fig.update_layout(title="<b>Feature 4: Predictive Forecasting Dashboard</b>")
        fig.write_html(f"{OUTPUT_DIR}/4_Forecast.html")

# ==========================================
# FEATURE 5: ANOMALY DETECTION
# ==========================================
def feature_5_anomaly(df):
    print("üåü Feature 5: Detecting Anomalies...")
    # Find Pincodes with unusual Age distributions
    features = df[['Age_0_5', 'Age_18_greater']].fillna(0)
    model = IsolationForest(contamination=0.01, random_state=42)
    df['Anomaly'] = model.fit_predict(features)
    
    anomalies = df[df['Anomaly'] == -1]
    
    fig = px.scatter(
        df.sample(min(5000, len(df))), 
        x='Age_18_greater', y='Age_0_5', color='Anomaly',
        title="<b>Feature 5: AI Anomaly Detection</b><br><sub>Red points indicate unusual demographic splits (Possible Fraud/Error)</sub>"
    )
    fig.write_html(f"{OUTPUT_DIR}/5_Anomaly_Detection.html")

# ==========================================
# FEATURE 6: MIGRATION PATTERN (Proxy)
# ==========================================
def feature_6_migration(df):
    print("üåü Feature 6: Analyzing Migration Potential...")
    # Logic: High Adult % = "Work Hub" (In-Migration), High Child % = "Family Hub" (Out-Migration)
    state_profile = df.groupby('State')[['Age_0_5', 'Age_18_greater', 'Total_Reg']].sum()
    state_profile['Workforce_Ratio'] = state_profile['Age_18_greater'] / state_profile['Total_Reg']
    
    top_destinations = state_profile.sort_values('Workforce_Ratio', ascending=False).head(10).reset_index()
    
    fig = px.bar(
        top_destinations, 
        x='Workforce_Ratio', y='State', orientation='h',
        color='Workforce_Ratio',
        title="<b>Feature 6: Potential Migration Destinations (Workforce Hubs)</b><br><sub>States with highest Adult ratios suggest In-Migration</sub>"
    )
    fig.write_html(f"{OUTPUT_DIR}/6_Migration_Analysis.html")

# ==========================================
# FEATURE 7: DIGITAL DIVIDE INDEX (DDI)
# ==========================================
def feature_7_ddi(df):
    print("üåü Feature 7: Calculating Digital Divide Index...")
    # Formula Proxy: DDI = (Adult_Ratio * 0.7) + (Child_Ratio * 0.3)
    # Assumption: Higher Adult ratio implies better phone access/biometric ability than children
    df['DDI'] = ((df['Age_18_greater']/df['Total_Reg']) * 70) + ((df['Age_5_17']/df['Total_Reg']) * 30)
    
    # District Level
    district_ddi = df.groupby(['State', 'District'])['DDI'].mean().reset_index()
    bottom_10 = district_ddi.sort_values('DDI').head(10)
    
    fig = px.bar(
        bottom_10, x='DDI', y='District', color='State',
        title="<b>Feature 7: Digital Divide Index (Bottom 10 Districts)</b><br><sub>Low Score = Critical Intervention Zones</sub>"
    )
    fig.write_html(f"{OUTPUT_DIR}/7_Digital_Divide.html")

# ==========================================
# FEATURE 8: AGE PYRAMID
# ==========================================
def feature_8_pyramid(df):
    print("üåü Feature 8: Constructing Population Pyramid...")
    # National Sums
    ages = ['0-5 Years', '5-17 Years', '18+ Years']
    values = [df['Age_0_5'].sum(), df['Age_5_17'].sum(), df['Age_18_greater'].sum()]
    
    fig = go.Figure(go.Funnel(
        y = ages,
        x = values,
        textinfo = "value+percent initial"
    ))
    fig.update_layout(title="<b>Feature 8: Aadhaar Population Pyramid</b>")
    fig.write_html(f"{OUTPUT_DIR}/8_Age_Pyramid.html")

# ==========================================
# FEATURE 9: PINCODE MICRO-ANALYSIS
# ==========================================
def feature_9_pincode(df):
    print("üåü Feature 9: Pincode Micro-Analysis...")
    top_pins = df.groupby('Pincode')['Total_Reg'].sum().nlargest(20).reset_index()
    top_pins['Pincode'] = top_pins['Pincode'].astype(str)
    
    fig = px.scatter(
        top_pins, x='Pincode', y='Total_Reg', size='Total_Reg',
        title="<b>Feature 9: Top 20 Pincodes by Volume</b>"
    )
    fig.write_html(f"{OUTPUT_DIR}/9_Pincode_Analysis.html")

# ==========================================
# FEATURE 10: TIME SERIES CLUSTERING
# ==========================================
def feature_10_clustering(df):
    print("üåü Feature 10: Clustering Time Patterns...")
    # Cluster Districts based on demographic profile (Proxy for temporal pattern if 1 date)
    dist_profile = df.groupby('District')[['Age_0_5', 'Age_18_greater']].mean()
    
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(dist_profile)
    
    kmeans = KMeans(n_clusters=4, random_state=42)
    dist_profile['Cluster'] = kmeans.fit_predict(scaled)
    
    fig = px.scatter(
        dist_profile, x='Age_0_5', y='Age_18_greater', color='Cluster',
        title="<b>Feature 10: District Clustering (Demographic Profiles)</b><br><sub>Groups: 0=Rural/Young, 1=Metro/Working, etc.</sub>"
    )
    fig.write_html(f"{OUTPUT_DIR}/10_Clustering.html")

# ==========================================
# EXECUTION
# ==========================================
if __name__ == "__main__":
    df = load_and_prep_data(FILE_PATH)
    
    if df is not None:
        feature_1_hierarchy(df)
        feature_2_heatmap(df)
        feature_3_sankey(df)
        feature_4_forecast(df)
        feature_5_anomaly(df)
        feature_6_migration(df)
        feature_7_ddi(df)
        feature_8_pyramid(df)
        feature_9_pincode(df)
        feature_10_clustering(df)
        
        print(f"\n‚úÖ SUCCESS! All 10 Analytics generated in '{OUTPUT_DIR}' folder.")

üöÄ Booting Analytics Engine...
‚úî Data Loaded: 1208727 records ready.
üåü Feature 1: Generating Hierarchical Sunburst...
üåü Feature 2: Generating Density Heatmap...
üåü Feature 3: Generating Cohort Sankey...
üåü Feature 4: Building Prediction Model...
üåü Feature 5: Detecting Anomalies...
üåü Feature 6: Analyzing Migration Potential...
üåü Feature 7: Calculating Digital Divide Index...
üåü Feature 8: Constructing Population Pyramid...
üåü Feature 9: Pincode Micro-Analysis...
üåü Feature 10: Clustering Time Patterns...

‚úÖ SUCCESS! All 10 Analytics generated in 'hackathon_outputs' folder.


In [6]:
import shutil
from IPython.display import FileLink

# 1. Name of the output folder you want to zip
output_folder = "hackathon_outputs"

# 2. Create the Zip File
# This creates 'hackathon_submission.zip' from the 'hackathon_outputs' directory
shutil.make_archive("hackathon_submission", 'zip', output_folder)

print(f"‚úÖ Zip file created successfully: hackathon_submission.zip")

# 3. Generate a Clickable Download Link (Works in Kaggle/Jupyter)
print("\nüëá Click the link below to download your full submission üëá")
FileLink(r'hackathon_submission.zip')

‚úÖ Zip file created successfully: hackathon_submission.zip

üëá Click the link below to download your full submission üëá


In [7]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler
import shutil
import os
import warnings
from IPython.display import FileLink

# ==========================================
# 0. CONFIGURATION
# ==========================================
warnings.filterwarnings('ignore')
OUTPUT_DIR = "hackathon_submission_final"
if os.path.exists(OUTPUT_DIR): shutil.rmtree(OUTPUT_DIR)
os.makedirs(OUTPUT_DIR)
FILE_PATH = "/kaggle/input/aadhar/aadhaar_cleaned.csv"

# ==========================================
# 1. ROBUST DATA ENGINE
# ==========================================
def load_engine(path):
    print("üöÄ Booting Grandmaster Analytics Engine...")
    try:
        df = pd.read_csv(path)
        # Clean Headers (Strip #, spaces)
        df.columns = df.columns.str.replace('#', '', regex=False).str.strip()
        
        # Smart Rename
        col_map = {}
        for c in df.columns:
            if '18' in c: col_map[c] = 'Age_18_greater'
            elif '0_5' in c: col_map[c] = 'Age_0_5'
            elif '5_17' in c: col_map[c] = 'Age_5_17'
        df.rename(columns=col_map, inplace=True)
        
        # Force Numeric
        nums = ['Age_0_5', 'Age_5_17', 'Age_18_greater']
        for c in nums: df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0)
        
        df['Total_Reg'] = df[nums].sum(axis=1)
        df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')
        df = df[df['Total_Reg'] > 0].copy() # Remove ghosts
        
        print(f"‚úî Data Ready: {len(df):,} records.")
        return df
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return None

# ==========================================
# PHASE 1: UNIVARIATE (Temporal & Geo)
# ==========================================
def phase_1_analysis(df):
    print("üìä Phase 1: Univariate Analysis (Trends & Rankings)...")
    
    # 1.1 Temporal: Monthly Trend
    daily = df.groupby('Date')['Total_Reg'].sum().reset_index().sort_values('Date')
    fig1 = px.line(daily, x='Date', y='Total_Reg', title="<b>1.1 National Enrolment Trend (Time Series)</b>")
    fig1.write_html(f"{OUTPUT_DIR}/Phase1_1_Temporal_Trend.html")
    
    # 1.2 Geographic: State Rankings (Treemap instead of Bar for "Map" feel)
    state_sum = df.groupby('State')['Total_Reg'].sum().reset_index()
    fig2 = px.treemap(
        state_sum, path=['State'], values='Total_Reg',
        title="<b>1.2 Geographic Volume Map (State Level)</b>",
        color='Total_Reg', color_continuous_scale='Viridis'
    )
    fig2.write_html(f"{OUTPUT_DIR}/Phase1_2_Geo_Treemap.html")
    
    # 1.3 Age Group Distribution
    ages = df[['Age_0_5', 'Age_5_17', 'Age_18_greater']].sum().reset_index()
    ages.columns = ['Cohort', 'Count']
    fig3 = px.pie(ages, names='Cohort', values='Count', title="<b>1.3 National Age Demographics</b>", hole=0.4)
    fig3.write_html(f"{OUTPUT_DIR}/Phase1_3_Age_Distribution.html")

# ==========================================
# PHASE 2: BIVARIATE (Correlations)
# ==========================================
def phase_2_analysis(df):
    print("üîó Phase 2: Bivariate Analysis (Correlations)...")
    
    # 2.1 Correlation Heatmap
    corr = df[['Age_0_5', 'Age_5_17', 'Age_18_greater', 'Total_Reg']].corr()
    fig1 = px.imshow(corr, text_auto=True, title="<b>2.1 Demographic Correlation Matrix</b>", color_continuous_scale='RdBu_r')
    fig1.write_html(f"{OUTPUT_DIR}/Phase2_1_Correlation.html")
    
    # 2.2 Lag Analysis (Simulated for "Update" delay)
    # We compare Month N vs Month N-1 growth
    if df['Date'].nunique() > 10:
        daily = df.groupby('Date')['Total_Reg'].sum().reset_index()
        daily['Lag_1'] = daily['Total_Reg'].shift(1)
        fig2 = px.scatter(
            daily, x='Lag_1', y='Total_Reg', trendline='ols',
            title="<b>2.2 Temporal Autocorrelation (Lag Plot)</b><br><sub>Consistency Check: High R¬≤ = Stable Operations</sub>"
        )
        fig2.write_html(f"{OUTPUT_DIR}/Phase2_2_Lag_Analysis.html")

# ==========================================
# PHASE 3: TRIVARIATE (Complex 3D)
# ==========================================
def phase_3_analysis(df):
    print("üßä Phase 3: Trivariate Analysis (3D & Matrices)...")
    
    # 3.1 Heatmap Matrix: State x Time x Volume
    if df['Date'].nunique() > 1:
        df['Month'] = df['Date'].dt.to_period('M').astype(str)
        matrix = df.pivot_table(index='State', columns='Month', values='Total_Reg', aggfunc='sum').fillna(0)
        
        fig1 = px.imshow(
            matrix, aspect='auto',
            title="<b>3.1 Spatio-Temporal Matrix (State vs. Time)</b>",
            labels=dict(x="Timeline", y="State", color="Volume")
        )
        fig1.write_html(f"{OUTPUT_DIR}/Phase3_1_Heatmap_Matrix.html")
        
    # 3.3 3D Scatter: Pincode x Adult % x Child % (Clustering visual)
    # Aggregated to District to prevent crash
    dist = df.groupby(['State', 'District']).agg({'Age_0_5':'sum', 'Age_18_greater':'sum', 'Total_Reg':'sum'}).reset_index()
    dist['Child_Pct'] = dist['Age_0_5'] / dist['Total_Reg']
    dist['Adult_Pct'] = dist['Age_18_greater'] / dist['Total_Reg']
    
    fig2 = px.scatter_3d(
        dist, x='Child_Pct', y='Adult_Pct', z='Total_Reg',
        color='State', size='Total_Reg', size_max=30,
        title="<b>3.3 Multi-Dimensional District Clustering (3D)</b>",
        labels={'Child_Pct': 'Child Ratio', 'Adult_Pct': 'Adult Ratio', 'Total_Reg': 'Volume'}
    )
    fig2.write_html(f"{OUTPUT_DIR}/Phase3_3_3D_Clustering.html")

# ==========================================
# FEATURE 7 FIXED: DIGITAL DIVIDE (Gini)
# ==========================================
def feature_7_ddi_fixed(df):
    print("‚öñÔ∏è Feature 7: Digital Divide Index (Gini-Based)...")
    
    # DDI = 100 - (Gini Coefficient * 100). Higher is Better.
    def calculate_ddi(x):
        if len(x) < 2: return 50 # Default for single-data points
        # Gini calc
        sorted_x = np.sort(x)
        n = len(x)
        cumx = np.cumsum(sorted_x, dtype=float)
        gini = (n + 1 - 2 * np.sum(cumx) / cumx[-1]) / n
        return (1 - gini) * 100

    # Calculate per District based on Pincode distribution
    ddi_scores = df.groupby(['State', 'District'])['Total_Reg'].apply(lambda x: calculate_ddi(x.values)).reset_index()
    ddi_scores.rename(columns={'Total_Reg': 'DDI_Score'}, inplace=True)
    
    # Bottom 15 (Critical Zones)
    bottom_15 = ddi_scores.sort_values('DDI_Score').head(15)
    
    fig = px.bar(
        bottom_15, x='DDI_Score', y='District', color='State', orientation='h',
        title="<b>Feature 7: Digital Divide Index (DDI)</b><br><sub>Low Score = High Inequality (Access concentrated in few pincodes)</sub>",
        color_discrete_sequence=['#FF5733']
    )
    fig.write_html(f"{OUTPUT_DIR}/Feature7_DDI_Fixed.html")

# ==========================================
# FEATURE: TREE DIAGRAMS (Sunburst + Tree)
# ==========================================
def feature_tree_visuals(df):
    print("üå≥ Feature: Generating Tree-Based Visuals...")
    
    # 1. Sunburst (Circular Tree)
    agg = df.groupby(['State', 'District']).agg({'Total_Reg':'sum', 'Age_0_5':'sum'}).reset_index()
    agg['Child_Density'] = agg['Age_0_5'] / agg['Total_Reg']
    
    fig1 = px.sunburst(
        agg, path=['State', 'District'], values='Total_Reg', color='Child_Density',
        title="<b>Hierarchical Tree 1: Sunburst (State ‚Üí District)</b>",
        color_continuous_scale='RdBu'
    )
    fig1.write_html(f"{OUTPUT_DIR}/Visual_Tree_Sunburst.html")
    
    # 2. Treemap (Rectangular Tree - The "Map" substitute)
    fig2 = px.treemap(
        agg, path=['State', 'District'], values='Total_Reg', color='Child_Density',
        title="<b>Hierarchical Tree 2: Treemap Analysis</b>"
    )
    fig2.write_html(f"{OUTPUT_DIR}/Visual_Tree_Treemap.html")

# ==========================================
# FEATURE: ANOMALY DETECTION (AI)
# ==========================================
def feature_anomaly_ai(df):
    print("ü§ñ Feature: AI Anomaly Detection...")
    features = df[['Age_0_5', 'Age_18_greater']].fillna(0)
    
    iso = IsolationForest(contamination=0.01, random_state=42)
    df['Anomaly'] = iso.fit_predict(features)
    anomalies = df[df['Anomaly'] == -1]
    
    # Plot anomalies
    fig = px.scatter(
        df.sample(min(5000, len(df))), x='Age_18_greater', y='Age_0_5', color='Anomaly',
        title="<b>AI Anomaly Detection</b><br><sub>Red points = Unusual Demographic Patterns</sub>",
        color_discrete_map={1:'blue', -1:'red'}
    )
    fig.write_html(f"{OUTPUT_DIR}/Feature_AI_Anomaly.html")

# ==========================================
# MAIN EXECUTION
# ==========================================
if __name__ == "__main__":
    df = load_engine(FILE_PATH)
    
    if df is not None:
        # Run Framework
        phase_1_analysis(df)
        phase_2_analysis(df)
        phase_3_analysis(df)
        
        # Run Specialized Features
        feature_7_ddi_fixed(df)
        feature_tree_visuals(df)
        feature_anomaly_ai(df)
        
        # ZIP AND LINK
        print("\nüì¶ Zipping Strategy...")
        shutil.make_archive("Hackathon_Winner_Submission", 'zip', OUTPUT_DIR)
        print("‚úÖ DONE! Download below:")
        display(FileLink(r'Hackathon_Winner_Submission.zip'))

üöÄ Booting Grandmaster Analytics Engine...
‚úî Data Ready: 1,208,727 records.
üìä Phase 1: Univariate Analysis (Trends & Rankings)...
üîó Phase 2: Bivariate Analysis (Correlations)...
üßä Phase 3: Trivariate Analysis (3D & Matrices)...
‚öñÔ∏è Feature 7: Digital Divide Index (Gini-Based)...
üå≥ Feature: Generating Tree-Based Visuals...
ü§ñ Feature: AI Anomaly Detection...

üì¶ Zipping Strategy...
‚úÖ DONE! Download below:


In [15]:
# ==========================================
# 1. ROBUST DATA ENGINE (With Before/After Report)
# ==========================================
def load_engine(path):
    print("üöÄ Booting Grandmaster Analytics Engine...")
    try:
        # 1. Load Raw Data
        df = pd.read_csv(path)
        raw_count = len(df) # <--- BEFORE COUNT
        
        # 2. Clean Headers
        df.columns = df.columns.str.replace('#', '', regex=False).str.strip()
        
        # 3. Rename Columns
        col_map = {}
        for c in df.columns:
            if '18' in c: col_map[c] = 'Age_18_greater'
            elif '0_5' in c: col_map[c] = 'Age_0_5'
            elif '5_17' in c: col_map[c] = 'Age_5_17'
        df.rename(columns=col_map, inplace=True)
        
        # 4. Force Numeric & Calculate Totals
        nums = ['Age_0_5', 'Age_5_17', 'Age_18_greater']
        for c in nums: df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0)
        
        df['Total_Reg'] = df[nums].sum(axis=1)
        df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')
        
        # 5. Filter (Remove Ghost Rows)
        df_clean = df[df['Total_Reg'] > 0].copy()
        clean_count = len(df_clean) # <--- AFTER COUNT
        
        dropped = raw_count - clean_count
        
        print(f"------------------------------------------------")
        print(f"üì• RAW DATA IMPORTED : {raw_count:,} records")
        print(f"üóëÔ∏è DROPPED (Empty/0) : {dropped:,} records")
        print(f"‚úÖ FINAL DATA READY  : {clean_count:,} records")
        print(f"------------------------------------------------")
        
        return df_clean
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return None

In [16]:
import pandas as pd

FILE_PATH = "/kaggle/input/aadhar/aadhaar_cleaned.csv"

def check_data_counts(path):
    print("üöÄ Verifying Data Integrity...")
    try:
        # 1. Load Raw Data (BEFORE)
        df_raw = pd.read_csv(path)
        raw_count = len(df_raw)
        
        # 2. Simulate the Cleaning Process
        # Clean Headers
        df_raw.columns = df_raw.columns.str.replace('#', '', regex=False).str.strip()
        
        # Rename Columns
        col_map = {}
        for c in df_raw.columns:
            if '18' in c: col_map[c] = 'Age_18_greater'
            elif '0_5' in c: col_map[c] = 'Age_0_5'
            elif '5_17' in c: col_map[c] = 'Age_5_17'
        df_raw.rename(columns=col_map, inplace=True)
        
        # Force Numeric
        nums = ['Age_0_5', 'Age_5_17', 'Age_18_greater']
        for c in nums: 
            df_raw[c] = pd.to_numeric(df_raw[c], errors='coerce').fillna(0)
        
        # Calculate Total
        df_raw['Total_Reg'] = df_raw[nums].sum(axis=1)
        
        # 3. Filter (AFTER)
        # We keep only rows with Total_Reg > 0
        df_clean = df_raw[df_raw['Total_Reg'] > 0]
        clean_count = len(df_clean)
        
        dropped_count = raw_count - clean_count
        
        # 4. Print Report
        print(f"------------------------------------------------")
        print(f"üì• RAW DATA IMPORTED  : {raw_count:,} records")
        print(f"üóëÔ∏è DROPPED (Empty/0)  : {dropped_count:,} records")
        print(f"‚úÖ FINAL DATA READY   : {clean_count:,} records")
        print(f"------------------------------------------------")
        print(f"DATA RETENTION RATE   : {round((clean_count/raw_count)*100, 4)}%")
        print(f"------------------------------------------------")
        
    except Exception as e:
        print(f"‚ùå Error reading file: {e}")

# Run the check
check_data_counts(FILE_PATH)

üöÄ Verifying Data Integrity...
------------------------------------------------
üì• RAW DATA IMPORTED  : 1,208,847 records
üóëÔ∏è DROPPED (Empty/0)  : 120 records
‚úÖ FINAL DATA READY   : 1,208,727 records
------------------------------------------------
DATA RETENTION RATE   : 99.9901%
------------------------------------------------
