# Task 2: Comprehensive Exploratory Data Analysis

**Objective**: Analyze patterns, trajectories, and relationships in Ethiopia's financial inclusion data.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import numpy as np

# Add src to path
sys.path.append(os.path.abspath(os.path.join('../src')))
from data_loader import load_data
from analyze_data import analyze_access_slowdown, analyze_gender_gap

sns.set_theme(style="whitegrid")
df = load_data(data_path=r"../data/raw/ethiopia_fi_unified_data.xlsx")

## 1. Dataset Overview
**Temporal Coverage**: Which years have data?

In [None]:
data_coverage = df[df['record_type'] == 'observation'].pivot_table(
    index='indicator_code', columns='data_year', values='value_numeric', aggfunc='count'
)
plt.figure(figsize=(12, 8))
sns.heatmap(data_coverage.fillna(0), cmap="YlGnBu", cbar=False, annot=True)
plt.title('Data Availability by Year and Indicator')
plt.show()

## 2. Access Analysis: Trajectory & Slowdown
Visualizing the account ownership trend and average annual growth.

In [None]:
access_stats = analyze_access_slowdown(df)
print(access_stats)

plt.figure(figsize=(10, 6))
sns.lineplot(data=access_stats, x='data_year', y='value_numeric', marker='o', label='Account Ownership (%)', linewidth=2.5)

# Add event markers
plt.axvline(x=2021.4, color='orange', linestyle='--', alpha=0.7, label='Telebirr Launch (May 2021)')
plt.axvline(x=2023.6, color='green', linestyle='--', alpha=0.7, label='M-Pesa Entry (Aug 2023)')

plt.title('Account Ownership Trajectory vs Major Events')
plt.ylabel('Percentage (%)')
plt.legend()
plt.ylim(0, 60)
plt.show()

### Insight: The 2021-2024 Slowdown
Despite the launch of Telebirr in 2021, Findex reported only a 3pp increase (46% -> 49%) in 3 years. This suggests that while **Usage** exploded (see below), **Access** (unique account ownership) saturation or measurement lag might be issues.

## 3. Usage Analysis: The Payment Explosion
Contrasting the slow Access growth with the rapid Usage growth.

In [None]:
usage_df = df[
    (df['indicator_code'].isin(['USG_P2P_COUNT', 'USG_ATM_COUNT'])) & 
    (df['record_type'] == 'observation')
].sort_values('data_year')

plt.figure(figsize=(10, 6))
sns.barplot(data=usage_df, x='data_year', y='value_numeric', hue='indicator')
plt.title('Transaction Volume: Digital (P2P) vs Cash (ATM)')
plt.ylabel('Transactions (Millions)')
plt.show()

## 4. Gender Gap Evolution
Has the gap narrowed?

In [None]:
gender_stats = analyze_gender_gap(df)
print(gender_stats)

gender_stats[['male', 'female']].plot(kind='bar', figsize=(10, 6), color=['#3498db', '#e74c3c'])
plt.title('Account Ownership by Gender')
plt.ylabel('Ownership Rate (%)')
plt.show()

## 5. Infrastructure: The 4G Correlation
Is infrastructure a leading indicator?

In [None]:
infra_df = df[
    (df['indicator_code'] == 'ACC_4G_COV')
].sort_values('data_year')

plt.figure(figsize=(8, 5))
sns.lineplot(data=infra_df, x='data_year', y='value_numeric', marker='s', color='purple')
plt.title('4G Population Coverage Trend')
plt.ylabel('Coverage (%)')
plt.show()