# Wealth Inequality Analysis: Long Run Dynamics

This notebook provides analysis of long-run dynamics of wealth inequality using data from multiple sources.

## Contents
1. Data Loading
2. Exploratory Data Analysis
3. Time Series Analysis
4. Cross-Country Comparisons
5. Statistical Analysis
6. Visualizations

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
import warnings

warnings.filterwarnings('ignore')

# Set plotting style
try:
    plt.style.use('seaborn-v0_8-darkgrid')
except:
    plt.style.use('seaborn-darkgrid')
sns.set_palette("husl")

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully")

## 1. Data Loading

Load processed data from the data/processed directory.

In [None]:
# Load processed data
import os

data_dir = '../data/processed'

# Check if data exists
if not os.path.exists(data_dir):
    print(f"Warning: {data_dir} does not exist. Please run data scraping and processing first.")
    print("Run: python scripts/data_scraper.py")
    print("Then: python scripts/data_processing.py")
else:
    # List available files
    files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]
    print(f"Available data files: {files}")
    
    # Load merged data if available
    if 'merged_data.csv' in files:
        df = pd.read_csv(os.path.join(data_dir, 'merged_data.csv'))
        print(f"\nLoaded merged data: {df.shape[0]} rows, {df.shape[1]} columns")
        print(f"\nColumns: {df.columns.tolist()}")
        display(df.head())
    else:
        print("Merged data not found. Loading individual datasets...")
        # Load individual datasets
        datasets = {}
        for file in files:
            name = file.replace('_processed.csv', '')
            datasets[name] = pd.read_csv(os.path.join(data_dir, file))
            print(f"Loaded {name}: {datasets[name].shape}")

## 2. Exploratory Data Analysis

Basic statistical summary and data exploration.

In [None]:
# Basic statistics
if 'df' in locals():
    print("Dataset Summary:")
    print("="*50)
    print(df.info())
    print("\nDescriptive Statistics:")
    print("="*50)
    display(df.describe())
    
    print("\nMissing Values:")
    print("="*50)
    print(df.isnull().sum())

In [None]:
# Country and time coverage
if 'df' in locals():
    if 'country' in df.columns:
        print(f"Countries in dataset: {df['country'].nunique()}")
        print(df['country'].value_counts())
    
    if 'year' in df.columns:
        print(f"\nYear range: {df['year'].min()} - {df['year'].max()}")
        print(f"Number of years: {df['year'].nunique()}")

## 3. Time Series Analysis

Analyze temporal trends in wealth inequality.

In [None]:
# Plot wealth inequality trends over time
if 'df' in locals() and 'year' in df.columns:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Plot 1: Gini index over time
    if 'gini_index' in df.columns and 'country' in df.columns:
        for country in df['country'].unique():
            country_data = df[df['country'] == country]
            axes[0, 0].plot(country_data['year'], country_data['gini_index'], 
                           marker='o', label=country)
        axes[0, 0].set_xlabel('Year')
        axes[0, 0].set_ylabel('Gini Index')
        axes[0, 0].set_title('Gini Index Over Time')
        axes[0, 0].legend()
        axes[0, 0].grid(True, alpha=0.3)
    
    # Plot 2: Top 10% wealth share
    if 'top10_wealth_share' in df.columns:
        for country in df['country'].unique():
            country_data = df[df['country'] == country]
            axes[0, 1].plot(country_data['year'], country_data['top10_wealth_share'], 
                           marker='o', label=country)
        axes[0, 1].set_xlabel('Year')
        axes[0, 1].set_ylabel('Top 10% Wealth Share (%)')
        axes[0, 1].set_title('Top 10% Wealth Share Over Time')
        axes[0, 1].legend()
        axes[0, 1].grid(True, alpha=0.3)
    
    # Plot 3: Top 1% wealth share
    if 'top1_wealth_share' in df.columns:
        for country in df['country'].unique():
            country_data = df[df['country'] == country]
            axes[1, 0].plot(country_data['year'], country_data['top1_wealth_share'], 
                           marker='o', label=country)
        axes[1, 0].set_xlabel('Year')
        axes[1, 0].set_ylabel('Top 1% Wealth Share (%)')
        axes[1, 0].set_title('Top 1% Wealth Share Over Time')
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=0.3)
    
    # Plot 4: Wealth concentration ratio
    if 'wealth_concentration_ratio' in df.columns:
        for country in df['country'].unique():
            country_data = df[df['country'] == country].dropna(subset=['wealth_concentration_ratio'])
            if len(country_data) > 0:
                axes[1, 1].plot(country_data['year'], country_data['wealth_concentration_ratio'], 
                               marker='o', label=country)
        axes[1, 1].set_xlabel('Year')
        axes[1, 1].set_ylabel('Concentration Ratio (Top10% / Bottom50%)')
        axes[1, 1].set_title('Wealth Concentration Ratio Over Time')
        axes[1, 1].legend()
        axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('../output/figures/time_series_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("Time series plots saved to output/figures/time_series_analysis.png")

## 4. Cross-Country Comparisons

Compare wealth inequality across different countries.

In [None]:
# Cross-country comparison
if 'df' in locals() and 'country' in df.columns:
    # Get most recent year data
    if 'year' in df.columns:
        latest_year = df['year'].max()
        recent_data = df[df['year'] == latest_year]
        
        fig, axes = plt.subplots(1, 2, figsize=(15, 5))
        
        # Bar plot: Gini index comparison
        if 'gini_index' in recent_data.columns:
            recent_data_sorted = recent_data.sort_values('gini_index', ascending=False)
            axes[0].bar(recent_data_sorted['country'], recent_data_sorted['gini_index'])
            axes[0].set_xlabel('Country')
            axes[0].set_ylabel('Gini Index')
            axes[0].set_title(f'Gini Index by Country ({latest_year})')
            axes[0].tick_params(axis='x', rotation=45)
        
        # Bar plot: Top 10% wealth share comparison
        if 'wealth_share_top10' in recent_data.columns:
            recent_data_sorted = recent_data.sort_values('wealth_share_top10', ascending=False)
            axes[1].bar(recent_data_sorted['country'], recent_data_sorted['wealth_share_top10'])
            axes[1].set_xlabel('Country')
            axes[1].set_ylabel('Top 10% Wealth Share (%)')
            axes[1].set_title(f'Top 10% Wealth Share by Country ({latest_year})')
            axes[1].tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.savefig('../output/figures/cross_country_comparison.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        print("Cross-country comparison saved to output/figures/cross_country_comparison.png")

## 5. Statistical Analysis

Perform statistical tests and regression analysis.

In [None]:
# Correlation analysis
if 'df' in locals():
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    if len(numeric_cols) > 1:
        correlation_matrix = df[numeric_cols].corr()
        
        plt.figure(figsize=(10, 8))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
                   square=True, linewidths=1, fmt='.2f')
        plt.title('Correlation Matrix of Inequality Metrics')
        plt.tight_layout()
        plt.savefig('../output/figures/correlation_matrix.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        print("Correlation matrix saved to output/figures/correlation_matrix.png")

In [None]:
# Trend analysis: Calculate growth rates
if 'df' in locals() and 'year' in df.columns and 'country' in df.columns:
    print("Calculating growth rates for wealth inequality metrics...\n")
    
    for country in df['country'].unique():
        country_data = df[df['country'] == country].sort_values('year')
        
        if len(country_data) >= 2:
            print(f"\n{country}:")
            print("="*50)
            
            # Calculate trend for each metric
            metrics = ['gini_index', 'top10_wealth_share', 'top1_wealth_share']
            for metric in metrics:
                if metric in country_data.columns:
                    data_clean = country_data.dropna(subset=[metric, 'year'])
                    if len(data_clean) >= 2:
                        # Simple linear regression
                        X = data_clean['year'].values.reshape(-1, 1)
                        y = data_clean[metric].values
                        
                        # Using statsmodels for better statistics
                        X_with_const = sm.add_constant(X)
                        model = sm.OLS(y, X_with_const).fit()
                        
                        trend = model.params[1]  # Slope
                        p_value = model.pvalues[1]
                        
                        print(f"  {metric}: {trend:.4f} per year (p={p_value:.4f})")

## 6. Additional Visualizations

Create additional visualizations for the paper.

In [None]:
# Distribution plots
if 'df' in locals():
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    metrics = ['gini_index', 'wealth_share_top10', 'top10_wealth_share', 'top1_wealth_share']
    
    for idx, metric in enumerate(metrics):
        if metric in df.columns:
            ax = axes[idx // 2, idx % 2]
            
            # Distribution plot
            df[metric].dropna().hist(bins=20, ax=ax, edgecolor='black')
            ax.set_xlabel(metric.replace('_', ' ').title())
            ax.set_ylabel('Frequency')
            ax.set_title(f'Distribution of {metric.replace("_", " ").title()}')
            ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('../output/figures/distributions.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("Distribution plots saved to output/figures/distributions.png")

## Summary Statistics Table

Generate a summary table for the paper.

In [None]:
# Create summary table
if 'df' in locals():
    summary_table = df.describe().T
    summary_table = summary_table.round(2)
    
    # Save to CSV
    summary_table.to_csv('../output/tables/summary_statistics.csv')
    print("Summary statistics saved to output/tables/summary_statistics.csv")
    
    display(summary_table)

## Conclusion

This notebook provides a comprehensive analysis of long-run dynamics of wealth inequality. The analysis includes:

- Time series analysis of inequality metrics
- Cross-country comparisons
- Statistical analysis and correlations
- Trend calculations

All figures and tables have been saved to the output directory for use in the paper.