# Economic Indicators Exploratory Data Analysis

This notebook contains descriptive analysis of the panel dataset for Latin American countries from 2003-2023.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot styles
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('Set2')

# Import custom helper functions
from src.utils.helpers import plot_missing

## Load the panel dataset

In [None]:
# Load the processed panel dataset
panel = pd.read_csv("../../data/processed/panel_2003_2023.csv")

# Display basic information
print(f"Dataset shape: {panel.shape}")
panel.head()

## Missing Data Analysis

Let's examine missing data in our poverty indicators. The visualization will color differently the two poverty line variables:
- `lnpovhead`: Poverty headcount ratio at $3.65 a day (2017 PPP)
- `lnpovhead215`: Poverty headcount ratio at $2.15 a day (2017 PPP)

In [None]:
# Visualize missing data for poverty variables
# Note: There might be 'ISO3' columns instead of 'Country' in the dataset
# Using the column name from the prompt
plot_missing(panel[['ISO3', 'Year', 'lnpovhead', 'lnpovhead215']],
             "../../reports/figures/missing_poverty.png")

# Display the image in the notebook
from IPython.display import Image
Image("../../reports/figures/missing_poverty.png")

## Overall Missing Data

In [None]:
# Visualize missing data for all variables
plot_missing(panel, "../../reports/figures/missing_all.png")

# Display the image in the notebook
Image("../../reports/figures/missing_all.png")

## Summary Statistics

In [None]:
# Generate summary statistics
panel.describe()

## Distribution of Key Variables

In [None]:
# Plot histograms for key variables
numeric_cols = panel.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [col for col in numeric_cols if col != 'Year']

fig, axes = plt.subplots(nrows=len(numeric_cols)//2 + len(numeric_cols)%2, ncols=2, figsize=(14, 3*len(numeric_cols)//2))
axes = axes.flatten()

for i, col in enumerate(numeric_cols):
    sns.histplot(panel[col].dropna(), kde=True, ax=axes[i])
    axes[i].set_title(f'Distribution of {col}')
    
plt.tight_layout()
plt.savefig("../../reports/figures/variable_distributions.png", dpi=300, bbox_inches='tight')
plt.show()