# World Inequality Database Analysis

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

## Data Preparation

The `WID_countries.csv` file contains all the country codes used to label the country-specific `csv`s as well as some metadata on countries.

In [None]:
countries = pd.read_csv(
    'source_data/WID/WID_countries.csv',
    sep=';',
    keep_default_na=False,
    na_values=['']
)

The variables we are interested in are as follows. For more information on each variable, see the [WID Codes Dictionary](https://wid.world/codes-dictionary/) and the [README](./source_data/WID/README.md).

In [None]:

variables = [
    # (Column Name, Variable Name, Percentile)
    ('lcu_per_usd', 'xlcusxi999', 'p0p100'),
    ('income_per_capita', 'anninci999', 'p0p100'),
    ('avg_primary_house_npish_income', 'aprihni999', 'p0p100'),
    ('avg_secondary_house_npish_income', 'asechni999', 'p0p100'),
    ('avg_primary_corp_income', 'apricoi999', 'p0p100'),
    ('avg_secondary_corp_income', 'aseccoi999', 'p0p100'),
    ('avg_primary_govt_income', 'aprigoi999', 'p0p100'),
    ('avg_secondary_govt_income', 'asecgoi999', 'p0p100'),
    ('avg_house_wealth', 'ahwealj992', 'p0p100'),
    ('avg_house_wealth_bot99', 'ahwealj992', 'p0p99'),
    ('avg_house_wealth_top1', 'ahwealj992', 'p99p100'),
    ('ss_benefits', 'assbhni999', 'p0p100'),
    ('labor_share_income', 'wlabshi999', 'p0p100'),
    ('capital_share_income', 'wcapshi999', 'p0p100')
]

Using the country codes listed in `WID_countries.csv` we're able to collect all the necessary information from the individual `WID_data_XX.csv` files, where `XX` is the country code. For more information on these files see the [README](./source_data/WID/README.md). In these files, each variable is a record, and there's no guarantee than any given country will have that variable defined. Moreover, different countries may have these variables defined for different years. We want to obtain the highest possible number of data points. To this end, we will have to inspect each country and see what is defined and for what years the variables are defined, and select a year that maximizes the number of data points.

When this is done, the best year available to us within the range $[2012, 2022]$ is 2015.

In [None]:
best_year = 2015

Now we can go through the countries of the best year and obtain the relevant data for that country, converting variable records into `DataFrame` columns.

In [None]:
# This will contain all the variables for each country.
data = []

# We want to restrict ourselves to countries, ignoring subregions.
country_codes = filter(lambda code: len(code) == 2, list(countries.alpha2))

for country_code in country_codes:
    country_data = pd.read_csv(
        f'source_data/WID/WID_data_{country_code}.csv',
        sep=';',
        keep_default_na=False,
        na_values=['']
    )

    def read_variable(variable, percentile):
        """Read the value of a variable."""
        values = country_data.loc[
            (country_data.variable == variable)
            & (country_data.percentile == percentile)
            & (country_data.year == best_year)
        ]

        if values.empty:
            return None

        assert(len(values) == 1)
        return values.iloc[0].value

    skip_row = False
    row = [country_code]
    for _, variable, percentile in variables:
        x = read_variable(variable, percentile)
        
        # We want to skip any countries that don't have all variables defined.
        if not x:
            skip_row = True
            break
        
        row.append(x)

    if skip_row:
        continue

    data.append(row)

data = pd.DataFrame.from_records(
    data,
    columns=['country'] + [col for col, _, _ in variables]
)

We'll be working with these countries:

In [None]:
data.country

All the variables for each country are given in LCU (local currency units). We have to use the exchange rate into USD to normalize all the countries' variables into USD.

In [None]:
def norm_col_currency(df, col):
    df[col] = df[col] / df.lcu_per_usd

norm_col_currency(data, 'income_per_capita')
norm_col_currency(data, 'avg_primary_house_npish_income')
norm_col_currency(data, 'avg_secondary_house_npish_income')
norm_col_currency(data, 'avg_primary_corp_income')
norm_col_currency(data, 'avg_secondary_corp_income')
norm_col_currency(data, 'avg_primary_govt_income')
norm_col_currency(data, 'avg_secondary_govt_income')
norm_col_currency(data, 'avg_house_wealth')
norm_col_currency(data, 'avg_house_wealth_bot99')
norm_col_currency(data, 'avg_house_wealth_top1')
norm_col_currency(data, 'ss_benefits')

## Data Analysis

The following function will allow us to easily plot trendlines.

In [None]:
def plot_trendline(ax, x, y, **kwargs):
    """Plot a linear trendline."""
    m, b = np.polyfit(x, y, deg=1)
    ax.plot(x, x * m + b, **kwargs)

plt.Axes.trendline = plot_trendline

### Income

Unsurprisingly, net national income is correlated with average primary and secondary incomes of the household & NPISH, corporate, and government sectors.

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12.8, 4.8), sharex=True, sharey=True)

fig.supxlabel('Net National Income per Capita')

ax1.set_title('Primary Income')
ax1.set_ylabel('Avg. Primary Income')
ax1.xaxis.set_major_formatter('${x:1.0f}')
ax1.yaxis.set_major_formatter('${x:1.0f}')
ax1.scatter(data.income_per_capita, data.avg_primary_house_npish_income)
ax1.scatter(data.income_per_capita, data.avg_primary_corp_income)
ax1.scatter(data.income_per_capita, data.avg_primary_govt_income)
ax1.trendline(data.income_per_capita, data.avg_primary_house_npish_income)
ax1.trendline(data.income_per_capita, data.avg_primary_corp_income)
ax1.trendline(data.income_per_capita, data.avg_primary_govt_income)

ax2.set_title('Secondary Income')
ax2.set_ylabel('Avg. Secondary Income')
ax2.xaxis.set_major_formatter('${x:1.0f}')
ax2.yaxis.set_major_formatter('${x:1.0f}')
ax2.scatter(data.income_per_capita, data.avg_secondary_house_npish_income)
ax2.scatter(data.income_per_capita, data.avg_secondary_corp_income)
ax2.scatter(data.income_per_capita, data.avg_secondary_govt_income)
ax2.trendline(data.income_per_capita, data.avg_secondary_house_npish_income)
ax2.trendline(data.income_per_capita, data.avg_secondary_corp_income)
ax2.trendline(data.income_per_capita, data.avg_secondary_govt_income)

fig.legend(['Household & NPISH', 'Corporate', 'Government'])

There doesn't appear to be any correlation between national income and labor or capital shares of national income.

In [None]:
fig, ax = plt.subplots()

ax.scatter(data.income_per_capita, data.labor_share_income)
ax.scatter(data.income_per_capita, data.capital_share_income)
ax.set_xlabel('Net National Income per Capita')
ax.set_ylabel('Share of National Income')
ax.legend(['Labor', 'Capital'])
ax.xaxis.set_major_formatter('${x:1.0f}')
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1))

stats.pearsonr(data.income_per_capita, data.labor_share_income), stats.pearsonr(data.income_per_capita, data.capital_share_income)

There is a positive correlation between national income and social security benefits.

In [None]:
fig, ax = plt.subplots()

ax.scatter(data.income_per_capita, data.ss_benefits)
ax.trendline(data.income_per_capita, data.ss_benefits)
ax.set_xlabel('Net National Income per Capita')
ax.xaxis.set_major_formatter('${x:1.0f}')
ax.set_ylabel('Social Security Benefits')
ax.yaxis.set_major_formatter('${x:1.0f}')

stats.pearsonr(data.income_per_capita, data.ss_benefits)

### Income and Wealth

There is a correlation between average household wealth and national income.

In [None]:
fig, ax = plt.subplots()

ax.scatter(data.income_per_capita, data.avg_house_wealth)
ax.trendline(data.income_per_capita, data.avg_house_wealth)
ax.set_xlabel('Net National Income per Capita')
ax.set_ylabel('Avg. Household Wealth')
ax.xaxis.set_major_formatter('${x:1.0f}')
ax.yaxis.set_major_formatter('${x:1.0f}')

stats.pearsonr(data.income_per_capita, data.avg_house_wealth)

Interestingly, this correlation is stronger for the top 1% of household wealth.

In [None]:
fig, ax = plt.subplots()

ax.scatter(data.income_per_capita, data.avg_house_wealth_bot99)
ax.scatter(data.income_per_capita, data.avg_house_wealth_top1)
ax.trendline(data.income_per_capita, data.avg_house_wealth_bot99)
ax.trendline(data.income_per_capita, data.avg_house_wealth_top1)
ax.legend(['Bottom 99%', 'Top 1%'])
ax.set_xlabel('Net National Wealth per Capita')
ax.set_ylabel('Avg. Household Wealth')
ax.xaxis.set_major_formatter('${x:1.0f}')
ax.yaxis.set_major_formatter('${x:1.0f}')

stats.pearsonr(data.income_per_capita, data.avg_house_wealth_bot99), stats.pearsonr(data.income_per_capita, data.avg_house_wealth_top1)

The correlation is still present, but much weaker, for the bottom 99% of households. Notice the scale of the y-axis has changed tremendously.

In [None]:
fig, ax = plt.subplots()

ax.scatter(data.income_per_capita, data.avg_house_wealth_bot99)
ax.trendline(data.income_per_capita, data.avg_house_wealth_bot99)
ax.set_xlabel('Net National Income per Capita')
ax.set_ylabel('Avg. Household Wealth')
ax.xaxis.set_major_formatter('${x:1.0f}')
ax.yaxis.set_major_formatter('${x:1.0f}')