# World Inequality Database Analysis

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

## Data Preparation

The `WID_countries.csv` file contains all the country codes used to label the country-specific `csv`s as well as some metadata on countries.

In [None]:
countries = pd.read_csv(
    'source_data/WID_countries.csv',
    sep=';',
    keep_default_na=False,
    na_values=['']
)

The variables we are interested in are as follows. For more information on each variable, see the [WID Codes Dictionary](https://wid.world/codes-dictionary/) and the [README](./source_data/README.md).

In [None]:

variables = [
    # (Column Name, Variable Name, Percentile)
    ('lcu_per_usd', 'xlcusxi999', 'p0p100'),
    ('income_per_capita', 'anninci999', 'p0p100'),
    ('gdp_per_capita', 'agdproi999', 'p0p100'),
    ('avg_primary_house_npish_income', 'aprihni999', 'p0p100'),
    ('avg_secondary_house_npish_income', 'asechni999', 'p0p100'),
    ('avg_primary_corp_income', 'apricoi999', 'p0p100'),
    ('avg_secondary_corp_income', 'aseccoi999', 'p0p100'),
    ('avg_primary_govt_income', 'aprigoi999', 'p0p100'),
    ('avg_secondary_govt_income', 'asecgoi999', 'p0p100'),
    ('avg_house_wealth', 'ahwealj992', 'p0p100'),
    ('avg_house_wealth_bot99', 'ahwealj992', 'p0p99'),
    ('avg_house_wealth_top1', 'ahwealj992', 'p99p100'),
    ('ss_benefits', 'assbhni999', 'p0p100'),
    ('labor_share_income', 'wlabshi999', 'p0p100'),
    ('capital_share_income', 'wcapshi999', 'p0p100')
]

Using the country codes listed in `WID_countries.csv` we're able to collect all the necessary information from the individual `WID_data_XX.csv` files, where `XX` is the country code. For more information on these files see the [README](./source_data/README.md). In these files, each variable is a record, and there's no guarantee than any given country will have that variable defined. Moreover, different countries may have these variables defined for different years. We want to obtain the highest possible number of data points. To this end, we will have to inspect each country and see what is defined and for what years the variables are defined, and select a year that maximizes the number of data points.

In [None]:
# The following dictionary will have keys that are lists of countries which
# have all the variables we want for the given year key.
years = {year: [] for year in range(1800, 2023)}

for country_code in list(countries.alpha2):
    country_data = pd.read_csv(
        f'source_data/WID_data_{country_code}.csv',
        sep=';',
        keep_default_na=False,
        na_values=['']
    )

    def get_variable_years(variable, percentile):
        """Obtain the years a given variable is defined for."""
        values = country_data.loc[
            (country_data.variable == variable)
            & (country_data.percentile == percentile)
        ]

        return list(values['year'])

    # This will contain the years for which all variables are defined.
    years_defined = {year for year in range(1800, 2023)}

    for _, variable, percentile in variables:
        years_defined = years_defined.intersection(get_variable_years(variable, percentile))
        if not years_defined:
            break

    for year in years_defined:
        years[year].append(country_code)

# Obtain the greatest year of those for which the most countries have all the
# relevant variables defined.
best_year = max([year for year, codes in years.items() if len(codes) == max([len(codes) for codes in years.values()])])

We'll be working with a subset of the available countries:

In [None]:
(pd.DataFrame(years[best_year], columns=['Country Code'])
    .join(countries[['alpha2', 'shortname']].set_index('alpha2'), on='Country Code')
    .rename(columns={'shortname': 'Country Name'}))

Now we can go through the countries of the best year and obtain the relevant data for that country, converting variable records into `DataFrame` columns.

In [None]:
# This will contain all the variables for each country.
data = []

for country_code in years[best_year]:
    country_data = pd.read_csv(
        f'source_data/WID_data_{country_code}.csv',
        sep=';',
        keep_default_na=False,
        na_values=['']
    )

    def read_variable(variable, percentile):
        """Read the value of a variable."""
        values = country_data.loc[
            (country_data.variable == variable)
            & (country_data.percentile == percentile)
            & (country_data.year == best_year)
        ]

        if values.empty:
            return None

        assert(len(values) == 1)
        return values.iloc[0].value

    row = [country_code]
    for _, variable, percentile in variables:
        row.append(read_variable(variable, percentile))

    data.append(row)

data = pd.DataFrame.from_records(
    data,
    columns=['country'] + [col for col, _, _ in variables]
)

All the variables for each country are given in LCU (local currency units). We have to use the exchange rate into USD to normalize all the countries' variables into USD.

In [None]:
def norm_col_currency(df, col):
    df[col] = df[col] / df.lcu_per_usd

norm_col_currency(data, 'income_per_capita')
norm_col_currency(data, 'gdp_per_capita')
norm_col_currency(data, 'avg_primary_house_npish_income')
norm_col_currency(data, 'avg_secondary_house_npish_income')
norm_col_currency(data, 'avg_primary_corp_income')
norm_col_currency(data, 'avg_secondary_corp_income')
norm_col_currency(data, 'avg_primary_govt_income')
norm_col_currency(data, 'avg_secondary_govt_income')
norm_col_currency(data, 'avg_house_wealth')
norm_col_currency(data, 'avg_house_wealth_bot99')
norm_col_currency(data, 'avg_house_wealth_top1')
norm_col_currency(data, 'ss_benefits')

## Data Analysis

Unsurprisingly, net national income is correlated with average primary and secondary incomes of the household & NPISH, corporate, and government sectors.

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12.8, 4.8), sharex=True, sharey=True)

fig.supxlabel('Net National Income per Capita')

ax1.set_title('Primary Income')
ax1.set_ylabel('Avg. Primary Income')
ax1.xaxis.set_major_formatter('${x:1.0f}')
ax1.yaxis.set_major_formatter('${x:1.0f}')
ax1.scatter(data.income_per_capita, data.avg_primary_house_npish_income)
ax1.scatter(data.income_per_capita, data.avg_primary_corp_income)
ax1.scatter(data.income_per_capita, data.avg_primary_govt_income)
reg_m, reg_b = np.polyfit(data.income_per_capita, data.avg_primary_house_npish_income, deg=1)
ax1.plot(data.income_per_capita, reg_m * data.income_per_capita + reg_b)
reg_m, reg_b = np.polyfit(data.income_per_capita, data.avg_primary_corp_income, deg=1)
ax1.plot(data.income_per_capita, reg_m * data.income_per_capita + reg_b)
reg_m, reg_b = np.polyfit(data.income_per_capita, data.avg_primary_govt_income, deg=1)
ax1.plot(data.income_per_capita, reg_m * data.income_per_capita + reg_b)

ax2.set_title('Secondary Income')
ax2.set_ylabel('Avg. Secondary Income')
ax2.xaxis.set_major_formatter('${x:1.0f}')
ax2.yaxis.set_major_formatter('${x:1.0f}')
ax2.scatter(data.income_per_capita, data.avg_secondary_house_npish_income)
ax2.scatter(data.income_per_capita, data.avg_secondary_corp_income)
ax2.scatter(data.income_per_capita, data.avg_secondary_govt_income)
reg_m, reg_b = np.polyfit(data.income_per_capita, data.avg_secondary_house_npish_income, deg=1)
ax2.plot(data.income_per_capita, reg_m * data.income_per_capita + reg_b)
reg_m, reg_b = np.polyfit(data.income_per_capita, data.avg_secondary_corp_income, deg=1)
ax2.plot(data.income_per_capita, reg_m * data.income_per_capita + reg_b)
reg_m, reg_b = np.polyfit(data.income_per_capita, data.avg_secondary_govt_income, deg=1)
ax2.plot(data.income_per_capita, reg_m * data.income_per_capita + reg_b)

fig.legend(['Household & NPISH', 'Corporate', 'Government'])

There doesn't appear to be any correlation between GDP per capita and labor or capital shares of national income.

In [None]:
fig, ax = plt.subplots()

ax.scatter(data.gdp_per_capita, data.labor_share_income)
ax.scatter(data.gdp_per_capita, data.capital_share_income)
ax.set_xlabel('GDP per Capita')
ax.set_ylabel('Share of National Income')
ax.legend(['Labor', 'Capital'])
ax.xaxis.set_major_formatter('${x:1.0f}')
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1))

stats.pearsonr(data.gdp_per_capita, data.labor_share_income), stats.pearsonr(data.gdp_per_capita, data.capital_share_income)

There is a positive correlation between GDP per capita and social security benefits.

In [None]:
fig, ax = plt.subplots()

ax.scatter(data.gdp_per_capita, data.ss_benefits)
reg_m, reg_b = np.polyfit(data.gdp_per_capita, data.ss_benefits, deg=1)
ax.plot(data.gdp_per_capita, reg_m * data.gdp_per_capita + reg_b)
ax.set_xlabel('GDP per Capita')
ax.xaxis.set_major_formatter('${x:1.0f}')
ax.set_ylabel('Social Security Benefits')
ax.yaxis.set_major_formatter('${x:1.0f}')

stats.pearsonr(data.gdp_per_capita, data.ss_benefits)

There is a correlation between average household wealth and GDP per capita.

In [None]:
fig, ax = plt.subplots()

ax.scatter(data.gdp_per_capita, data.avg_house_wealth)
reg_m, reg_b = np.polyfit(data.gdp_per_capita, data.avg_house_wealth, deg=1)
ax.plot(data.gdp_per_capita, reg_m * data.gdp_per_capita + reg_b)
ax.set_xlabel('GDP per Capita')
ax.set_ylabel('Avg. Household Wealth')
ax.xaxis.set_major_formatter('${x:1.0f}')
ax.yaxis.set_major_formatter('${x:1.0f}')

stats.pearsonr(data.gdp_per_capita, data.avg_house_wealth)

Interestingly, this correlation is stronger for the top 1% of household wealth.

In [None]:
fig, ax = plt.subplots()

ax.scatter(data.gdp_per_capita, data.avg_house_wealth_bot99)
ax.scatter(data.gdp_per_capita, data.avg_house_wealth_top1)
reg_m, reg_b = np.polyfit(data.gdp_per_capita, data.avg_house_wealth_bot99, deg=1)
ax.plot(data.gdp_per_capita, reg_m * data.gdp_per_capita + reg_b)
reg_m, reg_b = np.polyfit(data.gdp_per_capita, data.avg_house_wealth_top1, deg=1)
ax.plot(data.gdp_per_capita, reg_m * data.gdp_per_capita + reg_b)
ax.legend(['Bottom 99%', 'Top 1%'])
ax.set_xlabel('GDP')
ax.set_ylabel('Avg. Household Wealth')
ax.xaxis.set_major_formatter('${x:1.0f}')
ax.yaxis.set_major_formatter('${x:1.0f}')

stats.pearsonr(data.gdp_per_capita, data.avg_house_wealth_bot99), stats.pearsonr(data.gdp_per_capita, data.avg_house_wealth_top1)

The correlation is still present, but much weaker, for the bottom 99% of households. Notice the scale of the y-axis has changed tremendously.

In [None]:
fig, ax = plt.subplots()

ax.scatter(data.gdp_per_capita, data.avg_house_wealth_bot99)
reg_m, reg_b = np.polyfit(data.gdp_per_capita, data.avg_house_wealth_bot99, deg=1)
ax.plot(data.gdp_per_capita, reg_m * data.gdp_per_capita + reg_b)
ax.set_xlabel('GDP per Capita')
ax.set_ylabel('Avg. Household Wealth')
ax.xaxis.set_major_formatter('${x:1.0f}')
ax.yaxis.set_major_formatter('${x:1.0f}')