In [1]:
import os
import numpy as np
import pandas as pd
from scipy.stats import wasserstein_distance, energy_distance
import matplotlib.pyplot as plt

In [2]:
cwd = globals()['_dh'][0] 
df = pd.read_csv(os.path.join(os.path.dirname(cwd), 'data', 'ysc0_distributions_by_district_bd.csv'))
df = df[(~pd.isnull(df.district)) & (df.bd.isin([1980, 1990]))].reset_index(drop=True)
df = df.set_index(['iso', 'district', 'ysc0', 'major_religion', 'bd'], drop=True).unstack().reset_index(drop=False)

df.loc[df.iso.isin(['LBR', 'MLI', 'NGA', 'TGO']), ('wy01418', 1980)] = df.loc[df.iso.isin(['LBR', 'MLI', 'NGA', 'TGO']), ('wy01418', 1990)] 
df = df.iloc[:, [0,1,2,3,4,6]]
df.columns = list(df.columns.droplevel(1))

df = df.fillna(0)
df['wy0sum'] = df.groupby(['iso', 'district', 'major_religion'])['wy0'].transform('sum')
df['wy01418sum'] = df.groupby(['iso', 'district', 'major_religion'])['wy01418'].transform('sum')
df['wy0'] = df.wy0 / df.wy0sum
df['wy01418'] = df.wy01418 / df.wy01418sum
df = df.drop(['wy0sum', 'wy01418sum'], axis=1)
df

Unnamed: 0,iso,district,ysc0,major_religion,wy0,wy01418
0,BEN,1.0,0,Christian,0.705882,0.663102
1,BEN,1.0,0,Muslim,0.866531,0.847437
2,BEN,1.0,0,Traditional,0.905167,0.911111
3,BEN,1.0,1,Christian,0.048682,0.053476
4,BEN,1.0,1,Muslim,0.031440,0.022646
...,...,...,...,...,...,...
66334,ZMB,2757.0,11,Christian,0.017875,0.017722
66335,ZMB,2757.0,11,Traditional,0.027027,0.027778
66336,ZMB,2757.0,12,Christian,0.030785,0.022785
66337,ZMB,2757.0,12,Traditional,0.094595,0.083333


In [3]:
iso_dists = df[['iso', 'district']].drop_duplicates(['iso', 'district']).reset_index(drop=True).copy()

In [4]:
outlist = []

for district in sorted(list(set(df.district))):
    if district % 500 == 0:
        print(district)
    sub_df = (df[df.district==district][
                ['ysc0', 'major_religion', 'wy0', 'wy01418']]
              .set_index(['ysc0', 'major_religion'], drop=True)
              .unstack()
              .fillna(0))

    try:
        w_all = wasserstein_distance(sub_df.index, 
                                     sub_df.index, 
                                     u_weights=sub_df['wy0']['Christian'].values, 
                                     v_weights=sub_df['wy0']['Muslim'].values)
    except:
        w_all = np.nan
    try:
        w_1418 = wasserstein_distance(sub_df.index, 
                                      sub_df.index, 
                                      u_weights=sub_df['wy01418']['Christian'].values, 
                                      v_weights=sub_df['wy01418']['Muslim'].values)
    except:
        w_1418 = np.nan        
    try:
        e_all = energy_distance(sub_df.index, 
                                sub_df.index, 
                                u_weights=sub_df['wy0']['Christian'].values, 
                                v_weights=sub_df['wy0']['Muslim'].values)
    except:
        e_all = np.nan
    try:
        e_1418 = energy_distance(sub_df.index, 
                                sub_df.index, 
                                u_weights=sub_df['wy01418']['Christian'].values, 
                                v_weights=sub_df['wy01418']['Muslim'].values)
    except:
        e_1418 = np.nan         
    outlist.append([district, w_all, w_1418, e_all, e_1418])

500.0
1500.0
2500.0


In [5]:
out = pd.DataFrame(outlist, columns=['district', 'w_all', 'w_1418', 'e_all', 'e_1418'])
out = pd.merge(iso_dists, out, on=['district'])

In [6]:
out_all = out[['iso', 'district', 'w_all', 'e_all']].dropna()
out_1418 = out[['iso', 'district', 'w_1418', 'e_1418']].dropna()

out_all.sort_values(['iso', 'w_all'], inplace=True)
out_all['wass_all'] = out_all.w_all
out_all['w_all'] = out_all.groupby('iso').agg('iso').cumcount() + 1
out_all.sort_values(['iso', 'e_all'], inplace=True)
out_all['e_all'] = out_all.groupby('iso').agg('iso').cumcount() + 1
out_all['n_all'] = out_all.groupby('iso').agg('iso').transform('count')
out_all.sort_values(['wass_all'], inplace=True)
out_all['w_all_glob'] = np.arange(out_all.shape[0])+1

out_1418.sort_values(['iso', 'w_1418'], inplace=True)
out_1418['w_1418'] = out_1418.groupby('iso').agg('iso').cumcount() + 1
out_1418.sort_values(['iso', 'e_1418'], inplace=True)
out_1418['e_1418'] = out_1418.groupby('iso').agg('iso').cumcount() + 1
out_1418['n_1418'] = out_1418.groupby('iso').agg('iso').transform('count')

out = pd.merge(out_all, out_1418, on=['iso', 'district'], how='outer')

out = out.set_index(['iso', 'district'], drop=True)
out.drop(['wass_all', 'w_all_glob'], axis=1).to_csv(os.path.join(os.path.dirname(cwd), 'data', '_distribution_similarity_y0_mc_bycountry.csv'))