In [1]:
import os
import numpy as np
import pandas as pd
from scipy.stats import wasserstein_distance, energy_distance
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
cwd = globals()['_dh'][0] 
df = pd.read_csv(os.path.join(os.path.dirname(cwd), 'data', 'ysc0_distributions_by_district_bd.csv'))
df = df[(~pd.isnull(df.district)) & (~pd.isnull(df.bd))].reset_index(drop=True)
df = df.fillna(0)
df['wy0sum'] = df.groupby(['iso', 'district', 'bd', 'major_religion'])['wy0'].transform('sum')
df['wy01418sum'] = df.groupby(['iso', 'district', 'bd', 'major_religion'])['wy01418'].transform('sum')
df['wy0'] = df.wy0 / df.wy0sum
df['wy01418'] = df.wy01418 / df.wy01418sum
df = df.drop(['wy0sum', 'wy01418sum'], axis=1)

In [3]:
iso_dists = df[['iso', 'district']].drop_duplicates(['iso', 'district']).reset_index(drop=True).copy()

In [4]:
outlist = []

# for district in range(1,10):
for district in sorted(list(set(df.district))):
    if district % 500 == 0:
        print(district)
    sub_df = df[df.district==district]
    for bd in set(sub_df.bd):
        sub_sub_df = (sub_df[sub_df.bd==bd][
                        ['ysc0', 'major_religion', 'wy0', 'wy01418']]
                        .set_index(['ysc0', 'major_religion'], drop=True)
                        .unstack()
                        .fillna(0)
                     )
        try:
            w_all_m = wasserstein_distance(sub_sub_df.index, 
                                           sub_sub_df.index, 
                                           u_weights=sub_sub_df['wy0']['Christian'].values, 
                                           v_weights=sub_sub_df['wy0']['Muslim'].values)      
        except:
            w_all_m = np.nan
        try:
            w_all_t = wasserstein_distance(sub_sub_df.index, 
                                           sub_sub_df.index, 
                                           u_weights=sub_sub_df['wy0']['Christian'].values, 
                                           v_weights=sub_sub_df['wy0']['Traditional'].values)      
        except:
            w_all_t = np.nan              
              
              
        try:
            w_1418_m = wasserstein_distance(sub_sub_df.index, 
                                            sub_sub_df.index, 
                                            u_weights=sub_sub_df['wy01418']['Christian'].values, 
                                            v_weights=sub_sub_df['wy01418']['Muslim'].values)
        except:
            w_1418_m = np.nan     
              
        try:
            w_1418_t = wasserstein_distance(sub_sub_df.index, 
                                            sub_sub_df.index, 
                                            u_weights=sub_sub_df['wy01418']['Christian'].values, 
                                            v_weights=sub_sub_df['wy01418']['Traditional'].values)
        except:
            w_1418_t = np.nan   
              
              
     
        outlist.append([district, bd, w_all_m, w_all_t, w_1418_m, w_1418_t])

500.0
1500.0
2500.0


In [5]:
out = pd.DataFrame(outlist, columns=['district', 'bd', 'w_all_m', 'w_all_t', 'w_1418_m', 'w_1418_t'])
out = pd.merge(iso_dists, out, on=['district'])

In [6]:
out_all_m = out[['iso', 'district', 'bd', 'w_all_m']].dropna()
out_all_t = out[['iso', 'district', 'bd', 'w_all_t']].dropna()
out_1418_m = out[['iso', 'district', 'bd', 'w_1418_m']].dropna()
out_1418_t = out[['iso', 'district', 'bd', 'w_1418_t']].dropna()

out_all_m.sort_values(['bd', 'w_all_m'], inplace=True)
out_all_m['r_all_m'] = out_all_m.groupby('bd').agg('iso').cumcount() + 1
out_all_m.sort_values(['bd', 'iso', 'w_all_m'], inplace=True)
out_all_m['r_all_m_byc'] = out_all_m.groupby(['bd', 'iso']).agg('iso').cumcount() + 1
out_all_m['n_all_m_byc'] = out_all_m.groupby(['bd', 'iso']).agg('iso').transform('count')


out_all_t.sort_values(['bd', 'w_all_t'], inplace=True)
out_all_t['r_all_t'] = out_all_t.groupby('bd').agg('iso').cumcount() + 1
out_all_t.sort_values(['bd', 'iso', 'w_all_t'], inplace=True)
out_all_t['r_all_t_byc'] = out_all_t.groupby(['bd', 'iso']).agg('iso').cumcount() + 1
out_all_t['n_all_t_byc'] = out_all_t.groupby(['bd', 'iso']).agg('iso').transform('count')


out_1418_m.sort_values(['bd', 'w_1418_m'], inplace=True)
out_1418_m['r_1418_m'] = out_1418_m.groupby('bd').agg('iso').cumcount() + 1
out_1418_m.sort_values(['bd', 'iso', 'w_1418_m'], inplace=True)
out_1418_m['r_1418_m_byc'] = out_1418_m.groupby(['bd', 'iso']).agg('iso').cumcount() + 1
out_1418_m['n_1418_m_byc'] = out_1418_m.groupby(['bd', 'iso']).agg('iso').transform('count')


out_1418_t.sort_values(['bd', 'w_1418_t'], inplace=True)
out_1418_t['r_1418_t'] = out_1418_t.groupby('bd').agg('iso').cumcount() + 1
out_1418_t.sort_values(['bd', 'iso', 'w_1418_t'], inplace=True)
out_1418_t['r_1418_t_byc'] = out_1418_t.groupby(['bd', 'iso']).agg('iso').cumcount() + 1
out_1418_t['n_1418_t_byc'] = out_1418_t.groupby(['bd', 'iso']).agg('iso').transform('count')


out = pd.merge(out_all_m, out_all_t, on=['iso', 'district', 'bd'], how='outer')
out = pd.merge(out, out_1418_m, on=['iso', 'district', 'bd'], how='outer')
out = pd.merge(out, out_1418_t, on=['iso', 'district', 'bd'], how='outer')

out = out.set_index(['bd', 'district'], drop=True)
out.to_csv(os.path.join(os.path.dirname(cwd), 'data', '_distribution_similarity_y0_mct_bd.csv'))