In [1]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.style.use('seaborn')
mpl.rcParams['font.family'] = 'serif'
import os

In [2]:
current_folder = globals()['_dh'][0]
rootdir = os.path.dirname(os.path.dirname(current_folder))
indir_prepro = os.path.join(rootdir, '_1_preprocessing', 'data', 'raw', 'eth')
indir_inter = os.path.join(rootdir, '_2_intermediate', 'data')
outdir = os.path.join(rootdir, '_3_figures_tables', 'data')

In [3]:
df = pd.read_csv(os.path.join(indir_inter, '_nobs_bd_eth_rel.csv'))
ethconc = pd.read_csv(os.path.join(indir_prepro, 'all_iso_eth_correspondence.csv'))
ethconc = (ethconc[~pd.isnull(ethconc.eth_hrm)][
    ['iso', 'eth_hrm', 'eth_hrm_name']]
           .drop_duplicates(subset=['iso', 'eth_hrm'])
           .reset_index(drop=True)
          )
df = pd.merge(df, ethconc, on=['iso', 'eth_hrm'], how='left')
df = df[['iso', 'bd', 'major_religion', 'eth_hrm_name', 'nobs']].copy()

In [4]:
df_all = pd.DataFrame(df.nobs.groupby([df.iso,
                                       df.major_religion,
                                       df.eth_hrm_name]).sum()).reset_index(drop=False)
df_1980 = df[df.bd == 1980].reset_index(drop=True)
del df_1980['bd']
neth_all = pd.DataFrame(df_all.nobs.groupby([df_all.iso,
                                             df_all.eth_hrm_name]).sum()).reset_index(drop=False)
neth_1980 = pd.DataFrame(df_1980.nobs.groupby([df_1980.iso,
                                               df_1980.eth_hrm_name]).sum()).reset_index(drop=False)
neth_all.columns = ['iso', 'eth_hrm_name', 'neth_all']
neth_1980.columns = ['iso', 'eth_hrm_name', 'neth_all']
df_all = pd.merge(df_all, neth_all, on=['iso', 'eth_hrm_name'], how='inner')
df_1980 = pd.merge(df_1980, neth_1980, on=['iso', 'eth_hrm_name'], how='inner')

df_all['shr'] = (df_all.nobs / df_all.neth_all)
df_1980['shr'] = (df_1980.nobs / df_1980.neth_all)

In [5]:
df_all.to_csv(outdir + '/eth_relshares_allbd.csv', index=False)
df_1980.to_csv(outdir + '/eth_relshares_1980bd.csv', index=False)

In [6]:
df_all['shr2'] = (df_all.nobs / df_all.neth_all) ** 2
df_1980['shr2'] = (df_1980.nobs / df_1980.neth_all) ** 2

In [7]:
df_all =  pd.DataFrame(df_all.shr2.groupby([df_all.iso, df_all.eth_hrm_name]).sum()).reset_index(drop=False)
df_1980 =  pd.DataFrame(df_1980.shr2.groupby([df_all.iso, df_1980.eth_hrm_name]).sum()).reset_index(drop=False)

In [8]:
df_all

Unnamed: 0,iso,eth_hrm_name,shr2
0,BEN,Adja,0.420562
1,BEN,Bariba,0.441589
2,BEN,Dendi,0.951218
3,BEN,Fon,0.486362
4,BEN,Otamari,0.331818
...,...,...,...
190,ZMB,Nyanja,0.849888
191,ZMB,Other African,0.666310
192,ZMB,Tonga,0.881185
193,ZMB,Tumbuka,0.821130


In [9]:
f, ax = plt.subplots(1, 1, figsize=(10, 6))
ax.hist(df_all.shr2, bins=30, edgecolor='k')
ax.set_xlabel('within-ethnicity Herfindahl index of religion shares', size=15)
ax.set_ylabel('number of ethnic groups', size=15)
ax.tick_params(axis='both', which='major', labelsize=15)
ax.set_xlim([0,1])
# plt.show()
# f.savefig(outdir + '_3_herfindahl_eth_relshares_all_bd.pdf', bbox_inches='tight')
plt.close(f)

In [10]:
f, ax = plt.subplots(1, 1, figsize=(10, 6))
ax.hist(df_1980.shr2, bins=30, edgecolor='k')
ax.set_xlabel('within-ethnicity Herfindahl index of religion shares', size=15)
ax.set_ylabel('number of ethnic groups', size=15)
ax.tick_params(axis='both', which='major', labelsize=15)
ax.set_xlim([0,1])
# plt.show()
# f.savefig(outdir + '_3_herfindahl_eth_relshares_1980_bd.pdf', bbox_inches='tight')
plt.close(f)

# Summary stats

In [11]:
df = pd.read_csv(os.path.join(outdir, 'eth_relshares_allbd.csv'))[['iso', 'major_religion', 'eth_hrm_name']].copy()
df['neth'] = df.groupby(['iso', 'major_religion'])['eth_hrm_name'].transform('count')
df['nrel'] = df.groupby(['iso', 'eth_hrm_name'])['major_religion'].transform('count')

In [12]:
eth = df.drop_duplicates(subset=['iso', 'eth_hrm_name']).agg({'nrel': ['min', 'median', 'max', 'mean', 'std']})

In [13]:
eth

Unnamed: 0,nrel
min,3.0
median,5.0
max,5.0
mean,4.738462
std,0.452151


In [14]:
rel = df.drop_duplicates(subset=['iso', 'major_religion']).agg({'neth': ['min', 'median', 'max', 'mean', 'std']})

In [15]:
rel

Unnamed: 0,neth
min,8.0
median,12.0
max,21.0
mean,13.014085
std,4.148986
