In [1]:
import numpy as np
import pandas as pd
import re
import country_converter as coco
from textab import TexTab
import os

In [2]:
current_folder = globals()['_dh'][0]
rootdir = os.path.dirname(os.path.dirname(current_folder))
wdir = os.path.join(rootdir, '_2_intermediate', 'data')
outdir = os.path.join(rootdir, '_3_figures_tables', 'data')

In [3]:
ISO = ["BEN", "BFA", "BWA", "CMR", "EGY",
       "ETH", "GHA", "GIN", "LBR", "MLI",
       "MOZ", "MUS", "MWI", "NGA", "RWA",
       "SEN", "SLE", "TGO", "UGA", "ZAF",
       "ZMB"]
pop = pd.read_csv(os.path.join(wdir, 'pop_world.csv'))
pop = pop[(pop.year == 1980) & (pop.iso.isin(ISO))].reset_index(drop=True)
del pop['year']
pop['pop1980'] = pop['pop'].astype('int') 
del pop['pop']
pop

Unnamed: 0,iso,pop1980
0,BEN,3717165
1,BFA,6822843
2,BWA,897868
3,CMR,8621406
4,EGY,43309063
5,ETH,35141712
6,GHA,11056116
7,GIN,4871435
8,LBR,1853001
9,MLI,7090126


In [4]:
df = pd.read_csv(os.path.join(wdir, '_nobs_cens_condensed.csv'))
del df['n50'], df['n80']
de = pd.read_csv(os.path.join(wdir, '_ndist_eth.csv'))
df = pd.merge(df, de, on = ['iso', 'year'], how='left')
df = pd.merge(df, pop, on=['iso'], how='left')
de = de.groupby(de.iso).max().sum()
de['pop'] = pop['pop1980'].sum()
de

year         42204.0
ndist         2285.0
neth           222.0
pop      279347780.0
dtype: float64

In [5]:
df['country'] =  coco.convert(names=list(df['iso']), to='name_short')
del df['iso']
df = df[['country'] + list(df.columns)[:-1]]
df = df.sort_values(['country', 'year']).reset_index(drop=True).copy(deep=True)
df = df.fillna(0)
tot = pd.DataFrame(df.iloc[:,2:].sum(axis=0)).T
cols = list(tot.columns)
tot['country'] = 'total'
tot['year'] = ''
tot = tot[['country', 'year']+cols]
df = pd.concat([df, tot], axis=0).reset_index(drop=True)
df['year'] = df.year.astype('str')
# this is the correct number of non-nan districts (can be obtained from 
# counting the number of unique non-nan districts in _F_dist_religion_bch10.csv
df.iloc[-1,-3] = 2286 
# this is the correct number of ethnic groups
df.iloc[-1,-2] = 222
df.iloc[-1,-1] = 278381741
for col in df.columns[2:]:
    df[col] = df[col].astype('int')

In [6]:
df

Unnamed: 0,country,year,n,nim18,nim25,nexposure,ndist,neth,pop1980
0,Benin,1992.0,255736,34784,65040,4928,77,9,3717165
1,Benin,2002.0,373452,57364,104331,7795,77,9,3717165
2,Benin,2013.0,559525,93329,170580,9227,77,9,3717165
3,Botswana,2001.0,109509,16077,29119,0,21,8,897868
4,Botswana,2011.0,138094,14276,28817,0,21,8,897868
5,Burkina Faso,1996.0,552062,95669,157808,0,45,0,6822843
6,Burkina Faso,2006.0,770161,123364,211275,0,45,12,6822843
7,Cameroon,2005.0,1003327,162672,295388,14650,204,0,8621406
8,Egypt,1986.0,4261935,693275,1345068,40708,173,0,43309063
9,Egypt,1996.0,3810835,695795,1230963,16798,231,0,43309063


In [7]:
tt = TexTab(df)

In [8]:
cns = ['', '', '(1)', '(2)', '(3)', '(4)', '(5)', '(6)', '(7)']
lcols = ['country', 'year']
gaps = ['year']
cap = "Observations by Country and Census"
lab = "tab:_nobs_condensed"
lines = ['total']
notes = '''This table reports the number of observations by census. Column (1) shows 
the total number of observations for individuals aged 14+ for whom we oberve religion
as well as their own educational attainment. Column (2) shows the number of individuals 
aged 14-18 for whom we observe religion, their own educational attainment, as well as 
the educational attainment of the previous generation in the family. Column (3) is the
same as column (2) but for individuals aged 14-25. Column (4) shows the number of 
individuals that enter in the exposure effects regressions. Column (5) shows the number
of unique district in that census-year, column (6) shows the number of unique ethnic
groups and column (7) shows the country's population in 1980, which we use for weighting.
'''

In [9]:
tab = tt.create_tab_onepanel(cns=cns, gaps=gaps, lcols=lcols,
                             cap=cap, lab=lab, lines=lines, notes=notes, clines=False)

In [10]:
tab = re.sub(r'&&&\(1\).+\n', r'', tab)
nl1 = '\\\\hline\n'
nl2 = '\\\\multicolumn{1}{l}{country}&\\\\multicolumn{1}{l}{year}&\\\\multicolumn{1}{l}{}'
nl3a = '&$N^{14+}$&$N^{14-18}_{\\\\text{olded}}$&$N^{14-25}_{\\\\text{olded}}$&$N^{14-25}_{\\\\text{exposure}}$'
nl3b = '&$N_{dist}$&$N_{eth}$&$\\\\text{pop}_{1980}$\\\\\\\n'
nl3 = nl3a + nl3b
tab = re.sub(r'(\\hline\nBenin)', r'{}{}{}\1'.format(nl1, nl2, nl3), tab)
tab = re.sub(r'&0&', r'&&', tab)
tab = re.sub(r'&0\\\\', r'&\\\\', tab)
print(tab)

\begin{table}[ht!]
\singlespacing
\centering
\caption{Observations by Country and Census}
\label{tab:_nobs_condensed}
\resizebox{\columnwidth}{!}{
\begin{tabular}{llcccccccc}
\hline
\hline
&&\;\;\;\;\;\;\;\;\;\;\;&(1)&(2)&(3)&(4)&(5)&(6)&(7)\\
\hline
\multicolumn{1}{l}{country}&\multicolumn{1}{l}{year}&\multicolumn{1}{l}{}&$N^{14+}$&$N^{14-18}_{\text{olded}}$&$N^{14-25}_{\text{olded}}$&$N^{14-25}_{\text{exposure}}$&$N_{dist}$&$N_{eth}$&$\text{pop}_{1980}$\\
\hline
Benin&1992&&255,736&34,784&65,040&4,928&77&9&3,717,165\\
Benin&2002&&373,452&57,364&104,331&7,795&77&9&3,717,165\\
Benin&2013&&559,525&93,329&170,580&9,227&77&9&3,717,165\\
Botswana&2001&&109,509&16,077&29,119&&21&8&897,868\\
Botswana&2011&&138,094&14,276&28,817&&21&8&897,868\\
Burkina Faso&1996&&552,062&95,669&157,808&&45&&6,822,843\\
Burkina Faso&2006&&770,161&123,364&211,275&&45&12&6,822,843\\
Cameroon&2005&&1,003,327&162,672&295,388&14,650&204&&8,621,406\\
Egypt&1986&&4,261,935&693,275&1,345,068&40,708&173&&43,309,063\\
E

In [11]:
fh = open(outdir + "/_nobs_cens_condensed.tex", "w")
fh.write(tab)
fh.close()