This notebook gathers the data collected from our buffers and alternative buffers contructed in the notebooks `Construct-Buffers-v17`, `Construct-Buffers-v16`, `Construct-AltBuffers-v17.ipynb`, and `Construct-AltBuffers-v16` and combines it with Dicken's own dataset.

Note that each of the above-mentioned notebooks require the Ethnologe to run which is protected under copy rights. Therefore, these notebooks can't be run here on Deepnote. However, you can see the outputs of each cell. If you want to understand how the data was constructed please review the notebooks.

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import zscore
import statsmodels.api as sm
import statsmodels.formula.api as smf
import sys, os, time
pd.set_option('display.width', 140)

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_context("talk")

from IPython.display import display, HTML, Image

pathdata = '/work/Replication_Dickens_2022/data/'

IPyStata is loaded in batch mode.


## Preparing the Final Dataset

In [None]:
# Load all datasets
dfor = pd.read_stata(pathdata + 'EJ_Dickens_Border_100km.dta')
dfor_v16 = pd.read_stata(pathdata + 'Dickens_OrBuf_v16_stats.dta')
dfor_v17 = pd.read_stata(pathdata + 'Dickens_OrBuf_v17_stats.dta')
dfor_v16_alt = pd.read_stata(pathdata + 'Dickens_AltBuf_v16_absdif.dta')
dfor_v17_alt = pd.read_stata(pathdata + 'Dickens_AltBuf_v17_absdif.dta')

# Redefine some variables to be closer to what Dickens did with buffers defined by Dickens
for df in [dfor_v16,dfor_v17]:
    df['csi_change_sd_oj'] = (df.post1500AverageCaloriesstd - df.pre1500AverageCaloriesstd)/1000
    df['csi_sd_oj'] = (df.pre1500AverageCaloriesstd)/1000
    df['csi_change_oj'] = (df.post1500AverageCaloriesmean - df.pre1500AverageCaloriesmean)/1000
    df['csi_oj'] = (df.pre1500AverageCaloriesmean)/1000

# Redefine some variables to be closer to what Dickens did with the alternative buffers
for df in [dfor_v16_alt,dfor_v17_alt]:
    df['csi_change_alt'] = (df.post1500AverageCaloriesmean - df.pre1500AverageCaloriesmean)/1000
    df['csi_alt'] = (df.pre1500AverageCaloriesmean)/1000

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  dfor = pd.read_stata(pathdata + 'EJ_Dickens_Border_100km.dta')
  df['csi_change_sd_oj'] = (df.post1500AverageCaloriesstd - df.pre1500AverageCaloriesstd)/1000
  df['csi_sd_oj'] = (df.pre1500AverageCaloriesstd)/1000
  df['csi_change_oj'] = (df.post1500AverageCaloriesmean - df.pre1500AverageCaloriesmean)/1000
  df['csi_oj'] = (df.pre1500AverageCaloriesmean)/1000
  df['csi_change_sd_oj'] = (df.post1500AverageCaloriesstd - df.pre1500AverageCaloriesstd)/1000
  df['csi_sd_oj'] = (df.pre1500AverageCaloriesstd)/1000
  df['csi_change_oj'] = (df.post1500AverageCaloriesmean - df.pre1500AverageCaloriesmean)/1000
  df['csi_oj'] = (df.pre1500AverageCaloriesmean)/1000
  df['csi_change_alt'] = (df.post1500AverageCaloriesmean - df.p

In [None]:
# We want to make sure that we have the same as buffers zones Dicken, 
# so we will first restrict our sample to those that have information about lingDist
dfor = dfor[dfor.lingDist.isna() == False]

# Merge the data set for v16
dfor16 = dfor.merge(dfor_v16, how = 'left', on = 'identifier')
dfor16 = dfor16.merge(dfor_v16_alt, how = 'left', on = 'identifier')

# Merge the data set for v16
dfor17 = dfor.merge(dfor_v17, how = 'left', on = 'identifier')
dfor17 = dfor17.merge(dfor_v17_alt, how = 'left', on = 'identifier')

In [None]:
# There is an error in the data v17 where a buffer is reapted 3 times
category_counts = dfor17.identifier.value_counts()
single_obs_categories = category_counts[category_counts != 1].index.tolist()
single_obs_categories

# To fix this we are going to drop these buffers
dfor17 = dfor17[~dfor17.identifier.isin(single_obs_categories)]

# We will also further restrict the sample to those that we have data on csi_alt
dfor17 = dfor17[dfor17.csi_alt.isna() == False]

# Now we check the number of observations.
print(sum(dfor16.csi_alt.isna() == False))
print(sum(dfor17.csi.isna() == False))

8426
7582


In [None]:
# To get the same number of observations in the regressions we also need to identify family1 and family2 singletons
category_counts = dfor16.loc[dfor16.lingDist.isna()==False].groupby('family1').identifier.count()
singletop_fam1 = category_counts[category_counts == 1].index.tolist()

category_counts = dfor16.loc[dfor16.lingDist.isna()==False].groupby('family2').identifier.count()
singletop_fam2 = category_counts[category_counts == 1].index.tolist()

dfor16 = dfor16[(~dfor16.family1.isin(singletop_fam1)) & (~dfor16.family2.isin(singletop_fam2))]
print(sum(dfor16.csi_alt.isna() == False))
dfor17 = dfor17[(~dfor17.family1.isin(singletop_fam1)) & (~dfor17.family2.isin(singletop_fam2))]
print(sum(dfor17.csi_alt.isna() == False))

8402
7564


In [None]:
# For specification (6) in Table 1 he also drops the singletons when they are in the same country.
dfor_same_country = dfor16[dfor16.samecountry == 1]
print(dfor_same_country.shape)
category_counts = dfor_same_country.groupby('family1').identifier.count()
singletop_fam1 = category_counts[category_counts == 1].index.tolist()

category_counts = dfor_same_country.groupby('family2').identifier.count()
singletop_fam2 = category_counts[category_counts == 1].index.tolist()

category_counts = dfor_same_country.groupby('ccode1').identifier.count()
singletop_ccode = category_counts[category_counts == 1].index.tolist()

dfor_same_country = dfor_same_country[(~dfor_same_country.family1.isin(singletop_fam1)) & (~dfor_same_country.family2.isin(singletop_fam2)) & (~dfor_same_country.ccode1.isin(singletop_ccode))]
print(sum(dfor_same_country.csi_alt.isna() == False))

(7312, 409)
7291


In [None]:
# Here is the code to export the final data used for the analysis

# Get the variables used in the analysis and order them with the csi variables 1st
var_to_keep = dfor.columns.tolist()
var_to_keep.remove("csi")
var_to_keep.remove("csi_sd")
var_to_keep.remove("csi_change")
var_to_keep.remove("csi_change_sd")

var_to_keep = ['csi','csi_sd','csi_change','csi_change_sd','csi_oj','csi_change_oj','csi_sd_oj','csi_change_sd_oj','csi_alt','csi_change_alt'] + var_to_keep

# Export both datasets to .dta format
dfor16[var_to_keep].to_stata(pathdata + 'Dickens_rep_v16.dta', version=117)
dfor17[var_to_keep].to_stata(pathdata + 'Dickens_rep_v17.dta', version=117)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=4a945f27-2c4c-4244-8f3b-ab6dff812a2f' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>