In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz, process



In [2]:
brd_df = pd.read_csv("battle_deaths.csv")
pop_df = pd.read_csv("total_pop.csv")

In [3]:
# filter to intrastate deaths
print(set(brd_df["type_of_conflict"].tolist()))
brd_df = brd_df[brd_df["type_of_conflict"] != '2']
print(set(brd_df["type_of_conflict"].tolist()))

{'3', '4', '2', '3, 4'}
{'3', '4', '3, 4'}


In [4]:
brd_df.columns

Index(['conflict_id', 'dyad_id', 'location_inc', 'side_a', 'side_a_id',
       'side_a_2nd', 'side_b', 'side_b_id', 'side_b_2nd', 'incompatibility',
       'territory_name', 'year', 'bd_best', 'bd_low', 'bd_high',
       'type_of_conflict', 'battle_location', 'gwno_a', 'gwno_a_2nd', 'gwno_b',
       'gwno_b_2nd', 'gwno_loc', 'gwno_battle', 'region', 'version'],
      dtype='object')

In [5]:
brd_locations = sorted(list(set(brd_df["location_inc"].tolist())))
print(brd_locations)

['Afghanistan', 'Algeria', 'Angola', 'Azerbaijan', 'Bangladesh', 'Bosnia-Herzegovina', 'Burkina Faso', 'Burundi', 'Cambodia (Kampuchea)', 'Cameroon', 'Central African Republic', 'Chad', 'China', 'Colombia', 'Comoros', 'Congo', 'Croatia', 'DR Congo (Zaire)', 'Djibouti', 'Egypt', 'El Salvador', 'Eritrea', 'Ethiopia', 'Georgia', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Haiti', 'India', 'Indonesia', 'Iran', 'Iraq', 'Israel', 'Ivory Coast', 'Jordan', 'Kenya', 'Laos', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Malaysia', 'Mali', 'Mauritania', 'Mexico', 'Moldova', 'Morocco', 'Mozambique', 'Myanmar (Burma)', 'Nepal', 'Nicaragua', 'Niger', 'Nigeria', 'North Macedonia', 'Pakistan', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Romania', 'Russia (Soviet Union)', 'Rwanda', 'Senegal', 'Serbia (Yugoslavia)', 'Sierra Leone', 'Somalia', 'South Sudan', 'Spain', 'Sri Lanka', 'Sudan', 'Syria', 'Tajikistan', 'Thailand', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Uganda', 'Ukraine', '

In [6]:
pop_df.columns

Index(['LocID', 'Location', 'VarID', 'Variant', 'Time', 'MidPeriod', 'PopMale',
       'PopFemale', 'PopTotal', 'PopDensity'],
      dtype='object')

In [7]:
pop_locations = sorted(list(set(pop_df["Location"].tolist())))
print(pop_locations)

['Afghanistan', 'Africa', 'African Group', 'African Union', 'African Union: Central Africa', 'African Union: Eastern Africa', 'African Union: Northern Africa', 'African Union: Southern Africa', 'African Union: Western Africa', 'African, Caribbean and Pacific (ACP) Group of States', 'Albania', 'Algeria', 'American Samoa', 'Andean Community', 'Andorra', 'Angola', 'Anguilla', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Asia', 'Asia-Pacific Economic Cooperation (APEC)', 'Asia-Pacific Group', 'Association of Southeast Asian Nations (ASEAN)', 'Australia', 'Australia/New Zealand', 'Austria', 'Azerbaijan', 'BRIC', 'BRICS', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Belt-Road Initiative (BRI)', 'Belt-Road Initiative: Africa', 'Belt-Road Initiative: Asia', 'Belt-Road Initiative: Europe', 'Belt-Road Initiative: Latin America and the Caribbean', 'Belt-Road Initiative: Pacific', 'Benin', 'Bermuda', 'Bhutan', 'Black Sea Economic Cooperation (BSEC)',

In [8]:
# are all locations in BRD locatable in the pop dataset?
for loc in brd_locations:
    if loc not in pop_locations:
        print(loc)
        print(process.extract(loc,pop_locations, limit=2))

Bosnia-Herzegovina
[('Bosnia and Herzegovina', 95), ('Asia', 68)]
Cambodia (Kampuchea)
[('Cambodia', 90), ('Chad', 68)]
DR Congo (Zaire)
[('Congo', 90), ('Democratic Republic of the Congo', 86)]
Iran
[('Iran (Islamic Republic of)', 90), ('Iraq', 75)]
Ivory Coast
[('Croatia', 61), ("Côte d'Ivoire", 58)]
Laos
[('Barbados', 68), ('Central Asia', 68)]
Moldova
[('Republic of Moldova', 90), ('American Samoa', 54)]
Myanmar (Burma)
[('Myanmar', 90), ('Oman', 64)]
Russia (Soviet Union)
[('African Union', 86), ('African Union: Northern Africa', 86)]
Serbia (Yugoslavia)
[('Serbia', 90), ('Latvia', 82)]
Syria
[('Syrian Arab Republic', 90), ('Serbia', 73)]
Venezuela
[('Venezuela (Bolivarian Republic of)', 90), ('Australia/New Zealand', 60)]
Yemen (North Yemen)
[('Yemen', 90), ('ESCAP region: East and North-East Asia', 86)]


In [11]:
# replace some country names in the UN population dataset
pop_df = pop_df.replace("Lao People's Democratic Republic", "Laos")
pop_df = pop_df.replace("Russian Federation", "Russia (Soviet Union)")
pop_df = pop_df.replace("Côte d'Ivoire", "Ivory Coast")
pop_df = pop_df.replace("Democratic Republic of the Congo", "Congo")

In [12]:
# get locations again and look at matches
pop_locations = sorted(list(set(pop_df["Location"].tolist())))

for loc in brd_locations:
    if loc not in pop_locations:
        pop_loc_string = process.extractOne(loc,pop_locations)[0]
        print(loc, pop_loc_string)
        pop_df = pop_df.replace(pop_loc_string, loc)

In [15]:
pop_df = pop_df.set_index(['Location', 'Time'])

In [28]:
pop_df.loc['Afghanistan', 1951]['PopTotal'].values[0]

  """Entry point for launching an IPython kernel.


7840.151

In [51]:
# now we create a new column of per capita deaths
brd_df["per_capita_deaths"] = brd_df.apply(lambda r: r['bd_best'] / pop_df.loc[r['location_inc'], r['year']]['PopTotal'].values[0], axis=1)

  


In [52]:
# check range of per capita deaths
brd_df[brd_df["per_capita_deaths"] > 1][["bd_best", "location_inc", "per_capita_deaths", "year"]]

Unnamed: 0,bd_best,location_inc,per_capita_deaths,year
36,27347,Syria,1.565767,2016
40,35349,Syria,1.964116,2015
44,3268,DR Congo (Zaire),1.107177,1998
543,4457,DR Congo (Zaire),1.554433,1997
597,44616,Syria,2.182901,2012
598,61728,Syria,3.152852,2013
599,56994,Syria,3.046063,2014
735,8005,Somalia,1.100491,1991


In [54]:
# also get ISO 3166-1 code
brd_df["loc_id"] = brd_df.apply(lambda r: pop_df.loc[r['location_inc']]['LocID'].values[0], axis=1)

In [56]:
clean_per_cap_deaths = brd_df[["loc_id", "location_inc", "year", "per_capita_deaths", "bd_best"]]

In [58]:
clean_per_cap_deaths.to_csv("conflict_deaths_cleaned.csv")