# Data and loading

The Nonviolent and Violent Campaigns and Outcomes (NAVCO) data project is the first of its kind to collect systematic data on both violent insurgencies and nonviolent civil resistance campaigns. The coverage is global but is excluded to maximalist campaigns (i.e. those seeking to overthrow an incumbent government, expel a foreign military occupation, or secede). NAVCO now has multiple published versions, as well as several others in progress.
Here I use NAVCO 2.1, the latest version at the moment (May 2022).

The granularity of dataset is campaign-year per row.


Dataset: https://dataverse.harvard.edu/dataverse/navco 

The Code Book can be accessed at: https://drive.google.com/file/d/1DVyiptpY4OqvoLpdyyjaIHq0LLvUVHwU/view 




In [None]:
pip install geopandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import io
from google.colab import files
from statistics import mean, median, mode, stdev
from datetime import datetime, date # manipulating date formats
from dateutil.relativedelta import relativedelta
import numpy as np
import math    
import datetime
import os
import sys #import datetime, date # manipulating date formats
import altair as alt
from shapely.geometry import shape
import geopandas as gpd
pd.options.mode.chained_assignment = None

In [None]:
#loading the NAVCO dataset
if sys.version_info[0] >= 3:
    from urllib.request import urlretrieve
else:
    from urllib import urlretrieve

url = 'https://drive.google.com/uc?export=download&id=1c2yqrxbFeG46fB5iZ4NKhBt9NiAjVFd7'
urlretrieve(url, '/NAVCO_2_1.csv')

('/NAVCO_2_1.csv', <http.client.HTTPMessage at 0x7fecdc31f110>)

In [None]:
df = pd.read_csv('/NAVCO_2_1.csv')

In [None]:
#loading the country data for map
if sys.version_info[0] >= 3:
    from urllib.request import urlretrieve
else:
    from urllib import urlretrieve

url = 'https://drive.google.com/uc?export=download&id=1zF1PcGWziUtdxiUx_j45-5NOnXLGeyiF'
urlretrieve(url, 'world-countries.json')

('world-countries.json', <http.client.HTTPMessage at 0x7fecda722150>)

In [None]:
world = gpd.read_file('world-countries.json')

In [None]:
#список для корекції любʼязно надала Наталя Волкова
world["name"].replace({"Republic of Serbia": "Serbia", "China, Taiwan Province of China": "Taiwan", 
                          "Republic of Korea": "South Korea", "Republic of the Congo": "Congo",
                          "The Bahamas": "Bahamas"}, inplace=True)

In [None]:
#loading the country data for map
if sys.version_info[0] >= 3:
    from urllib.request import urlretrieve
else:
    from urllib import urlretrieve

url = 'https://drive.google.com/uc?export=download&id=1DD9O68MlcfgCNv7NvsVdOUxknB7QA4yx'
urlretrieve(url, 'population.csv')

('population.csv', <http.client.HTTPMessage at 0x7fecdcbe0850>)

In [None]:
population = pd.read_csv('population.csv')

In [None]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [None]:
if sys.version_info[0] >= 3:
    from urllib.request import urlretrieve
else:
    from urllib import urlretrieve

url = 'https://drive.google.com/uc?export=download&id=1-Y6gdockuvQxThVVO7lHjEM8ADVLpVqh'
urlretrieve(url, '/gapminder_country_data.csv')

('/gapminder_country_data.csv', <http.client.HTTPMessage at 0x7fecda54a790>)

In [None]:
df_regions = pd.read_csv('/gapminder_country_data.csv')
df_regions.head()

Unnamed: 0,country,time,income_per_person_gdppercapita_ppp_inflation_adjusted,life_expectancy_years,population_total,name,world_4region
0,afg,1800,603.0,28.21,3280000,Afghanistan,Asia
1,afg,1801,603.0,28.2,3280000,Afghanistan,Asia
2,afg,1802,603.0,28.19,3280000,Afghanistan,Asia
3,afg,1803,603.0,28.18,3280000,Afghanistan,Asia
4,afg,1804,603.0,28.17,3280000,Afghanistan,Asia


# Data preparation

## Data on regions

In [None]:
df_regions = df_regions.drop_duplicates(subset=['name'], keep="last")

In [None]:
df_regions.drop(['country', 'time', 'income_per_person_gdppercapita_ppp_inflation_adjusted', 'life_expectancy_years', 'population_total'], axis = 1, inplace=True)

In [None]:
df_regions = df_regions.rename(columns={'name':'country', 'world_4region':'region'})

## Data on population

In [None]:
population

Unnamed: 0,country,1800,1801,1802,1803,1804,1805,1806,1807,1808,...,2091,2092,2093,2094,2095,2096,2097,2098,2099,2100
0,Afghanistan,3.28M,3.28M,3.28M,3.28M,3.28M,3.28M,3.28M,3.28M,3.28M,...,76.6M,76.4M,76.3M,76.1M,76M,75.8M,75.6M,75.4M,75.2M,74.9M
1,Angola,1.57M,1.57M,1.57M,1.57M,1.57M,1.57M,1.57M,1.57M,1.57M,...,168M,170M,172M,175M,177M,179M,182M,184M,186M,188M
2,Albania,400k,402k,404k,405k,407k,409k,411k,413k,414k,...,1.33M,1.3M,1.27M,1.25M,1.22M,1.19M,1.17M,1.14M,1.11M,1.09M
3,Andorra,2650,2650,2650,2650,2650,2650,2650,2650,2650,...,63k,62.9k,62.9k,62.8k,62.7k,62.7k,62.6k,62.5k,62.5k,62.4k
4,United Arab Emirates,40.2k,40.2k,40.2k,40.2k,40.2k,40.2k,40.2k,40.2k,40.2k,...,12.3M,12.4M,12.5M,12.5M,12.6M,12.7M,12.7M,12.8M,12.8M,12.9M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,Samoa,47.3k,47.3k,47.3k,47.3k,47.3k,47.3k,47.3k,47.2k,47.2k,...,315k,314k,314k,314k,313k,313k,312k,312k,311k,310k
193,Yemen,2.59M,2.59M,2.59M,2.59M,2.59M,2.59M,2.59M,2.59M,2.59M,...,54.5M,54.4M,54.3M,54.1M,54M,53.8M,53.7M,53.5M,53.4M,53.2M
194,South Africa,1.45M,1.45M,1.46M,1.46M,1.47M,1.47M,1.48M,1.49M,1.49M,...,79.8M,79.8M,79.7M,79.7M,79.6M,79.5M,79.5M,79.4M,79.3M,79.2M
195,Zambia,747k,758k,770k,782k,794k,806k,818k,831k,843k,...,74.4M,75.2M,76M,76.8M,77.6M,78.4M,79.2M,80M,80.8M,81.5M


In [None]:
population_average = population[['country', '1900', '1950', '2020']]

In [None]:
population_average['1900'] = (population_average['1900'].str
                              .replace('M', '000000').str
                              .replace('k', '000').str
                              .replace('.', '')).astype('int64')

population_average['1950'] = (population_average['1950'].str
                              .replace('M', '000000').str
                              .replace('k', '000').str
                              .replace('.', '')).astype('int64')

population_average['2020'] = (population_average['2020'].str
                              .replace('B', '000000000').str
                              .replace('M', '000000').str
                              .replace('k', '000').str
                              .replace('.', '')).astype('int64')

  after removing the cwd from sys.path.
  if __name__ == '__main__':
  from ipykernel import kernelapp as app


In [None]:
population_average['20_cent_average'] = (population_average['1900'] + population_average['1950'] + population_average['2020'])/3

In [None]:
population_average['20_cent_average'] = population_average['20_cent_average'].round().astype('int64')

## Navco

In [None]:
features_to_analyze = ['camp_name', 
                       'id', 
                       'year', 
                       'location', 
                       'start_year', 
                       'end_year', 
                       'camp_duration', 
                       'prim_meth', 
                       'resis_meth', 
                       'camp_size', 
                       'fatalities_range', 
                       'success', 
                       'progress']
df_navco = df[features_to_analyze]

In [None]:
df_navco['location'][(df_navco['location']=='Yugoslavia')&(df_navco['camp_name']=='Croatian nationalists')]='Croatia'
df_navco['location'][(df_navco['location']=='Yugoslavia')&(df_navco['camp_name']=='Yugoslavia student protests')]='Serbia'
df_navco['location'][(df_navco['location']=='Yugoslavia')&(df_navco['camp_name']=='Croats')]='Croatia'
df_navco['location'][(df_navco['location']=='Yugoslavia')&(df_navco['camp_name']=='Kosovo Albanian')]='Kosovo'
df_navco['location'][(df_navco['location']=='Yugoslavia')&(df_navco['camp_name']=='Kosovo Albanian nationalist movement')]='Kosovo'

#corrected that based on Wikipedia info on the protests and the actual territory

In [None]:
df_navco["location"].replace({"UK": "United Kingdom", 
                              "Guinea-Bissau": "Guinea Bissau", 
                              "Bosnia-Herzegovina": "Bosnia and Herzegovina", 
                              "Democratic Republic of Congo": "Democratic Republic of the Congo",
                              "Congo": "Democratic Republic of the Congo",
                              "Yemen (North Yemen)":"Yemen",
                              "Yemen People's Republic":"Yemen",
                              "Yemen Arab Republic":"Yemen",
                              "Palestinian Territories":"Palestine",
                              "South Vietnam": "Vietnam",
                              "Myanmar/Burma": "Myanmar",
                              "Aruba": "Netherlands",
                              "Cambodia (Kampuchea)":"Cambodia", 
                              "Czechoslovakia": "Czech Republic", 
                              "USSR": "Ukraine", #only Ukrainian prostest present
                              "Princely State of Travencore":"India",
                              "East Germany": "Germany"}, inplace=True)

In [None]:
"""
We know that there might be differences in naming
"""
present_in_navco_only = ((set(df_navco['location'].unique()) - set(world['name'].unique())))
present_in_map_only = ((set(world['name'].unique()) - set(df_navco['location'].unique())))

In [None]:
present_in_navco_only

{'Bahrain',
 'Comoros',
 "Cote d'Ivoire",
 'Hyderabad',
 'Maldives',
 'Palestine',
 'Tibet',
 'Tonga'}

In [None]:
df_navco = df_navco.rename(columns={'location':'country'})

In [None]:
world = world.rename(columns={'name':'country'})

In [None]:
population_average.drop(['1900','1950','2020'], axis = 1, inplace = True)

In [None]:
df_final_navco = pd.merge(df_navco, world, how='left', on = 'country')

In [None]:
present_in_final_navco_only = ((set(df_final_navco['country'].unique()) - set(population_average['country'].unique())))
present_in_pop_only = ((set(population_average['country'].unique()) - set(df_final_navco['country'].unique())))

In [None]:
present_in_final_navco_only

{'Democratic Republic of the Congo',
 'East Timor',
 'Guinea Bissau',
 'Hyderabad',
 'Ivory Coast',
 'Kosovo',
 'Kyrgyzstan',
 'Laos',
 'Macedonia',
 'Slovakia',
 'Swaziland',
 'Tibet',
 'Western Sahara'}

In [None]:
population_average["country"].replace({"Congo, Dem. Rep.": "Democratic Republic of the Congo", 
                                        "Timor-Leste": "East Timor",
                                        "Guinea-Bissau": "Guinea Bissau",
                                        "Kyrgyz Republic":"Kyrgyzstan",
                                        "Lao":"Laos",
                                        "North Macedonia":"Macedonia",
                                        "Slovak Republic":"Slovakia",
                                        "Eswatini":"Swaziland"}, inplace=True)

In [None]:
df_regions["country"].replace({"Congo, Dem. Rep.": "Democratic Republic of the Congo", 
                                        "Timor-Leste": "East Timor",
                                        "Guinea-Bissau": "Guinea Bissau",
                                        "Kyrgyz Republic":"Kyrgyzstan",
                                        "Lao":"Laos",
                                        "North Macedonia":"Macedonia",
                                        "Slovak Republic":"Slovakia",
                                        "Eswatini":"Swaziland"}, inplace=True)

In [None]:
df_final_navco = pd.merge(df_final_navco, population_average, how='left', on = 'country')

In [None]:
df_final_navco[df_final_navco['20_cent_average'].isna()]['country'].unique()

array(['Kosovo', 'Ivory Coast', 'Tibet', 'Western Sahara', 'Hyderabad'],
      dtype=object)

In [None]:
df_final_navco = df_final_navco[df_final_navco['20_cent_average'].notna()]


In [None]:
df_final_navco.drop(['id_y'], axis=1, inplace=True)
df_final_navco = df_final_navco.rename(columns = {"id_x":"id"})

In [None]:
present_in_fn_only = ((set(df_final_navco['country'].unique()) - set(df_regions['country'].unique())))
present_in_reg_only = ((set(df_regions['country'].unique()) - set(df_final_navco['country'].unique())))

In [None]:
present_in_fn_only

{'Taiwan'}

In [None]:
df_final_navco = pd.merge(df_final_navco, df_regions, how='left', on = 'country')

## Setting values from codes

In [None]:
"""
From the codebook:
Indicator of the general size of the campaign.
0=1-999 1=1000-9,999 2=10,000-99,999 3=100,000-499,999 4=500,000-1 million
5>=1 million -99=unknown
"""
casualty_cat = {0:999, 1:9999, 2:99999, 3:499999, 4:1000000}

In [None]:
df_final_navco['camp_size'] = df_final_navco['camp_size'].replace(to_replace = casualty_cat)

In [None]:
df_final_navco['ratio_of_pop_involved']=df_final_navco['camp_size']/df_final_navco['20_cent_average']

In [None]:
df_final_navco['fatalities_range'].unique()

array([  3.,   1.,   0.,   2.,   5.,   6., -99.,  nan,   4.])

In [None]:
df_final_navco['fatalities_range'] = df_final_navco['fatalities_range'].fillna(-99)

In [None]:
"""
From the codebook:
Range of estimated fatalities among protestors / dissidents / fighters.

0 = no known fatalities
1 = 1-10 fatalities
2 = 11-25 fatalities
3 = 26-100 fatalities
4 = 101-1,000 fatalities
5 = 1,001-10,000 fatalities
6 = 10,001+ fatalities
-99 = unknown
"""

fatality_cat = {0:"no known fatalities", 
                1:"1-10 fatalities", 
                2:"11-25 fatalities", 
                3:"26-100 fatalities", 
                4:"101-1000 fatalities",
                5:"1001-10000 fatalities",
                6:"10001+ fatalities",
                -99: "unknown"}
df_final_navco['fatalities_range'] = df_final_navco['fatalities_range'].replace(to_replace = fatality_cat)

In [None]:
"""
Since the data given is for every year, final success or failure is indicated for the last year only. 
From the codebook:
Campaign’s maximalist goal ultimately achieved as a direct result of the campaign.
0=not successful 1= successful.

I take maximum of values for campain id for success for each year to use as final result.
"""
df_final_navco['final_result'] = df_final_navco.groupby('id')['success'].transform(lambda x: x.max()) 

In [None]:
repl_res = {1:'Success',
            0:'Failure'}
df_final_navco['final_result'] = df_final_navco['final_result'].replace(to_replace=repl_res)

In [None]:
"""
Since the data given is for every year, primary method (violent or not) is changing within the campain.
From the codebook:
Denotes the primary type of resistance method used in a campaign year.
0=primarily violent campaign 1=primarily nonviolent campaign.

I take the average for all the years, round it up and use everything above 0.5 as "mostly violent" and below - "mostly non-violent" respectively.
"""
df_final_navco['how_violent'] = df_final_navco.groupby('id')['prim_meth'].transform(lambda x: x.mean()) 

In [None]:
df_final_navco['how_violent'] = df_final_navco['how_violent'].round(0)

In [None]:
viol_repl = {1:"mostly violent",
             0:"mostly non-violent"}

In [None]:
df_final_navco['how_violent'] = df_final_navco['how_violent'].replace(to_replace=viol_repl)

In [None]:
df_final_navco['prim_meth'].unique()

array([1, 0])

In [None]:
prim_meth_repl = {1:"violent",
             0:"non-violent"}

In [None]:
"""
We have 1 for violent (for campaing year) and 0 for non-violent. 
It's a bit worrying that here we don't have -99 as in "unknown", since there's a lot of low confidence data in general.
"""
df_final_navco['prim_meth'] = df_final_navco['prim_meth'].replace(to_replace=prim_meth_repl)

In [None]:
#some missing data

df_final_navco.loc[df_final_navco['id']==215, 'region'] = 'Asia'
df_final_navco.loc[df_final_navco['id']==216, 'region'] = 'Asia'

In [None]:
#check if we have Nans anywhere

col_list = list(df_final_navco.columns)
col_list

nan_cols_df_final = []
for col in col_list:
  df_final_navco[col].isnull().sum()
  if df_final_navco[col].isnull().sum()>0:
    nan_cols_df_final.append(col)

nan_cols_df_final

['geometry']

In [None]:
# no geom version

df_final_navco_ = df_final_navco.copy()
df_final_navco_.drop(columns=['geometry'], axis=1, inplace=True)

In [None]:
#check if we have Nans anywhere

col_list = list(df_final_navco_.columns)
col_list

nan_cols_df_final = []
for col in col_list:
  df_final_navco_[col].isnull().sum()
  if df_final_navco_[col].isnull().sum()>0:
    nan_cols_df_final.append(col)

nan_cols_df_final

[]

# Visualization

In [None]:
radio_region = alt.binding_radio(options=[None,'Europe','Asia','Americas', 'Africa'],labels=['All','Europe','Asia','Americas', 'Africa'], name='Select region:')
select_region = alt.selection_single(empty = 'all', bind = radio_region, fields = ['region'])

In [None]:
all_years_map_tlo = alt.Chart(df_final_navco_)\
    .transform_filter(alt.FieldRangePredicate(field='year', range=[1900, 2019]))\
    .mark_bar(fill = 'lightgrey', stroke = 'white')\
    .encode(
        x = alt.X('year:Q'),
        y = alt.Y('country:N', aggregate = 'count'),
        color = alt.Color('how_violent:N'))

all_years_map = alt.Chart(df_final_navco_)\
    .transform_filter(alt.FieldRangePredicate(field='year', range=[1900, 2019]))\
    .mark_bar()\
    .encode(
        x = alt.X('year:Q',
                  axis=alt.Axis(format="c", tickCount = 10, title = 'Years'),
                  scale = alt.Scale(zero = False, nice=False)),
        y = alt.Y('country:N', 
                  aggregate = 'count',
                  axis = alt.Axis(tickCount = 10, title = 'Number of campaigns')),
        color = alt.Color('how_violent:N', 
                          legend=alt.Legend(title='Type of campaign', labelFontSize = 12, titleFontSize = 12),
                          scale = alt.Scale(scheme = 'set2')))\
    .add_selection(select_region).transform_filter(select_region)
    
alt.layer(all_years_map_tlo, all_years_map)\
    .properties(width = 800, 
                height = 600, 
                background = '#F9F9F9', 
                padding = 25,
                title={
                    "text": ["The Nonviolent and Violent Campaigns Dynamics Worldwide"], 
                    "subtitle": " ",
                    "align": "left",
                    "anchor": "start",
                    "fontSize": 25,
                    "subtitleFontSize": 12
                    }
                )\
            .configure_axis(ticks = False,  
                            domain = False,
                            gridDash = [1.5, 1.5],
                            titleFontSize = 12, 
                            labelPadding = 8,
                            titlePadding = 10)

In [None]:
input_slider = alt.binding_range(min=df_final_navco_.year.min(), max=df_final_navco_.year.max(), step=1, name='Select year: ')
select_year = alt.selection_single(name="year", fields = ['year'], bind=input_slider, init = {'year': 2013})

In [None]:
base_chart = alt.Chart(world)\
    .project(type = 'equalEarth')\
    .mark_geoshape(fill = 'lightgrey', stroke = 'white')\
    .encode(
        tooltip = [alt.Tooltip(title = 'No campaign this year', field = 'country', type = 'nominal')]
        )

map_chart = alt.Chart(df_final_navco_)\
    .mark_geoshape()\
    .encode(
    color = alt.Color('prim_meth:N', 
                      legend=alt.Legend(title = "Primary method per year", labelFontSize = 12, titleFontSize = 12),
                      scale = alt.Scale(scheme = 'set2')),
    tooltip = [
               alt.Tooltip(title = 'country', field = 'country', type = 'nominal'),
               alt.Tooltip(title = 'year', field = 'year', type = 'nominal'),
               alt.Tooltip(title = 'primary method', field = 'prim_meth', type = 'nominal'),
               alt.Tooltip(title = 'campaign name', field = 'camp_name', type = 'nominal')
               ]
            )\
    .add_selection(select_year)\
    .transform_filter(select_year)\
    .transform_lookup(
        lookup='country',
        from_=alt.LookupData(world, 
                             key='country',
                             fields=["id", "country", "geometry", "type"])
        )\
    .project(type = 'equalEarth')


alt.layer(base_chart, map_chart)\
    .properties(width = 800, 
                height = 600, 
                background = '#F9F9F9', 
                padding = 25,
                title={
                    "text": ["The Nonviolent and Violent Campaigns Dynamics Worldwide"], 
                    "subtitle": "*Hovering the cursor over the country's territory on the map will show you more details",
                    "align": "left",
                    "anchor": "start",
                    "fontSize": 25,
                    "subtitleFontSize": 12
                    }
                )


In [None]:
success_map = alt.Chart(df_final_navco_)\
    .mark_bar()\
    .encode(
        x = alt.X('year:Q', 
                  scale = alt.Scale(zero = False, nice=False, domain=[1945, 2013]),
                  axis=alt.Axis(format="c", title = "Years")),
        y = alt.Y('id:N', 
                  aggregate = 'distinct',
                  axis=alt.Axis(title = "Number of campaigns"),
                  scale = alt.Scale(zero = False, domain=[0, 60])),
        color = alt.Color('final_result:N',
                          legend=alt.Legend(title = "Result of the campain", labelFontSize = 12, titleFontSize = 12),
                          scale = alt.Scale(scheme = 'set2'))
            )\
    .add_selection(select_region)\
    .transform_filter(select_region)\
    .properties(
        width=500,
        height=500
            )

In [None]:
success_map.facet(facet = alt.Facet('how_violent:N', title = None), columns = 2)\
    .properties(
        background = '#F9F9F9', 
        padding = 30,
        title={
            "text": ["Primary method dynamics and success, 1945-2013"],
            "subtitle": [' '],
            "align": "left",
            "anchor": "start",
            "fontSize": 25,
            "subtitleFontSize": 12
            })\
    .configure_axis(ticks = False,  
                    domain = False,
                    gridDash = [1.5, 1.5],
                    titleFontSize = 12, 
                    labelPadding = 8,
                    titlePadding = 10)

In [None]:
df_navco_campaigns = df_final_navco_.drop_duplicates(subset=['id'], keep="first")

In [None]:
df_navco_campaigns['country_success_rate'] = df_navco_campaigns.groupby('country')['success'].transform(lambda x: x.mean()) 

In [None]:
base_chart = alt.Chart(world)\
    .project(type = 'equalEarth')\
    .mark_geoshape(fill = 'lightgrey', stroke = 'white')\
    .encode(
        tooltip = [alt.Tooltip(title = 'No campaign', field = 'country', type = 'nominal')]
        )

map_chart = alt.Chart(df_navco_campaigns)\
    .mark_geoshape()\
    .encode(
        color = alt.Color('country_success_rate:Q', 
                          legend=alt.Legend(title = "Success rate", format = '.0%', labelFontSize = 12, titleFontSize = 12),
                          scale = alt.Scale(scheme = 'greens')),
        tooltip = [
                   alt.Tooltip(title = 'country', field = 'country', type = 'nominal'),
                   alt.Tooltip(title = 'country_success_rate', field = 'country_success_rate', type = 'quantitative', format = '.0%')
                   ]
            )\
    .transform_lookup(
        lookup='country',
        from_=alt.LookupData(world, 
                             key='country',
                             fields=["id", "country", "geometry", "type"])
        )\
    .project(type = 'equalEarth')


alt.layer(base_chart, map_chart)\
    .properties(width = 800,
                height = 600,
                background = '#F9F9F9',
                padding = 25,
                title={
                    "text": ["Success rate for regime change, antioccupation, or secession 1945-2013 by country"],
                    "subtitle": ["*Hovering the cursor over the country's territory on the map will show you more details", ' '],
                    "align": "left",
                    "anchor": "start",
                    "fontSize": 25,
                    "subtitleFontSize": 12
                    }
                )