In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../data/global_cia_combined.csv')

In [3]:
df.head()

Unnamed: 0,country,year,region,protest,protestnumber,startday,startmonth,startyear,endday,endmonth,...,govt_type,legal_system,gdp_purchasing_power_global_rank,gdp_growth_global_rank,electricity_access_percent,electricity_generating_capacity_global_rank,internet_access_percent,cell_phone_per_100,military_spending_annual_percent_gdp,transnational_disputes
0,Canada,1990,North America,1,1,15.0,1.0,1990.0,15.0,1.0,...,federal parliamentary democracy (Parliament of...,"common law system except in Quebec, where civi...",17.0,112.0,100.0,8.0,91.0,90.0,1.31,managed maritime boundary disputes with the US...
1,Canada,1990,North America,1,2,25.0,6.0,1990.0,25.0,6.0,...,federal parliamentary democracy (Parliament of...,"common law system except in Quebec, where civi...",17.0,112.0,100.0,8.0,91.0,90.0,1.31,managed maritime boundary disputes with the US...
2,Canada,1990,North America,1,3,1.0,7.0,1990.0,1.0,7.0,...,federal parliamentary democracy (Parliament of...,"common law system except in Quebec, where civi...",17.0,112.0,100.0,8.0,91.0,90.0,1.31,managed maritime boundary disputes with the US...
3,Canada,1990,North America,1,4,12.0,7.0,1990.0,6.0,9.0,...,federal parliamentary democracy (Parliament of...,"common law system except in Quebec, where civi...",17.0,112.0,100.0,8.0,91.0,90.0,1.31,managed maritime boundary disputes with the US...
4,Canada,1990,North America,1,5,14.0,8.0,1990.0,15.0,8.0,...,federal parliamentary democracy (Parliament of...,"common law system except in Quebec, where civi...",17.0,112.0,100.0,8.0,91.0,90.0,1.31,managed maritime boundary disputes with the US...


In [5]:
df.columns

Index(['country', 'year', 'region', 'protest', 'protestnumber', 'startday',
       'startmonth', 'startyear', 'endday', 'endmonth', 'endyear',
       'protesterviolence', 'location', 'participants', 'protesteridentity',
       'sources', 'notes', 'final', 'ignore', 'crowd dispersal', 'arrests',
       'accomodation', 'shootings', 'beatings', 'killings',
       'political behavior, process', 'labor wage dispute',
       'price increases, tax policy', 'removal of politician',
       'police brutality', 'land farm issue', 'social restrictions', 'climate',
       'natural_resources', 'population_distribution',
       'net_migration_per_1000_population', 'age_0_14_percent',
       'age_15_24_percent', 'age_25_54_percent', 'age_55_64_percent',
       'age_65_over_percent', 'language', 'govt_type', 'legal_system',
       'gdp_purchasing_power_global_rank', 'gdp_growth_global_rank',
       'electricity_access_percent',
       'electricity_generating_capacity_global_rank',
       'internet_acce

In [4]:
# I will be removing the current columns for state response and replacing them with columns
# containing binary data for each type of response. First, I will create a new column which
# will contain whatever the final state response for each protest was, in case we want to
# use that for something or need more chronological information.

responses = df['stateresponse1'].value_counts().index.tolist()
responses

KeyError: 'stateresponse1'

In [None]:
for i in range(7,0,-1):
    df['final'] = 0
    for ind in range(df.shape[0]):
        if df[f'stateresponse{i}'][ind] in responses:
            df['final'][ind] = df[f'stateresponse{i}'][ind]

In [None]:
# I've seen this warning before, but I don't really understand it and don't think it's
# much of a problem? Not totally sure.

for item in responses:
    df[f'{item}'] = 0
    for i in range(1,8):
        for ind in range(df.shape[0]):
            if df[f'stateresponse{i}'][ind] == f'{item}':
                df[f'{item}'][ind] = 1

In [None]:
df.columns

In [None]:
# We will use the new binary columns instead of these original columns. This takes care of
# all the nulls we had and will let us actually use these as target variables. This way, 
# we can run some binary regression (i.e. "success"/"no success", "killings"/"no killings")
# and also multiclass regressions where we can include all of the government responses
# together as the target.

df.drop(columns = ['stateresponse1','stateresponse2','stateresponse3',
                  'stateresponse4','stateresponse5','stateresponse6',
                  'stateresponse7'], inplace = True)

In [None]:
# I'm going to drop the "ccode" column, as this is an arbitrary numberical tag applied to
# each country and doesn't actually offer any insight. Also dropping the 'id' column since
# this is essentially the same information that the date columns are providing.

df.drop(columns = ['ccode', 'id'], inplace = True)

In [None]:
df.isnull().sum()

In [None]:
# Creating the same for loop to create binary columns for the protestor demands instead
# of what we currently have. Using basically the same technique as before. These columns
# will be really interesting to compare to the state responses (for example, do protests
# over the removal of a politician result in killings more than a labor wage dispute?)

demands = df['protesterdemand1'].value_counts().index.tolist()

for item in demands:
    df[f'{item}'] = 0
    for i in range(1,5):
        for ind in range(df.shape[0]):
            if df[f'protesterdemand{i}'][ind] == f'{item}':
                df[f'{item}'][ind] = 1

In [None]:
# Dropping the original columns which we no longer need.

df.drop(columns = ['protesterdemand1','protesterdemand2',
                  'protesterdemand3','protesterdemand4'], inplace = True)

In [None]:
df.isnull().sum()

In [None]:
# I'm just going to remove nulls in the "notes" column and see if that takes care of most
# of the nulls in the dataframe. Just from looking at it, it looks as though the nulls
# are basically contained in the same rows.

df = df.dropna(axis=0, subset=['notes'])

In [None]:
df.isnull().sum()

In [None]:
# As expected, the vast majority of nulls came from the same place. Getting rid of rows
# that had nulls in the "notes" column took care of most of the nulls in the dataset.

In [None]:
# After looking at the manual, I'm thinking it will make sense to remove the 
# 'participants_category' column. This is basically a column that puts the "participants"
# into buckets of various sizes. We will likely be more interested in the actual number
# of participants as opposed to the range, and I am confused at the huge number of nulls
# in the data for this categorical variable they created. If we decide to analyze the size
# of a protest with buckets, we can easily make them ourselves without including 5000+
# nulls.

df.drop(columns = 'participants_category', inplace = True)

In [None]:
# I will leave the small number of nulls in for this column.

df['protesteridentity'].value_counts()

In [None]:
# This is a really weird column with some really important information. Either we will find
# a way to use this as categorical data or we will have to figure out a way to make this
# more numerical. I'm going to leave it for now.

df['participants'].value_counts()

In [None]:
df['accomodation']

At this point, I have created the necessary target variable columns, cleaned up the majority of the nulls, and have turned some categorical data into useful binary columns. I'm going to save this dataframe into the data folder, then I will save a dataframe for each region. This should be a good place to start.

In [None]:
df.to_csv('../data/global.csv', index = False)

In [None]:
regions = df['region'].value_counts().index.tolist()
regions

In [None]:
for item in regions:
    snake_item = item.replace(' ','_').lower()
    df[df['region'] == item].to_csv(f'../data/{snake_item}.csv', index = False)