# Importing and preprocessing ACLED data

In [2]:
import pandas as pd
import urllib.parse
import urllib.error
import pickle
import time
import sys
import glob
from IPython.display import clear_output
import geopandas as gpd
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import plotly.express as px
import pycountry
import numpy as np
import kaleido
import plotly.graph_objects as go

# set relative paths
countries_codes_path = '../../data/auxilary_data/countries_iso_fips_capitals.csv'
acled_data_path = '../../data/ACLED/'

### Reading in the dataframes from ACLED

In [3]:
# read in xlsx into pandas
middle_east_df = pd.read_csv(acled_data_path + 'MiddleEast.csv')
print('Middle East data loaded' + "\r")
asia_pacific_df = pd.read_csv(acled_data_path + 'AsiaPacific.csv')
print('Asia Pacific data loaded' + "\r")
africa_df = pd.read_csv(acled_data_path + 'Africa.csv')
print('Africa data loaded' + "\r")
europe_central_asia_df = pd.read_csv(acled_data_path + 'EuropeCentralAsia.csv')
print('Europe Central Asia data loaded' + "\r")
latin_america_df = pd.read_csv(acled_data_path + 'LatinAmerica.csv')
print('Latin America data loaded' + "\r")
usa_canada_df = pd.read_csv(acled_data_path + 'USACanada.csv')
print('USA Canada data loaded' + "\r")

Middle East data loaded
Asia Pacific data loaded
Africa data loaded
Europe Central Asia data loaded
Latin America data loaded
USA Canada data loaded


  usa_canada_df = pd.read_csv(acled_data_path + 'USACanada.csv')


In [4]:
all_country_df = pd.concat([middle_east_df, asia_pacific_df, africa_df, europe_central_asia_df, latin_america_df, usa_canada_df])

In [14]:
all_country_df["event_date"] = pd.to_datetime(all_country_df["event_date"])

In [5]:
def plot_fatilities_country(df_raw, country, smooth):
    df = df_raw[df_raw["country"] == country]
    if df.empty:
        print(f"No data for {country}")
        return
    # group by date and count the number of fatalities
    df_sum = df.groupby("event_date")["fatalities"].sum().reset_index()

    if smooth:
        # smooth the fatalities over time
        df_sum["fatalities"] = df_sum["fatalities"].rolling(window=smooth).mean()

    # plot fatalities over time
    fig = px.line(df_sum, x="event_date", y="fatalities", title=f"Fatalities in {country} over time")
    fig.show()

In [16]:
plot_fatilities_country(all_country_df, "Afghanistan", 1)

## Calculating the number of "Hard cases" in the ACLED data

TODO, to check:

[X] Download data for all of the world

[X] Check fatalities per "event_type"

| EVENT_TYPE | Fatalities |
|------------|------------|
| Battles    | 1009381    |
| Explosions/Remote violence | 361225 |
| Violence against civilians | 312965 |
| Riots | 30459 |
| Protests | 5517 |
| Strategic developments | 4430 |

- [ ] Make rolling window based solely on date, not on the index
- [ ] Calculate "Number of fatalities in last month"
- [ ] Classify fatality occurences after month of 0 fatalities as "hard cases"
- [ ] Calculate "hard cases" per country

A hard case is defined by Mueller & Rauh (2022) as a month of 0 fatalities followed by a month with at least one fatality. Furthermore, the prediction challenge of VIEWS uses a monthly prediction framework, so we can smooth our data on the monthly level. From this we can then calculate the total number of fatalities on the country-month level, and classify the hard cases in the dataset. 

In [21]:
# group by by month and country, and calculate number of fatalities
df_monthly = all_country_df.groupby([pd.Grouper(key='country'), pd.Grouper(key='event_date', freq='M')])['fatalities'].sum().reset_index()

And then calculate the hard cases from this df:

In [38]:
# set df_montly["hard cases"] to true if fatalities of current month is > 0 and fatalities of previous month was 0
df_monthly["hard_cases"] = (df_monthly["fatalities"] > 0) & (df_monthly["fatalities"].shift(1) == 0)

Then we also calculate the log change in fatalities, as given by Hegre e.a. (2022):

$$
O_{s,i,t} = \Delta_s\ln(Y_{i, t}+1) = \ln(Y_{i, t}+1) - \ln(Y_{i, t-s}+1)
$$

In [76]:
df_monthly["log change"] = np.log(df_monthly["fatalities"] + 1) - np.log(df_monthly["fatalities"].shift(1) + 1)

Finally, since the ACLED data contains significantly more entries of violence, we want to changee our criteria for what is classified as a hard case. We will therefore classify "significant change" as a month that has an absolute log change of more than 3, in fatalities. 
So: 
$$
S_{s, i, t}=
    \begin{cases}
      True, & \text{if}\  |(O_{s, i, t})|  > 3\\
      False, & \text{otherwise}
    \end{cases}
$$

This makes our hard case classification significantly more nuanced.

In [135]:
df_monthly["significant_change"] = (abs(df_monthly["log change"]) > 3)
df_monthly["sudden_escalation"] = (df_monthly["log change"] > 3)
df_monthly["sudden_deescalation"] = (df_monthly["log change"] < -3)

df_monthly.head()

Unnamed: 0,country,event_date,fatalities,hard_cases,log change,significant_change,sudden_escalation,sudden_deescalation
0,Afghanistan,2017-01-31,2415,False,,False,False,False
1,Afghanistan,2017-02-28,2213,False,-0.087312,False,False,False
2,Afghanistan,2017-03-31,3217,False,0.373959,False,False,False
3,Afghanistan,2017-04-30,3681,False,0.134696,False,False,False
4,Afghanistan,2017-05-31,4351,False,0.167179,False,False,False


We then save this df to a csv file.

In [18]:
df_monthly.to_csv(acled_data_path + 'intensity_change_monthly_country.csv', index=False)

## Exploring amount of hard cases

In [17]:
df_monthly = pd.read_csv(acled_data_path + 'intensity_change_monthly_country.csv')

In [19]:
df_hard = df_monthly.groupby("country")["hard_cases"].sum().sort_values(ascending=False).reset_index()
df_sudden = df_monthly.groupby("country")["significant_change"].sum().sort_values(ascending=False).reset_index()
# merge df
df_hard.merge(df_sudden, on="country", suffixes=("_hard", "_sudden")).head(20).sort_values("significant_change", ascending=False)

Unnamed: 0,country,hard_cases,significant_change
11,Chad,34,38
16,Central African Republic,31,29
10,Angola,34,18
2,Liberia,50,14
7,Guinea,42,13
14,Republic of Congo,32,11
0,Senegal,57,11
6,Ivory Coast,44,10
3,Tanzania,49,9
19,Rwanda,29,6


#### Make function for plotting hard cases

We can then plot the fatalities per country and label the hard cases, and cases where significant change occurs:

In [None]:
def plot_fatilities_monthly(df, country, s_change = 3):
    df_country = df[df["country"] == country]
    if df_country.empty:
        print(f"No data for {country}")
        return

    fig = go.Figure()
    
    # Add line plot for fatalities
    fig.add_trace(go.Scatter(x=df_country["event_date"], y=df_country["fatalities"], mode='lines', name="Fatalities"))
    
    #Add line plot for log change, on different y-axis
    fig.add_trace(go.Scatter(x=df_country["event_date"], 
                             y=df_country["log change"], 
                             mode='lines', 
                             name="Log change", 
                             yaxis="y2", 
                             line=dict(dash='dash', color="grey")))

    fig.update_layout(
        title=f"Fatalities and Log Change in {country} per month",
        yaxis=dict(title="Fatalities"),
        yaxis2=dict(title="Log Change", overlaying="y", side="right")
    )

     # Add line for scatter markers for hard cases
    fig.add_trace(go.Scatter(x=df_country[df_country["hard_cases"]]["event_date"], 
                             y=df_country[df_country["hard_cases"]]["fatalities"], 
                             mode='markers', 
                             marker=dict(color='red'), 
                             name="Standard hard case"))
    
    # Add lines for scatter markers for significant changes
    fig.add_trace(go.Scatter(x=df_country[df_country["sudden_escalation"]]["event_date"], 
                             y=abs(df_country[df_country["sudden_escalation"]]["fatalities"]), 
                             mode='markers', 
                             marker=dict(color='orange'), 
                             name="Sudden escalation"))
    
    fig.add_trace(go.Scatter(x=df_country[df_country["sudden_deescalation"]]["event_date"], 
                             y=abs(df_country[df_country["sudden_deescalation"]]["fatalities"]), 
                             mode='markers', 
                             marker=dict(color='green'), 
                             name="Sudden de-escalation"))

    fig.show()

#### Exploring the hard cases

In [27]:
plot_fatilities_monthly(df_monthly, "Yemen")

## Compare all countries by saving all plots:

In [16]:
country_list = df_monthly["country"].unique()

235