In [1]:
import numpy as np
import pandas as pd

In [2]:
midi = pd.read_csv("../correlates_data/MIDI 5.0.csv") #One record per incident
midip = pd.read_csv("../correlates_data/MIDIP 5.0.csv") #One record per participant per incident

---
1. MIDI shape: (4483, 16)
2. MIDIP shape: (9619, 19)
---
**Objective:**

The object of this project is to predict attributes (duration or highest military action) of an incident.  Therefore, the attributes of MIDIP must be collapsed into MIDI so that a model can be created.  Create a dataframe that records attributes from both sides of each conflict.

1. Unknown values are stored as -9 within the dataset.  Populate these with `np.nan` so they can be handled properly.
2. Declare an empty dataframe containing the features desired for predictions.
3. Populate the empty dataframe using values from MIDI and MIDIP.
4. Save the new dataframe as a csv file for modelling in another notebook.

In [3]:
def populate_nines(df, nans, unknowns=None):
    #expects nans to be a list of columns whose unknown values are going to be replaced with np.nan
    #expects unknowns to be a dictionary of column name: column replacement value for -9 (can vary by column)
    for col in nans:
        df[col] = df[col].replace(-9, np.nan)
    
    if unknowns is not None:
        for col, value in unknowns.items():
            df[col] = df[col].replace(-9, value)

    return df

In [4]:
midi_nans = ["stday", "endday", "fatality", "fatalpre", "duration"]

midip_nans = ["stday", "endday", "fatality", "fatalpre"]
midip_unknowns = {"revtype1":0, "revtype2":0}

In [5]:
midi=populate_nines(midi, midi_nans)
midip=populate_nines(midip, midip_nans, midip_unknowns)

---
---

In [6]:
new_features = [
    #Universal features
    "dispnum",
    "incidnum",
    "y_duration",        #maxdur+mindur/2
    "year",              #year of initiation (styear)
    #Side A features
    "a_country",         #Primary side A nation (if only one, or one engaged prior to allies; otherwise "coalition")
    "a_rev_territory",   #bin value: revisionism vis a vis 'territory'
    "a_rev_policy",      #bin value: revisionism vis a vis 'policy'
    "a_rev_regime",      #bin value: revisionism vis a vis 'regime'
    "a_rev_other",       #bin value: revisionism for any other
    "a_fatalities",      #fatalities for side a
    "a_hiact",           #highest level of action by side a
    "a_hostlev",         #highest hostility level by side a
    "a_coalition",       #number of states engaged in conflict past the first (usually 0)
    #Side B features
    "b_country",
    "b_rev_territory",
    "b_rev_policy",
    "b_rev_regime",
    "b_rev_other",
    "b_fatalities",
    "b_hiact",
    "b_hostlev",
]

df = pd.DataFrame(columns=new_features)

In [7]:
#Universal assignments (from MIDI)

df["dispnum"] = midi["dispnum"]
df["incidnum"] = midi["incidnum"]
df["y_duration"] = midi["duration"]
df["year"] = midi["styear"]

---
---

In [8]:
#general functions for getting slices and values from a dataframe

def slice_midip_by_incidnum(incidnum):
    return midip[midip["incidnum"]==incidnum]


def fetch_value(subdf, val, side=True):
    if side:
        fetched = subdf[subdf["sidea"]==1]
    else:
        fetched = subdf[subdf["sidea"]==0]
    return fetched[val]

In [9]:
#Functions for specific features

def side_breakdown_num_check(incidnum):
    #gets the number of participants in side A or side B, returned as a tuple
    temp_df = midip[midip["incidnum"]==incidnum]
    return (
        #Returns a tuple of side a and side b
        len(temp_df[temp_df["sidea"]==1]),
        len(temp_df[temp_df["sidea"]==0])
    )


def sum_fatality_ordinals(*fatality_levels):
    #takes an arbitrary number of fatality ordinals for a given "side" of the conflict (a or b) and returns the ordinal that corresponds to the added value
    ordinal_translation = {0:0, 1:10, 2:50, 3:150, 4:350, 5:750, 6:1000}
    ordinal_range_array = {0:range(0, 1), 1:range(1, 26), 2:range(26, 101), 3:range(101, 251), 4:range(251, 501), 5:range(501, 1000), 6:range(1000, 100_000)}
    fatalities_array = []

    for i in fatality_levels:
        for key, value in ordinal_translation.items():
            if i==key:
                fatalities_array.append(value)
    
    sum_fatalities = sum(fatalities_array)
    
    for key, value in ordinal_range_array.items():
        if sum_fatalities in value:
            return key

---
---
Assignations of values to the new model dataframe.
---
---

In [10]:
#This cell assigns the ccode of conflicts.  Where there were more than one partipant, "coalition" is encoded.

for i in range(len(df)):
    incidnum = df.loc[i]["incidnum"]
    
    if side_breakdown_num_check(incidnum)[0] == 1:
        df.at[i, "a_country"] = int(fetch_value(slice_midip_by_incidnum(incidnum), "ccode"))
    else:  df.at[i, "a_country"] = "coalition"
        
    if side_breakdown_num_check(incidnum)[1] == 1:
        df.at[i, "b_country"] = int(fetch_value(slice_midip_by_incidnum(incidnum), "ccode", False))
    else:  df.at[i, "b_country"] = "coalition"

In [11]:
#This cell populates revisionism status. 
a_rev_dict = {1:"a_rev_territory", 2:"a_rev_policy", 3:"a_rev_regime", 4:"a_rev_other"}
b_rev_dict = {1:"b_rev_territory", 2:"b_rev_policy", 3:"b_rev_regime", 4:"b_rev_other"}


for i in range(len(df)):
    incidnum = df.loc[i]["incidnum"]
    
    #population of a_country revtypes
    try:
        revision1 = int(max(list(fetch_value(slice_midip_by_incidnum(incidnum), "revtype1"))))
    except ValueError:  revision1 = 0
        
    try:
        revision2 = int(max(list(fetch_value(slice_midip_by_incidnum(incidnum), "revtype2"))))
    except ValueError:  revision2 = 0
        
    for key, rev_type in a_rev_dict.items():
        if (key==revision1) or (key==revision2):
            df.at[i, rev_type] = 1
        else:
            df.at[i, rev_type] = 0


    #population of b_country revtypes
    try:
        revision1 = int(max(list(fetch_value(slice_midip_by_incidnum(incidnum), "revtype1", False))))
    except ValueError:  revision1 = 0
    
    try:
        revision2 = int(max(list(fetch_value(slice_midip_by_incidnum(incidnum), "revtype2", False))))
    except ValueError:  revision2 = 0

    for key, rev_type in b_rev_dict.items():
        if (key==revision1) or (key==revision2):
            df.at[i, rev_type] = 1
        else:
            df.at[i, rev_type] = 0

In [12]:
#This cell assigns the hiact & hostility levels of conflicts

for i in range(len(df)):
    incidnum = df.loc[i]["incidnum"]
    
    try:
        df.at[i, "a_hiact"] = int(max(list(fetch_value(slice_midip_by_incidnum(incidnum), "action"))))
    except:  df.at[i, "a_hiact"] = np.nan
    try:
        df.at[i, "b_hiact"] = int(max(list(fetch_value(slice_midip_by_incidnum(incidnum), "action", False))))
    except:  df.at[i, "b_hiact"] = np.nan

    try:
        df.at[i, "a_hostlev"] = int(max(list(fetch_value(slice_midip_by_incidnum(incidnum), "hostlev"))))
    except:  df.at[i, "a_hostlev"] = np.nan
    try:
        df.at[i, "b_hostlev"] = int(max(list(fetch_value(slice_midip_by_incidnum(incidnum), "hostlev", False))))
    except:  df.at[i, "b_hostlev"] = np.nan

In [13]:
#This cell assigns the fatality level of conflicts

for i in range(len(df)):
    incidnum = df.loc[i]["incidnum"]
    
    try:
        df.at[i, "a_fatalities"] = int(sum_fatality_ordinals(list(fetch_value(slice_midip_by_incidnum(incidnum), "fatality"))))
    except ValueError:
        pass #pass over assignation of NaN values so that the NaNs are retained without tripping the program

    try:
        df.at[i, "b_fatalities"] = int(sum_fatality_ordinals(list(fetch_value(slice_midip_by_incidnum(incidnum), "fatality", False))))
    except ValueError:
        pass #pass over assignation of NaN values so that the NaNs are retained without tripping the program

In [14]:
#This cell sets coalition values for side a

for i in range(len(df)):    
    df.at[i, "a_coalition"] = (midi.loc[i]["numa"]) -1

In [15]:
nadrops = ["y_duration", "a_hiact", "a_hostlev", "b_hiact", "b_hostlev"]
df = df.dropna(axis="rows", subset=nadrops)

In [17]:
df.to_csv("../final_data/incidents.csv", index=False)

---
---
### Creation of alternate midip dataframe - for predicting highest action per state per conflict

(rather than predicting highest action per side per conflict)

In [19]:
midip = pd.read_csv("../correlates_data/MIDIP 5.0.csv") #One record per participant per incident

In [20]:
ip_drops = ['ccode', 'stday', 'stmon', 'endday', 'endmon', 'endyear', 'fatalpre', 'version', 'revtype1', 'revtype2']

ip_renames = {'styear':'year', 'insidea':'incid_a', 'sidea':'side_a', 'stabb':'ccode'}

ip_column_adds = ["rev_territory", "rev_policy", "rev_regime", "rev_other", 'allies']

In [21]:
midip=populate_nines(midip, midip_nans, midip_unknowns) #replace -9 in the df with np.nan, which is what it represents

In [22]:
#This cell populates revisionism status in the new dataframe. 

midip[ip_column_adds] = np.nan  #Initialize the laterizable boolean columns

rev_dict = {1:"rev_territory", 2:"rev_policy", 3:"rev_regime", 4:"rev_other"}


for i in range(len(midip)):
    revtype1 = midip.loc[i]["revtype1"]
    revtype2 = midip.loc[i]["revtype2"]
    
    for key, rev_type in rev_dict.items():
        if (key==revtype1) or (key==revtype2):
            midip.at[i, rev_type] = 1
        else:
            midip.at[i, rev_type] = 0

In [23]:
#This cell provides a column on how many allies aside from itself each nation has in each incident.

for i in range(len(midip)):
    incidnum = midip.loc[i]["incidnum"]
    
    subdf = slice_midip_by_incidnum(incidnum)
    
    if midip.at[i, 'insidea'] == 1:
        midip.at[i, "allies"] = len(subdf[subdf['insidea']==1]) - 1
    else:
        midip.at[i, "allies"] = len(subdf[subdf['insidea']==0]) - 1

In [24]:
midip.drop(columns=ip_drops, inplace=True)      #Remove unused columns.  'styear' used for "year" because this is supposed to predict for ongoing conflict actions.
midip.rename(columns=ip_renames, inplace=True)  #Rename some columns to correspond to my own conventions or altered use of the feature

In [25]:
midip.to_csv('../final_data/incids_by_country.csv', index=False)