In [1]:
import numpy as np
import pandas as pd

In [2]:
mida = pd.read_csv("../correlates_data/MIDA 5.0.csv") #One record per dispute
midb = pd.read_csv("../correlates_data/MIDB 5.0.csv") #One record per participant per dispute

In [3]:
#Curtails data to only MIDs that occur on or after 1993.  Incidents were only recorded post-1993.
mida = mida[mida["dispnum"]>3550]
midb = midb[midb["dispnum"]>3550]

#reset index for dataframe folding
mida = mida.reset_index(drop=True)
midb = midb.reset_index(drop=True)

---
POST-1993 DATASETS
1. MIDA shape: (847, 20)
2. MIDB shape: (2074, 19)
---
**Objective:**

The object of this project is to predict attributes (duration or highest military action) of a dispute.  Therefore, the attributes of MIDB must be collapsed into MIDA so that a model can be created.  Create a dataframe that records attributes from both sides of each conflict.

1. Unknown values are stored as -9 within the dataset.  Populate these with `np.nan` so they can be handled properly.
2. Declare an empty dataframe containing the features desired for predictions.
3. Populate the empty dataframe using values from MIDA and MIDB.
4. Save the new dataframe as a csv file for modelling in another notebook.

In [4]:
def populate_nines(df, nans, unknowns):
    #expects nans to be a list of columns whose unknown values are going to be replaced with np.nan
    #expects unknowns to be a dictionary of column name: column replacement value for -9 (can vary by column)
    for col in nans:
        df[col] = df[col].replace(-9, np.nan)
    for col, value in unknowns.items():
        df[col] = df[col].replace(-9, value)
    
    return df

In [5]:
mida_nans = ["stday", "endday", "fatality", "fatalpre"]
mida_unknowns = {"outcome":8, "settle":4, }

midb_nans = ["stday", "endday", "fatality", "fatalpre"]
midb_unknowns = {"revtype1":0, "revtype2":0}

In [6]:
mida=populate_nines(mida, mida_nans, mida_unknowns)
midb=populate_nines(midb, midb_nans, midb_unknowns)

---
---

In [7]:
new_features = [
    #Universal features
    "dispnum",
    "y_avgdur",          #maxdur+mindur/2
    "year",              #year of initiation (styear)
    "outcome",           #Object value regarding the outcome
    "settle",            #What type of settlement it was
    "recip",             #Whether the dispute was reciprocated
    #Side A features
    "a_country",         #Primary side A nation (if only one, or one engaged prior to allies; otherwise "coalition")
    "a_rev_territory",   #bin value: revisionism vis a vis 'territory'
    "a_rev_policy",      #bin value: revisionism vis a vis 'policy'
    "a_rev_regime",      #bin value: revisionism vis a vis 'regime'
    "a_rev_other",       #bin value: revisionism for any other
    "a_fatalities",      #fatalities for side a
    "a_hiact",           #highest level of action by side a
    "a_hostlev",         #highest hostility level by side a
    "a_orig",            #whether side a was the originator of the conflict
    "a_coalition",       #number of states engaged in conflict past the first (usually 0)
    #Side B features
    "b_country",
    "b_rev_territory",
    "b_rev_policy",
    "b_rev_regime",
    "b_rev_other",
    "b_fatalities",
    "b_hiact",
    "b_hostlev",
    "b_orig",
    "b_coalition"
]

df = pd.DataFrame(columns=new_features)

In [8]:
#Universal assignments (from MIDA)

df["dispnum"] = mida["dispnum"]
df["y_avgdur"] = (mida["maxdur"] + mida["mindur"]) / 2
df["year"] = mida["styear"]
df["outcome"] = mida["outcome"]
df["settle"] = mida["settle"]
df["recip"] = mida["recip"]

In [9]:
df = df.reset_index(drop=True)

---
### Basic functions for handling folding MIDB data into the new dataframe

In [10]:
#general functions for getting slices and values from a dataframe

def slice_midb_by_dispnum(dispnum):
    return midb[midb["dispnum"]==dispnum]


def fetch_value(subdf, val, side=True):
    if side:
        a = subdf[subdf["sidea"]==1]
        return a[val]
    else:
        b = subdf[subdf["sidea"]==0]
        return b[val]

In [11]:
#Functions for specific features

def side_breakdown_num_check(dispnum):
    #gets the number of participants in side A or side B, returned as a tuple
    temp_df = midb[midb["dispnum"]==dispnum]
    return (
        #Returns a tuple of side a and side b
        len(temp_df[temp_df["sidea"]==1]),
        len(temp_df[temp_df["sidea"]==0])
    )


def sum_fatality_ordinals(*fatality_levels):
    #takes an arbitrary number of fatality ordinals for a given "side" of the conflict (a or b) and returns the ordinal that corresponds to the added value
    ordinal_translation = {0:0, 1:10, 2:50, 3:150, 4:350, 5:750, 6:1000}
    ordinal_range_array = {0:range(0, 1), 1:range(1, 26), 2:range(26, 101), 3:range(101, 251), 4:range(251, 501), 5:range(501, 1000), 6:range(1000, 100_000)}
    fatalities_array = []

    for i in fatality_levels:
        for key, value in ordinal_translation.items():
            if i==key:
                fatalities_array.append(value)
    
    sum_fatalities = sum(fatalities_array)
    
    for key, value in ordinal_range_array.items():
        if sum_fatalities in value:
            return key

---
---
Assignations of values to the new model dataframe.
---
---

In [12]:
#This cell assigns the ccode of conflicts.  Where there were more than one partipant, "coalition" is encoded.

for i in range(len(df)):
    dispnum = df.loc[i]["dispnum"]
    
    if side_breakdown_num_check(dispnum)[0] == 1:
        df.at[i, "a_country"] = int(fetch_value(slice_midb_by_dispnum(dispnum), "ccode"))
    else:  df.at[i, "a_country"] = "coalition"
        
    if side_breakdown_num_check(dispnum)[1] == 1:
        df.at[i, "b_country"] = int(fetch_value(slice_midb_by_dispnum(dispnum), "ccode", False))
    else:  df.at[i, "b_country"] = "coalition"

---
---

In [13]:
#This cell populates revisionism status. 
#Notes on methodology: a quick review of the lists of all revtypes grouped by side, by dispute number showed that there were very few "mixed cases"; e.i. 
#if multiple states were on the same side of the conflict they were overwhelmingly likely to have the same revisionist goals.  In addition, amongst the mixed cases
#(of which there were less than half a dozen across the whole dataset), the only occurences were NUM + 0: e.i., there were allies with revisionist goals and allies
#with no revisionist goals.  This means that there was never a dispute where the revisionist goals were separate from each other between allies.
#Therefore, I considered it acceptable to get the "max" of a side when retrieiving a list of revisionist goals, because that was the only incidence of revisionism
#present on that side, and the side is then accurately represented in the dataframe with how revisionism is encoded.


a_rev_dict = {1:"a_rev_territory", 2:"a_rev_policy", 3:"a_rev_regime", 4:"a_rev_other"}
b_rev_dict = {1:"b_rev_territory", 2:"b_rev_policy", 3:"b_rev_regime", 4:"b_rev_other"}


for i in range(len(df)):
    dispnum = df.loc[i]["dispnum"]
    
    #population of a_country revtypes
    revision1 = int(max(list(fetch_value(slice_midb_by_dispnum(dispnum), "revtype1"))))
    revision2 = int(max(list(fetch_value(slice_midb_by_dispnum(dispnum), "revtype2"))))

    for key, rev_type in a_rev_dict.items():
        if (key==revision1) or (key==revision2):
            df.at[i, rev_type] = 1
        else:
            df.at[i, rev_type] = 0


    #population of b_country revtypes        
    revision1 = int(max(list(fetch_value(slice_midb_by_dispnum(dispnum), "revtype1", False))))
    revision2 = int(max(list(fetch_value(slice_midb_by_dispnum(dispnum), "revtype2", False))))

    for key, rev_type in b_rev_dict.items():
        if (key==revision1) or (key==revision2):
            df.at[i, rev_type] = 1
        else:
            df.at[i, rev_type] = 0

---
---

In [14]:
#This cell assigns the hiact, hostility levels, and origination of conflicts

for i in range(len(df)):
    dispnum = df.loc[i]["dispnum"]
    
    df.at[i, "a_hiact"] = int(max(list(fetch_value(slice_midb_by_dispnum(dispnum), "hiact"))))
    df.at[i, "b_hiact"] = int(max(list(fetch_value(slice_midb_by_dispnum(dispnum), "hiact", False))))

    df.at[i, "a_hostlev"] = int(max(list(fetch_value(slice_midb_by_dispnum(dispnum), "hostlev"))))
    df.at[i, "b_hostlev"] = int(max(list(fetch_value(slice_midb_by_dispnum(dispnum), "hostlev", False))))

    df.at[i, "a_orig"] = int(max(list(fetch_value(slice_midb_by_dispnum(dispnum), "orig"))))
    df.at[i, "b_orig"] = int(max(list(fetch_value(slice_midb_by_dispnum(dispnum), "orig", False))))        

---
---

In [15]:
#This cell assigns the fatality level of conflicts

for i in range(len(df)):
    dispnum = df.loc[i]["dispnum"]
    
    try:
        df.at[i, "a_fatalities"] = int(sum_fatality_ordinals(list(fetch_value(slice_midb_by_dispnum(dispnum), "fatality"))))
    except ValueError:
        pass #pass over assignation of NaN values so that the NaNs are retained without tripping the program

    try:
        df.at[i, "b_fatalities"] = int(sum_fatality_ordinals(list(fetch_value(slice_midb_by_dispnum(dispnum), "fatality", False))))
    except ValueError:
        pass #pass over assignation of NaN values so that the NaNs are retained without tripping the program

---
---

In [16]:
#This cell sets coalition values for sides a & b

for i in range(len(df)):    
    df.at[i, "a_coalition"] = (mida.loc[i]["numa"]) -1
    df.at[i, "b_coalition"] = (mida.loc[i]["numb"]) -1

---
---

In [17]:
#Leftover issue with a number of numeric columns being interpreted as object
tagged_for_numeric = ['a_rev_territory', 'a_rev_policy', 'a_rev_regime', 'a_rev_other', 'a_fatalities', 'a_hiact', 'a_hostlev', 'a_orig', 'a_coalition',
                      'b_rev_territory', 'b_rev_policy', 'b_rev_regime', 'b_rev_other', 'b_fatalities', 'b_hiact', 'b_hostlev', 'b_orig', 'b_coalition',
                     ]

for feature in tagged_for_numeric:
    df[feature] = pd.to_numeric(df[feature])

In [18]:
df.info() #double check

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 847 entries, 0 to 846
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   dispnum          847 non-null    int64  
 1   y_avgdur         847 non-null    float64
 2   year             847 non-null    int64  
 3   outcome          847 non-null    int64  
 4   settle           847 non-null    int64  
 5   recip            847 non-null    int64  
 6   a_country        847 non-null    object 
 7   a_rev_territory  847 non-null    int64  
 8   a_rev_policy     847 non-null    int64  
 9   a_rev_regime     847 non-null    int64  
 10  a_rev_other      847 non-null    int64  
 11  a_fatalities     847 non-null    int64  
 12  a_hiact          847 non-null    int64  
 13  a_hostlev        847 non-null    int64  
 14  a_orig           847 non-null    int64  
 15  a_coalition      847 non-null    float64
 16  b_country        847 non-null    object 
 17  b_rev_territory 

In [20]:
df.to_csv("../final_data/disputes.csv", index=False)