# Merge CoW and prep for time series integration

In [1]:
import pandas as pd
import numpy as np

read in the relevant tables. don't need the war transitions table

In [2]:
cow_par = pd.read_csv("../Data/CoW/Wrangled/war_participants.csv")
cow_pol = pd.read_csv("../Data/CoW/Wrangled/polities.csv")
cow_war = pd.read_csv("../Data/CoW/Wrangled/wars.csv", usecols=['WarID', 'WarTypeName', 'IsIntervention', 'IsInternational'])
cow_loc = pd.read_csv("../Data/CoW/Wrangled/war_locations.csv")

limit the cow_par table to only those rows where the participant is a state

In [3]:
cow_pol_states = cow_pol[cow_pol['PolityType']=='State']
cow_pol_states_list = list(cow_pol_states['PolityID'].unique())

cow_par = cow_par[cow_par['PolityID'].isin(cow_pol_states_list)]

expand the start and end dates so there is one row per year for each war-participant observation, then reduce to only rows with year of 1946 or greater

In [4]:
cow_par['EndDate'] = cow_par['EndDate'].fillna('2008-12-31')
cow_par['StartDate'] = pd.to_datetime(cow_par['StartDate'])
cow_par['EndDate'] = pd.to_datetime(cow_par['EndDate'])

cow_par['StartDateClean'] = cow_par['StartDate'].apply(lambda dt: dt.replace(day=1, month=1))
cow_par['EndDateClean'] = cow_par['EndDate'].apply(lambda dt: dt.replace(day=1, month=1))


cow_par_ts = pd.concat([pd.DataFrame({'year': pd.date_range(row.StartDateClean, row.EndDateClean, freq='YS'),
                                      'StartDate': row.StartDate,
                                      'EndDate': row.EndDate,
                                      'cow_id': row.PolityID, 
                                      'war_id': row.WarID, 
                                      'IsInitiator': row.IsInitiator, 
                                      'Outcome': row.Outcome, 
                                      'total_deaths': row.Deaths}, 
                                 columns=['year', 'StartDate', 'EndDate', 'cow_id', 'war_id', 'IsInitiator', 'Outcome', 'total_deaths']) 
                                 for i, row in cow_par.iterrows()], ignore_index=True)
cow_par_ts['year'] = cow_par_ts['year'].dt.year
cow_par_ts = cow_par_ts[cow_par_ts['year'] > 1945].reset_index(drop=True)
cow_par_ts

Unnamed: 0,year,StartDate,EndDate,cow_id,war_id,IsInitiator,Outcome,total_deaths
0,1947,1947-10-26,1949-01-01,750,147,1,6,2500.0
1,1948,1947-10-26,1949-01-01,750,147,1,6,2500.0
2,1949,1947-10-26,1949-01-01,750,147,1,6,2500.0
3,1947,1947-10-26,1949-01-01,770,147,0,6,1000.0
4,1948,1947-10-26,1949-01-01,770,147,0,6,1000.0
...,...,...,...,...,...,...,...,...
1528,1949,1948-04-03,1949-05-01,732,1573,0,1,
1529,1968,1968-09-01,1971-10-06,698,1577,0,6,
1530,1969,1968-09-01,1971-10-06,698,1577,0,6,
1531,1970,1968-09-01,1971-10-06,698,1577,0,6,


find the number of days spend in the conflict for the given year

In [5]:
def days_in_conflict(row):
    startyear = row.StartDate.year
    endyear = row.EndDate.year
    year = row.year
    daysinyear = 365
    if (year%4==0) & (year%100==0):
        daysinyear = 366
    
    lastday = pd.Timestamp(year=year, month=12, day=31)
    firstday = pd.Timestamp(year=year, month=1, day=1)
    
    days = 0
    
    if (startyear == year) & (endyear == year):
        days = (row.EndDate - row.StartDate).days + 1
    elif (startyear == year) & (endyear != year):
        days = (lastday - row.StartDate).days + 1
    elif (startyear != year) & (endyear == year):
        days = (row.EndDate - firstday).days + 1
    else:
        days = daysinyear
    
    return (days / daysinyear)

In [6]:
cow_par_ts['TimeAtWar'] = cow_par_ts.apply(days_in_conflict, axis=1).round(3)
cow_par_ts

Unnamed: 0,year,StartDate,EndDate,cow_id,war_id,IsInitiator,Outcome,total_deaths,TimeAtWar
0,1947,1947-10-26,1949-01-01,750,147,1,6,2500.0,0.184
1,1948,1947-10-26,1949-01-01,750,147,1,6,2500.0,1.000
2,1949,1947-10-26,1949-01-01,750,147,1,6,2500.0,0.003
3,1947,1947-10-26,1949-01-01,770,147,0,6,1000.0,0.184
4,1948,1947-10-26,1949-01-01,770,147,0,6,1000.0,1.000
...,...,...,...,...,...,...,...,...,...
1528,1949,1948-04-03,1949-05-01,732,1573,0,1,,0.332
1529,1968,1968-09-01,1971-10-06,698,1577,0,6,,0.334
1530,1969,1968-09-01,1971-10-06,698,1577,0,6,,1.000
1531,1970,1968-09-01,1971-10-06,698,1577,0,6,,1.000


determine if conflict was fought in same region where participant is

In [7]:
wars = list(cow_par_ts['war_id'].unique())
cow_loc = cow_loc[cow_loc['WarID'].isin(wars)]
cow_loc2 = cow_loc.groupby('WarID').agg({'Region':lambda x: ','.join(list(x))})

In [8]:
cow_warloc = cow_war.merge(cow_loc2, on='WarID')
cow_warloc = cow_warloc.rename(columns={'WarID': 'war_id', 'WarTypeName': 'war_type', 'Region': 'war_region'})
cow_merged = cow_par_ts.merge(cow_warloc, on=['war_id'], how='left')

In [9]:
def get_state_region(row):
    state = row.cow_id
    region = "none"
    
    if state < 200:
        region = "W. Hemisphere"
    elif state < 400:
        region = "Europe"
    elif state < 600:
        region = "Africa"
    elif state < 700:
        region = "Middle East"
    elif state < 900:
        region = "Asia"
    else:
        region = "Oceania"
    
    return region

In [10]:
cow_merged['state_region'] = cow_merged.apply(get_state_region, axis=1)
cow_merged['IsSameRegion'] = cow_merged.apply(lambda x: x.state_region in x.war_region.split(','), axis=1).astype(int)
cow_merged

Unnamed: 0,year,StartDate,EndDate,cow_id,war_id,IsInitiator,Outcome,total_deaths,TimeAtWar,war_type,IsIntervention,IsInternational,war_region,state_region,IsSameRegion
0,1947,1947-10-26,1949-01-01,750,147,1,6,2500.0,0.184,Inter-State War,,,Asia,Asia,1
1,1948,1947-10-26,1949-01-01,750,147,1,6,2500.0,1.000,Inter-State War,,,Asia,Asia,1
2,1949,1947-10-26,1949-01-01,750,147,1,6,2500.0,0.003,Inter-State War,,,Asia,Asia,1
3,1947,1947-10-26,1949-01-01,770,147,0,6,1000.0,0.184,Inter-State War,,,Asia,Asia,1
4,1948,1947-10-26,1949-01-01,770,147,0,6,1000.0,1.000,Inter-State War,,,Asia,Asia,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1528,1949,1948-04-03,1949-05-01,732,1573,0,1,,0.332,Non-State War,,,Asia,Asia,1
1529,1968,1968-09-01,1971-10-06,698,1577,0,6,,0.334,Non-State War,,,Middle East,Middle East,1
1530,1969,1968-09-01,1971-10-06,698,1577,0,6,,1.000,Non-State War,,,Middle East,Middle East,1
1531,1970,1968-09-01,1971-10-06,698,1577,0,6,,1.000,Non-State War,,,Middle East,Middle East,1


find how many years each conflict lasted for so the total_deaths can be normalized to the average per year.
note: discarding this

In [11]:
#cow_merged_wars = cow_merged.groupby(['war_id', 'cow_id']).agg({'year': 'count'})
#cow_merged_wars = cow_merged_wars.reset_index().rename(columns={'year':'num_years'})

#cow_merged2 = cow_merged.merge(cow_merged_wars, on=['war_id', 'cow_id'])
#cow_merged2['avg_deaths'] = (cow_merged2['total_deaths'] / cow_merged2['num_years'])
#cow_merged2['avg_deaths'] = cow_merged2['avg_deaths'].round(0).astype('Int64')
#cow_merged2

create dummy variables for each war type

In [12]:
cow_merged['war_type'].unique()

array(['Inter-State War', 'Extra-State War', 'Intra-State War',
       'Non-State War'], dtype=object)

In [13]:
type_map = {'Inter-State War':'InterState', 'Extra-State War':'ExtraState', 'Intra-State War':'IntraState', 'Non-State War':'NonState'}
cow_merged['war_type'] = cow_merged['war_type'].map(type_map)
type_dummy = pd.get_dummies(cow_merged['war_type'], prefix='type')
cow_merged = pd.concat([cow_merged, type_dummy], axis=1)

create dummy variables for each war outcome type

note - should this be limited to only the last year of the war?

In [14]:
cow_merged['Outcome'].unique()

array([6, 1, 2, 4, 3, 7, 5])

In [15]:
cow_merged.loc[cow_merged['EndDate'].dt.year != cow_merged['year'], 'Outcome'] = 5
outcome_map = {1: 'won', 2: 'lost', 3: 'compromised', 4: 'transWarType', 5: 'ongoing', 6: 'stalemate', 7: 'contConflict'}
cow_merged['Outcome'] = cow_merged['Outcome'].map(outcome_map)
outcome_dummy = pd.get_dummies(cow_merged['Outcome'], prefix='status')
cow_merged = pd.concat([cow_merged, outcome_dummy], axis=1)

In [16]:
cow_merged

Unnamed: 0,year,StartDate,EndDate,cow_id,war_id,IsInitiator,Outcome,total_deaths,TimeAtWar,war_type,...,type_InterState,type_IntraState,type_NonState,status_compromised,status_contConflict,status_lost,status_ongoing,status_stalemate,status_transWarType,status_won
0,1947,1947-10-26,1949-01-01,750,147,1,ongoing,2500.0,0.184,InterState,...,1,0,0,0,0,0,1,0,0,0
1,1948,1947-10-26,1949-01-01,750,147,1,ongoing,2500.0,1.000,InterState,...,1,0,0,0,0,0,1,0,0,0
2,1949,1947-10-26,1949-01-01,750,147,1,stalemate,2500.0,0.003,InterState,...,1,0,0,0,0,0,0,1,0,0
3,1947,1947-10-26,1949-01-01,770,147,0,ongoing,1000.0,0.184,InterState,...,1,0,0,0,0,0,1,0,0,0
4,1948,1947-10-26,1949-01-01,770,147,0,ongoing,1000.0,1.000,InterState,...,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1528,1949,1948-04-03,1949-05-01,732,1573,0,won,,0.332,NonState,...,0,0,1,0,0,0,0,0,0,1
1529,1968,1968-09-01,1971-10-06,698,1577,0,ongoing,,0.334,NonState,...,0,0,1,0,0,0,1,0,0,0
1530,1969,1968-09-01,1971-10-06,698,1577,0,ongoing,,1.000,NonState,...,0,0,1,0,0,0,1,0,0,0
1531,1970,1968-09-01,1971-10-06,698,1577,0,ongoing,,1.000,NonState,...,0,0,1,0,0,0,1,0,0,0


group by country and year (so wars get crunched together) and compute aggregates - count number of unique wars, sum the dummy variables, and sum the average deaths.

In [17]:
cow_merged.columns

Index(['year', 'StartDate', 'EndDate', 'cow_id', 'war_id', 'IsInitiator',
       'Outcome', 'total_deaths', 'TimeAtWar', 'war_type', 'IsIntervention',
       'IsInternational', 'war_region', 'state_region', 'IsSameRegion',
       'type_ExtraState', 'type_InterState', 'type_IntraState',
       'type_NonState', 'status_compromised', 'status_contConflict',
       'status_lost', 'status_ongoing', 'status_stalemate',
       'status_transWarType', 'status_won'],
      dtype='object')

In [18]:
cow_gb = cow_merged.groupby(['cow_id', 'year']).agg({'war_id': 'nunique', 
                                                      'IsInitiator': 'sum', 
                                                      'TimeAtWar': 'sum',
                                                      'IsSameRegion': 'sum',
                                                      'type_ExtraState': 'sum', 
                                                      'type_InterState': 'sum', 
                                                      'type_IntraState': 'sum',
                                                      'type_NonState': 'sum', 
                                                      'status_compromised': 'sum', 
                                                      'status_contConflict': 'sum',
                                                      'status_lost': 'sum', 
                                                      'status_ongoing': 'sum', 
                                                      'status_stalemate': 'sum',
                                                      'status_transWarType': 'sum', 
                                                      'status_won': 'sum'})
cow_gb = cow_gb.rename(columns={'war_id': 'warCount'})
cow_gb = cow_gb.add_prefix('cow_')
cow_gb

Unnamed: 0_level_0,Unnamed: 1_level_0,cow_warCount,cow_IsInitiator,cow_TimeAtWar,cow_IsSameRegion,cow_type_ExtraState,cow_type_InterState,cow_type_IntraState,cow_type_NonState,cow_status_compromised,cow_status_contConflict,cow_status_lost,cow_status_ongoing,cow_status_stalemate,cow_status_transWarType,cow_status_won
cow_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2,1950,1,0,0.515,0,0,1,0,0,0,0,0,1,0,0,0
2,1951,1,0,1.000,0,0,1,0,0,0,0,0,1,0,0,0
2,1952,1,0,1.000,0,0,1,0,0,0,0,0,1,0,0,0
2,1953,1,0,0.570,0,0,1,0,0,0,0,0,0,1,0,0
2,1958,1,0,0.170,0,0,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900,2008,2,2,2.006,0,2,0,0,0,0,0,0,2,0,0,0
910,1989,1,1,0.468,0,0,0,1,0,0,0,0,1,0,0,0
910,1990,1,1,1.000,0,0,0,1,0,0,0,0,1,0,0,0
910,1991,1,1,1.000,0,0,0,1,0,0,0,0,1,0,0,0


merge onto base and fill NAs with 0

In [19]:
base_ts = pd.read_csv("../Data/FINAL/countrycodes_ts-base.csv", usecols=['country', 'year', 'cow_id']).dropna()
base_ts

Unnamed: 0,country,year,cow_id
0,Afghanistan,1946,700.0
1,Afghanistan,1947,700.0
2,Afghanistan,1948,700.0
3,Afghanistan,1949,700.0
4,Afghanistan,1950,700.0
...,...,...,...
11254,Zimbabwe,2014,552.0
11255,Zimbabwe,2015,552.0
11256,Zimbabwe,2016,552.0
11257,Zimbabwe,2017,552.0


In [20]:
cow_final = base_ts.merge(cow_gb.reset_index(), on=['cow_id','year'], how='left')

In [21]:
cow_final = cow_final[cow_final['year']<=2008]
cow_final = cow_final.fillna(0).astype(int, errors="ignore")

In [22]:
cow_final

Unnamed: 0,country,year,cow_id,cow_warCount,cow_IsInitiator,cow_TimeAtWar,cow_IsSameRegion,cow_type_ExtraState,cow_type_InterState,cow_type_IntraState,cow_type_NonState,cow_status_compromised,cow_status_contConflict,cow_status_lost,cow_status_ongoing,cow_status_stalemate,cow_status_transWarType,cow_status_won
0,Afghanistan,1946,700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Afghanistan,1947,700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Afghanistan,1948,700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Afghanistan,1949,700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Afghanistan,1950,700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10871,Zimbabwe,2004,552,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10872,Zimbabwe,2005,552,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10873,Zimbabwe,2006,552,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10874,Zimbabwe,2007,552,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [23]:
cow_final.to_csv("../Data/FINAL/cow.csv", index=False)