# Data Mashup - Create Time Series

This notebook will combine the following data sources into a state-year time series:

- Correlates of War
- UCDP/PRIO Armed Conflict
- Polity IV
- World Bank WDI

The year will range from 1946 - 2016, and the states will be identified by their ISO code.

In [1]:
import pandas as pd
import numpy as np

## Create base time series

In [2]:
cow_ts = pd.read_csv("../Data/CoW/Raw/system2016.csv") \
            .drop(columns='version').rename(columns={'stateabb': 'iso_alpha3', 'ccode': 'cow_id'})
cow_ts = cow_ts[cow_ts['year'] > 1945].reset_index(drop=True)
cow_ts

Unnamed: 0,iso_alpha3,cow_id,year
0,USA,2,1946
1,CAN,20,1946
2,CUB,40,1946
3,HAI,41,1946
4,DOM,42,1946
...,...,...,...
10513,NAU,970,2016
10514,MSI,983,2016
10515,PAL,986,2016
10516,FSM,987,2016


In [3]:
gw = pd.read_csv("../Data/GW/gw_codes.dat", sep = "\t", header=None, names = ['gw_id', 'ISO_alpha3', 'name', 'startdate', 'enddate'], encoding="latin-1")
gw['startdate'] = pd.to_datetime(gw['startdate'], format = '%d:%m:%Y')
gw['enddate'] = pd.to_datetime(gw['enddate'], format = '%d:%m:%Y')
gw = gw.rename(columns={'name': 'gw_name', 'startdate': 'gw_startdate', 'enddate': 'gw_enddate'})
gw

Unnamed: 0,gw_id,ISO_alpha3,gw_name,gw_startdate,gw_enddate
0,2,USA,United States of America,1816-01-01,2017-12-31
1,20,CAN,Canada,1867-07-01,2017-12-31
2,31,BHM,Bahamas,1973-07-10,2017-12-31
3,40,CUB,Cuba,1902-05-20,2017-12-31
4,41,HAI,Haiti,1816-01-01,1915-07-04
...,...,...,...,...,...
211,900,AUL,Australia,1901-01-01,2017-12-31
212,910,PNG,Papua New Guinea,1975-09-16,2017-12-31
213,920,NEW,New Zealand,1907-09-01,2017-12-31
214,940,SOL,Solomon Islands,1978-07-07,2017-12-31


In [4]:
gw_ts = pd.concat([pd.DataFrame({'year': pd.date_range(row.gw_startdate, row.gw_enddate, freq='Y'),
                                 'gw_id': row.gw_id,
                                 'iso_alpha3': row.ISO_alpha3}, 
                                columns=['year', 'gw_id', 'iso_alpha3']) 
           for i, row in gw.iterrows()], ignore_index=True)
gw_ts['year'] = gw_ts['year'].dt.year
gw_ts = gw_ts[(gw_ts['year'] > 1945) & (gw_ts['year'] < 2017)].reset_index(drop=True)
gw_ts

Unnamed: 0,year,gw_id,iso_alpha3
0,1946,2,USA
1,1947,2,USA
2,1948,2,USA
3,1949,2,USA
4,1950,2,USA
...,...,...,...
9957,2012,950,FJI
9958,2013,950,FJI
9959,2014,950,FJI
9960,2015,950,FJI


In [5]:
base_ts = gw_ts.merge(cow_ts, how='outer', on=['iso_alpha3', 'year'])
base_ts

Unnamed: 0,year,gw_id,iso_alpha3,cow_id
0,1946,2.0,USA,2.0
1,1947,2.0,USA,2.0
2,1948,2.0,USA,2.0
3,1949,2.0,USA,2.0
4,1950,2.0,USA,2.0
...,...,...,...,...
10797,2016,,NAU,970.0
10798,2016,,MSI,983.0
10799,2016,,PAL,986.0
10800,2016,,FSM,987.0


### need to resolve problem of identifiers that don't match

In [6]:
base_ts[base_ts['gw_id'].isna()].cow_id.unique()

array([360., 678., 812., 511., 950.,  55., 403., 817., 591., 990.,  54.,
        56.,  57.,  58., 935.,  60., 223., 255., 265., 680., 983., 987.,
       331., 221., 232., 986., 946., 955., 970., 947., 345.])

In [7]:
base_ts[base_ts['gw_id'].isna()].iso_alpha3.unique()

array(['ROM', 'YAR', 'LAO', 'ZAN', 'FIJ', 'GRN', 'STP', 'RVN', 'SEY',
       'WSM', 'DMA', 'SLU', 'SVG', 'AAB', 'VAN', 'SKN', 'LIE', 'GMY',
       'GDR', 'YPR', 'MSI', 'FSM', 'SNM', 'MNC', 'AND', 'PAL', 'KIR',
       'TON', 'NAU', 'TUV', 'YUG'], dtype=object)

In [8]:
base_ts[base_ts['cow_id'].isna()].gw_id.unique()

array([260., 265., 305., 343., 340., 360., 652., 678., 698., 711., 732.,
       740., 760., 850., 950.])

In [9]:
base_ts[base_ts['cow_id'].isna()].iso_alpha3.unique()

array(['GFR', 'GDR', 'AUS', 'MAC', 'SER', 'RUM', 'SYR', 'YEM', 'OMA',
       'TBT', 'ROK', 'JPN', 'BHU', 'INS', 'FJI'], dtype=object)

## Incorporate Polity data

In [10]:
polity = pd.read_csv("../Data/PolityIV/p4v2018.csv", usecols=['ccode', 'scode', 'year', 'democ', 'autoc', 'polity', 'polity2', 'durable', 'd4', 'sf', 'regtrans'])
polity = polity[(polity['year'] > 1945) & (polity['year'] < 2017)]
polity = polity.rename(columns={'ccode': 'cow_id', 'scode': 'iso_alpha3'})
polity

Unnamed: 0,cow_id,iso_alpha3,year,democ,autoc,polity,polity2,durable,d4,sf,regtrans
146,2,USA,1946,10,0,10,10.0,137.0,,,
147,2,USA,1947,10,0,10,10.0,138.0,,,
148,2,USA,1948,10,0,10,10.0,139.0,,,
149,2,USA,1949,10,0,10,10.0,140.0,,,
150,2,USA,1950,10,0,10,10.0,141.0,,,
...,...,...,...,...,...,...,...,...,...,...,...
17555,950,FJI,2012,0,4,-4,-4.0,5.0,,,
17556,950,FJI,2013,0,4,-4,-4.0,6.0,,,
17557,950,FJI,2014,3,1,2,2.0,0.0,1.0,,3.0
17558,950,FJI,2015,3,1,2,2.0,1.0,,,


In [11]:
ts_1 = base_ts.merge(polity, how="outer", on=['cow_id', 'iso_alpha3', 'year'])
ts_1 = ts_1.rename(columns={'democ': 'pol_democ', 'autoc': 'pol_autoc', 'polity': 'pol_polity', 
                            'polity2': 'pol_polity2', 'durable': 'pol_durable', 'd4': 'pol_regtransComplete', 
                            'sf': 'pol_isStateFailure', 'regtrans': 'pol_regtrans'})
ts_1

Unnamed: 0,year,gw_id,iso_alpha3,cow_id,pol_democ,pol_autoc,pol_polity,pol_polity2,pol_durable,pol_regtransComplete,pol_isStateFailure,pol_regtrans
0,1946,2.0,USA,2.0,10.0,0.0,10.0,10.0,137.0,,,
1,1947,2.0,USA,2.0,10.0,0.0,10.0,10.0,138.0,,,
2,1948,2.0,USA,2.0,10.0,0.0,10.0,10.0,139.0,,,
3,1949,2.0,USA,2.0,10.0,0.0,10.0,10.0,140.0,,,
4,1950,2.0,USA,2.0,10.0,0.0,10.0,10.0,141.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
11326,2012,,FJI,950.0,0.0,4.0,-4.0,-4.0,5.0,,,
11327,2013,,FJI,950.0,0.0,4.0,-4.0,-4.0,6.0,,,
11328,2014,,FJI,950.0,3.0,1.0,2.0,2.0,0.0,1.0,,3.0
11329,2015,,FJI,950.0,3.0,1.0,2.0,2.0,1.0,,,


note: the 'regtrans' variable needs to be expanded so that certain control codes (see pg 35-36 in codebook) get their own columns

## Incorporate World Bank data

- will need to deal with missing data

In [12]:
wdi = pd.read_csv("../Data/WorldBank/Wrangled/wdi_timeseries_top25.csv", usecols=['country', 'indicator', 'value', 'year'])
wdi = wdi[wdi['year'] < 2017]
wdi

Unnamed: 0,country,indicator,value,year
3,AFG,BX.KLT.DINV.CD.WD,93591315.3,2016
4,AFG,BX.KLT.DINV.CD.WD,169146608.0,2015
5,AFG,BX.KLT.DINV.CD.WD,42975262.5,2014
6,AFG,BX.KLT.DINV.CD.WD,48311346.0,2013
7,AFG,BX.KLT.DINV.CD.WD,56823660.0,2012
...,...,...,...,...
287995,ZWE,ST.INT.ARVL,,1964
287996,ZWE,ST.INT.ARVL,,1963
287997,ZWE,ST.INT.ARVL,,1962
287998,ZWE,ST.INT.ARVL,,1961


In [13]:
wdi_pivot = wdi.pivot_table(index=['country', 'year'], columns='indicator', values='value')
wdi_pivot = wdi_pivot.add_prefix('wb_')
wdi_pivot = wdi_pivot.reset_index()
wdi_pivot = wdi_pivot.rename(columns={'country': 'iso_alpha3'})
wdi_pivot

indicator,iso_alpha3,year,wb_BX.KLT.DINV.CD.WD,wb_EG.ELC.ACCS.ZS,wb_EN.ATM.CO2E.PC,wb_EN.POP.DNST,wb_FP.CPI.TOTL.ZG,wb_MS.MIL.XPND.GD.ZS,wb_NE.EXP.GNFS.ZS,wb_NY.GDP.MKTP.CD,...,wb_SI.POV.GINI,wb_SL.UEM.TOTL.ZS,wb_SP.DYN.IMRT.IN,wb_SP.DYN.LE00.IN,wb_SP.DYN.TFRT.IN,wb_SP.POP.GROW,wb_SP.POP.TOTL,wb_SP.POP.TOTL.FE.IN,wb_SP.URB.TOTL.IN.ZS,wb_ST.INT.ARVL
0,AFG,1960,,,0.046057,,,,4.132233,5.377778e+08,...,,,,32.446,7.450,1.828639,8996973.0,4347397.0,8.401,
1,AFG,1961,,,0.053589,14.044987,,,4.453443,5.488889e+08,...,,,236.5,32.962,7.450,1.898476,9169410.0,4439158.0,8.684,
2,AFG,1962,,,0.073721,14.323808,,,4.878051,5.466667e+08,...,,,232.6,33.471,7.450,1.965751,9351441.0,4535392.0,8.976,
3,AFG,1963,,,0.074161,14.617537,,,9.171601,7.511112e+08,...,,,228.9,33.971,7.450,2.029893,9543205.0,4636172.0,9.276,
4,AFG,1964,,,0.086174,14.926295,,,8.888893,8.000000e+08,...,,,225.1,34.463,7.450,2.090248,9744781.0,4741531.0,9.586,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10918,ZWE,2012,3.498500e+08,44.000000,0.594152,33.902368,3.721087,2.263931,25.163254,1.711485e+10,...,,5.608,45.7,55.032,4.058,1.698001,13115131.0,6874607.0,32.834,1794000.0
10919,ZWE,2013,3.730500e+08,38.344547,0.874563,34.510420,1.631622,2.343084,21.987759,1.909102e+10,...,,5.623,42.8,56.897,4.030,1.777645,13350356.0,7001145.0,32.654,1833000.0
10920,ZWE,2014,4.728000e+08,32.300000,0.884721,35.121316,-0.212940,2.324734,20.930146,1.949552e+10,...,,5.518,40.4,58.410,3.974,1.754692,13586681.0,7126780.0,32.504,1880000.0
10921,ZWE,2015,3.992000e+08,33.700000,,35.710557,-2.409500,2.343629,19.160176,1.996312e+10,...,,5.438,38.5,59.534,3.896,1.663813,13814629.0,7245857.0,32.385,2057000.0


In [14]:
ts_2 = ts_1.merge(wdi_pivot, how='left', on=['iso_alpha3', 'year'])
ts_2

Unnamed: 0,year,gw_id,iso_alpha3,cow_id,pol_democ,pol_autoc,pol_polity,pol_polity2,pol_durable,pol_regtransComplete,...,wb_SI.POV.GINI,wb_SL.UEM.TOTL.ZS,wb_SP.DYN.IMRT.IN,wb_SP.DYN.LE00.IN,wb_SP.DYN.TFRT.IN,wb_SP.POP.GROW,wb_SP.POP.TOTL,wb_SP.POP.TOTL.FE.IN,wb_SP.URB.TOTL.IN.ZS,wb_ST.INT.ARVL
0,1946,2.0,USA,2.0,10.0,0.0,10.0,10.0,137.0,,...,,,,,,,,,,
1,1947,2.0,USA,2.0,10.0,0.0,10.0,10.0,138.0,,...,,,,,,,,,,
2,1948,2.0,USA,2.0,10.0,0.0,10.0,10.0,139.0,,...,,,,,,,,,,
3,1949,2.0,USA,2.0,10.0,0.0,10.0,10.0,140.0,,...,,,,,,,,,,
4,1950,2.0,USA,2.0,10.0,0.0,10.0,10.0,141.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11326,2012,,FJI,950.0,0.0,4.0,-4.0,-4.0,5.0,,...,,4.516,20.3,66.859,2.773,0.187444,865069.0,424109.0,53.196,661000.0
11327,2013,,FJI,950.0,0.0,4.0,-4.0,-4.0,6.0,,...,36.7,4.357,20.4,66.950,2.786,0.062288,865608.0,424846.0,53.706,659000.0
11328,2014,,FJI,950.0,3.0,1.0,2.0,2.0,0.0,1.0,...,,4.533,20.6,67.030,2.795,0.097572,866453.0,425743.0,54.216,693000.0
11329,2015,,FJI,950.0,3.0,1.0,2.0,2.0,1.0,,...,,4.548,20.8,67.103,2.800,0.250594,868627.0,427258.0,54.726,755000.0


## Incorporate UCDP/PRIO data

need to aggregate data to create appropriate variables

- isConflict: 0 = no, 1 = yes
- conflictLevel: highest level for country-year
- isInternal, isInterstate, isExtrasystemic

In [16]:
ucdp_par = pd.read_csv("../Data/UCDP_PRIO/Wrangled/participants_gw.csv")
ucdp_con = pd.read_csv("../Data/UCDP_PRIO/Wrangled/conflicts.csv", usecols=['conflict_id', 'type_of_conflict', 'incompatibility_isGovernment', 'incompatibility_isTerritory'])
ucdp_obs = pd.read_csv("../Data/UCDP_PRIO/Wrangled/observations.csv", usecols=['obs_id', 'ep_end', 'intensity_level', 'cumulative_intensity', 'isInternational'])
ucdp_loc = pd.read_csv("../Data/UCDP_PRIO/Wrangled/locations.csv")

In [26]:
ucdp_loc['isLocation'] = 1

ucdp = ucdp_par.merge(ucdp_con, on='conflict_id') \
                .merge(ucdp_obs, on='obs_id') \
                .merge(ucdp_loc, how='left', on=['conflict_id', 'gw_id']).fillna({'isLocation': 0})

ucdp

Unnamed: 0,obs_id,gw_id,side,role,conflict_id,year,type_of_conflict,incompatibility_isTerritory,incompatibility_isGovernment,ep_end,intensity_level,cumulative_intensity,isInternational,isLocation
0,200-1946,145,A,primary,200,1946,Internal,0,1,1,War,War,0.0,1.0
1,200-1952,145,A,primary,200,1952,Internal,0,1,1,Minor,War,0.0,1.0
2,200-1967,145,A,primary,200,1967,Internal,0,1,1,Minor,War,0.0,1.0
3,201-1946,220,A,primary,201,1946,Extrasystemic,1,0,0,Minor,Minor,,0.0
4,201-1947,220,A,primary,201,1947,Extrasystemic,1,0,0,Minor,Minor,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4442,14275-2017,840,A,primary,14275,2017,Internal,1,0,0,War,War,0.0,1.0
4443,14275-2018,840,A,primary,14275,2018,Internal,1,0,0,Minor,War,0.0,1.0
4444,14333-2016,616,A,primary,14333,2016,Internal,1,0,1,Minor,Minor,0.0,1.0
4445,14609-2018,630,A,primary,14609,2018,Interstate,0,1,0,Minor,Minor,,1.0


In [27]:
# create role dummy variables
ucdp['role_primary'] = 0
ucdp['role_primary'][ucdp['role'] == 'primary'] = 1

ucdp['role_secondary'] = 0
ucdp['role_secondary'][ucdp['role'] == 'secondary'] = 1

# create type dummy variables
ucdp['type_interstate'] = 0
ucdp['type_interstate'][ucdp['type_of_conflict'] == 'Interstate'] = 1

ucdp['type_internal'] = 0
ucdp['type_internal'][ucdp['type_of_conflict'] == 'Internal'] = 1

ucdp['type_extrasystemic'] = 0
ucdp['type_extrasystemic'][ucdp['type_of_conflict'] == 'Extrasystemic'] = 1

# create intensity dummy variables
ucdp['intensity_level'] = ucdp['intensity_level'].replace({'Minor': 1, 'War': 2})
ucdp['cumulative_intensity'] = ucdp['cumulative_intensity'].replace({'Minor': 1, 'War': 2})

ucdp

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be

Unnamed: 0,obs_id,gw_id,side,role,conflict_id,year,type_of_conflict,incompatibility_isTerritory,incompatibility_isGovernment,ep_end,intensity_level,cumulative_intensity,isInternational,isLocation,role_primary,role_secondary,type_interstate,type_internal,type_extrasystemic
0,200-1946,145,A,primary,200,1946,Internal,0,1,1,2,2,0.0,1.0,1,0,0,1,0
1,200-1952,145,A,primary,200,1952,Internal,0,1,1,1,2,0.0,1.0,1,0,0,1,0
2,200-1967,145,A,primary,200,1967,Internal,0,1,1,1,2,0.0,1.0,1,0,0,1,0
3,201-1946,220,A,primary,201,1946,Extrasystemic,1,0,0,1,1,,0.0,1,0,0,0,1
4,201-1947,220,A,primary,201,1947,Extrasystemic,1,0,0,1,1,,0.0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4442,14275-2017,840,A,primary,14275,2017,Internal,1,0,0,2,2,0.0,1.0,1,0,0,1,0
4443,14275-2018,840,A,primary,14275,2018,Internal,1,0,0,1,2,0.0,1.0,1,0,0,1,0
4444,14333-2016,616,A,primary,14333,2016,Internal,1,0,1,1,1,0.0,1.0,1,0,0,1,0
4445,14609-2018,630,A,primary,14609,2018,Interstate,0,1,0,1,1,,1.0,1,0,1,0,0


In [32]:
ucdp_gb = ucdp.groupby(['gw_id', 'year']).agg({'conflict_id':'count', 
                                               'ep_end': 'max', 
                                               'intensity_level': 'max', 
                                               'cumulative_intensity': 'max', 
                                               'isLocation': 'sum', 
                                               'role_primary': 'sum', 
                                               'role_secondary': 'sum', 
                                               'type_interstate': 'sum', 
                                               'type_internal': 'sum',
                                               'isInternational': 'sum',
                                               'type_extrasystemic': 'sum', 
                                               'incompatibility_isTerritory': 'sum', 
                                               'incompatibility_isGovernment': 'sum'})
ucdp_gb = ucdp_gb.add_prefix('ucdp_')
ucdp_gb = ucdp_gb.reset_index().rename(columns={'ucdp_conflict_id': 'ucdp_conflict_count'})
ucdp_gb

Unnamed: 0,gw_id,year,ucdp_conflict_count,ucdp_ep_end,ucdp_intensity_level,ucdp_cumulative_intensity,ucdp_isLocation,ucdp_role_primary,ucdp_role_secondary,ucdp_type_interstate,ucdp_type_internal,ucdp_isInternational,ucdp_type_extrasystemic,ucdp_incompatibility_isTerritory,ucdp_incompatibility_isGovernment
0,2,1950,2,1,2,2,1.0,1,1,1,0,0.0,1,2,0
1,2,1951,1,0,2,2,0.0,0,1,1,0,0.0,0,1,0
2,2,1952,1,0,2,2,0.0,0,1,1,0,0.0,0,1,0
3,2,1953,1,1,2,2,0.0,0,1,1,0,0.0,0,1,0
4,2,1954,1,1,2,2,0.0,0,1,1,0,0.0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2844,972,2008,1,0,2,2,0.0,0,1,0,1,1.0,0,0,1
2845,972,2011,1,0,2,2,0.0,0,1,0,1,1.0,0,0,1
2846,972,2012,1,0,2,2,0.0,0,1,0,1,1.0,0,0,1
2847,972,2013,1,0,2,2,0.0,0,1,0,1,1.0,0,0,1


In [33]:
ts_3 = ts_2.merge(ucdp_gb, how='left', on=['gw_id', 'year'])
ts_3

Unnamed: 0,year,gw_id,iso_alpha3,cow_id,pol_democ,pol_autoc,pol_polity,pol_polity2,pol_durable,pol_regtransComplete,...,ucdp_cumulative_intensity,ucdp_isLocation,ucdp_role_primary,ucdp_role_secondary,ucdp_type_interstate,ucdp_type_internal,ucdp_isInternational,ucdp_type_extrasystemic,ucdp_incompatibility_isTerritory,ucdp_incompatibility_isGovernment
0,1946,2.0,USA,2.0,10.0,0.0,10.0,10.0,137.0,,...,,,,,,,,,,
1,1947,2.0,USA,2.0,10.0,0.0,10.0,10.0,138.0,,...,,,,,,,,,,
2,1948,2.0,USA,2.0,10.0,0.0,10.0,10.0,139.0,,...,,,,,,,,,,
3,1949,2.0,USA,2.0,10.0,0.0,10.0,10.0,140.0,,...,,,,,,,,,,
4,1950,2.0,USA,2.0,10.0,0.0,10.0,10.0,141.0,,...,2.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11326,2012,,FJI,950.0,0.0,4.0,-4.0,-4.0,5.0,,...,,,,,,,,,,
11327,2013,,FJI,950.0,0.0,4.0,-4.0,-4.0,6.0,,...,,,,,,,,,,
11328,2014,,FJI,950.0,3.0,1.0,2.0,2.0,0.0,1.0,...,,,,,,,,,,
11329,2015,,FJI,950.0,3.0,1.0,2.0,2.0,1.0,,...,,,,,,,,,,


## Incorporate CoW data

need to aggregate data to create appropriate variables