# Explore & Resolve Discrepancies in State Identifiers

Summary:

In the first part, I attempt to use CShapes as a bridge between datasets. However, I discover that this is not feasible.

In the second part, I use a dataset from the R package 'countrycode'. I fix the errors I can identify, and as a result I have my base country-year time series.

In [1]:
import pandas as pd

In [2]:
cshapes = pd.read_csv("../Data/CShapes/Raw/country_shapes.csv", na_values=[-1])
cow = pd.read_csv("../Data/CoW/Raw/states2016.csv")
cow_ts = pd.read_csv("../Data/CoW/Raw/system2016.csv", usecols=['ccode', 'year'])
polity = pd.read_csv("../Data/PolityIV/p4v2018.csv", usecols = ['ccode', 'country', 'year'])
wb = pd.read_csv("../Data/WorldBank/Raw_API/countries_list.csv")
gw_states = pd.read_csv("../Data/GW/Raw/gw_codes.dat", sep = "\t", header=None, names = ['gw_id', 'abbr', 'name', 'startdate', 'enddate'], encoding="latin-1")
gw_micro = pd.read_csv("../Data/GW/Raw/microstatessystem.dat", sep = "\t", header=None, names = ['gw_id', 'abbr', 'name', 'startdate', 'enddate'], encoding="latin-1")
ucdp = pd.read_csv("../Data/UCDP_PRIO/Wrangled/participants_gw.csv")

## CShapes

In [3]:
cshapes['cow_edate'] = pd.to_datetime(cshapes.cow_endyear*10000+cshapes.cow_endmonth*100+cshapes.cow_endday,format='%Y%m%d')
cshapes['cow_sdate'] = pd.to_datetime(cshapes.cow_startyear*10000+cshapes.cow_startmonth*100+cshapes.cow_startday,format='%Y%m%d')
cshapes['gw_edate'] = pd.to_datetime(cshapes.gw_endyear*10000+cshapes.gw_endmonth*100+cshapes.gw_endday,format='%Y%m%d')
cshapes['gw_sdate'] = pd.to_datetime(cshapes.gw_startyear*10000+cshapes.gw_startmonth*100+cshapes.gw_startday,format='%Y%m%d')

cshapes = cshapes.drop(columns=['area', 'capital_lat', 'capital_long', 'capital_name', 
                                'cow_endyear', 'cow_endmonth', 'cow_endday', 'cow_startyear', 'cow_startmonth', 'cow_startday',
                                'gw_endyear', 'gw_endmonth', 'gw_endday', 'gw_startyear', 'gw_startmonth', 'gw_startday'])
cshapes = cshapes.sort_values(by=['iso_alpha3', 'gw_sdate', 'cow_sdate'])
cshapes

Unnamed: 0,country_name,cow_code,feature_id,gw_code,iso_alpha2,iso_alpha3,iso_name,iso_num,cow_edate,cow_sdate,gw_edate,gw_sdate
132,Afghanistan,700.0,134,700.0,AF,AFG,Afghanistan,4,2016-06-30,1946-01-01,2016-06-30,1946-01-01
164,Angola,540.0,168,540.0,AO,AGO,Angola,24,2016-06-30,1975-11-11,2016-06-30,1975-11-11
81,Albania,339.0,82,339.0,AL,ALB,Albania,8,2016-06-30,1946-01-01,2016-06-30,1946-01-01
67,Andorra,232.0,68,232.0,AD,AND,Andorra,20,2016-06-30,1993-07-28,2016-06-30,1946-01-01
131,United Arab Emirates,696.0,133,696.0,AE,ARE,United Arab Emirates,784,2016-06-30,1971-12-02,2016-06-30,1971-12-02
...,...,...,...,...,...,...,...,...,...,...,...,...
231,Kosovo,347.0,236,347.0,,,,0,2016-06-30,2008-02-20,2016-06-30,2008-02-17
194,Egypt (United Arab Republic),651.0,199,,,,,0,1961-08-31,1958-02-01,NaT,NaT
209,Mali Federation,432.0,214,,,,,0,1960-08-19,1960-06-20,NaT,NaT
192,Egypt,651.0,197,,,,,0,1967-05-31,1961-09-01,NaT,NaT


In [4]:
cshapes_iso = list(cshapes['iso_alpha3'].unique())
cshapes_cow = list(cshapes['cow_code'].unique())
cshapes_gw = list(cshapes['gw_code'].unique())

## G&W

note: source for these files are from http://ksgleditsch.com/data-4.html

In [5]:
gw = pd.concat([gw_states, gw_micro])

In [6]:
gw['startdate'] = pd.to_datetime(gw['startdate'], format = '%d:%m:%Y')
gw['enddate'] = pd.to_datetime(gw['enddate'], format = '%d:%m:%Y')
gw = gw.rename(columns={'name': 'gw_name', 'startdate': 'gw_startdate', 'enddate': 'gw_enddate'})
gw

Unnamed: 0,gw_id,abbr,gw_name,gw_startdate,gw_enddate
0,2,USA,United States of America,1816-01-01,2017-12-31
1,20,CAN,Canada,1867-07-01,2017-12-31
2,31,BHM,Bahamas,1973-07-10,2017-12-31
3,40,CUB,Cuba,1902-05-20,2017-12-31
4,41,HAI,Haiti,1816-01-01,1915-07-04
...,...,...,...,...,...
18,973,TUV,Tuvalu,1978-10-01,2017-12-31
19,983,MSI,Marshall Islands,1986-10-21,2017-12-31
20,986,PAL,Palau,1994-10-01,2017-12-31
21,987,FSM,Federated States of Micronesia,1986-11-03,2017-12-31


In [7]:
gw.to_csv("../Data/Other/gw_codes.csv", index=False, encoding='utf-8')

In [8]:
gw = gw[gw['gw_enddate'] > pd.to_datetime('1945-12-31')]

In [9]:
gw.to_csv("../Data/Other/gw_states_post1946.csv", index=False, encoding='utf-8')

In [10]:
gw_gw = list(gw['gw_id'].unique())
gw_iso = list(gw['abbr'].unique())

## UCDP

In [11]:
ucdp_gw = list(ucdp['gw_id'].unique())

## CoW

In [12]:
cow['cow_startdate'] = pd.to_datetime(cow.styear*10000+cow.stmonth*100+cow.stday,format='%Y%m%d')
cow['cow_enddate'] = pd.to_datetime(cow.endyear*10000+cow.endmonth*100+cow.endday,format='%Y%m%d')

cow = cow.drop(columns=['styear', 'stmonth', 'stday', 'endyear', 'endmonth', 'endday', 'version'])
cow = cow.rename(columns={'stateabb': 'ISO_alpha3', 'ccode': 'cow_id', 'statenme': 'cow_name'})
cow

Unnamed: 0,ISO_alpha3,cow_id,cow_name,cow_startdate,cow_enddate
0,USA,2,United States of America,1816-01-01,2016-12-31
1,CAN,20,Canada,1920-01-10,2016-12-31
2,BHM,31,Bahamas,1973-07-10,2016-12-31
3,CUB,40,Cuba,1902-05-20,1906-09-25
4,CUB,40,Cuba,1909-01-23,2016-12-31
...,...,...,...,...,...
238,NAU,970,Nauru,1999-09-14,2016-12-31
239,MSI,983,Marshall Islands,1991-09-17,2016-12-31
240,PAL,986,Palau,1994-12-15,2016-12-31
241,FSM,987,Federated States of Micronesia,1991-09-17,2016-12-31


In [13]:
cow = cow[cow['cow_enddate'] > pd.to_datetime('1945-12-31')]

In [14]:
cow.to_csv("../Data/CoW/cow_states_post1946.csv", index=False, encoding='utf-8')

In [15]:
cow_cow = list(cow['cow_id'].unique())
cow_iso = list(cow['ISO_alpha3'].unique())

## Polity

In [16]:
polity = polity[polity['year'] > 1945]

In [17]:
polity = polity.drop_duplicates(subset=['ccode'])

In [18]:
polity.to_csv("../Data/PolityIV/polity_country_list.csv", index=False)

In [19]:
pol_cow = list(polity['ccode'].unique())

## World Bank

In [20]:
wb = wb[wb['region_name']!='Aggregates']

In [21]:
wb.to_csv("../Data/WorldBank/wb_countries.csv", index=False, encoding='utf-8')

In [22]:
wb_iso = list(wb['id'].unique())

## Compare

### CShapes vs original CoW and GW

In [23]:
set(cshapes_cow) - set(cow_cow)

{nan}

In [24]:
set(cow_cow) - set(cshapes_cow)

set()

In [25]:
cshapes_not_in_gw = set(cshapes_gw) - set(gw_gw)
cshapes[cshapes['gw_code'].isin(list(cshapes_not_in_gw))]

Unnamed: 0,country_name,cow_code,feature_id,gw_code,iso_alpha2,iso_alpha3,iso_name,iso_num,cow_edate,cow_sdate,gw_edate,gw_sdate
227,USSR,365.0,232,,SU,SUN,Union of Soviet Socialist Republics,810,1991-12-25,1991-09-06,NaT,NaT
225,Yugoslavia,345.0,230,,YU,YUG,"Yugoslavia, Socialist Federal Republic of",890,1992-04-06,1992-01-15,NaT,NaT
228,Yugoslavia,345.0,233,,YU,YUG,"Yugoslavia, Socialist Federal Republic of",890,1993-04-07,1992-04-07,NaT,NaT
194,Egypt (United Arab Republic),651.0,199,,,,,0,1961-08-31,1958-02-01,NaT,NaT
209,Mali Federation,432.0,214,,,,,0,1960-08-19,1960-06-20,NaT,NaT
192,Egypt,651.0,197,,,,,0,1967-05-31,1961-09-01,NaT,NaT
193,Syria,652.0,198,,,,,0,1967-05-31,1961-09-29,NaT,NaT


^ excepting the NaNs, these all look like microstates, and may be included in the seperate G&W microstate list

In [26]:
gw_not_in_cshapes = set(gw_gw) - set(cshapes_gw)
gw[gw['gw_id'].isin(list(gw_not_in_cshapes))]

Unnamed: 0,gw_id,abbr,gw_name,gw_startdate,gw_enddate
183,711,TBT,Tibet,1913-01-01,1950-10-01
10,396,ABK,Abkhazia,2008-08-26,2017-12-31
11,397,SOT,South Ossetia,2008-08-26,2017-12-31


### UCDP vs GW

751 is the Government of Hyderabad. Unclear if this is actually a state or just a territory.

In [27]:
set(ucdp_gw) - set(gw_gw)

{751}

### Polity & CoW

In [28]:
pol_not_in_cow = set(pol_cow) - set(cow_cow)

In [29]:
polity[polity['ccode'].isin(list(pol_not_in_cow))]

Unnamed: 0,ccode,country,year
7730,342,Serbia,2006
7913,348,Montenegro,2006
8557,364,USSR,1946
11606,525,South Sudan,2011
11614,529,Ethiopia,1993
15934,769,Pakistan,1947
16808,818,Vietnam,1976


^ polity codes are supposed to be derived from CoW country codes, but they must have added some

- Serbia does not exist in Polity between 1921 and 2005
- Ethiopia given an unofficial code (ETI instead of ETH) in 1993

In [30]:
cow_not_in_pol = set(cow_cow) - set(pol_cow)

In [31]:
cow[cow['cow_id'].isin(list(cow_not_in_pol))]

Unnamed: 0,ISO_alpha3,cow_id,cow_name,cow_startdate,cow_enddate
2,BHM,31,Bahamas,1973-07-10,2016-12-31
11,BAR,53,Barbados,1966-11-30,2016-12-31
12,DMA,54,Dominica,1978-11-03,2016-12-31
13,GRN,55,Grenada,1974-02-07,2016-12-31
14,SLU,56,St. Lucia,1979-02-22,2016-12-31
15,SVG,57,St. Vincent and the Grenadines,1979-10-27,2016-12-31
16,AAB,58,Antigua & Barbuda,1981-11-01,2016-12-31
17,SKN,60,St. Kitts and Nevis,1983-09-19,2016-12-31
19,BLZ,80,Belize,1981-09-21,2016-12-31
49,MNC,221,Monaco,1993-05-28,2016-12-31


^ again, these all look like microstates, which Polity must not track

### World Bank vs CoW & GW

In [32]:
wb_not_in_cshapes = set(wb_iso) - set(cshapes_iso)
wb[wb['id'].isin(list(wb_not_in_cshapes))]

Unnamed: 0,id,iso2code,name,capitalcity,latitude,longitude,region_name,adminregion_name,incomelevel_value,lendingtype_value
0,ABW,AW,Aruba,Oranjestad,12.5167,-70.0167,Latin America & Caribbean,,High income,Not classified
11,ASM,AS,American Samoa,Pago Pago,-14.2846,-170.691,East Asia & Pacific,East Asia & Pacific (excluding high income),Upper middle income,Not classified
32,BMU,BM,Bermuda,Hamilton,32.3293,-64.706,North America,,High income,Not classified
47,CHI,JG,Channel Islands,,,,Europe & Central Asia,,High income,Not classified
63,CUW,CW,Curacao,Willemstad,,,Latin America & Caribbean,,High income,Not classified
64,CYM,KY,Cayman Islands,George Town,19.3022,-81.3857,Latin America & Caribbean,,High income,Not classified
101,FRO,FO,Faroe Islands,Torshavn,61.8926,-6.91181,Europe & Central Asia,,High income,Not classified
108,GIB,GI,Gibraltar,,,,Europe & Central Asia,,High income,Not classified
115,GRL,GL,Greenland,Nuuk,64.1836,-51.7214,Europe & Central Asia,,High income,Not classified
117,GUM,GU,Guam,Agana,13.4443,144.794,East Asia & Pacific,,High income,Not classified


In [33]:
cshapes_not_in_wb = set(cshapes_iso) - set(wb_iso)
cshapes[cshapes['iso_alpha3'].isin(list(cshapes_not_in_wb))]

Unnamed: 0,country_name,cow_code,feature_id,gw_code,iso_alpha2,iso_alpha3,iso_name,iso_num,cow_edate,cow_sdate,gw_edate,gw_sdate
186,Czechoslovakia,315.0,190,315.0,CS,CSK,Czechoslovakia,200,1992-12-31,1946-01-01,1992-12-31,1946-01-01
185,Germany Democratic Republic,265.0,189,265.0,DD,DDR,German Democratic Republic,278,1990-10-02,1954-03-25,1990-10-02,1949-10-05
226,Serbia and Montenegro,345.0,231,345.0,CS,SCG,Serbia and Montenegro,891,2006-06-11,1993-04-08,2006-06-02,1992-04-27
188,USSR,365.0,192,365.0,SU,SUN,Union of Soviet Socialist Republics,810,1991-09-05,1946-01-01,1991-08-24,1946-01-01
247,USSR,,249,365.0,SU,SUN,Union of Soviet Socialist Republics,810,NaT,NaT,1991-08-26,1991-08-25
248,USSR,,250,365.0,SU,SUN,Union of Soviet Socialist Republics,810,NaT,NaT,1991-08-30,1991-08-27
249,USSR,,251,365.0,SU,SUN,Union of Soviet Socialist Republics,810,NaT,NaT,1991-09-05,1991-08-31
250,USSR,,252,365.0,SU,SUN,Union of Soviet Socialist Republics,810,NaT,NaT,1991-09-08,1991-09-06
251,USSR,,253,365.0,SU,SUN,Union of Soviet Socialist Republics,810,NaT,NaT,1991-10-26,1991-09-09
252,USSR,,254,365.0,SU,SUN,Union of Soviet Socialist Republics,810,NaT,NaT,1991-11-30,1991-10-27


## Conclusion

This is not a solvable problem with the datasets that I have. Instead, I will use the R package "countrycode". See the github repo here: https://github.com/vincentarelbundock/countrycode

# Part 2: Explore & resolve discrepancies using 'countrycode' package 

In [34]:
pd.set_option('display.max_rows', 100)

In [35]:
cc = pd.read_csv("../Data/countrycode/country_conversion_table.csv", dtype={'cown': 'Int64', 'gwn': 'Int64', 'p4n': 'Int64'})

In [36]:
cc_cow = list(cc['cown'].unique())
cow_not_in_cc = set(cow_cow) - set(cc_cow)
cow[cow['cow_id'].isin(list(cow_not_in_cc))]

Unnamed: 0,ISO_alpha3,cow_id,cow_name,cow_startdate,cow_enddate
221,DRV,816,Vietnam,1954-07-21,2016-12-31


In [37]:
cc_gw = list(cc['gwn'].unique())
gw_not_in_cc = set(gw_gw) - set(cc_gw)
gw[gw['gw_id'].isin(list(gw_not_in_cc))]

Unnamed: 0,gw_id,abbr,gw_name,gw_startdate,gw_enddate
183,711,TBT,Tibet,1913-01-01,1950-10-01
203,816,DRV,"Vietnam, Democratic Republic of",1954-05-01,2017-12-31
10,396,ABK,Abkhazia,2008-08-26,2017-12-31
11,397,SOT,South Ossetia,2008-08-26,2017-12-31
12,403,STP,São Tomé and Principe,1975-07-12,2017-12-31


In [38]:
cc_p4 = list(cc['p4n'].unique())
pol_not_in_cc = set(pol_cow) - set(cc_p4)
polity[polity['ccode'].isin(list(pol_not_in_cc))]

Unnamed: 0,ccode,country,year
7913,348,Montenegro,2006
8557,364,USSR,1946
11606,525,South Sudan,2011
11614,529,Ethiopia,1993
15934,769,Pakistan,1947
16808,818,Vietnam,1976


In [39]:
cc_wb = list(cc['wb'].unique())
wb_not_in_cc = set(wb_iso) - set(cc_wb)
wb[wb['id'].isin(list(wb_not_in_cc))]

Unnamed: 0,id,iso2code,name,capitalcity,latitude,longitude,region_name,adminregion_name,incomelevel_value,lendingtype_value
0,ABW,AW,Aruba,Oranjestad,12.5167,-70.0167,Latin America & Caribbean,,High income,Not classified
11,ASM,AS,American Samoa,Pago Pago,-14.2846,-170.691,East Asia & Pacific,East Asia & Pacific (excluding high income),Upper middle income,Not classified
32,BMU,BM,Bermuda,Hamilton,32.3293,-64.706,North America,,High income,Not classified
47,CHI,JG,Channel Islands,,,,Europe & Central Asia,,High income,Not classified
63,CUW,CW,Curacao,Willemstad,,,Latin America & Caribbean,,High income,Not classified
64,CYM,KY,Cayman Islands,George Town,19.3022,-81.3857,Latin America & Caribbean,,High income,Not classified
101,FRO,FO,Faroe Islands,Torshavn,61.8926,-6.91181,Europe & Central Asia,,High income,Not classified
108,GIB,GI,Gibraltar,,,,Europe & Central Asia,,High income,Not classified
115,GRL,GL,Greenland,Nuuk,64.1836,-51.7214,Europe & Central Asia,,High income,Not classified
117,GUM,GU,Guam,Agana,13.4443,144.794,East Asia & Pacific,,High income,Not classified


## Things to Fix

- Vietnam
    - fill NA values for 'cown' with 816 
    - correct gwn by replacing 817 with 816 for Vietnam after 1976
    - fill single NA value for 'p4n' with 816 (in 1954)
    - change p4n for Vietnam after 1976 to 818
- Russia
    - change p4n for years before 1992 to 364
- Pakistan
    - change p4n for years before 1972 to 769
- Ethiopia
    - change p4n for years after 1993 to 529
- South Sudan
    - change p4n to 525 (626 is wrong)
    - 626 refers to North Sudan so...
- North Sudan
    - change p4n for Sudan (SDN) after 2010 to 626
- Somalia
    - give wb id of SOM
    - remove years before 1960
- Montenegro
    - drop rows with year < 2006
    - change p4n to 348 (341 is wrong)
    - 341 refers to Kosovo so...
- Kosovo
    - drop rows with year < 2008
    - change p4n to 341 (347 is wrong)
    - add iso code XKX
    - continent = Europe, region = Southern Europe
    - 347 refers to Yugoslavia so...
- Yugoslavia
    - change p4n to 347 after 1990
    - delete rows after 2006, Yugoslavia becomes Serbia
    - add wb id SRB
    - continent = Europe, region = Southern Europe
- Serbia
    - delete rows before 2006 (was Yugoslavia)
    - add CoW code 345


In [40]:
mask_vietnam = cc['iso3c']=="VNM"

cc.loc[mask_vietnam, 'cown'] = cc.loc[mask_vietnam, 'cown'].fillna(816)
cc.loc[mask_vietnam, 'gwn'] = cc.loc[mask_vietnam, 'cown']
cc.loc[mask_vietnam, 'p4n'] = cc.loc[mask_vietnam, 'p4n'].fillna(816)

mask_vietnamnew = (cc['cown']==816) & (cc['year'] >= 1976)

cc.loc[mask_vietnamnew, 'p4n'] = 818

cc[mask_vietnam]

Unnamed: 0,country.name.en,year,continent,region,cown,gwn,iso3c,p4n,wb
12072,Vietnam,1955,Asia,South-Eastern Asia,817,817,VNM,817,VNM
12073,Vietnam,1956,Asia,South-Eastern Asia,817,817,VNM,817,VNM
12074,Vietnam,1957,Asia,South-Eastern Asia,817,817,VNM,817,VNM
12075,Vietnam,1958,Asia,South-Eastern Asia,817,817,VNM,817,VNM
12076,Vietnam,1959,Asia,South-Eastern Asia,817,817,VNM,817,VNM
12077,Vietnam,1960,Asia,South-Eastern Asia,817,817,VNM,817,VNM
12078,Vietnam,1961,Asia,South-Eastern Asia,817,817,VNM,817,VNM
12079,Vietnam,1962,Asia,South-Eastern Asia,817,817,VNM,817,VNM
12080,Vietnam,1963,Asia,South-Eastern Asia,817,817,VNM,817,VNM
12081,Vietnam,1964,Asia,South-Eastern Asia,817,817,VNM,817,VNM


In [41]:
mask_russia = cc['iso3c']=="RUS"
mask_ussr = (cc['iso3c']=="RUS") & (cc['year'] < 1992)
cc.loc[mask_ussr, 'p4n'] = 364
cc[mask_russia]

Unnamed: 0,country.name.en,year,continent,region,cown,gwn,iso3c,p4n,wb
9072,Russia,1992,Europe,Eastern Europe,365,365,RUS,365,RUS
9073,Russia,1993,Europe,Eastern Europe,365,365,RUS,365,RUS
9074,Russia,1994,Europe,Eastern Europe,365,365,RUS,365,RUS
9075,Russia,1995,Europe,Eastern Europe,365,365,RUS,365,RUS
9076,Russia,1996,Europe,Eastern Europe,365,365,RUS,365,RUS
9077,Russia,1997,Europe,Eastern Europe,365,365,RUS,365,RUS
9078,Russia,1998,Europe,Eastern Europe,365,365,RUS,365,RUS
9079,Russia,1999,Europe,Eastern Europe,365,365,RUS,365,RUS
9080,Russia,2000,Europe,Eastern Europe,365,365,RUS,365,RUS
9081,Russia,2001,Europe,Eastern Europe,365,365,RUS,365,RUS


In [42]:
mask_pakistan = cc['iso3c']=="PAK"
mask_pakistan2 = (cc['iso3c']=="PAK") & (cc['year'] < 1972)
cc.loc[mask_pakistan2, 'p4n'] = 769
cc[mask_pakistan]

Unnamed: 0,country.name.en,year,continent,region,cown,gwn,iso3c,p4n,wb
8318,Pakistan,1972,Asia,Southern Asia,770,770,PAK,770,PAK
8319,Pakistan,1973,Asia,Southern Asia,770,770,PAK,770,PAK
8320,Pakistan,1974,Asia,Southern Asia,770,770,PAK,770,PAK
8321,Pakistan,1975,Asia,Southern Asia,770,770,PAK,770,PAK
8322,Pakistan,1976,Asia,Southern Asia,770,770,PAK,770,PAK
8323,Pakistan,1977,Asia,Southern Asia,770,770,PAK,770,PAK
8324,Pakistan,1978,Asia,Southern Asia,770,770,PAK,770,PAK
8325,Pakistan,1979,Asia,Southern Asia,770,770,PAK,770,PAK
8326,Pakistan,1980,Asia,Southern Asia,770,770,PAK,770,PAK
8327,Pakistan,1981,Asia,Southern Asia,770,770,PAK,770,PAK


In [43]:
mask_ethiopia = cc['iso3c']=="ETH"
mask_ethiopia2 = (cc['iso3c']=="ETH") & (cc['year'] > 1993)
cc.loc[mask_ethiopia2, 'p4n'] = 529
cc[mask_ethiopia]

Unnamed: 0,country.name.en,year,continent,region,cown,gwn,iso3c,p4n,wb
3633,Ethiopia,1946,Africa,Eastern Africa,530,530,ETH,530,ETH
3634,Ethiopia,1947,Africa,Eastern Africa,530,530,ETH,530,ETH
3635,Ethiopia,1948,Africa,Eastern Africa,530,530,ETH,530,ETH
3636,Ethiopia,1949,Africa,Eastern Africa,530,530,ETH,530,ETH
3637,Ethiopia,1950,Africa,Eastern Africa,530,530,ETH,530,ETH
3638,Ethiopia,1951,Africa,Eastern Africa,530,530,ETH,530,ETH
3639,Ethiopia,1952,Africa,Eastern Africa,530,530,ETH,530,ETH
3640,Ethiopia,1953,Africa,Eastern Africa,530,530,ETH,530,ETH
3641,Ethiopia,1954,Africa,Eastern Africa,530,530,ETH,530,ETH
3642,Ethiopia,1955,Africa,Eastern Africa,530,530,ETH,530,ETH


In [44]:
mask_ssudan = cc['iso3c']=="SSD"
cc.loc[mask_ssudan, 'p4n'] = 525
cc[mask_ssudan]

Unnamed: 0,country.name.en,year,continent,region,cown,gwn,iso3c,p4n,wb
10118,South Sudan,2011,Africa,Northern Africa,626,626,SSD,525,SSD
10119,South Sudan,2012,Africa,Northern Africa,626,626,SSD,525,SSD
10120,South Sudan,2013,Africa,Northern Africa,626,626,SSD,525,SSD
10121,South Sudan,2014,Africa,Northern Africa,626,626,SSD,525,SSD
10122,South Sudan,2015,Africa,Northern Africa,626,626,SSD,525,SSD
10123,South Sudan,2016,Africa,Northern Africa,626,626,SSD,525,SSD
10124,South Sudan,2017,Africa,Northern Africa,626,626,SSD,525,SSD
10125,South Sudan,2018,Africa,Northern Africa,626,626,SSD,525,SSD


In [45]:
mask_sudan = cc['iso3c']=="SDN"
mask_sudan2 = (cc['iso3c']=="SDN") & (cc['year'] > 2010)
cc.loc[mask_sudan2, 'p4n'] = 626
cc[mask_sudan]

Unnamed: 0,country.name.en,year,continent,region,cown,gwn,iso3c,p4n,wb
10409,Sudan,2012,Africa,Northern Africa,625.0,625,SDN,626.0,SDN
10410,Sudan,2013,Africa,Northern Africa,625.0,625,SDN,626.0,SDN
10411,Sudan,2014,Africa,Northern Africa,625.0,625,SDN,626.0,SDN
10412,Sudan,2015,Africa,Northern Africa,625.0,625,SDN,626.0,SDN
10413,Sudan,2016,Africa,Northern Africa,625.0,625,SDN,626.0,SDN
10414,Sudan,2017,Africa,Northern Africa,625.0,625,SDN,626.0,SDN
10415,Sudan,2018,Africa,Northern Africa,625.0,625,SDN,626.0,SDN
10416,Sudan,1956,Africa,Northern Africa,625.0,625,SDN,625.0,SDN
10417,Sudan,1957,Africa,Northern Africa,625.0,625,SDN,625.0,SDN
10418,Sudan,1958,Africa,Northern Africa,625.0,625,SDN,625.0,SDN


In [46]:
mask_somalia = cc['iso3c']=="SOM"
cc.loc[mask_somalia, 'wb'] = "SOM"
cc = cc[~(mask_somalia & cc['cown'].isna())]
cc[mask_somalia]

  cc[mask_somalia]


Unnamed: 0,country.name.en,year,continent,region,cown,gwn,iso3c,p4n,wb
9899,Somalia,2008,Africa,Eastern Africa,520,520,SOM,520,SOM
9900,Somalia,2009,Africa,Eastern Africa,520,520,SOM,520,SOM
9901,Somalia,2010,Africa,Eastern Africa,520,520,SOM,520,SOM
9902,Somalia,2011,Africa,Eastern Africa,520,520,SOM,520,SOM
9903,Somalia,2012,Africa,Eastern Africa,520,520,SOM,520,SOM
9904,Somalia,2013,Africa,Eastern Africa,520,520,SOM,520,SOM
9905,Somalia,2014,Africa,Eastern Africa,520,520,SOM,520,SOM
9906,Somalia,2015,Africa,Eastern Africa,520,520,SOM,520,SOM
9907,Somalia,2016,Africa,Eastern Africa,520,520,SOM,520,SOM
9908,Somalia,2017,Africa,Eastern Africa,520,520,SOM,520,SOM


In [47]:
mask_montenegro = cc['iso3c']=="MNE"
cc = cc[~(mask_montenegro & cc['cown'].isna())]
cc.loc[mask_montenegro, 'p4n'] = 348
cc[mask_montenegro]

  cc[mask_montenegro]


Unnamed: 0,country.name.en,year,continent,region,cown,gwn,iso3c,p4n,wb
7328,Montenegro,2006,Europe,Southern Europe,341,341,MNE,348,MNE
7329,Montenegro,2007,Europe,Southern Europe,341,341,MNE,348,MNE
7330,Montenegro,2008,Europe,Southern Europe,341,341,MNE,348,MNE
7331,Montenegro,2009,Europe,Southern Europe,341,341,MNE,348,MNE
7332,Montenegro,2010,Europe,Southern Europe,341,341,MNE,348,MNE
7333,Montenegro,2011,Europe,Southern Europe,341,341,MNE,348,MNE
7334,Montenegro,2012,Europe,Southern Europe,341,341,MNE,348,MNE
7335,Montenegro,2013,Europe,Southern Europe,341,341,MNE,348,MNE
7336,Montenegro,2014,Europe,Southern Europe,341,341,MNE,348,MNE
7337,Montenegro,2015,Europe,Southern Europe,341,341,MNE,348,MNE


In [48]:
mask_kosovo = cc['country.name.en']=="Kosovo"
cc = cc[~(mask_kosovo & cc['cown'].isna())]
cc.loc[mask_kosovo, 'p4n'] = 341
cc.loc[mask_kosovo, 'iso3c'] = "XKX"
cc.loc[mask_kosovo, 'continent'] = "Europe"
cc.loc[mask_kosovo, 'region'] = "Southern Europe"
cc[mask_kosovo]

  cc[mask_kosovo]


Unnamed: 0,country.name.en,year,continent,region,cown,gwn,iso3c,p4n,wb
5835,Kosovo,2008,Europe,Southern Europe,347,347,XKX,341,XKX
5836,Kosovo,2009,Europe,Southern Europe,347,347,XKX,341,XKX
5837,Kosovo,2010,Europe,Southern Europe,347,347,XKX,341,XKX
5838,Kosovo,2011,Europe,Southern Europe,347,347,XKX,341,XKX
5839,Kosovo,2012,Europe,Southern Europe,347,347,XKX,341,XKX
5840,Kosovo,2013,Europe,Southern Europe,347,347,XKX,341,XKX
5841,Kosovo,2014,Europe,Southern Europe,347,347,XKX,341,XKX
5842,Kosovo,2015,Europe,Southern Europe,347,347,XKX,341,XKX
5843,Kosovo,2016,Europe,Southern Europe,347,347,XKX,341,XKX
5844,Kosovo,2017,Europe,Southern Europe,347,347,XKX,341,XKX


In [49]:
mask_yugoslavia = cc['country.name.en']=="Yugoslavia"

mask_yugoslavia2 = (cc['country.name.en']=="Yugoslavia") & (cc['year'] > 1990)
mask_yugoslavia3 = (cc['country.name.en']=="Yugoslavia") & (cc['year'] > 2007)

cc = cc[~(mask_yugoslavia3)]
cc.loc[mask_yugoslavia2, 'p4n'] = 347
cc.loc[mask_yugoslavia, 'iso3c'] = "YUG"
cc.loc[mask_yugoslavia, 'wb'] = "SRB"
cc.loc[mask_yugoslavia, 'continent'] = "Europe"
cc.loc[mask_yugoslavia, 'region'] = "Southern Europe"

cc[mask_yugoslavia]

  cc[mask_yugoslavia]


Unnamed: 0,country.name.en,year,continent,region,cown,gwn,iso3c,p4n,wb
12235,Yugoslavia,1992,Europe,Southern Europe,345,345,YUG,347,SRB
12236,Yugoslavia,1993,Europe,Southern Europe,345,345,YUG,347,SRB
12237,Yugoslavia,1994,Europe,Southern Europe,345,345,YUG,347,SRB
12238,Yugoslavia,1995,Europe,Southern Europe,345,345,YUG,347,SRB
12239,Yugoslavia,1996,Europe,Southern Europe,345,345,YUG,347,SRB
12240,Yugoslavia,1997,Europe,Southern Europe,345,345,YUG,347,SRB
12241,Yugoslavia,1998,Europe,Southern Europe,345,345,YUG,347,SRB
12242,Yugoslavia,1999,Europe,Southern Europe,345,345,YUG,347,SRB
12243,Yugoslavia,2000,Europe,Southern Europe,345,345,YUG,347,SRB
12244,Yugoslavia,2001,Europe,Southern Europe,345,345,YUG,347,SRB


In [50]:
mask_serbia = cc['country.name.en']=="Serbia"
mask_serbia_drop = (cc['country.name.en']=="Serbia") & (cc['year'] < 2006)

cc = cc[~(mask_serbia_drop)]
cc.loc[mask_serbia, 'cown'] = 345

cc[mask_serbia]

  cc[mask_serbia]


Unnamed: 0,country.name.en,year,continent,region,cown,gwn,iso3c,p4n,wb
9478,Serbia,2010,Europe,Southern Europe,345,340,SRB,342,SRB
9479,Serbia,2011,Europe,Southern Europe,345,340,SRB,342,SRB
9480,Serbia,2012,Europe,Southern Europe,345,340,SRB,342,SRB
9481,Serbia,2013,Europe,Southern Europe,345,340,SRB,342,SRB
9482,Serbia,2014,Europe,Southern Europe,345,340,SRB,342,SRB
9483,Serbia,2015,Europe,Southern Europe,345,340,SRB,342,SRB
9484,Serbia,2016,Europe,Southern Europe,345,340,SRB,342,SRB
9485,Serbia,2017,Europe,Southern Europe,345,340,SRB,342,SRB
9486,Serbia,2018,Europe,Southern Europe,345,340,SRB,342,SRB
9504,Serbia,2006,Europe,Southern Europe,345,340,SRB,342,SRB


## Compare to base G&W time series

note: countrycode isn't filling in the G&W codes correctly - filling in values where country doesn't yet exist

In [51]:
gw['StartDate'] = gw['gw_startdate'].apply(lambda dt: dt.replace(day=1, month=1))
gw['EndDate'] = gw['gw_enddate'].apply(lambda dt: dt.replace(day=1, month=1))


gw_ts = pd.concat([pd.DataFrame({'year': pd.date_range(row.StartDate, row.EndDate, freq='YS'),
                                 'gwn': row.gw_id}, 
                                 columns=['year', 'gwn']) 
                                 for i, row in gw.iterrows()], ignore_index=True)
gw_ts['year'] = gw_ts['year'].dt.year
gw_ts = gw_ts[gw_ts['year'] > 1945].reset_index(drop=True)
gw_ts['in_gw'] = 1
gw_ts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gw['StartDate'] = gw['gw_startdate'].apply(lambda dt: dt.replace(day=1, month=1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gw['EndDate'] = gw['gw_enddate'].apply(lambda dt: dt.replace(day=1, month=1))


Unnamed: 0,year,gwn,in_gw
0,1946,2,1
1,1947,2,1
2,1948,2,1
3,1949,2,1
4,1950,2,1
...,...,...,...
11123,2013,990,1
11124,2014,990,1
11125,2015,990,1
11126,2016,990,1


In [52]:
gw_ts['gwn'] = gw_ts['gwn'].astype('Int64')
cc_check_gw = cc.merge(gw_ts, on=['year', 'gwn'], how='outer')
cc_check_gw

Unnamed: 0,country.name.en,year,continent,region,cown,gwn,iso3c,p4n,wb,in_gw
0,Afghanistan,1946,Asia,Southern Asia,700,700,AFG,700,AFG,1.0
1,Afghanistan,1947,Asia,Southern Asia,700,700,AFG,700,AFG,1.0
2,Afghanistan,1948,Asia,Southern Asia,700,700,AFG,700,AFG,1.0
3,Afghanistan,1949,Asia,Southern Asia,700,700,AFG,700,AFG,1.0
4,Afghanistan,1950,Asia,Southern Asia,700,700,AFG,700,AFG,1.0
...,...,...,...,...,...,...,...,...,...,...
12847,,1971,,,,990,,,,1.0
12848,,1972,,,,990,,,,1.0
12849,,1973,,,,990,,,,1.0
12850,,1974,,,,990,,,,1.0


find countries that exist in G&W but not in countrycode

Yemen & Vietnam are the only ones that need G&W codes added; rest are microstates

In [53]:
cc_check_gw[cc_check_gw['country.name.en'].isna()]['gwn'].unique()

array([678, 711, 816, 221, 223, 331, 232, 396, 397, 403, 970, 971, 972,
       973, 983, 987, 990], dtype=object)

#### Need to fix Yemen

- Where cown is 679, fill gwn NAs with 678

In [54]:
cc_check_gw[cc_check_gw['gwn']==678]

Unnamed: 0,country.name.en,year,continent,region,cown,gwn,iso3c,p4n,wb,in_gw
12075,Yemen Arab Republic,1946,,,678.0,678,,678.0,,1.0
12076,Yemen Arab Republic,1947,,,678.0,678,,678.0,,1.0
12077,Yemen Arab Republic,1948,,,678.0,678,,678.0,,1.0
12078,Yemen Arab Republic,1949,,,678.0,678,,678.0,,1.0
12079,Yemen Arab Republic,1950,,,678.0,678,,678.0,,1.0
12080,Yemen Arab Republic,1951,,,678.0,678,,678.0,,1.0
12081,Yemen Arab Republic,1952,,,678.0,678,,678.0,,1.0
12082,Yemen Arab Republic,1953,,,678.0,678,,678.0,,1.0
12083,Yemen Arab Republic,1954,,,678.0,678,,678.0,,1.0
12084,Yemen Arab Republic,1955,,,678.0,678,,678.0,,1.0


In [55]:
mask_yemen = cc_check_gw['cown']==679

cc_check_gw.loc[mask_yemen, 'gwn'] = cc_check_gw.loc[mask_yemen, 'gwn'].fillna(678)

#### can't fix Vietnam

for some reason countrycode doesn't count North Vietnam, and this isn't something I can readily fix without potentially messing up other merges.

In [56]:
cc_check_gw[cc_check_gw['gwn']==816]

Unnamed: 0,country.name.en,year,continent,region,cown,gwn,iso3c,p4n,wb,in_gw
12032,Vietnam,2018,Asia,South-Eastern Asia,816.0,816,VNM,818.0,VNM,
12033,Vietnam,2010,Asia,South-Eastern Asia,816.0,816,VNM,818.0,VNM,1.0
12034,Vietnam,2011,Asia,South-Eastern Asia,816.0,816,VNM,818.0,VNM,1.0
12035,Vietnam,2016,Asia,South-Eastern Asia,816.0,816,VNM,818.0,VNM,1.0
12036,Vietnam,2017,Asia,South-Eastern Asia,816.0,816,VNM,818.0,VNM,1.0
12037,Vietnam,1978,Asia,South-Eastern Asia,816.0,816,VNM,818.0,VNM,1.0
12038,Vietnam,1979,Asia,South-Eastern Asia,816.0,816,VNM,818.0,VNM,1.0
12039,Vietnam,1980,Asia,South-Eastern Asia,816.0,816,VNM,818.0,VNM,1.0
12040,Vietnam,1981,Asia,South-Eastern Asia,816.0,816,VNM,818.0,VNM,1.0
12041,Vietnam,1982,Asia,South-Eastern Asia,816.0,816,VNM,818.0,VNM,1.0


find countries that exist in countrycode but not in G&W

In [57]:
mask_valid_gw = (cc_check_gw['in_gw'].isna()) & (cc_check_gw['year'] < 2018)
cc_check_gw[mask_valid_gw]['gwn'].unique()

array([615, 540, 371, 373, 692, 53, 370, 434, 760, 571, 439, 516, 811,
       402, 482, 483, 581, 484, 490, 437, 344, 352, 316, 522, 54, 265,
       411, 531, 366, 950, 481, 420, 372, 260, 452, 438, 404, 110, 750,
       51, 705, 501, 690, 703, 812, 367, 570, 368, 580, 553, 820, 781,
       432, 435, 590, 359, 600, 541, 775, 565, 436, 475, 731, 910, 694,
       517, nan, 678, 433, 591, 451, 830, 349, 940, 732, 680, 780, 625,
       115, 572, 713, 702, 510, 860, 461, 52, 616, 701, 500, 369, 704,
       935, 345, 551, 511, 552], dtype=object)

In [58]:
cc_check_gw[mask_valid_gw].head(100)

Unnamed: 0,country.name.en,year,continent,region,cown,gwn,iso3c,p4n,wb,in_gw
203,Algeria,1956,Africa,Northern Africa,,615,DZA,,DZA,
204,Algeria,1957,Africa,Northern Africa,,615,DZA,,DZA,
205,Algeria,1958,Africa,Northern Africa,,615,DZA,,DZA,
206,Algeria,1959,Africa,Northern Africa,,615,DZA,,DZA,
207,Algeria,1960,Africa,Northern Africa,,615,DZA,,DZA,
208,Algeria,1961,Africa,Northern Africa,,615,DZA,,DZA,
209,Algeria,1946,Africa,Northern Africa,,615,DZA,,DZA,
210,Algeria,1947,Africa,Northern Africa,,615,DZA,,DZA,
211,Algeria,1948,Africa,Northern Africa,,615,DZA,,DZA,
212,Algeria,1949,Africa,Northern Africa,,615,DZA,,DZA,


removing rows where both CoW ID is NA and G&W ID should have been NA

In [59]:
cc_check_gw = cc_check_gw[~((cc_check_gw['in_gw'].isna()) & (cc_check_gw['cown'].isna()))]
cc_check_gw

Unnamed: 0,country.name.en,year,continent,region,cown,gwn,iso3c,p4n,wb,in_gw
0,Afghanistan,1946,Asia,Southern Asia,700,700,AFG,700,AFG,1.0
1,Afghanistan,1947,Asia,Southern Asia,700,700,AFG,700,AFG,1.0
2,Afghanistan,1948,Asia,Southern Asia,700,700,AFG,700,AFG,1.0
3,Afghanistan,1949,Asia,Southern Asia,700,700,AFG,700,AFG,1.0
4,Afghanistan,1950,Asia,Southern Asia,700,700,AFG,700,AFG,1.0
...,...,...,...,...,...,...,...,...,...,...
12847,,1971,,,,990,,,,1.0
12848,,1972,,,,990,,,,1.0
12849,,1973,,,,990,,,,1.0
12850,,1974,,,,990,,,,1.0


drop rows that were added by the GW merge

In [60]:
cc_clean = cc_check_gw.dropna(subset=['country.name.en'])
cc_clean = cc_clean.sort_values(by=['country.name.en', 'year']).reset_index(drop=True).drop(columns=['in_gw'])
cc_clean = cc_clean.rename(columns={'country.name.en': 'country_name', 'cown': 'cow_id', 'gwn': 'gw_id', 'iso3c': 'iso3alpha', 'p4n': 'p4_id', 'wb': 'wb_id'})
cc_clean

Unnamed: 0,country_name,year,continent,region,cow_id,gw_id,iso3alpha,p4_id,wb_id
0,Afghanistan,1946,Asia,Southern Asia,700,700,AFG,700,AFG
1,Afghanistan,1947,Asia,Southern Asia,700,700,AFG,700,AFG
2,Afghanistan,1948,Asia,Southern Asia,700,700,AFG,700,AFG
3,Afghanistan,1949,Asia,Southern Asia,700,700,AFG,700,AFG
4,Afghanistan,1950,Asia,Southern Asia,700,700,AFG,700,AFG
...,...,...,...,...,...,...,...,...,...
10964,Zimbabwe,2014,Africa,Eastern Africa,552,552,ZWE,552,ZWE
10965,Zimbabwe,2015,Africa,Eastern Africa,552,552,ZWE,552,ZWE
10966,Zimbabwe,2016,Africa,Eastern Africa,552,552,ZWE,552,ZWE
10967,Zimbabwe,2017,Africa,Eastern Africa,552,552,ZWE,552,ZWE


### Compare to CoW base time series

In [None]:
cow_ts = cow_ts.rename(columns={'ccode':'cow_id'})
cow_ts = cow_ts[cow_ts['year']>1945]
cow_ts['inCoW'] = 1
cc_check_cow = cc_clean.merge(cow_ts, on=['cow_id', 'year'], how='outer')
cc_check_cow

BOOKMARK HERE

In [61]:
cc_clean.duplicated(subset=['year', 'country_name']).sum()

0

'year' + 'country_name' is the primary key

In [62]:
cc_clean.to_csv("../Data/FINAL/countrycodes_ts-base.csv", index=False, encoding='utf-8')