# Clean countrycode's country-year panel

In [1]:
import pandas as pd
import numpy as np

In [2]:
cshapes = pd.read_csv("../Data/CShapes/Raw/country_shapes.csv", na_values=[-1])
gw_states = pd.read_csv("../Data/GW/Raw/gw_codes.dat", sep = "\t", header=None, names = ['gw_id', 'abbr', 'name', 'startdate', 'enddate'], encoding="latin-1")
gw_micro = pd.read_csv("../Data/GW/Raw/microstatessystem.dat", sep = "\t", header=None, names = ['gw_id', 'abbr', 'name', 'startdate', 'enddate'], encoding="latin-1")
ucdp = pd.read_csv("../Data/UCDP_PRIO/Wrangled/participants_gw.csv")
cow = pd.read_csv("../Data/CoW/Raw/states2016.csv")
cow_ts = pd.read_csv("../Data/CoW/Raw/system2016.csv", usecols=['ccode', 'year'])
polity = pd.read_csv("../Data/PolityIV/p4v2018.csv", usecols = ['ccode', 'country', 'year'])
wb = pd.read_csv("../Data/WorldBank/Raw_API/countries_list.csv", usecols = ['id', 'iso2code', 'name', 'region_name'])
wb_ts1 = pd.read_csv("../Data/FINAL/wdi_top25.csv", usecols = ['country', 'value', 'year'])
cc = pd.read_csv("../Data/countrycode/country_conversion_table.csv", dtype={'cown': 'Int64', 'gwn': 'Int64', 'p4n': 'Int64'})

### Unique country codes for all datasets

Get lists of unique values for each dataset. This is replicating code from Explore_State_Identifiers notebook

In [3]:
# cshapes
cshapes_isocodes = set(cshapes['iso_alpha3'].unique())
cshapes_cowcodes = set(cshapes['cow_code'].unique())
cshapes_gwcodes = set(cshapes['gw_code'].unique())
# gw
gw = pd.concat([gw_states, gw_micro])
gw['startdate'] = pd.to_datetime(gw['startdate'], format = '%d:%m:%Y')
gw['enddate'] = pd.to_datetime(gw['enddate'], format = '%d:%m:%Y')
gw = gw.rename(columns={'name': 'gw_name', 'startdate': 'gw_startdate', 'enddate': 'gw_enddate'})
gw = gw[gw['gw_enddate'] > pd.to_datetime('1945-12-31')]
gw_gwcodes = set(gw['gw_id'].unique())
# ucdp
ucdp_gwcodes = set(ucdp['gw_id'].unique())
# cow
cow['cow_startdate'] = pd.to_datetime(cow.styear*10000+cow.stmonth*100+cow.stday,format='%Y%m%d')
cow['cow_enddate'] = pd.to_datetime(cow.endyear*10000+cow.endmonth*100+cow.endday,format='%Y%m%d')
cow = cow.drop(columns=['styear', 'stmonth', 'stday', 'endyear', 'endmonth', 'endday', 'version'])
cow = cow.rename(columns={'stateabb': 'ISO_alpha3', 'ccode': 'cow_id', 'statenme': 'cow_name'})
cow = cow[cow['cow_enddate'] > pd.to_datetime('1945-12-31')]
cow_cowcodes = set(cow['cow_id'].unique())
# polity
polity = polity[polity['year'] > 1945]
pol_p4codes = set(polity['ccode'].unique())
# wb 
wb = wb[wb['region_name']!='Aggregates']
wb_wbcodes = set(wb['id'].unique())
# cc
cc_cowcodes = set(cc['cown'].unique())
cc_gwcodes = set(cc['gwn'].unique())
cc_p4codes = set(cc['p4n'].unique())
cc_wbcodes = set(cc['wb'].unique())

### Find discrepancies - when cc is missing a code

cc is the original base time-series. However, there are some problems...

Let's investigate what codes should be in cc but are missing.

In [4]:
cc = cc.rename(columns={'country.name.en': 'country'})
cc

Unnamed: 0,country,year,continent,region,cown,gwn,iso3c,p4n,wb
0,Afghanistan,1946,Asia,Southern Asia,700,700,AFG,700,AFG
1,Afghanistan,1947,Asia,Southern Asia,700,700,AFG,700,AFG
2,Afghanistan,1948,Asia,Southern Asia,700,700,AFG,700,AFG
3,Afghanistan,1949,Asia,Southern Asia,700,700,AFG,700,AFG
4,Afghanistan,1950,Asia,Southern Asia,700,700,AFG,700,AFG
...,...,...,...,...,...,...,...,...,...
12522,Zimbabwe,1960,Africa,Eastern Africa,,552,ZWE,,ZWE
12523,Zimbabwe,1961,Africa,Eastern Africa,,552,ZWE,,ZWE
12524,Zimbabwe,1962,Africa,Eastern Africa,,552,ZWE,,ZWE
12525,Zimbabwe,1963,Africa,Eastern Africa,,552,ZWE,,ZWE


cc is missing 816, aka Vietnam, from 'cown'. Note: there are two "Vietnam"s in CoW, the other is the Republic of Vietnam (aka South Vietnam), which ceased to exist in 1975

In [5]:
missing_cow_codes = cow_cowcodes - cc_cowcodes
cow[cow['cow_id'].isin(missing_cow_codes)]

Unnamed: 0,ISO_alpha3,cow_id,cow_name,cow_startdate,cow_enddate
221,DRV,816,Vietnam,1954-07-21,2016-12-31


cc is missing 816, aka Vietnam, from 'gwn' - same problem as with CoW. 

Abkhazia, South Ossetia, and São Tomé and Principe are microstates and not a concern because they are not tracked. Tibet is a bit more complicated - but since it is not recorded as a state for CoW or Polity, we can leave it out of the dataset.

In [6]:
missing_gw_codes = gw_gwcodes - cc_gwcodes
gw[gw['gw_id'].isin(missing_gw_codes)]

Unnamed: 0,gw_id,abbr,gw_name,gw_startdate,gw_enddate
183,711,TBT,Tibet,1913-01-01,1950-10-01
203,816,DRV,"Vietnam, Democratic Republic of",1954-05-01,2017-12-31
10,396,ABK,Abkhazia,2008-08-26,2017-12-31
11,397,SOT,South Ossetia,2008-08-26,2017-12-31
12,403,STP,São Tomé and Principe,1975-07-12,2017-12-31


cc is missing several p4 codes. This is a complex problem because they cascade - fixing one of these codes leads to discovering other missing codes.

In [7]:
missing_p4_codes = pol_p4codes - cc_p4codes
polity[polity['ccode'].isin(missing_p4_codes)].drop_duplicates(subset=['ccode'])

Unnamed: 0,ccode,country,year
7913,348,Montenegro,2006
8557,364,USSR,1946
11606,525,South Sudan,2011
11614,529,Ethiopia,1993
15934,769,Pakistan,1947
16808,818,Vietnam,1976


Most of the missing WB codes are microstates or not sovereign, and therefore not a concern. 

Somalia's wb code needs to be added to cc

In [8]:
missing_wb_codes = wb_wbcodes - cc_wbcodes
wb[wb['id'].isin(missing_wb_codes)]

Unnamed: 0,id,iso2code,name,region_name
0,ABW,AW,Aruba,Latin America & Caribbean
11,ASM,AS,American Samoa,East Asia & Pacific
32,BMU,BM,Bermuda,North America
47,CHI,JG,Channel Islands,Europe & Central Asia
63,CUW,CW,Curacao,Latin America & Caribbean
64,CYM,KY,Cayman Islands,Latin America & Caribbean
101,FRO,FO,Faroe Islands,Europe & Central Asia
108,GIB,GI,Gibraltar,Europe & Central Asia
115,GRL,GL,Greenland,Europe & Central Asia
117,GUM,GU,Guam,East Asia & Pacific


### The question of identifiers

- Can a country have multiple iso3c codes? **NO**
- Do any countries have no iso3c codes? **YES**

In [9]:
cc.groupby('country').agg({'iso3c':'nunique'}).query('iso3c != 1')

Unnamed: 0_level_0,iso3c
country,Unnamed: 1_level_1
Czechoslovakia,0
East Germany,0
German Democratic Republic,0
Kosovo,0
South Yemen,0
Yemen Arab Republic,0
Yemen People's Republic,0
Yugoslavia,0
Zanzibar,0


## The first repair: fix cc based on identified missing codes

What needs to be done?

### Things to Fix

- Vietnam
    - fill NA values for 'cown' with 816 
    - correct gwn by replacing 817 with 816 for Vietnam after 1976
    - fill single NA value for 'p4n' with 816 (in 1954)
    - change p4n for Vietnam after 1976 to 818
- Russia
    - change p4n for years before 1992 to 364
    - note: There is a retired ISO code for the USSR (SUN). Can optionally revise 'iso3c' to reflect this.
- Pakistan
    - change p4n for years before 1972 to 769
- Ethiopia
    - change p4n for years after 1993 to 529
- South Sudan
    - change p4n to 525 (626 is wrong)
    - 626 refers to North Sudan so...
- (North) Sudan
    - change p4n for Sudan (SDN) after 2010 to 626
- Somalia
    - give wb id of SOM
    - remove years before 1960
- Montenegro
    - drop rows with year < 2006
    - change p4n to 348 (341 is wrong)
    - 341 refers to Kosovo so...

#### These codes are not missing initially, but cascade from the erroneous coding of Montenegro

- Kosovo
    - drop rows with year < 2008
    - change p4n to 341 (347 is wrong)
    - add iso code XKX
    - continent = Europe, region = Southern Europe
    - 347 refers to Yugoslavia so...
- Yugoslavia
    - change p4n to 347 after 1990
    - delete rows after 2006, Yugoslavia becomes Serbia
    - add wb id SRB
    - continent = Europe, region = Southern Europe
- Serbia
    - delete rows before 2006 (was Yugoslavia)
    - add CoW code 345


- [x] fill NA values for 'cown' with 816 
- [x] correct gwn by replacing 817 with 816 for Vietnam after 1976
- [x] fill single NA value for 'p4n' with 816 (in 1954)
- [x] change p4n for Vietnam after 1976 to 818

In [10]:
mask_vietnam = cc['country']=="Vietnam"

cc.loc[mask_vietnam, 'cown'] = cc.loc[mask_vietnam, 'cown'].fillna(816)
cc.loc[mask_vietnam, 'gwn'] = cc.loc[mask_vietnam, 'cown']
cc.loc[mask_vietnam, 'p4n'] = cc.loc[mask_vietnam, 'p4n'].fillna(816)

mask_vietnamnew = (cc['cown']==816) & (cc['year'] >= 1976)

cc.loc[mask_vietnamnew, 'p4n'] = 818

with pd.option_context("display.max_rows", 100):
    display(cc[mask_vietnam].sort_values(by='year'))

Unnamed: 0,country,year,continent,region,cown,gwn,iso3c,p4n,wb
12093,Vietnam,1954,Asia,South-Eastern Asia,817,817,VNM,816,VNM
12072,Vietnam,1955,Asia,South-Eastern Asia,817,817,VNM,817,VNM
12073,Vietnam,1956,Asia,South-Eastern Asia,817,817,VNM,817,VNM
12074,Vietnam,1957,Asia,South-Eastern Asia,817,817,VNM,817,VNM
12075,Vietnam,1958,Asia,South-Eastern Asia,817,817,VNM,817,VNM
12076,Vietnam,1959,Asia,South-Eastern Asia,817,817,VNM,817,VNM
12077,Vietnam,1960,Asia,South-Eastern Asia,817,817,VNM,817,VNM
12078,Vietnam,1961,Asia,South-Eastern Asia,817,817,VNM,817,VNM
12079,Vietnam,1962,Asia,South-Eastern Asia,817,817,VNM,817,VNM
12080,Vietnam,1963,Asia,South-Eastern Asia,817,817,VNM,817,VNM


- [x] change p4n for years before 1992 to 364
- note: There is a retired ISO code for the USSR (SUN). Can optionally revise 'iso3c' to reflect this.

In [11]:
mask_russia = cc['country']=="Russia"
mask_ussr = (cc['country']=="Russia") & (cc['year'] < 1992)
cc.loc[mask_ussr, 'p4n'] = 364
cc.loc[mask_ussr, 'iso3c'] = "SUN"
with pd.option_context("display.max_rows", 100):
    display(cc[mask_russia].sort_values(by='year'))

Unnamed: 0,country,year,continent,region,cown,gwn,iso3c,p4n,wb
9099,Russia,1946,Europe,Eastern Europe,365,365,SUN,364,RUS
9100,Russia,1947,Europe,Eastern Europe,365,365,SUN,364,RUS
9101,Russia,1948,Europe,Eastern Europe,365,365,SUN,364,RUS
9102,Russia,1949,Europe,Eastern Europe,365,365,SUN,364,RUS
9103,Russia,1950,Europe,Eastern Europe,365,365,SUN,364,RUS
9104,Russia,1951,Europe,Eastern Europe,365,365,SUN,364,RUS
9105,Russia,1952,Europe,Eastern Europe,365,365,SUN,364,RUS
9106,Russia,1953,Europe,Eastern Europe,365,365,SUN,364,RUS
9107,Russia,1954,Europe,Eastern Europe,365,365,SUN,364,RUS
9108,Russia,1955,Europe,Eastern Europe,365,365,SUN,364,RUS


- [x] change p4n for years before 1972 to 769

In [12]:
mask_pakistan = cc['country']=="Pakistan"
mask_pakistan2 = (cc['country']=="Pakistan") & (cc['year'] < 1972)
cc.loc[mask_pakistan2, 'p4n'] = 769
with pd.option_context("display.max_rows", 100):
    display(cc[mask_pakistan].sort_values(by='year'))

Unnamed: 0,country,year,continent,region,cown,gwn,iso3c,p4n,wb
8365,Pakistan,1947,Asia,Southern Asia,770,770,PAK,769,PAK
8366,Pakistan,1948,Asia,Southern Asia,770,770,PAK,769,PAK
8367,Pakistan,1949,Asia,Southern Asia,770,770,PAK,769,PAK
8368,Pakistan,1950,Asia,Southern Asia,770,770,PAK,769,PAK
8369,Pakistan,1951,Asia,Southern Asia,770,770,PAK,769,PAK
8370,Pakistan,1952,Asia,Southern Asia,770,770,PAK,769,PAK
8371,Pakistan,1953,Asia,Southern Asia,770,770,PAK,769,PAK
8372,Pakistan,1954,Asia,Southern Asia,770,770,PAK,769,PAK
8373,Pakistan,1955,Asia,Southern Asia,770,770,PAK,769,PAK
8374,Pakistan,1956,Asia,Southern Asia,770,770,PAK,769,PAK


- [x] change p4n for years after 1993 to 529

In [13]:
mask_ethiopia = cc['country']=="Ethiopia"
mask_ethiopia2 = (cc['country']=="Ethiopia") & (cc['year'] > 1993)
cc.loc[mask_ethiopia2, 'p4n'] = 529
with pd.option_context("display.max_rows", 100):
    display(cc[mask_ethiopia].sort_values(by='year'))

Unnamed: 0,country,year,continent,region,cown,gwn,iso3c,p4n,wb
3633,Ethiopia,1946,Africa,Eastern Africa,530,530,ETH,530,ETH
3634,Ethiopia,1947,Africa,Eastern Africa,530,530,ETH,530,ETH
3635,Ethiopia,1948,Africa,Eastern Africa,530,530,ETH,530,ETH
3636,Ethiopia,1949,Africa,Eastern Africa,530,530,ETH,530,ETH
3637,Ethiopia,1950,Africa,Eastern Africa,530,530,ETH,530,ETH
3638,Ethiopia,1951,Africa,Eastern Africa,530,530,ETH,530,ETH
3639,Ethiopia,1952,Africa,Eastern Africa,530,530,ETH,530,ETH
3640,Ethiopia,1953,Africa,Eastern Africa,530,530,ETH,530,ETH
3641,Ethiopia,1954,Africa,Eastern Africa,530,530,ETH,530,ETH
3642,Ethiopia,1955,Africa,Eastern Africa,530,530,ETH,530,ETH


- [x] change p4n to 525 (626 is wrong)

In [14]:
mask_ssudan = cc['country']=="South Sudan"
cc.loc[mask_ssudan, 'p4n'] = 525
with pd.option_context("display.max_rows", 100):
    display(cc[mask_ssudan].sort_values(by='year'))

Unnamed: 0,country,year,continent,region,cown,gwn,iso3c,p4n,wb
10118,South Sudan,2011,Africa,Northern Africa,626,626,SSD,525,SSD
10119,South Sudan,2012,Africa,Northern Africa,626,626,SSD,525,SSD
10120,South Sudan,2013,Africa,Northern Africa,626,626,SSD,525,SSD
10121,South Sudan,2014,Africa,Northern Africa,626,626,SSD,525,SSD
10122,South Sudan,2015,Africa,Northern Africa,626,626,SSD,525,SSD
10123,South Sudan,2016,Africa,Northern Africa,626,626,SSD,525,SSD
10124,South Sudan,2017,Africa,Northern Africa,626,626,SSD,525,SSD
10125,South Sudan,2018,Africa,Northern Africa,626,626,SSD,525,SSD


- [x] change p4n for Sudan (SDN) after 2010 to 626

In [15]:
mask_sudan = cc['country']=="Sudan"
mask_sudan2 = (cc['country']=="Sudan") & (cc['year'] > 2010)
cc.loc[mask_sudan2, 'p4n'] = 626
with pd.option_context("display.max_rows", 100):
    display(cc[mask_sudan].sort_values(by='year'))

Unnamed: 0,country,year,continent,region,cown,gwn,iso3c,p4n,wb
10472,Sudan,1946,Africa,Northern Africa,,625,SDN,,SDN
10473,Sudan,1947,Africa,Northern Africa,,625,SDN,,SDN
10474,Sudan,1948,Africa,Northern Africa,,625,SDN,,SDN
10475,Sudan,1949,Africa,Northern Africa,,625,SDN,,SDN
10476,Sudan,1950,Africa,Northern Africa,,625,SDN,,SDN
10477,Sudan,1951,Africa,Northern Africa,,625,SDN,,SDN
10478,Sudan,1952,Africa,Northern Africa,,625,SDN,,SDN
10479,Sudan,1953,Africa,Northern Africa,,625,SDN,,SDN
10480,Sudan,1954,Africa,Northern Africa,,625,SDN,,SDN
10481,Sudan,1955,Africa,Northern Africa,,625,SDN,,SDN


- [x] give wb id of SOM
- [x] remove years before 1960

In [16]:
mask_somalia = cc['country']=="Somalia"
cc.loc[mask_somalia, 'wb'] = "SOM"
mask_somalia_drop = (cc['country']=="Somalia") & (cc['year'] < 1960)
cc = cc.drop(cc[mask_somalia_drop].index)
with pd.option_context("display.max_rows", 100):
    display(cc[mask_somalia].sort_values(by='year'))

  display(cc[mask_somalia].sort_values(by='year'))


Unnamed: 0,country,year,continent,region,cown,gwn,iso3c,p4n,wb
9910,Somalia,1960,Africa,Eastern Africa,520,520,SOM,520,SOM
9911,Somalia,1961,Africa,Eastern Africa,520,520,SOM,520,SOM
9912,Somalia,1962,Africa,Eastern Africa,520,520,SOM,520,SOM
9913,Somalia,1963,Africa,Eastern Africa,520,520,SOM,520,SOM
9914,Somalia,1964,Africa,Eastern Africa,520,520,SOM,520,SOM
9915,Somalia,1965,Africa,Eastern Africa,520,520,SOM,520,SOM
9916,Somalia,1966,Africa,Eastern Africa,520,520,SOM,520,SOM
9917,Somalia,1967,Africa,Eastern Africa,520,520,SOM,520,SOM
9918,Somalia,1968,Africa,Eastern Africa,520,520,SOM,520,SOM
9919,Somalia,1969,Africa,Eastern Africa,520,520,SOM,520,SOM


- [x] drop rows with year < 2006
- [x] change p4n to 348 (341 is wrong)

In [17]:
mask_montenegro = cc['country']=="Montenegro"
cc.loc[mask_montenegro, 'p4n'] = 348
mask_montenegro_drop = (cc['country']=="Montenegro") & (cc['year'] < 2006)
cc = cc.drop(cc[mask_montenegro_drop].index)
with pd.option_context("display.max_rows", 100):
    display(cc[mask_montenegro].sort_values(by='year'))

  display(cc[mask_montenegro].sort_values(by='year'))


Unnamed: 0,country,year,continent,region,cown,gwn,iso3c,p4n,wb
7328,Montenegro,2006,Europe,Southern Europe,341,341,MNE,348,MNE
7329,Montenegro,2007,Europe,Southern Europe,341,341,MNE,348,MNE
7330,Montenegro,2008,Europe,Southern Europe,341,341,MNE,348,MNE
7331,Montenegro,2009,Europe,Southern Europe,341,341,MNE,348,MNE
7332,Montenegro,2010,Europe,Southern Europe,341,341,MNE,348,MNE
7333,Montenegro,2011,Europe,Southern Europe,341,341,MNE,348,MNE
7334,Montenegro,2012,Europe,Southern Europe,341,341,MNE,348,MNE
7335,Montenegro,2013,Europe,Southern Europe,341,341,MNE,348,MNE
7336,Montenegro,2014,Europe,Southern Europe,341,341,MNE,348,MNE
7337,Montenegro,2015,Europe,Southern Europe,341,341,MNE,348,MNE


- [x] drop rows with year < 2008
- [x] change p4n to 341 (347 is wrong)
- [x] add iso code XKX
- [x] continent = Europe, region = Southern Europe

In [18]:
mask_kosovo = cc['country']=="Kosovo"
mask_kosovo_drop = (cc['country']=="Kosovo") & (cc['year'] < 2008)
cc = cc.drop(cc[mask_kosovo_drop].index)
cc.loc[mask_kosovo, 'p4n'] = 341
cc.loc[mask_kosovo, 'iso3c'] = "XKX"
cc.loc[mask_kosovo, 'continent'] = "Europe"
cc.loc[mask_kosovo, 'region'] = "Southern Europe"
with pd.option_context("display.max_rows", 100):
    display(cc[mask_kosovo].sort_values(by='year'))

  display(cc[mask_kosovo].sort_values(by='year'))


Unnamed: 0,country,year,continent,region,cown,gwn,iso3c,p4n,wb
5835,Kosovo,2008,Europe,Southern Europe,347,347,XKX,341,XKX
5836,Kosovo,2009,Europe,Southern Europe,347,347,XKX,341,XKX
5837,Kosovo,2010,Europe,Southern Europe,347,347,XKX,341,XKX
5838,Kosovo,2011,Europe,Southern Europe,347,347,XKX,341,XKX
5839,Kosovo,2012,Europe,Southern Europe,347,347,XKX,341,XKX
5840,Kosovo,2013,Europe,Southern Europe,347,347,XKX,341,XKX
5841,Kosovo,2014,Europe,Southern Europe,347,347,XKX,341,XKX
5842,Kosovo,2015,Europe,Southern Europe,347,347,XKX,341,XKX
5843,Kosovo,2016,Europe,Southern Europe,347,347,XKX,341,XKX
5844,Kosovo,2017,Europe,Southern Europe,347,347,XKX,341,XKX


- [x] change p4n to 347 after 1990
- [x] delete rows after 2006, Yugoslavia becomes Serbia
- [x] add wb id SRB
- [x] continent = Europe, region = Southern Europe

In [19]:
mask_yugoslavia = cc['country']=="Yugoslavia"
mask_yugoslavia_drop = (cc['country']=="Yugoslavia") & (cc['year'] > 2006)
cc = cc.drop(cc[mask_yugoslavia_drop].index)

mask_yugoslavia2 = (cc['country']=="Yugoslavia") & (cc['year'] > 1990)
cc.loc[mask_yugoslavia2, 'p4n'] = 347

cc.loc[mask_yugoslavia, 'iso3c'] = "YUG"
cc.loc[mask_yugoslavia, 'wb'] = "SRB"
cc.loc[mask_yugoslavia, 'continent'] = "Europe"
cc.loc[mask_yugoslavia, 'region'] = "Southern Europe"
with pd.option_context("display.max_rows", 100):
    display(cc[mask_yugoslavia].sort_values(by='year'))

  display(cc[mask_yugoslavia].sort_values(by='year'))


Unnamed: 0,country,year,continent,region,cown,gwn,iso3c,p4n,wb
12246,Yugoslavia,1946,Europe,Southern Europe,345,345,YUG,345,SRB
12247,Yugoslavia,1947,Europe,Southern Europe,345,345,YUG,345,SRB
12248,Yugoslavia,1948,Europe,Southern Europe,345,345,YUG,345,SRB
12249,Yugoslavia,1949,Europe,Southern Europe,345,345,YUG,345,SRB
12250,Yugoslavia,1950,Europe,Southern Europe,345,345,YUG,345,SRB
12251,Yugoslavia,1951,Europe,Southern Europe,345,345,YUG,345,SRB
12252,Yugoslavia,1952,Europe,Southern Europe,345,345,YUG,345,SRB
12253,Yugoslavia,1953,Europe,Southern Europe,345,345,YUG,345,SRB
12254,Yugoslavia,1954,Europe,Southern Europe,345,345,YUG,345,SRB
12255,Yugoslavia,1955,Europe,Southern Europe,345,345,YUG,345,SRB


- [x] delete rows before 2006 (was Yugoslavia)
- [x] add CoW code 345 (for 2006 and after)

In [20]:
mask_serbia = cc['country']=="Serbia"
mask_serbia_drop = (cc['country']=="Serbia") & (cc['year'] < 2006)
cc = cc.drop(cc[mask_serbia_drop].index)
cc.loc[mask_serbia, 'cown'] = 345
with pd.option_context("display.max_rows", 100):
    display(cc[mask_serbia].sort_values(by='year'))

  display(cc[mask_serbia].sort_values(by='year'))


Unnamed: 0,country,year,continent,region,cown,gwn,iso3c,p4n,wb
9504,Serbia,2006,Europe,Southern Europe,345,340,SRB,342,SRB
9505,Serbia,2007,Europe,Southern Europe,345,340,SRB,342,SRB
9506,Serbia,2008,Europe,Southern Europe,345,340,SRB,342,SRB
9507,Serbia,2009,Europe,Southern Europe,345,340,SRB,342,SRB
9478,Serbia,2010,Europe,Southern Europe,345,340,SRB,342,SRB
9479,Serbia,2011,Europe,Southern Europe,345,340,SRB,342,SRB
9480,Serbia,2012,Europe,Southern Europe,345,340,SRB,342,SRB
9481,Serbia,2013,Europe,Southern Europe,345,340,SRB,342,SRB
9482,Serbia,2014,Europe,Southern Europe,345,340,SRB,342,SRB
9483,Serbia,2015,Europe,Southern Europe,345,340,SRB,342,SRB


## Find Time Series Discrepancies

Now that we have corrected cc based on the discovered missing identifiers, let's explore on a deeper level - the time series. This means we need to construct a country-year time series for GW. Polity & WB are already in country-year format. CoW has a seperate dataset in country-year format

### Time Series for comparison & checking CC

In [21]:
gw['StartDate'] = gw['gw_startdate'].apply(lambda dt: dt.replace(day=1, month=1))
gw['EndDate'] = gw['gw_enddate'].apply(lambda dt: dt.replace(day=1, month=1))


gw_ts = pd.concat([pd.DataFrame({'year': pd.date_range(row.StartDate, row.EndDate, freq='YS'),
                                 'gwn': row.gw_id,
                                 'name': row.gw_name}, 
                                 columns=['year', 'gwn', 'name']) 
                                 for i, row in gw.iterrows()], ignore_index=True)
gw_ts['year'] = gw_ts['year'].dt.year
gw_ts = gw_ts[gw_ts['year'] > 1945].reset_index(drop=True)
gw_ts['in_gw'] = 1
gw_ts

Unnamed: 0,year,gwn,name,in_gw
0,1946,2,United States of America,1
1,1947,2,United States of America,1
2,1948,2,United States of America,1
3,1949,2,United States of America,1
4,1950,2,United States of America,1
...,...,...,...,...
11123,2013,990,Samoa/Western Samoa,1
11124,2014,990,Samoa/Western Samoa,1
11125,2015,990,Samoa/Western Samoa,1
11126,2016,990,Samoa/Western Samoa,1


In [22]:
cow_ts = cow_ts.rename(columns={'ccode':'cown'})
cow_ts = cow_ts[cow_ts['year']>1945]
cow_ts = cow_ts.merge(cow[['cow_id', 'cow_name']], left_on=['cown'], right_on=['cow_id'], how = 'left').drop(columns=['cow_id'])
cow_ts['in_cow'] = 1
cow_ts

Unnamed: 0,cown,year,cow_name,in_cow
0,2,1946,United States of America,1
1,20,1946,Canada,1
2,40,1946,Cuba,1
3,41,1946,Haiti,1
4,42,1946,Dominican Republic,1
...,...,...,...,...
10582,970,2016,Nauru,1
10583,983,2016,Marshall Islands,1
10584,986,2016,Palau,1
10585,987,2016,Federated States of Micronesia,1


In [23]:
p4_ts = polity.rename(columns={'ccode':'p4n', 'country':'p4_name'})
p4_ts['in_p4'] = 1
p4_ts

Unnamed: 0,p4n,p4_name,year,in_p4
146,2,United States,1946,1
147,2,United States,1947,1
148,2,United States,1948,1
149,2,United States,1949,1
150,2,United States,1950,1
...,...,...,...,...
17557,950,Fiji,2014,1
17558,950,Fiji,2015,1
17559,950,Fiji,2016,1
17560,950,Fiji,2017,1


In [24]:
wb_ts = wb_ts1.dropna().drop(columns=['value']).drop_duplicates()
wb_ts['in_wb'] = 1
wb_ts

Unnamed: 0,country,year,in_wb
1,AFG,2018,1
2,AFG,2017,1
3,AFG,2016,1
4,AFG,2015,1
5,AFG,2014,1
...,...,...,...
288811,SRB,1988,1
288812,SRB,1987,1
288813,SRB,1986,1
288814,SRB,1985,1


In [25]:
cc['in_cc'] = 1
cc = cc.sort_values(by=['country', 'year']).reset_index(drop=True)
cc

Unnamed: 0,country,year,continent,region,cown,gwn,iso3c,p4n,wb,in_cc
0,Afghanistan,1946,Asia,Southern Asia,700,700,AFG,700,AFG,1
1,Afghanistan,1947,Asia,Southern Asia,700,700,AFG,700,AFG,1
2,Afghanistan,1948,Asia,Southern Asia,700,700,AFG,700,AFG,1
3,Afghanistan,1949,Asia,Southern Asia,700,700,AFG,700,AFG,1
4,Afghanistan,1950,Asia,Southern Asia,700,700,AFG,700,AFG,1
...,...,...,...,...,...,...,...,...,...,...
12419,Zimbabwe,2014,Africa,Eastern Africa,552,552,ZWE,552,ZWE,1
12420,Zimbabwe,2015,Africa,Eastern Africa,552,552,ZWE,552,ZWE,1
12421,Zimbabwe,2016,Africa,Eastern Africa,552,552,ZWE,552,ZWE,1
12422,Zimbabwe,2017,Africa,Eastern Africa,552,552,ZWE,552,ZWE,1


### CoW vs CC

- Vietnam is a strange case - there is only one Vietnam in the original CC. There are two Vietnams for CoW and GW. My solution is to make the Republic of Vietnam (South) the official Vietnam from 1954 - 1975, until the North wins and Vietnam unifies, at which the point Vietnam (North) becomes the official (and is the only) Vietnam. Alternative solution is to add a second Vietnam - problem is WB only records one Vietnam.
- could convert 'cown' for Germany from 255 to 260 for 1990, as the conversion occurred in October 1990

In [26]:
cow_ts = cow_ts.astype({'cown':'Int64'})
cc_and_cow = cc.merge(cow_ts, on=['cown', 'year'], how = 'outer')
cc_and_cow['in_cow'] = cc_and_cow['in_cow'].fillna(0)
cc_and_cow['in_cc'] = cc_and_cow['in_cc'].fillna(0)
cc_and_cow = cc_and_cow.astype({'in_cow':'int', 'in_cc':'int'})

no rows have a CoW code but shouldn't

In [27]:
cc_and_cow[(cc_and_cow['in_cow']==0) & (cc_and_cow['year'] < 2017) & (cc_and_cow['cown'].notna())]

Unnamed: 0,country,year,continent,region,cown,gwn,iso3c,p4n,wb,in_cc,cow_name,in_cow


these rows don't have a CoW code but should

In [28]:
cc_and_cow[cc_and_cow['in_cc']==0]

Unnamed: 0,country,year,continent,region,cown,gwn,iso3c,p4n,wb,in_cc,cow_name,in_cow
12493,,1954,,,816,,,,,0,Vietnam,1
12494,,1955,,,816,,,,,0,Vietnam,1
12495,,1956,,,816,,,,,0,Vietnam,1
12496,,1957,,,816,,,,,0,Vietnam,1
12497,,1958,,,816,,,,,0,Vietnam,1
12498,,1959,,,816,,,,,0,Vietnam,1
12499,,1960,,,816,,,,,0,Vietnam,1
12500,,1961,,,816,,,,,0,Vietnam,1
12501,,1962,,,816,,,,,0,Vietnam,1
12502,,1963,,,816,,,,,0,Vietnam,1


Conclusion: nothing we need to fix with the 'cown' column

### GW vs CC

GW codes have a much bigger problem than CoW codes - many rows have a 'gwn' but shouldn't.

In [29]:
gw_ts = gw_ts.astype({'gwn':'Int64'})
cc_and_gw = cc.merge(gw_ts, on=['gwn', 'year'], how = 'outer')
cc_and_gw['in_gw'] = cc_and_gw['in_gw'].fillna(0)
cc_and_gw['in_cc'] = cc_and_gw['in_cc'].fillna(0)
cc_and_gw = cc_and_gw.astype({'in_gw':'int', 'in_cc':'int'})

these rows have GW codes but shouldn't

In [30]:
cc_not_in_gw = cc_and_gw[(cc_and_gw['in_gw']==0) & (cc_and_gw['year'] < 2018) & (cc_and_gw['gwn'].notna())]
with pd.option_context("display.max_rows", 100):
    display(cc_not_in_gw.groupby('country').agg({'year': ['min', 'max']}))

Unnamed: 0_level_0,year,year
Unnamed: 0_level_1,min,max
country,Unnamed: 1_level_2,Unnamed: 2_level_2
Algeria,1946,1961
Angola,1946,1974
Armenia,1990,1990
Azerbaijan,1990,1990
Bahrain,1946,1970
Barbados,1946,1965
Belarus,1990,1990
Benin,1946,1959
Bhutan,1946,1948
Botswana,1946,1965


these rows don't have a GW code but should

In [31]:
gw_not_in_cc = cc_and_gw[(cc_and_gw['in_cc']==0) & ~(cc_and_gw['name'].isin(['Abkhazia', 'South Ossetia', 'São Tomé and Principe']))].sort_values(by=['name', 'year'])
gw_not_in_cc.groupby('name').agg({'year': ['min', 'max']})

Unnamed: 0_level_0,year,year
Unnamed: 0_level_1,min,max
name,Unnamed: 1_level_2,Unnamed: 2_level_2
Andorra,1946,1992
Federated States of Micronesia,1986,1990
Kiribati,1979,1998
Liechtenstein,1946,1989
Marshall Islands,1986,1990
Monaco,1946,1992
Nauru,1968,1998
Samoa/Western Samoa,1962,1975
San Marino,1946,1991
Tibet,1946,1950


Conclusion: there are major problems with the 'gwn' column. Need to remove the 'gwn' for some rows, and remove those rows if 'cown' is also null. Need to fill in 'gwn' where possible for some rows, if they exist in the original cc.

## Repair 2: erroneous country-year combos for G&W codes

### Fill in G&W codes where they should exist but don't

Yemen is complicated...

- CC has 4 Yemens: 
    - "South Yemen" (1946 - 1966), only code is gwn = 680
    - "Yemen" (1990 - 2018), cown = 679, gwn is missing, iso3c & wb = YEM
    - "Yemen Arab Republic" (1946 - 1990), cown, gwn, and p4n = 678, iso3c & wb missing
    - "Yemen People's Republic" (1967 - 1990), cown, gwn, and p4n = 680, iso3c & wb missing
- CoW has 3 Yemens
    - "Yemen" (1990 - present) 679
    - "Yemen Arab Republic" (1946 - 1990) 678
    - "Yemen People's Republic" (1967 - 1990) 680
    - (south yemen not present)
- GW has 2 Yemens
    - "Yemen (Arab Republic of Yemen)" (1946 - present) 678
    - "Yemen, People's Republic of" (1967 - 1990) 680

Fix:

- drop "South Yemen" in CC
- cc's "Yemen" should get gwn = 678
- other Yemen's need the continent = "Asia", region= "Western Asia"
- need to decide which of the earlier Yemens gets the WB code

In [32]:
cc = cc.drop(cc[cc['country'] == "South Yemen"].index)
yemen_mask = cc['country'] == "Yemen"
cc.loc[yemen_mask, 'gwn'] = 678
yemenar_mask = cc['country'] == "Yemen Arab Republic"
cc.loc[yemenar_mask, 'continent'] = "Asia"
cc.loc[yemenar_mask, 'region'] = "Western Asia"
yemenpr_mask = cc['country'] == "Yemen People's Republic"
cc.loc[yemenpr_mask, 'iso3c'] = "YMD"
cc.loc[yemenpr_mask, 'continent'] = "Asia"
cc.loc[yemenpr_mask, 'region'] = "Western Asia"

For some countries, they exist in G&W before they exist in CoW, and CC doesn't include those years. This is an issue for...

- Andorra
- Federated States of Micronesia
- Kiribati
- Liechtenstein
- Marshall Islands
- Monaco
- Nauru
- Samoa/Western Samoa
- San Marino
- Tonga
- Tuvalu

In [33]:
gw_countries_to_add = gw_not_in_cc.groupby('name').agg({'year': ['min', 'max']}).reset_index()
gw_countries_to_add = gw_countries_to_add.drop([9, 12, 13]) # Tibet, Vietnam, Yemen
gw_countries_to_add.columns = ['country', 'year_min', 'year_max']
gw_countries_to_add_ids = gw_countries_to_add.country.tolist()

cc_gw_countries_to_add = cc[cc['country'].isin(gw_countries_to_add_ids)][['country', 'continent', 'region', 'gwn', 'iso3c', 'p4n', 'wb']].drop_duplicates()
gw_countries_to_add_allvars = gw_countries_to_add.merge(cc_gw_countries_to_add, on='country')

In [34]:
gw_countries_to_add_allvars['year_min'] = pd.to_datetime(gw_countries_to_add_allvars['year_min'], format='%Y')
gw_countries_to_add_allvars['year_max'] = pd.to_datetime(gw_countries_to_add_allvars['year_max'], format='%Y')
gw_countries_to_add_allvars

Unnamed: 0,country,year_min,year_max,continent,region,gwn,iso3c,p4n,wb
0,Andorra,1946-01-01,1992-01-01,Europe,Southern Europe,232,AND,,AND
1,Kiribati,1979-01-01,1998-01-01,Oceania,Micronesia,970,KIR,,KIR
2,Liechtenstein,1946-01-01,1989-01-01,Europe,Western Europe,223,LIE,,LIE
3,Marshall Islands,1986-01-01,1990-01-01,Oceania,Micronesia,983,MHL,,MHL
4,Monaco,1946-01-01,1992-01-01,Europe,Western Europe,221,MCO,,MCO
5,Nauru,1968-01-01,1998-01-01,Oceania,Micronesia,971,NRU,,NRU
6,San Marino,1946-01-01,1991-01-01,Europe,Southern Europe,331,SMR,,SMR
7,Tonga,1970-01-01,1998-01-01,Oceania,Polynesia,972,TON,,TON
8,Tuvalu,1978-01-01,1999-01-01,Oceania,Polynesia,973,TUV,,TUV


In [35]:
gw_rows_to_add = pd.concat([pd.DataFrame({'year': pd.date_range(row.year_min, row.year_max, freq='YS'),
                                          'country': row.country,
                                          'continent': row.continent,
                                          'region': row.region, 
                                          'gwn': row.gwn, 
                                          'iso3c': row.iso3c, 
                                          'p4n': row.p4n, 
                                          'wb': row.wb}, 
                                 columns=['country', 'year', 'continent', 'region', 'gwn', 'iso3c', 'p4n', 'wb']) 
                                 for i, row in gw_countries_to_add_allvars.iterrows()], ignore_index=True)
gw_rows_to_add['year'] = gw_rows_to_add['year'].dt.year
gw_rows_to_add['in_cc'] = 2

In [36]:
gw_rows_to_add

Unnamed: 0,country,year,continent,region,gwn,iso3c,p4n,wb,in_cc
0,Andorra,1946,Europe,Southern Europe,232,AND,,AND,2
1,Andorra,1947,Europe,Southern Europe,232,AND,,AND,2
2,Andorra,1948,Europe,Southern Europe,232,AND,,AND,2
3,Andorra,1949,Europe,Southern Europe,232,AND,,AND,2
4,Andorra,1950,Europe,Southern Europe,232,AND,,AND,2
...,...,...,...,...,...,...,...,...,...
286,Tuvalu,1995,Oceania,Polynesia,973,TUV,,TUV,2
287,Tuvalu,1996,Oceania,Polynesia,973,TUV,,TUV,2
288,Tuvalu,1997,Oceania,Polynesia,973,TUV,,TUV,2
289,Tuvalu,1998,Oceania,Polynesia,973,TUV,,TUV,2


In [37]:
cc = pd.concat([cc, gw_rows_to_add], sort=True).sort_values(by=['country', 'year']).reset_index(drop=True)

One country exists in G&W but not in CoW: Tibet

Tibet no longer exists in 1950 (according to G&W), so I will not add this country to CC.

### Drop G&W codes where they shouldn't exist

In [38]:
cc = cc.merge(cc_not_in_gw[['country', 'year', 'in_gw']], on=['country', 'year'], how='left')

In [39]:
mask_badGWcodes = cc['in_gw'] == 0
cc.loc[mask_badGWcodes, 'gwn'] = np.NaN

In [40]:
cc[cc.in_gw==0]

Unnamed: 0,continent,country,cown,gwn,in_cc,iso3c,p4n,region,wb,year,in_gw
146,Africa,Algeria,,,1,DZA,,Northern Africa,DZA,1946,0.0
147,Africa,Algeria,,,1,DZA,,Northern Africa,DZA,1947,0.0
148,Africa,Algeria,,,1,DZA,,Northern Africa,DZA,1948,0.0
149,Africa,Algeria,,,1,DZA,,Northern Africa,DZA,1949,0.0
150,Africa,Algeria,,,1,DZA,,Northern Africa,DZA,1950,0.0
...,...,...,...,...,...,...,...,...,...,...,...
12635,Africa,Zimbabwe,,,1,ZWE,,Eastern Africa,ZWE,1960,0.0
12636,Africa,Zimbabwe,,,1,ZWE,,Eastern Africa,ZWE,1961,0.0
12637,Africa,Zimbabwe,,,1,ZWE,,Eastern Africa,ZWE,1962,0.0
12638,Africa,Zimbabwe,,,1,ZWE,,Eastern Africa,ZWE,1963,0.0


Now there are many rows with no CoW or GW code - those country-years don't exist in either schema. These rows should be dropped

In [41]:
mask_dropNAcodes = (cc['cown'].isna()) & (cc['gwn'].isna())
cc = cc.drop(cc[mask_dropNAcodes].index)

In [42]:
cc = cc.sort_values(by=['country', 'year']).reset_index(drop=True)
cc = cc[['country', 'year', 'continent', 'region', 'iso3c', 'cown', 'gwn', 'p4n', 'wb', 'in_cc']]

## Now compare CC to Polity & WorldBank timeseries

### CC vs P4

In [43]:
cc_and_p4 = cc.merge(p4_ts, on=['p4n', 'year'], how = 'outer')
cc_and_p4['in_p4'] = cc_and_p4['in_p4'].fillna(0)
cc_and_p4['in_cc'] = cc_and_p4['in_cc'].fillna(0)

there are no rows where there is a p4 code that shouldn't exist

In [44]:
cc_and_p4[(cc_and_p4['in_p4']==0) & (cc_and_p4['p4n'].notna())]

Unnamed: 0,country,year,continent,region,iso3c,cown,gwn,p4n,wb,in_cc,p4_name,in_p4


there are some rows where there should be a p4 code but there isn't

In [45]:
cc_and_p4[cc_and_p4['in_cc']==0].groupby('p4_name').agg({'year':['min','max', 'count']})

Unnamed: 0_level_0,year,year,year
Unnamed: 0_level_1,min,max,count
p4_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Bhutan,1946,1948,3
Croatia,1991,1991,1
Ethiopia,1993,1993,1
Germany East,1946,1948,3
Germany West,1946,1990,4
Jamaica,1959,1961,3
Rwanda,1961,1961,1
Singapore,1959,1964,6
Slovenia,1991,1991,1
Sudan,2011,2011,1


Issues with Bhutan, Croatia, Jamaica, Rwanda, Singapore, Slovenia all because Polity records their existance before CoW and G&W does.

Issues with Ethiopia, Germany West, Sudan, Yugoslavia due to years of transition.

North Vietnam does not exist in this dataset.

Issues to fix:
- rename "East Germany" to "German Democratic Republic"

In [46]:
mask_eastgermany = cc['country']=="East Germany"
cc.loc[mask_eastgermany, 'country'] = "German Democratic Republic"

In [47]:
mask_gdr = cc['country']=="German Democratic Republic"
cc.loc[mask_gdr, 'iso3c'] = "DDR"
cc.loc[mask_gdr, 'continent'] = "Europe"
cc.loc[mask_gdr, 'region'] = "Eastern Europe"

### CC vs WB

In [48]:
wb_ts = wb_ts.rename(columns={'country':'wb'})
cc_and_wb = cc.merge(wb_ts, on=['wb', 'year'], how = 'outer')
cc_and_wb['in_wb'] = cc_and_wb['in_wb'].fillna(0)
cc_and_wb['in_cc'] = cc_and_wb['in_cc'].fillna(0)

In [49]:
cc_and_wb

Unnamed: 0,country,year,continent,region,iso3c,cown,gwn,p4n,wb,in_cc,in_wb
0,Afghanistan,1946,Asia,Southern Asia,AFG,700,700,700,AFG,1.0,0.0
1,Afghanistan,1947,Asia,Southern Asia,AFG,700,700,700,AFG,1.0,0.0
2,Afghanistan,1948,Asia,Southern Asia,AFG,700,700,700,AFG,1.0,0.0
3,Afghanistan,1949,Asia,Southern Asia,AFG,700,700,700,AFG,1.0,0.0
4,Afghanistan,1950,Asia,Southern Asia,AFG,700,700,700,AFG,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
13016,,2019,,,,,,,WSM,0.0,1.0
13017,,2019,,,,,,,YEM,0.0,1.0
13018,,2019,,,,,,,ZAF,0.0,1.0
13019,,2019,,,,,,,ZMB,0.0,1.0


In [50]:
cc_and_wb[(cc_and_wb['in_wb']==0) & (cc_and_wb['year']>=1960) & (cc_and_wb['wb'].notna())] \
    .groupby('country').agg({'year':['min', 'max', 'count']})

Unnamed: 0_level_0,year,year,year
Unnamed: 0_level_1,min,max,count
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Taiwan,1960,2018,59
Yugoslavia,1960,1983,21


No WB data on Taiwan. For Yugoslavia, it appears there is no collected data prior to 1984.

In [51]:
cc_and_wb[(cc_and_wb['in_cc']==0) & (cc_and_wb['year']<2019)].groupby('wb').agg({'year':['min','max', 'count']})

Unnamed: 0_level_0,year,year,year
Unnamed: 0_level_1,min,max,count
wb,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
AGO,1960,1974,15
ARE,1960,1970,11
ARM,1960,1990,31
ATG,1960,1980,21
AZE,1960,1990,31
...,...,...,...
WSM,1960,1975,16
XKX,1960,2007,48
YEM,1960,1989,30
ZMB,1960,1963,4


In [52]:
wb_ts1[(wb_ts1['country']=="AGO") & (wb_ts1['year']<1974) & (wb_ts1['value'].notna())]

Unnamed: 0,country,year,value
11746,AGO,1973,6.496962e+06
11747,AGO,1972,6.248552e+06
11748,AGO,1971,6.040777e+06
11749,AGO,1970,5.890365e+06
11750,AGO,1969,5.803254e+06
...,...,...,...
174719,AGO,1960,1.369926e+00
209626,AGO,1973,7.540000e+06
209627,AGO,1972,2.160000e+06
209628,AGO,1971,1.860000e+06


It would appear WB has data on countries before they actually became countries. Not sure why, but this shouldn't be a problem. This data just won't make it to the final dataset.

## Final Checks

In [53]:
cc[cc['region'].isna()].country.unique()

array(['Czechoslovakia', 'Zanzibar'], dtype=object)

In [54]:
mask_czechoslovakia = cc['country']=="Czechoslovakia"
cc.loc[mask_czechoslovakia, 'iso3c'] = "CSK"
cc.loc[mask_czechoslovakia, 'continent'] = "Europe"
cc.loc[mask_czechoslovakia, 'region'] = "Eastern Europe"

mask_zanzibar = cc['country']=="Zanzibar"
mask_zanzibar_drop = (cc['country']=="Zanzibar") & (cc['year'] == 2018)
cc = cc.drop(cc[mask_zanzibar_drop].index)
cc.loc[mask_zanzibar, 'continent'] = "Africa"
cc.loc[mask_zanzibar, 'region'] = "Eastern Africa"

In [55]:
cc[cc['iso3c'].isna()].country.unique()

array(['Yemen Arab Republic', 'Zanzibar'], dtype=object)

In [56]:
cc[cc['wb'].isna()].country.unique()

array(['Czechoslovakia', 'German Democratic Republic',
       'Yemen Arab Republic', "Yemen People's Republic", 'Zanzibar'],
      dtype=object)

In [57]:
cc = cc.drop(columns=['in_cc'])
cc = cc.rename(columns={'cown':'cow_id', 'gwn':'gw_id', 'p4n':'p4_id', 'wb':'wb_id'})
cc = cc.sort_values(by=['country', 'year'])

In [58]:
cc.to_csv("../Data/FINAL/countrycodes_ts-base.csv", index=False)