In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd

# Reading

In [3]:
country_names = pd.read_csv('country/country_names.csv')
country_names.head()

Unnamed: 0,id,name
0,Q1000,Gabão
1,Q1005,Gâmbia
2,Q1006,Guiné
3,Q1007,Guiné-Bissau
4,Q1008,Costa do Marfim


In [4]:
country_areas = pd.read_csv('country/country_areas.csv')
country_areas.head()

Unnamed: 0,id,name,area
0,Q1000,Gabon,267667.0
1,Q1005,Gambia,11300.0
2,Q1006,Guinea,245857.0
3,Q1007,Guinea-Bissau,36125.0
4,Q1008,Ivory Coast,322463.0


In [5]:
country_continents = pd.read_csv('country/country_continents.csv', keep_default_na=False, na_values=[""])
country_continents.head()

Unnamed: 0,id,name,continent
0,Q1000,Gabon,AF
1,Q1005,Gambia,AF
2,Q1006,Guinea,AF
3,Q1007,Guinea-Bissau,AF
4,Q1008,Ivory Coast,AF


In [6]:
country_populations = pd.read_csv('country/country_populations.csv')
country_populations.head()

Unnamed: 0,id,name,year,population
0,Q1000,Gabon,1960,498823.0
1,Q1000,Gabon,1961,503762.0
2,Q1000,Gabon,1962,509348.0
3,Q1000,Gabon,1963,515762.0
4,Q1000,Gabon,1964,523236.0


# Merging

In [7]:
df = country_names.merge(country_areas.drop(columns=['name']), how='left')
df.head()

Unnamed: 0,id,name,area
0,Q1000,Gabão,267667.0
1,Q1005,Gâmbia,11300.0
2,Q1006,Guiné,245857.0
3,Q1007,Guiné-Bissau,36125.0
4,Q1008,Costa do Marfim,322463.0


In [8]:
df = df.merge(country_continents.drop(columns=['name']), how='left')
df.head()

Unnamed: 0,id,name,area,continent
0,Q1000,Gabão,267667.0,AF
1,Q1005,Gâmbia,11300.0,AF
2,Q1006,Guiné,245857.0,AF
3,Q1007,Guiné-Bissau,36125.0,AF
4,Q1008,Costa do Marfim,322463.0,AF


In [9]:
def fill_population(pop):
    return pop.interpolate()

pop = country_populations.drop(columns=['name']).groupby(['id']).apply(fill_population)

In [10]:
df = df.merge(pop, how='left')
df.head()

Unnamed: 0,id,name,area,continent,year,population
0,Q1000,Gabão,267667.0,AF,1960,498823.0
1,Q1000,Gabão,267667.0,AF,1961,503762.0
2,Q1000,Gabão,267667.0,AF,1962,509348.0
3,Q1000,Gabão,267667.0,AF,1963,515762.0
4,Q1000,Gabão,267667.0,AF,1964,523236.0


# Cleaning

In [11]:
clean = df.set_index('id').sort_index()
clean.head()

Unnamed: 0_level_0,name,area,continent,year,population
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Q1000,Gabão,267667.0,AF,1960,498823.0
Q1000,Gabão,267667.0,AF,1961,503762.0
Q1000,Gabão,267667.0,AF,1962,509348.0
Q1000,Gabão,267667.0,AF,1963,515762.0
Q1000,Gabão,267667.0,AF,1964,523236.0


## Transcontinental countries

In [12]:
transcontinental = df.drop(columns=['name', 'area', 'year', 'population']).drop_duplicates().groupby(by='id').size() > 1
transcontinental.name = 'transcontinental'
transcontinental.head()

id
Q1000    False
Q1005    False
Q1006    False
Q1007    False
Q1008    False
Name: transcontinental, dtype: bool

In [13]:
clean = clean.merge(transcontinental, left_index=True, right_index=True)
clean = clean.reset_index().set_index(['id', 'continent'])

In [14]:
clean.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,name,area,year,population,transcontinental
id,continent,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Q1000,AF,Gabão,267667.0,1960,498823.0,False
Q1000,AF,Gabão,267667.0,1961,503762.0,False
Q1000,AF,Gabão,267667.0,1962,509348.0,False
Q1000,AF,Gabão,267667.0,1963,515762.0,False
Q1000,AF,Gabão,267667.0,1964,523236.0,False


In [15]:
clean[clean.transcontinental].name.unique()

array(['Rússia', 'Azerbaijão', 'Chipre', 'Geórgia', 'Cazaquistão',
       'Holanda', 'Estados Unidos', 'Arménia', 'Turquia',
       'Reino da Dinamarca', 'Egito', 'Panamá'], dtype=object)

###### Russia

> The Russian Federation includes substantial territory in North Asia, historically incorporated into the Tsardom of Russia in the 17th century. European Russia has a population of about 110 million, or some 75% of the country's total population, though more than 75% of Russia's territory is in Asia.

In [16]:
# Set proportional area and population for each Russia continent
clean.loc[('Q159', 'AS'), 'area'] *= 0.75
clean.loc[('Q159', 'EU'), 'area'] *= 0.25
clean.loc[('Q159', 'AS'), 'population'] *= 0.25
clean.loc[('Q159', 'EU'), 'population'] *= 0.75


  return self._getitem_tuple(key)
  coro.send(None)


###### Azerbaijan

In [17]:
# Place Azerbaijan in asia.
clean = clean.drop([('Q227', 'EU')])

###### Cyprus

In [18]:
# Place Cyprus in asia.
clean = clean.drop([('Q229', 'EU')])

###### Georgia

In [19]:
# Place Georgia in asia.
clean = clean.drop([('Q230', 'EU')])

###### Kazakhstan

In [20]:
# Place Kazakhstan in asia.
clean = clean.drop([('Q232', 'EU')])

###### Kingdom of the Netherlands

In [21]:
# Place Kingdom of the Netherlands in europe.
clean = clean.drop([('Q29999', 'NA')])

###### United States

In [22]:
# Place United States in north america.
clean = clean.drop([('Q30', 'OC')])

###### Armenia

In [23]:
# Place Armenia in asia.
clean = clean.drop([('Q399', 'EU')])

###### Turkey

In [24]:
# Place Turkey in asia.
clean = clean.drop([('Q43', 'EU')])

###### Kingdom of Denmark

In [25]:
# Place Kingdom of Denmark in europe.
clean = clean.drop([('Q756617', 'NA')])

###### Egypt

In [26]:
# Place Egypt in africa.
clean = clean.drop([('Q79', 'AS')])

###### Panama

In [27]:
# Place Panama in north america.
clean = clean.drop([('Q804', 'SA')])

# Testing

In [28]:
assert len(clean) == 60 * 196

In [29]:
assert clean.index.levels[0].notnull().all()

In [30]:
assert len(clean.index.unique(0)) == 195

In [31]:
assert clean.name.notnull().all()

In [32]:
assert len(clean.name.unique()) == 195

In [33]:
assert clean.area.notnull().all()

In [34]:
assert clean.index.levels[1].notnull().all()

In [35]:
assert len(clean.index.unique(1)) == 6

In [36]:
assert clean.population.isnull().any()

In [37]:
assert clean.transcontinental.notnull().all()

In [38]:
assert clean.transcontinental.any()

# Writing

In [39]:
out = clean.drop(columns='transcontinental')
out.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,name,area,year,population
id,continent,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Q1000,AF,Gabão,267667.0,1960,498823.0
Q1000,AF,Gabão,267667.0,1961,503762.0
Q1000,AF,Gabão,267667.0,1962,509348.0
Q1000,AF,Gabão,267667.0,1963,515762.0
Q1000,AF,Gabão,267667.0,1964,523236.0


In [40]:
out.to_csv('country.csv')

# References

https://en.wikipedia.org/wiki/List_of_transcontinental_countries

https://population.un.org/wpp/DataQuery