************
# World data
************

In [1]:
import pandas as pd

## Area de los paises

In [2]:
A = pd.read_csv('../dat/table-1.csv')
# source: https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_area
# preprocessed with https://wikitable2csv.ggor.de/

In [3]:
# eliminar los ultimos paises porque tienen areas muy chicas
A2 = A[:-20]

In [4]:
A2[-10:]

Unnamed: 0,Rank,Sovereign state/dependency,Total in km2 (mi2),Land in km2 (mi2),Water in km2 (mi2),% water,Notes
233,–,British Virgin Islands (United Kingdom),151 (58),151 (58),0 (0),0,
234,–,Wallis and Futuna (France),142 (55),142 (55),0 (0),0,
235,–,Christmas Island (Australia),135 (52),135 (52),0 (0),0,[Note 61]
236,–,Jersey (United Kingdom),116 (45),116 (45),0 (0),0,
237,–,Montserrat (United Kingdom),102 (39),102 (39),0 (0),0,
238,–,Anguilla (United Kingdom),91 (35),91 (35),0 (0),0,
239,–,Guernsey (United Kingdom),78 (30),78 (30),0 (0),0,
240,190,San Marino,61 (24),61 (24),0 (0),0,
241,–,British Indian Ocean Territory (United Kingdom),60 (23),60 (23),"54,340 (20,980)",99.89,[Note 62]
242,–,Saint Martin (France),54 (21),54.4 (21.0),Negligible,Negligible,[Note 16]


In [5]:
d = A2['Total in km2 (mi2)'].values

In [6]:
d[-20:]

array(['264 (102)', '261 (101)', '260 (100)', '253.8 (98.0)', '242 (93)',
       '236 (91)', '199 (77)', '181 (70)', '180 (69)', '160 (62)',
       '151 (58)', '142 (55)', '135 (52)', '116 (45)', '102 (39)',
       '91 (35)', '78 (30)', '61 (24)', '60 (23)', '54 (21)'],
      dtype=object)

In [7]:
area = []
for a in d:
    s = a.split('(')[0].replace(',','').strip()
    s = float(s)
    area.append(s)

In [8]:
area[:10]

[510072000.0,
 17098246.0,
 14000000.0,
 9984670.0,
 9596961.0,
 9525067.0,
 8515767.0,
 7692024.0,
 3287263.0,
 2780400.0]

In [9]:
print(A2.shape)
print(len(area))

(243, 7)
243


In [10]:
country = A2['Sovereign state/dependency'].values
d = {'country': country, 'area': area}
df = pd.DataFrame(data=d)

In [11]:
df.to_csv('../dat/world_area.csv', index=None)

## Poblacion de los paises

In [12]:
P = pd.read_csv('../dat/table-2.csv')
# source: https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)
# preprocessed with https://wikitable2csv.ggor.de/

In [13]:
population = []
for pop in P['Population(1 July 2019)']:
    pp = pop.replace(',','')
    pp = float(pp)
    population.append(pp)

In [14]:
population[:6]

[1433783686.0,
 1366417754.0,
 329064917.0,
 270625568.0,
 216565318.0,
 211049527.0]

In [15]:
P[:5]

Unnamed: 0,Country or area,UN continentalregion[4],UN statisticalregion[4],Population(1 July 2018),Population(1 July 2019),Change
0,China[a],Asia,Eastern Asia,1427647786,1433783686,+0.43%
1,India,Asia,Southern Asia,1352642280,1366417754,+1.02%
2,United States,Americas,Northern America,327096265,329064917,+0.60%
3,Indonesia,Asia,South-eastern Asia,267670543,270625568,+1.10%
4,Pakistan,Asia,Southern Asia,212228286,216565318,+2.04%


In [16]:
P.keys()

Index(['Country or area', 'UN continentalregion[4]', 'UN statisticalregion[4]',
       'Population(1 July 2018)', 'Population(1 July 2019)', 'Change'],
      dtype='object')

In [17]:
country = P['Country or area'].values

In [18]:
d = {'country': country, 'population': population}

In [19]:
df = pd.DataFrame(data=d)

In [20]:
df

Unnamed: 0,country,population
0,China[a],1.433784e+09
1,India,1.366418e+09
2,United States,3.290649e+08
3,Indonesia,2.706256e+08
4,Pakistan,2.165653e+08
...,...,...
229,Falkland Islands,3.377000e+03
230,Niue,1.615000e+03
231,Tokelau,1.340000e+03
232,Vatican City[u],7.990000e+02


In [21]:
df.to_csv('../dat/world_population.csv', index=None)

# Combine area and population data

In [22]:
import pandas as pd

In [23]:
P = pd.read_csv('../dat/world_population.csv')

In [24]:
P

Unnamed: 0,country,population
0,China[a],1.433784e+09
1,India,1.366418e+09
2,United States,3.290649e+08
3,Indonesia,2.706256e+08
4,Pakistan,2.165653e+08
...,...,...
229,Falkland Islands,3.377000e+03
230,Niue,1.615000e+03
231,Tokelau,1.340000e+03
232,Vatican City[u],7.990000e+02


In [25]:
A = pd.read_csv('../dat/world_area.csv')

In [26]:
A

Unnamed: 0,country,area
0,World,510072000.0
1,Russia,17098246.0
2,Antarctica,14000000.0
3,Canada,9984670.0
4,China,9596961.0
...,...,...
238,Anguilla (United Kingdom),91.0
239,Guernsey (United Kingdom),78.0
240,San Marino,61.0
241,British Indian Ocean Territory (United Kingdom),60.0


In [27]:
df = pd.merge(P, A, on='country')

In [28]:
df

Unnamed: 0,country,population,area
0,India,1.366418e+09,3287263.0
1,United States,3.290649e+08,9525067.0
2,Indonesia,2.706256e+08,1910931.0
3,Pakistan,2.165653e+08,907132.0
4,Brazil,2.110495e+08,8515767.0
...,...,...,...
165,Saint Kitts and Nevis,5.282300e+04,261.0
166,Liechtenstein,3.801900e+04,160.0
167,San Marino,3.386000e+04,61.0
168,Palau,1.800800e+04,459.0


In [29]:
df.to_csv('../dat/pop_area.csv')