# Data preparation for tutorial

This notebook contains the code to convert raw downloaded external data into a cleaned or simplified version for tutorial purposes.


The raw data is expected to be in the `./raw` sub-directory (not included in the git repo).

In [1]:
%matplotlib inline

import geopandas

## Countries dataset

http://www.naturalearthdata.com/downloads/110m-cultural-vectors/110m-admin-0-countries/

In [2]:
countries = geopandas.read_file("zip://./raw/original_data_ne/ne_110m_admin_0_countries.zip")

In [3]:
countries.head()

Unnamed: 0,scalerank,featurecla,LABELRANK,SOVEREIGNT,SOV_A3,ADM0_DIF,LEVEL,TYPE,ADMIN,ADM0_A3,...,REGION_WB,NAME_LEN,LONG_LEN,ABBREV_LEN,TINY,HOMEPART,MIN_ZOOM,MIN_LABEL,MAX_LABEL,geometry
0,1,Admin-0 country,3.0,Afghanistan,AFG,0.0,2.0,Sovereign country,Afghanistan,AFG,...,South Asia,11.0,11.0,4.0,-99.0,1.0,0.0,3.0,7.0,"POLYGON ((61.21081709172574 35.65007233330923,..."
1,1,Admin-0 country,3.0,Angola,AGO,0.0,2.0,Sovereign country,Angola,AGO,...,Sub-Saharan Africa,6.0,6.0,4.0,-99.0,1.0,0.0,3.0,7.0,(POLYGON ((23.90415368011818 -11.7222815894063...
2,1,Admin-0 country,6.0,Albania,ALB,0.0,2.0,Sovereign country,Albania,ALB,...,Europe & Central Asia,7.0,7.0,4.0,-99.0,1.0,0.0,5.0,10.0,"POLYGON ((21.0200403174764 40.84272695572588, ..."
3,1,Admin-0 country,4.0,United Arab Emirates,ARE,0.0,2.0,Sovereign country,United Arab Emirates,ARE,...,Middle East & North Africa,20.0,20.0,6.0,-99.0,1.0,0.0,4.0,9.0,"POLYGON ((51.57951867046327 24.24549713795111,..."
4,1,Admin-0 country,2.0,Argentina,ARG,0.0,2.0,Sovereign country,Argentina,ARG,...,Latin America & Caribbean,9.0,9.0,4.0,-99.0,1.0,0.0,2.0,7.0,(POLYGON ((-66.95992000000001 -54.896810000000...


In [4]:
len(countries)

177

In [5]:
countries_subset = countries[['ADM0_A3', 'NAME', 'CONTINENT', 'POP_EST', 'GDP_MD_EST', 'geometry']]

In [6]:
countries_subset.columns = countries_subset.columns.str.lower()

In [7]:
countries_subset = countries_subset.rename(columns={'adm0_a3': 'iso_a3'})

In [8]:
countries_subset.head()

Unnamed: 0,iso_a3,name,continent,pop_est,gdp_md_est,geometry
0,AFG,Afghanistan,Asia,34124811.0,64080.0,"POLYGON ((61.21081709172574 35.65007233330923,..."
1,AGO,Angola,Africa,29310273.0,189000.0,(POLYGON ((23.90415368011818 -11.7222815894063...
2,ALB,Albania,Europe,3047987.0,33900.0,"POLYGON ((21.0200403174764 40.84272695572588, ..."
3,ARE,United Arab Emirates,Asia,6072475.0,667200.0,"POLYGON ((51.57951867046327 24.24549713795111,..."
4,ARG,Argentina,South America,44293293.0,879400.0,(POLYGON ((-66.95992000000001 -54.896810000000...


In [9]:
countries_subset.to_file("ne_110m_admin_0_countries.shp")

## Natural Earth - Cities dataset

http://www.naturalearthdata.com/downloads/110m-cultural-vectors/110m-populated-places/ (simple, version 4.0.0, downloaded May 2018)

In [10]:
cities = geopandas.read_file("zip://./raw/original_data_ne/ne_110m_populated_places_simple.zip")

In [11]:
cities.head()

Unnamed: 0,scalerank,natscale,labelrank,featurecla,name,namepar,namealt,diffascii,nameascii,adm0cap,...,pop_other,rank_max,rank_min,geonameid,meganame,ls_name,ls_match,checkme,min_zoom,geometry
0,8,10,3,Admin-0 capital,Vatican City,,,0,Vatican City,1.0,...,562430,2,2,6691831.0,,Vatican City,1,0,7.0,POINT (12.45338654497177 41.90328217996012)
1,7,20,0,Admin-0 capital,San Marino,,,0,San Marino,1.0,...,0,7,7,3168070.0,,San Marino,1,5,6.1,POINT (12.44177015780014 43.936095834768)
2,7,20,0,Admin-0 capital,Vaduz,,,0,Vaduz,1.0,...,33009,7,5,3042030.0,,Vaduz,1,0,6.7,POINT (9.516669472907267 47.13372377429357)
3,6,30,8,Admin-0 capital alt,Lobamba,,,0,Lobamba,0.0,...,0,5,4,935048.0,,Lobamba,1,5,6.0,POINT (31.19999710971274 -26.46666746135247)
4,6,30,8,Admin-0 capital,Luxembourg,,,0,Luxembourg,1.0,...,106219,9,8,2960316.0,,Luxembourg,1,0,6.0,POINT (6.130002806227083 49.61166037912108)


In [12]:
len(cities)

243

In [13]:
cities_subset = cities[['name', 'geometry']]

In [14]:
cities_subset.head()

Unnamed: 0,name,geometry
0,Vatican City,POINT (12.45338654497177 41.90328217996012)
1,San Marino,POINT (12.44177015780014 43.936095834768)
2,Vaduz,POINT (9.516669472907267 47.13372377429357)
3,Lobamba,POINT (31.19999710971274 -26.46666746135247)
4,Luxembourg,POINT (6.130002806227083 49.61166037912108)


In [15]:
cities_subset.to_file("ne_110m_populated_places.shp")

## Natural Earth - Rivers dataset

http://www.naturalearthdata.com/downloads/50m-physical-vectors/50m-rivers-lake-centerlines/ (version 4.0.0, downloaded May 2018)

In [16]:
rivers = geopandas.read_file("zip://./raw/ne_50m_rivers_lake_centerlines.zip")

In [17]:
rivers.head()

Unnamed: 0,scalerank,featurecla,name,note,min_zoom,name_alt,name_en,min_label,geometry
0,6,Lake Centerline,Kama,,5.0,,Kama,6.0,LINESTRING (51.9371337598152 55.70106609892139...
1,6,River,Kama,,5.0,,Kama,6.0,LINESTRING (53.69384765584471 58.2063174502901...
2,3,Lake Centerline,Abay,,3.0,,Abay,4.0,LINESTRING (37.11301150887408 11.8549872909308...
3,3,Lake Centerline,Al Furat,,3.0,,Al Furat,4.0,LINESTRING (38.56119184742585 35.8626433379197...
4,6,Lake Centerline,Alabama,,5.0,,Alabama,6.0,(LINESTRING (-86.52176754393696 33.03211843501...


Remove rows with missing geometry:

In [18]:
len(rivers)

462

In [19]:
rivers = rivers[~rivers.geometry.isna()].reset_index(drop=True)

In [20]:
len(rivers)

461

Subset of the columns:

In [21]:
rivers_subset = rivers[['featurecla', 'name_en', 'geometry']].rename(columns={'name_en': 'name'})

In [22]:
rivers_subset.head()

Unnamed: 0,featurecla,name,geometry
0,Lake Centerline,Kama,LINESTRING (51.9371337598152 55.70106609892139...
1,River,Kama,LINESTRING (53.69384765584471 58.2063174502901...
2,Lake Centerline,Abay,LINESTRING (37.11301150887408 11.8549872909308...
3,Lake Centerline,Al Furat,LINESTRING (38.56119184742585 35.8626433379197...
4,Lake Centerline,Alabama,(LINESTRING (-86.52176754393696 33.03211843501...


In [23]:
rivers_subset.to_file("ne_50m_rivers_lake_centerlines.shp")