# Data preparation for tutorial

This notebook contains the code to convert raw downloaded external data into a cleaned or simplified version for tutorial purposes.


The raw data is expected to be in the `./raw` sub-directory (not included in the git repo).

In [1]:
%matplotlib inline

import pandas as pd
import geopandas

## Countries dataset

http://www.naturalearthdata.com/downloads/110m-cultural-vectors/110m-admin-0-countries/

In [2]:
countries = geopandas.read_file("zip://./raw/original_data_ne/ne_110m_admin_0_countries.zip")

In [3]:
countries.head()

Unnamed: 0,scalerank,featurecla,LABELRANK,SOVEREIGNT,SOV_A3,ADM0_DIF,LEVEL,TYPE,ADMIN,ADM0_A3,...,REGION_WB,NAME_LEN,LONG_LEN,ABBREV_LEN,TINY,HOMEPART,MIN_ZOOM,MIN_LABEL,MAX_LABEL,geometry
0,1,Admin-0 country,3.0,Afghanistan,AFG,0.0,2.0,Sovereign country,Afghanistan,AFG,...,South Asia,11.0,11.0,4.0,-99.0,1.0,0.0,3.0,7.0,"POLYGON ((61.21081709172574 35.65007233330923,..."
1,1,Admin-0 country,3.0,Angola,AGO,0.0,2.0,Sovereign country,Angola,AGO,...,Sub-Saharan Africa,6.0,6.0,4.0,-99.0,1.0,0.0,3.0,7.0,(POLYGON ((23.90415368011818 -11.7222815894063...
2,1,Admin-0 country,6.0,Albania,ALB,0.0,2.0,Sovereign country,Albania,ALB,...,Europe & Central Asia,7.0,7.0,4.0,-99.0,1.0,0.0,5.0,10.0,"POLYGON ((21.0200403174764 40.84272695572588, ..."
3,1,Admin-0 country,4.0,United Arab Emirates,ARE,0.0,2.0,Sovereign country,United Arab Emirates,ARE,...,Middle East & North Africa,20.0,20.0,6.0,-99.0,1.0,0.0,4.0,9.0,"POLYGON ((51.57951867046327 24.24549713795111,..."
4,1,Admin-0 country,2.0,Argentina,ARG,0.0,2.0,Sovereign country,Argentina,ARG,...,Latin America & Caribbean,9.0,9.0,4.0,-99.0,1.0,0.0,2.0,7.0,(POLYGON ((-66.95992000000001 -54.896810000000...


In [4]:
len(countries)

177

In [5]:
countries_subset = countries[['ADM0_A3', 'NAME', 'CONTINENT', 'POP_EST', 'GDP_MD_EST', 'geometry']]

In [6]:
countries_subset.columns = countries_subset.columns.str.lower()

In [7]:
countries_subset = countries_subset.rename(columns={'adm0_a3': 'iso_a3'})

In [8]:
countries_subset.head()

Unnamed: 0,iso_a3,name,continent,pop_est,gdp_md_est,geometry
0,AFG,Afghanistan,Asia,34124811.0,64080.0,"POLYGON ((61.21081709172574 35.65007233330923,..."
1,AGO,Angola,Africa,29310273.0,189000.0,(POLYGON ((23.90415368011818 -11.7222815894063...
2,ALB,Albania,Europe,3047987.0,33900.0,"POLYGON ((21.0200403174764 40.84272695572588, ..."
3,ARE,United Arab Emirates,Asia,6072475.0,667200.0,"POLYGON ((51.57951867046327 24.24549713795111,..."
4,ARG,Argentina,South America,44293293.0,879400.0,(POLYGON ((-66.95992000000001 -54.896810000000...


In [9]:
countries_subset.to_file("ne_110m_admin_0_countries.shp")

## Natural Earth - Cities dataset

http://www.naturalearthdata.com/downloads/110m-cultural-vectors/110m-populated-places/ (simple, version 4.0.0, downloaded May 2018)

In [10]:
cities = geopandas.read_file("zip://./raw/original_data_ne/ne_110m_populated_places_simple.zip")

In [11]:
cities.head()

Unnamed: 0,scalerank,natscale,labelrank,featurecla,name,namepar,namealt,diffascii,nameascii,adm0cap,...,pop_other,rank_max,rank_min,geonameid,meganame,ls_name,ls_match,checkme,min_zoom,geometry
0,8,10,3,Admin-0 capital,Vatican City,,,0,Vatican City,1.0,...,562430,2,2,6691831.0,,Vatican City,1,0,7.0,POINT (12.45338654497177 41.90328217996012)
1,7,20,0,Admin-0 capital,San Marino,,,0,San Marino,1.0,...,0,7,7,3168070.0,,San Marino,1,5,6.1,POINT (12.44177015780014 43.936095834768)
2,7,20,0,Admin-0 capital,Vaduz,,,0,Vaduz,1.0,...,33009,7,5,3042030.0,,Vaduz,1,0,6.7,POINT (9.516669472907267 47.13372377429357)
3,6,30,8,Admin-0 capital alt,Lobamba,,,0,Lobamba,0.0,...,0,5,4,935048.0,,Lobamba,1,5,6.0,POINT (31.19999710971274 -26.46666746135247)
4,6,30,8,Admin-0 capital,Luxembourg,,,0,Luxembourg,1.0,...,106219,9,8,2960316.0,,Luxembourg,1,0,6.0,POINT (6.130002806227083 49.61166037912108)


In [12]:
len(cities)

243

In [13]:
cities_subset = cities[['name', 'geometry']]

In [14]:
cities_subset.head()

Unnamed: 0,name,geometry
0,Vatican City,POINT (12.45338654497177 41.90328217996012)
1,San Marino,POINT (12.44177015780014 43.936095834768)
2,Vaduz,POINT (9.516669472907267 47.13372377429357)
3,Lobamba,POINT (31.19999710971274 -26.46666746135247)
4,Luxembourg,POINT (6.130002806227083 49.61166037912108)


In [15]:
cities_subset.to_file("ne_110m_populated_places.shp")

## Natural Earth - Rivers dataset

http://www.naturalearthdata.com/downloads/50m-physical-vectors/50m-rivers-lake-centerlines/ (version 4.0.0, downloaded May 2018)

In [16]:
rivers = geopandas.read_file("zip://./raw/ne_50m_rivers_lake_centerlines.zip")

In [17]:
rivers.head()

Unnamed: 0,scalerank,featurecla,name,note,min_zoom,name_alt,name_en,min_label,geometry
0,6,Lake Centerline,Kama,,5.0,,Kama,6.0,LINESTRING (51.9371337598152 55.70106609892139...
1,6,River,Kama,,5.0,,Kama,6.0,LINESTRING (53.69384765584471 58.2063174502901...
2,3,Lake Centerline,Abay,,3.0,,Abay,4.0,LINESTRING (37.11301150887408 11.8549872909308...
3,3,Lake Centerline,Al Furat,,3.0,,Al Furat,4.0,LINESTRING (38.56119184742585 35.8626433379197...
4,6,Lake Centerline,Alabama,,5.0,,Alabama,6.0,(LINESTRING (-86.52176754393696 33.03211843501...


Remove rows with missing geometry:

In [18]:
len(rivers)

462

In [19]:
rivers = rivers[~rivers.geometry.isna()].reset_index(drop=True)

In [20]:
len(rivers)

461

Subset of the columns:

In [21]:
rivers_subset = rivers[['featurecla', 'name_en', 'geometry']].rename(columns={'name_en': 'name'})

In [22]:
rivers_subset.head()

Unnamed: 0,featurecla,name,geometry
0,Lake Centerline,Kama,LINESTRING (51.9371337598152 55.70106609892139...
1,River,Kama,LINESTRING (53.69384765584471 58.2063174502901...
2,Lake Centerline,Abay,LINESTRING (37.11301150887408 11.8549872909308...
3,Lake Centerline,Al Furat,LINESTRING (38.56119184742585 35.8626433379197...
4,Lake Centerline,Alabama,(LINESTRING (-86.52176754393696 33.03211843501...


In [23]:
rivers_subset.to_file("ne_50m_rivers_lake_centerlines.shp")

## Paris districts

Source: https://opendata.paris.fr/explore/dataset/quartier_paris/ (downloaded as GeoJSON file on August 20, 2018)

Administrative districts, polygon dataset

In [2]:
districts = geopandas.read_file("./raw/quartier_paris.geojson")

In [3]:
districts.head()

Unnamed: 0,n_sq_qu,perimetre,objectid,longueur,c_qu,surface,n_sq_ar,c_quinsee,l_qu,c_ar,geometry
0,750000010,2139.625388,50,2139.535591,10,271750.323937,750000003,7510302,Enfants-Rouges,3,"POLYGON ((2.367101341254551 48.86162755885409,..."
1,750000016,3283.163371,56,3282.999717,16,378252.153674,750000004,7510404,Notre-Dame,4,"POLYGON ((2.361313701339139 48.84858030437791,..."
2,750000018,4052.729521,58,4052.473226,18,798389.398463,750000005,7510502,Jardin-des-Plantes,5,"POLYGON ((2.364561460891576 48.84365746114398,..."
3,750000025,3827.253353,7,3827.053421,25,826559.43678,750000007,7510701,Saint-Thomas-d'Aquin,7,"POLYGON ((2.322133508640103 48.84924973446431,..."
4,750000039,3245.891413,21,3245.778222,39,609034.654451,750000010,7511003,Porte-Saint-Martin,10,"POLYGON ((2.363917183048105 48.86754108728465,..."


In [4]:
districts = districts.rename(columns={'l_qu': 'district_name', 'c_qu': 'id'}).sort_values('id').reset_index(drop=True)

Add population data (based on pdfs downloaded from ..):

In [5]:
population = pd.read_csv("./raw/paris-population.csv")

In [6]:
population['temp'] = population.district_name.str.lower()

In [7]:
population['temp'] = population['temp'].replace({
    'javel': 'javel 15art',
    'saint avoye': 'sainte avoie',
    "saint germain l'auxerrois": "st germain l'auxerrois",
    'plaine monceau': 'plaine de monceaux',
    'la   chapelle': 'la chapelle'})

In [8]:
districts['temp'] = (districts.district_name.str.lower().str.replace('-', ' ')
                              .str.replace('é', 'e').str.replace('è', 'e').str.replace('ê', 'e').str.replace('ô', 'o'))

In [9]:
res = pd.merge(districts, population[['population', 'temp']], on='temp', how='outer')

In [10]:
assert len(res) == len(districts)

In [11]:
districts = res[['id', 'district_name', 'population', 'geometry']]

In [12]:
districts.head()

Unnamed: 0,id,district_name,population,geometry
0,1,St-Germain-l'Auxerrois,1672,"POLYGON ((2.344593389828428 48.85404991486192,..."
1,2,Halles,8984,"POLYGON ((2.349365804803003 48.86057567227663,..."
2,3,Palais-Royal,3195,"POLYGON ((2.339465868602756 48.86213531210705,..."
3,4,Place-Vendôme,3044,"POLYGON ((2.331944969393234 48.86491285292422,..."
4,5,Gaillon,1345,"POLYGON ((2.336320212305949 48.8679713890312, ..."


In [13]:
districts.to_file("processed/paris_districts.geojson", driver='GeoJSON')

In [14]:
districts = districts.to_crs(epsg=32631)

In [15]:
districts.to_file("paris_districts_utm.geojson", driver='GeoJSON')

## Commerces de Paris

Source: https://opendata.paris.fr/explore/dataset/commercesparis/ (downloaded as csv file (`commercesparis.csv`) on October 30, 2018)

In [2]:
df = pd.read_csv("./raw/commercesparis.csv", sep=';')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.iloc[0]

ORDRE                              2379
ARRONDISSEMENT                    75001
QUARTIER                              3
IRIS                         7.5101e+08
ILOT                         7.5101e+10
NUMERO                                3
LET                                 NaN
TYPE VOIE                           RUE
LIBELLE VOIE                  PYRAMIDES
ADRESSE COMPLETE        3 RUE PYRAMIDES
CFA                                7902
X                                651020
Y                           6.86291e+06
XY                  48.864299, 2.332401
SEQUENCE                              3
SITUATION                       Sur rue
CODE ACTIVITE                     CH106
LIBELLE ACTIVITE    Restaurant européen
SURFACE                               1
CC ID                               NaN
CC NIV                              NaN
Name: 0, dtype: object

Take subset of the restaurants:

In [4]:
restaurants = df[df['CODE ACTIVITE'].str.startswith('CH1', na=False)].copy()

In [5]:
restaurants['LIBELLE ACTIVITE'].value_counts()

Restaurant traditionnel français                  1947
Restaurant asiatique                              1643
Restaurant européen                               1178
Restaurant indien, pakistanais et Moyen Orient     394
Restaurant maghrébin                               207
Restaurant africain                                138
Autre restaurant du monde                          107
Restaurant central et sud américain                 97
Restaurant antillais                                27
Name: LIBELLE ACTIVITE, dtype: int64

In [6]:
restaurants = restaurants.dropna(subset=['XY']).reset_index(drop=True)

Translate the restaurants and rename column:

In [7]:
restaurants['LIBELLE ACTIVITE'] = restaurants['LIBELLE ACTIVITE'].replace({
    'Restaurant traditionnel français': 'Traditional French restaurant',
    'Restaurant asiatique': 'Asian restaurant',
    'Restaurant européen': 'European restuarant',
    'Restaurant indien, pakistanais et Moyen Orient': 'Indian / Middle Eastern restaurant',
    'Restaurant maghrébin': 'Maghrebian restaurant',
    'Restaurant africain': 'African restaurant',
    'Autre restaurant du monde': 'Other world restaurant',
    'Restaurant central et sud américain': 'Central and South American restuarant',
    'Restaurant antillais': 'Caribbean restaurant'
})

In [8]:
restaurants = restaurants.rename(columns={'LIBELLE ACTIVITE': 'type'})

Create GeoDataFrame

In [9]:
from shapely.geometry import Point

In [10]:
restaurants['geometry'] = restaurants['XY'].str.split(', ').map(lambda x: Point(float(x[1]), float(x[0])))

In [11]:
restaurants = geopandas.GeoDataFrame(restaurants[['type', 'geometry']], crs={'init': 'epsg:4326'})

In [12]:
restaurants.head()

Unnamed: 0,type,geometry
0,European restuarant,POINT (2.332401 48.864299)
1,Traditional French restaurant,POINT (2.331778 48.86526)
2,Traditional French restaurant,POINT (2.332541 48.865932)
3,Indian / Middle Eastern restaurant,POINT (2.332785 48.866285)
4,Traditional French restaurant,POINT (2.332008 48.866444)


In [13]:
restaurants.to_file("processed/paris_restaurants.gpkg", driver='GPKG')