# SC/BEP Data Conditioning

In [1]:
import sys
import time
import pathlib
import pandas as pd
import geopandas as gpd
import geocoder
from geopy.distance import geodesic
#from fuzzymatcher import link_table, fuzzy_left_join
%matplotlib inline
pd.set_option('display.max_rows', 300)

## Load dataset

In [2]:
missions = pd.read_csv("data/missions.csv", sep=';', header=0, encoding='cp1250', parse_dates=['debut', 'fin'])
missions = missions.reset_index().rename(columns={'index': 'mission_id'})
missions = missions.drop('libelle', axis=1)
missions.sample(5)

Unnamed: 0,mission_id,debut,fin,pays_origine,ville_origine,pays,ville,lieu,mode_deplacement,acronyme,faculte
649,649,2012-07-15,2011-08-12,,,France,Bibracte,Centre archéologique européen Bibracte,,HAA,PHILOSCSOC
1357,1357,2014-05-15,2014-05-16,,,Allemagne,Heidelberg,Universität Heidelberg,,L&L,LTC
829,829,2012-12-10,2012-12-12,,,france,avignon,Université d'Avignon et des pays du Vaucluse,,DECA,LTC
5314,5314,2019-06-03,2019-06-05,Belgique,Bruxelles,France,Nancy,Université de Lorraine,voiture_diesel,DECS,SBS
4140,4140,2018-10-07,2018-10-15,Belgique,Bruxelles,Canada,Quebec,"Université de Laval, Quebec",avion_long_courrier_eco,ScPOL,PHILOSCSOC


## Data Cleansing

### Swap inverted timestamps

In [3]:
missions['elapsed'] = (missions['fin'] - missions['debut']).dt.days
missions.loc[missions['elapsed']<0,'fin'] = missions.loc[missions['elapsed']<0,'debut'].values
missions.loc[missions['elapsed']<0,'debut'] = missions.loc[missions['elapsed']<0,'fin'].values
missions['elapsed'] = ((missions['fin'] - missions['debut']) + pd.Timedelta("1D")).dt.days

### Correct subpopulated category

In [4]:
missions['acronyme'] = missions['acronyme'].replace({'PHILA': 'PHILO'}) 

### Normalize city names

In [5]:
def norm(x, fromkey, tokey):
    x[tokey] = x[fromkey].str.normalize('NFKD')\
                         .str.encode('ascii', errors='ignore').str.decode('utf-8')\
                         .str.strip().str.lower()

In [6]:
norm(missions, 'ville_origine', 'from_city')
norm(missions, 'ville', 'dest_city')

### Cleanse & split normalized city names

In [7]:
regreplace = r"\((.*?)\)"
regsplit = r'/|,|\&| et | puis |\+| and | - |;'
missions['dest_city'] = missions['dest_city'].str.replace(regreplace,"").str.split(regsplit)

In [8]:
df = missions.explode('dest_city')
df['dest_city'] = df['dest_city'].str.replace("\d", "")
df['dest_city'] = df['dest_city'].str.strip()

In [9]:
citytrans = pd.read_csv("data/city_trans.csv")
citytrans = {r.city: r.corrected for r in citytrans.itertuples()}

In [10]:
df['from_city'] = df['from_city'].replace(citytrans)
df['dest_city'] = df['dest_city'].replace(citytrans)

In [11]:
df['from_city'] = df['from_city'].fillna('bruxelles')
df['dest_city'] = df['dest_city'].fillna('bruxelles')

### Geocode city names

In [12]:
city = pd.DataFrame(list(df['from_city']) + list(df['dest_city']), columns=['city'])
city = city.dropna().drop_duplicates().rename(columns={'city': 'citycleansed'})
city.sample(5)

Unnamed: 0,citycleansed
7643,valladolid
8810,kiel
9225,konstanz
9414,gigny
7095,osuna


In [13]:
def geocode(x, keys=['country_code', 'city', 'lat', 'lng', 'accuracy', 'confidence']):
    # https://operations.osmfoundation.org/policies/nominatim/
    res = geocoder.osm(x).json
    time.sleep(1.2)
    return {k: (res or {}).get(k) for k in keys}

In [14]:
geocode = pd.read_json('data/geocode.json')
geocode

Unnamed: 0,geocodeid,citycleansed,ISO2,city,lat,lon,accuracy,confidence
0,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.777530,4.0
1,1,charleroi,BE,Charleroi,50.412033,4.443624,0.668664,3.0
2,2,paris,FR,Paris,48.856697,2.351462,0.931710,2.0
3,3,casablanca,MA,Casablanca ⵜⴰⴷⴷⴰⵔⵜ ⵜⵓⵎⵍⵉⵍⵜ الدار البيضاء,33.595063,-7.618777,0.717325,1.0
4,4,reading,GB,,51.451495,-0.983634,0.670144,4.0
...,...,...,...,...,...,...,...,...
1138,1138,sitia,GR,,35.206625,26.104971,0.476402,4.0
1139,1139,szklarska poreba,PL,Szklarska Poręba,50.827585,15.521166,0.566010,3.0
1140,1140,stadtoldendorf,DE,Stadtoldendorf,51.884211,9.625220,0.528807,4.0
1141,1141,longueil-sainte-marie,FR,Longueil-Sainte-Marie,49.357664,2.717691,0.849620,5.0


### Create Travel Cycle

In [15]:
travels = df.groupby("mission_id").agg({'from_city': 'first', 'dest_city': list}, default='first')
travels = travels.reset_index().rename(columns={'from_city': 'from', 'dest_city': 'to'}).dropna()
travels.sample(5)

Unnamed: 0,mission_id,from,to
1822,1822,bruxelles,[bruxelles]
3770,3770,bruxelles,[lisbonne]
5988,5988,bruxelles,[bamberg]
2005,2005,bruxelles,"[cracovie, sofia]"
1578,1578,bruxelles,[montpellier]


In [16]:
travels['chain'] = travels.apply(lambda x: [x['from']] + x['to'] + [x['from']], axis=1)
travels = travels.explode('chain')
travels['end'] = travels.groupby("mission_id")["chain"].shift(-1)
travels = travels.dropna(subset=['end']).reset_index(drop=True).reset_index().rename(columns={'index': 'travel_id'})
travels = travels.drop(['from', 'to'], axis=1).rename(columns={'chain': 'from', 'end': 'to'})

In [17]:
travels.head()

Unnamed: 0,travel_id,mission_id,from,to
0,0,0,bruxelles,londres
1,1,0,londres,oxford
2,2,0,oxford,bruxelles
3,3,1,bruxelles,londres
4,4,1,londres,bruxelles


In [18]:
travels = travels.merge(geocode.add_prefix('from_'), left_on='from', right_on='from_citycleansed', how='left')
travels = travels.merge(geocode.add_prefix('to_'), left_on='to', right_on='to_citycleansed', how='left')

In [19]:
travels = travels.drop(['from', 'to'], axis=1)

In [20]:
travels.sample(5)

Unnamed: 0,travel_id,mission_id,from_geocodeid,from_citycleansed,from_ISO2,from_city,from_lat,from_lon,from_accuracy,from_confidence,to_geocodeid,to_citycleansed,to_ISO2,to_city,to_lat,to_lon,to_accuracy,to_confidence
12368,12368,5921,449,louvain la neuve,BE,,50.674169,4.614157,0.742712,6.0,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0
6309,6309,2993,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0,130,cracovie,PL,,50.046943,19.997153,0.69366,1.0
7177,7177,3411,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0,27,rome,IT,Roma,41.89332,12.482932,0.789611,1.0
11951,11951,5718,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0,940,tilburg,NL,Tilburg,51.55847,5.083076,0.633356,2.0
12081,12081,5781,489,aarhus,DK,Aarhus,56.149628,10.213405,0.695327,1.0,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0


### Clean Missions

In [21]:
missions = missions.rename(columns={
    'debut': 'start', 'fin': 'stop', 'mode_deplacement': 'travel_type',
    'acronyme': 'acronym', 'faculte': 'faculty'
}).drop(['pays_origine', 'ville_origine', 'pays', 'lieu', 'ville', 'from_city', 'dest_city'], axis=1)
missions['travel_type'] = missions['travel_type'].replace({
    'avion_courte_distance': 'plane/short',
    'avion_long_courrier_eco': 'plane/eco',
    'avion_long_courrier_business': 'plane/first',
    'voiture_essence': 'car/gas',
    'voiture_diesel': 'car/diesel',
})
missions.sample(10)

Unnamed: 0,mission_id,start,stop,travel_type,acronym,faculty,elapsed
2644,2644,2017-01-15,2017-01-16,,ScSOC,PHILOSCSOC,2
511,511,2012-03-20,2012-03-23,,L&L,LTC,4
1442,1442,2014-07-04,2014-07-09,,SIC,LTC,6
3973,3973,2018-04-19,2018-04-20,train,SIC,LTC,2
4829,4829,2019-01-22,2019-01-22,,DECS,SBS,1
3234,3234,2017-09-12,2017-09-17,,ScPOL,PHILOSCSOC,6
841,841,2013-01-13,2013-01-19,,L&L,LTC,7
97,97,2010-11-18,2010-11-19,,HAA,PHILOSCSOC,2
3573,3573,2018-03-15,2018-03-16,,ScPOL,PHILOSCSOC,2
2843,2843,2017-07-11,2017-07-25,,ScPOL,PHILOSCSOC,15


In [22]:
travels = travels.merge(missions)

In [23]:
travels

Unnamed: 0,travel_id,mission_id,from_geocodeid,from_citycleansed,from_ISO2,from_city,from_lat,from_lon,from_accuracy,from_confidence,...,to_lat,to_lon,to_accuracy,to_confidence,start,stop,travel_type,acronym,faculty,elapsed
0,0,0,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.777530,4.0,...,51.507322,-0.127647,0.830783,1.0,2010-05-28,2010-06-01,,DECA,LTC,5
1,1,0,5,londres,GB,London,51.507322,-0.127647,0.830783,1.0,...,51.752013,-1.257850,0.760604,4.0,2010-05-28,2010-06-01,,DECA,LTC,5
2,2,0,99,oxford,GB,Oxford,51.752013,-1.257850,0.760604,4.0,...,50.843671,4.367437,0.777530,4.0,2010-05-28,2010-06-01,,DECA,LTC,5
3,3,1,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.777530,4.0,...,51.507322,-0.127647,0.830783,1.0,2010-06-22,2010-07-08,,SIC,LTC,17
4,4,1,5,londres,GB,London,51.507322,-0.127647,0.830783,1.0,...,50.843671,4.367437,0.777530,4.0,2010-06-22,2010-07-08,,SIC,LTC,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12802,12802,6128,36,amsterdam,NL,Amsterdam,52.372760,4.893604,0.826813,1.0,...,48.856697,2.351462,0.931710,2.0,2020-03-02,2020-03-03,train,DECS,SBS,2
12803,12803,6128,2,paris,FR,Paris,48.856697,2.351462,0.931710,2.0,...,52.372760,4.893604,0.826813,1.0,2020-03-02,2020-03-03,train,DECS,SBS,2
12804,12804,6129,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.777530,4.0,...,38.894893,-77.036553,0.849289,1.0,2020-01-28,2020-02-06,plane/eco,L&L,LTC,10
12805,12805,6129,267,washington,US,Washington,38.894893,-77.036553,0.849289,1.0,...,35.913154,-79.055780,0.746859,4.0,2020-01-28,2020-02-06,plane/eco,L&L,LTC,10


### Compute distance using Geodesic

Avoid cartesian computation when dealing with geographic information unless your projection system is compatible with (reasonable on a small portion of the world uniquely).
Plane travels have a radius increased by 10 km when distance is appreciable, should quantify how it does affect precision before path uncertainty.

In [24]:
def distance(x):
    try:
        return geodesic((x['from_lat'], x['from_lon']), (x['to_lat'], x['to_lon']),
                        ellipsoid='WGS-84').kilometers
    except:
        pass

In [25]:
travels['distance'] = travels.apply(distance, axis=1)

In [26]:
travels.sample(10)

Unnamed: 0,travel_id,mission_id,from_geocodeid,from_citycleansed,from_ISO2,from_city,from_lat,from_lon,from_accuracy,from_confidence,...,to_lon,to_accuracy,to_confidence,start,stop,travel_type,acronym,faculty,elapsed,distance
7807,7807,3711,885,la sage,CH,,46.098757,7.514699,0.475,6.0,...,4.367437,0.77753,4.0,2018-02-08,2018-02-10,plane/short,ScPOL,PHILOSCSOC,3,576.537887
2565,2565,1221,463,provo,US,Provo,40.233844,-111.658534,0.62578,2.0,...,4.367437,0.77753,4.0,2014-04-21,2014-05-05,,HAA,PHILOSCSOC,15,8159.364495
3416,3416,1627,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0,...,-0.127647,0.830783,1.0,2015-02-16,2015-02-17,,PHILO,PHILOSCSOC,2,322.849447
9596,9596,4574,48,bucarest,RO,București,44.436141,26.10272,0.701529,1.0,...,26.10272,0.701529,1.0,2018-10-26,2018-11-03,plane/short,ScPOL,PHILOSCSOC,9,0.0
5161,5161,2449,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0,...,5.324099,0.569652,4.0,2016-08-25,2016-08-25,,HAA,PHILOSCSOC,1,75.083119
3739,3739,1781,226,aiseau-presles,BE,Aiseau-Presles,50.416358,4.571198,0.55,4.0,...,4.367437,0.77753,4.0,2015-06-22,2015-07-10,,HAA,PHILOSCSOC,19,49.672945
11876,11876,5680,270,rotterdam,NL,Rotterdam,51.927847,4.488595,0.767394,2.0,...,4.367437,0.77753,4.0,2019-09-23,2020-02-22,train,DECS,SBS,153,120.915175
10042,10042,4789,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0,...,3.063528,0.753204,4.0,2019-01-22,2019-01-31,train,ScPOL,PHILOSCSOC,10,94.879968
3555,3555,1694,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0,...,2.351462,0.93171,2.0,2015-04-15,2015-04-15,,SIC,LTC,1,264.293514
4231,4231,2009,2,paris,FR,Paris,48.856697,2.351462,0.93171,2.0,...,4.367437,0.77753,4.0,2016-05-19,2016-05-19,,ScSOC,PHILOSCSOC,1,264.293514


In [30]:
travels.to_excel('data/travels.xlsx')