# SC/BEP Data Conditioning

In [1]:
import sys
import time
import pathlib
import pandas as pd
import geopandas as gpd
import geocoder
from geopy.distance import geodesic
#from fuzzymatcher import link_table, fuzzy_left_join
%matplotlib inline
pd.set_option('display.max_rows', 300)

## Load dataset

In [2]:
missions = pd.read_csv("data/missions.csv", sep=';', header=0, encoding='cp1250', parse_dates=['debut', 'fin'])
missions = missions.reset_index().rename(columns={'index': 'mission_id'})
missions = missions.drop('libelle', axis=1)
missions.sample(5)

Unnamed: 0,mission_id,debut,fin,pays_origine,ville_origine,pays,ville,lieu,mode_deplacement,acronyme,faculte
3232,3232,2017-07-02,2017-07-09,,,Brésil,"Săo Paulo, Brasilia, Rio de Janeiro","USP, Cebrap, UnB, UERJ, UFF, FGV, etc.",,ScPOL,PHILOSCSOC
3025,3025,2017-04-09,2017-04-13,,,CANADA,MONTREAL,UNIVERSITE DE MONTREAL,,ScPOL,PHILOSCSOC
1785,1785,2015-07-31,2015-08-02,,,Angelterre,Londres - Dorset,Facet Publishing,,SIC,LTC
5269,5269,2019-05-08,2019-05-09,Belgique,Bruxelles,France,Paris,Société Française de Statistique,train,ScPOL,PHILOSCSOC
5882,5882,2019-11-11,2019-11-12,Belgique,Bruxelles,Allemagne,Hambourg,Universität Hamburg [DE],train,DECS,SBS


## Data Cleansing

### Swap inverted timestamps

In [3]:
missions['elapsed'] = (missions['fin'] - missions['debut']).dt.days
missions.loc[missions['elapsed']<0,'fin'] = missions.loc[missions['elapsed']<0,'debut'].values
missions.loc[missions['elapsed']<0,'debut'] = missions.loc[missions['elapsed']<0,'fin'].values
missions['elapsed'] = ((missions['fin'] - missions['debut']) + pd.Timedelta("1D")).dt.days

### Correct subpopulated category

In [4]:
missions['acronyme'] = missions['acronyme'].replace({'PHILA': 'PHILO'}) 

### Normalize city names

In [5]:
def norm(x, fromkey, tokey):
    x[tokey] = x[fromkey].str.normalize('NFKD')\
                         .str.encode('ascii', errors='ignore').str.decode('utf-8')\
                         .str.strip().str.lower()

In [6]:
norm(missions, 'ville_origine', 'from_city')
norm(missions, 'ville', 'dest_city')

### Cleanse & split normalized city names

In [7]:
regreplace = r"\((.*?)\)"
regsplit = r'/|,|\&| et | puis |\+| and | - |;'
missions['dest_city'] = missions['dest_city'].str.replace(regreplace,"").str.split(regsplit)

In [8]:
df = missions.explode('dest_city')
df['dest_city'] = df['dest_city'].str.replace("\d", "")
df['dest_city'] = df['dest_city'].str.strip()

In [9]:
citytrans = pd.read_csv("data/city_trans.csv")
citytrans = {r.city: r.corrected for r in citytrans.itertuples()}

In [10]:
df['from_city'] = df['from_city'].replace(citytrans)
df['dest_city'] = df['dest_city'].replace(citytrans)

In [11]:
df['from_city'] = df['from_city'].fillna('bruxelles')
df['dest_city'] = df['dest_city'].fillna('bruxelles')

### Geocode city names

In [12]:
city = pd.DataFrame(list(df['from_city']) + list(df['dest_city']), columns=['city'])
city = city.dropna().drop_duplicates().rename(columns={'city': 'citycleansed'})
city.sample(5)

Unnamed: 0,citycleansed
9824,colombo
9288,barcelonne
12887,tarragona
7578,poitou
11059,tongeren


In [13]:
def geocode(x, keys=['country_code', 'city', 'lat', 'lng', 'accuracy', 'confidence']):
    # https://operations.osmfoundation.org/policies/nominatim/
    res = geocoder.osm(x).json
    time.sleep(1.2)
    return {k: (res or {}).get(k) for k in keys}

In [14]:
geocode = pd.read_json('data/geocode.json')
geocode

Unnamed: 0,geocodeid,citycleansed,ISO2,city,lat,lon,accuracy,confidence
0,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.777530,4.0
1,1,charleroi,BE,Charleroi,50.412033,4.443624,0.668664,3.0
2,2,paris,FR,Paris,48.856697,2.351462,0.931710,2.0
3,3,casablanca,MA,Casablanca ⵜⴰⴷⴷⴰⵔⵜ ⵜⵓⵎⵍⵉⵍⵜ الدار البيضاء,33.595063,-7.618777,0.717325,1.0
4,4,reading,GB,,51.451495,-0.983634,0.670144,4.0
...,...,...,...,...,...,...,...,...
1138,1138,sitia,GR,,35.206625,26.104971,0.476402,4.0
1139,1139,szklarska poreba,PL,Szklarska Poręba,50.827585,15.521166,0.566010,3.0
1140,1140,stadtoldendorf,DE,Stadtoldendorf,51.884211,9.625220,0.528807,4.0
1141,1141,longueil-sainte-marie,FR,Longueil-Sainte-Marie,49.357664,2.717691,0.849620,5.0


### Create Travel Cycle

In [15]:
travels = df.groupby("mission_id").agg({'from_city': 'first', 'dest_city': list}, default='first')
travels = travels.reset_index().rename(columns={'from_city': 'from', 'dest_city': 'to'}).dropna()
travels.sample(5)

Unnamed: 0,mission_id,from,to
4170,4170,bruxelles,[chicago]
970,970,bruxelles,[grenoble]
2118,2118,bruxelles,[paris]
4785,4785,bruxelles,[leyde]
57,57,bruxelles,[prague]


In [16]:
travels['chain'] = travels.apply(lambda x: [x['from']] + x['to'] + [x['from']], axis=1)
travels = travels.explode('chain')
travels['end'] = travels.groupby("mission_id")["chain"].shift(-1)
travels = travels.dropna(subset=['end']).reset_index(drop=True).reset_index().rename(columns={'index': 'travel_id'})
travels = travels.drop(['from', 'to'], axis=1).rename(columns={'chain': 'from', 'end': 'to'})

In [17]:
travels.head()

Unnamed: 0,travel_id,mission_id,from,to
0,0,0,bruxelles,londres
1,1,0,londres,oxford
2,2,0,oxford,bruxelles
3,3,1,bruxelles,londres
4,4,1,londres,bruxelles


In [18]:
travels = travels.merge(geocode.add_prefix('from_'), left_on='from', right_on='from_citycleansed', how='left')
travels = travels.merge(geocode.add_prefix('to_'), left_on='to', right_on='to_citycleansed', how='left')

In [19]:
travels = travels.drop(['from', 'to'], axis=1)

In [20]:
travels.sample(5)

Unnamed: 0,travel_id,mission_id,from_geocodeid,from_citycleansed,from_ISO2,from_city,from_lat,from_lon,from_accuracy,from_confidence,to_geocodeid,to_citycleansed,to_ISO2,to_city,to_lat,to_lon,to_accuracy,to_confidence
6127,6127,2906,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0,307,helsinki,FI,Helsinki,60.16741,24.942577,0.8385,1.0
5525,5525,2623,436,amiens,FR,Amiens,49.894171,2.295695,0.724949,4.0,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0
8409,8409,4000,798,campomoro,FR,,41.628822,8.81647,0.375,6.0,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0
12678,12678,6070,179,rennes,FR,Rennes,48.111339,-1.68002,0.725112,4.0,6,liege,BE,Liège,50.645138,5.57342,0.6366,3.0
11213,11213,5357,6,liege,BE,Liège,50.645138,5.57342,0.6366,3.0,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0


### Clean Missions

In [21]:
missions = missions.rename(columns={
    'debut': 'start', 'fin': 'stop', 'mode_deplacement': 'travel_type',
    'acronyme': 'acronym', 'faculte': 'faculty'
}).drop(['pays_origine', 'ville_origine', 'pays', 'lieu', 'ville', 'from_city', 'dest_city'], axis=1)
missions['travel_type'] = missions['travel_type'].replace({
    'avion_courte_distance': 'plane/short',
    'avion_long_courrier_eco': 'plane/eco',
    'avion_long_courrier_business': 'plane/first',
    'voiture_essence': 'car/gas',
    'voiture_diesel': 'car/diesel',
})
missions.sample(10)

Unnamed: 0,mission_id,start,stop,travel_type,acronym,faculty,elapsed
6077,6077,2020-02-27,2020-03-03,train,L&L,LTC,6
4646,4646,2018-11-10,2018-11-18,plane/eco,SIC,LTC,9
5431,5431,2019-06-14,2019-06-17,car/diesel,DECS,SBS,4
5568,5568,2019-06-24,2019-06-24,train,ScSOC,PHILOSCSOC,1
895,895,2013-02-13,2013-02-14,,PHILO,PHILOSCSOC,2
3447,3447,2017-10-24,2017-10-24,,L&L,LTC,1
5343,5343,2019-06-18,2019-06-24,plane/eco,DECA,LTC,7
5675,5675,2019-09-25,2019-09-28,,DECS,SBS,4
2182,2182,2016-05-17,2016-05-18,,PHILO,PHILOSCSOC,2
2679,2679,2016-12-05,2016-12-14,,SIC,LTC,10


In [22]:
travels = travels.merge(missions)

In [23]:
travels

Unnamed: 0,travel_id,mission_id,from_geocodeid,from_citycleansed,from_ISO2,from_city,from_lat,from_lon,from_accuracy,from_confidence,...,to_lat,to_lon,to_accuracy,to_confidence,start,stop,travel_type,acronym,faculty,elapsed
0,0,0,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.777530,4.0,...,51.507322,-0.127647,0.830783,1.0,2010-05-28,2010-06-01,,DECA,LTC,5
1,1,0,5,londres,GB,London,51.507322,-0.127647,0.830783,1.0,...,51.752013,-1.257850,0.760604,4.0,2010-05-28,2010-06-01,,DECA,LTC,5
2,2,0,99,oxford,GB,Oxford,51.752013,-1.257850,0.760604,4.0,...,50.843671,4.367437,0.777530,4.0,2010-05-28,2010-06-01,,DECA,LTC,5
3,3,1,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.777530,4.0,...,51.507322,-0.127647,0.830783,1.0,2010-06-22,2010-07-08,,SIC,LTC,17
4,4,1,5,londres,GB,London,51.507322,-0.127647,0.830783,1.0,...,50.843671,4.367437,0.777530,4.0,2010-06-22,2010-07-08,,SIC,LTC,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12802,12802,6128,36,amsterdam,NL,Amsterdam,52.372760,4.893604,0.826813,1.0,...,48.856697,2.351462,0.931710,2.0,2020-03-02,2020-03-03,train,DECS,SBS,2
12803,12803,6128,2,paris,FR,Paris,48.856697,2.351462,0.931710,2.0,...,52.372760,4.893604,0.826813,1.0,2020-03-02,2020-03-03,train,DECS,SBS,2
12804,12804,6129,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.777530,4.0,...,38.894893,-77.036553,0.849289,1.0,2020-01-28,2020-02-06,plane/eco,L&L,LTC,10
12805,12805,6129,267,washington,US,Washington,38.894893,-77.036553,0.849289,1.0,...,35.913154,-79.055780,0.746859,4.0,2020-01-28,2020-02-06,plane/eco,L&L,LTC,10


### Compute distance using Geodesic

Avoid cartesian computation when dealing with geographic information unless your projection system is compatible with (reasonable on a small portion of the world uniquely).
Plane travels have a radius increased by 10 km when distance is appreciable, should quantify how it does affect precision before path uncertainty.

In [24]:
def distance(x):
    try:
        return geodesic((x['from_lat'], x['from_lon']), (x['to_lat'], x['to_lon']),
                        ellipsoid='WGS-84').kilometers
    except:
        pass

In [25]:
travels['distance'] = travels.apply(distance, axis=1)

In [26]:
travels.sample(10)

Unnamed: 0,travel_id,mission_id,from_geocodeid,from_citycleansed,from_ISO2,from_city,from_lat,from_lon,from_accuracy,from_confidence,...,to_lon,to_accuracy,to_confidence,start,stop,travel_type,acronym,faculty,elapsed,distance
10048,10048,4792,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0,...,-3.703582,0.85807,1.0,2019-01-20,2019-01-23,,SIC,LTC,4,1316.913134
10076,10076,4805,32,maastricht,NL,Maastricht,50.851244,5.690977,0.692908,4.0,...,4.367437,0.77753,4.0,2019-01-25,2019-01-25,car/gas,HAA,PHILOSCSOC,1,93.21632
6945,6945,3297,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0,...,-0.137406,0.700602,5.0,2017-09-13,2017-09-13,,SIC,LTC,1,317.324284
1726,1726,818,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0,...,19.146073,0.840844,1.0,2012-11-23,2012-11-25,,DECA,LTC,3,1138.521333
2345,2345,1116,58,montreal,CA,Montréal,45.497216,-73.610364,0.711736,1.0,...,4.367437,0.77753,4.0,2013-09-21,2013-09-29,,SIC,LTC,9,5555.626465
11874,11874,5679,32,maastricht,NL,Maastricht,50.851244,5.690977,0.692908,4.0,...,4.367437,0.77753,4.0,2019-09-19,2019-09-20,car/diesel,DECS,SBS,2,93.21632
16,16,7,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0,...,26.254175,0.275,6.0,2010-07-10,2010-07-28,,HAA,PHILOSCSOC,19,2472.69118
11629,11629,5559,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0,...,13.38886,0.887539,1.0,2019-05-05,2019-05-07,plane/short,DECS,SBS,3,650.656508
12338,12338,5906,142,lyon,FR,Lyon,45.757814,4.832011,0.791378,4.0,...,4.367437,0.77753,4.0,2019-11-25,2019-11-29,train,HAA,PHILOSCSOC,5,566.573495
7919,7919,3766,2,paris,FR,Paris,48.856697,2.351462,0.93171,2.0,...,4.367437,0.77753,4.0,2018-03-14,2018-03-14,train,HAA,PHILOSCSOC,1,264.293514


In [27]:
travels.to_excel('data/travels.xlsx', index=False)