# SC/BEP Data Conditioning

In [1]:
import sys
import time
import pathlib
import pandas as pd
import geopandas as gpd
import geocoder
from geopy.distance import geodesic
#from fuzzymatcher import link_table, fuzzy_left_join
%matplotlib inline
pd.set_option('display.max_rows', 300)

## Load datasets

In [2]:
df = pd.read_csv("missions.csv", sep=';', header=0, encoding='cp1250', parse_dates=['debut', 'fin'])
df = df.reset_index().rename(columns={'index': 'mission_id'})
df.sample(5)

Unnamed: 0,mission_id,debut,fin,pays_origine,ville_origine,pays,ville,lieu,mode_deplacement,acronyme,libelle,faculte
5367,5367,2019-06-14,2019-06-19,Belgique,Bruxelles,Portugal,Porto,Université de Porto,avion_courte_distance,L&L,Département de Langues et Lettres,LTC
5610,5610,2019-10-24,2019-10-27,Belgique,Bruxelles,Amsterdam,Pays-Bas,Universiteit van Amsterdam,train,DECS,Dean's Office - Solvay Brussels School of Econ...,SBS
4519,4519,2018-10-15,2018-10-17,Belgique,Bruxelles,FRANCE,Paris,CNL,train,DECA,"Décanat de la Faculté de Lettres, Traduction e...",LTC
665,665,2012-06-25,2011-01-01,,,France,Lille,Université Lille III,,HAA,"Département d'Histoire, arts et archéologie",PHILOSCSOC
3079,3079,2017-05-17,2017-05-20,,,Italie,Rome,Academia Belgica,,L&L,Département de Langues et Lettres,LTC


## Data Cleansing

### Swap inverted timestamps

In [3]:
df['elapsed'] = (df['fin'] - df['debut']).dt.days
df.loc[df['elapsed']<0,'fin'] = df.loc[df['elapsed']<0,'debut'].values
df.loc[df['elapsed']<0,'debut'] = df.loc[df['elapsed']<0,'fin'].values
df['elapsed'] = ((df['fin'] - df['debut']) + pd.Timedelta("1D")).dt.days

### Normalize country names

In [4]:
def norm(x, fromkey, tokey):
    x[tokey] = x[fromkey].str.normalize('NFKD')\
                         .str.encode('ascii', errors='ignore').str.decode('utf-8')\
                         .str.strip().str.lower()

In [5]:
def unique(x, columns):
    keys = set()
    for k in columns:
        keys.update(x[k].dropna().unique())
    return keys

In [6]:
norm(df, 'pays_origine', 'from_country')
norm(df, 'pays', 'dest_country')

In [7]:
norm(df, 'ville_origine', 'from_city')
norm(df, 'ville', 'dest_city')

### Cleanse & split normalized city names

In [8]:
regreplace = r"\((.*?)\)"
regsplit = r'/|,|\&| et | puis |\+| and | - |;'
df['dest_city'] = df['dest_city'].str.replace(regreplace,"").str.split(regsplit)
df = df.explode('dest_city')
df['dest_city'] = df['dest_city'].str.replace("\d", "")
df['dest_city'] = df['dest_city'].str.strip()

In [9]:
#df['dest_city'].sample(30)

### Geocode city names

https://operations.osmfoundation.org/policies/nominatim/

In [26]:
def geocode(x, keys=['country_code', 'city', 'lat', 'lng', 'accuracy', 'confidence']):
    res = geocoder.osm(x).json
    time.sleep(1.2)
    return {k: (res or {}).get(k) for k in keys}

In [11]:
city = pd.DataFrame(list(df['from_city'].dropna().unique()) + list(df['dest_city'].unique()), columns=['city'])
city = city.dropna().drop_duplicates()
city['city'].sample(30).values

array(['la paix-dieu awap', 'bath', 'saint petersbourg', 'sfax',
       'auvelais', 'bonifacio', 'porto', 'aalborg', 'mexico city df',
       'nimcgue', 'middelburg', 'tokyo', 'loughborough', 'cairo',
       'les eyzies', 'tournus', 'mindelo',
       'egligny  abbaye cistercienne de preuilly', 'grenade', 'nanjing',
       "xi'an", 'irvine', 'palekastro crete', 'karlstad', 'weimar',
       'cotonou', 'la rochelle', 'jyvaskyla', 'abidjen', 'essen'],
      dtype=object)

In [12]:
#citygc = city['city'].apply(geocode).apply(pd.Series)
#citygc.to_pickle('geocoded.pickle')

In [13]:
citygc = pd.read_pickle('geocoded.pickle')
citygc['country_code'] = citygc['country_code'].str.upper()
citygc = citygc.rename(columns={'city': 'citycoded', 'country_code': 'ISO2'})

In [25]:
city

Unnamed: 0,city
0,bruxelles
1,charleroi
2,paris
3,casablanca
4,reading
...,...
1263,stadtoldendorf
1264,longueil-sainte-marie
1265,montpellie
1266,otsu


In [30]:
geocoded = pd.concat([city, citygc], axis=1).drop_duplicates('city')
geocoded.loc[geocoded['citycoded'].isnull(),:].to_excel("test.xlsx")

In [15]:
df['from_city'] = df['from_city'].fillna('bruxelles')

In [16]:
df = df.merge(geocoded.add_prefix('from_'), how='left')#, left_on='from_city', right_on='from_city')
df = df.merge(geocoded.add_prefix('dest_'), how='left')#, left_on='dest_city', right_on='from_city')

In [17]:
df = df.reset_index().rename(columns={'index': 'travel_id'})
df

Unnamed: 0,travel_id,mission_id,debut,fin,pays_origine,ville_origine,pays,ville,lieu,mode_deplacement,...,from_lat,from_lng,from_accuracy,from_confidence,dest_ISO2,dest_citycoded,dest_lat,dest_lng,dest_accuracy,dest_confidence
0,0,0,2010-05-28,2010-06-01,,,Grande Bretagne,Londres/Oxford,Oxford University — All Souls College,,...,50.843671,4.367437,0.777530,4.0,GB,London,51.507322,-0.127647,0.830783,1.0
1,1,0,2010-05-28,2010-06-01,,,Grande Bretagne,Londres/Oxford,Oxford University — All Souls College,,...,50.843671,4.367437,0.777530,4.0,GB,Oxford,51.752013,-1.257850,0.760604,4.0
2,2,1,2010-06-22,2010-07-08,,,Royaume-Uni,Londres,Archives du Royal Opera House Covent GardenArc...,,...,50.843671,4.367437,0.777530,4.0,GB,London,51.507322,-0.127647,0.830783,1.0
3,3,2,2010-06-22,2010-07-08,,,Royaume-Uni,Londres,Archives du Royal Opera House Covent GardenArc...,,...,50.843671,4.367437,0.777530,4.0,GB,London,51.507322,-0.127647,0.830783,1.0
4,4,3,2010-07-12,2010-07-17,,,Royaume-Uni,Londres,Archives du Royal Opera House Covent GardenArc...,,...,50.843671,4.367437,0.777530,4.0,GB,London,51.507322,-0.127647,0.830783,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6672,6672,6126,2020-02-23,2020-02-28,Belgique,Bruxelles,France,Toulouse/Perpignan/Montpellier,AgroParisTech [FR],train,...,50.843671,4.367437,0.777530,4.0,FR,Montpellier,43.611242,3.876734,0.741204,4.0
6673,6673,6127,2020-02-16,2020-02-18,Belgique,Bruxelles,France,Pessac,Université de Bordeaux-Montaigne,train,...,50.843671,4.367437,0.777530,4.0,FR,Pessac,44.805615,-0.630840,0.549654,4.0
6674,6674,6128,2020-03-02,2020-03-03,Pays-Bas,Amsterdam,France,Paris,OCDE,train,...,52.372760,4.893604,0.826813,1.0,FR,Paris,48.856697,2.351462,0.931710,2.0
6675,6675,6129,2020-01-28,2020-02-06,Belgique,Bruxelles,États-Unis,"Washington, Chapel Hill",Abilene Christian University [US],avion_long_courrier_eco,...,50.843671,4.367437,0.777530,4.0,US,Washington,38.894893,-77.036553,0.849289,1.0


### Compute distance using Geodesic

Avoid cartesian computation when dealing with geographic information unless your projection system is compatible with (reasonable on a small portion of the world uniquely).
Plane travels have a radius increased by 10 km when distance is appreciable, should quantify how it does affect precision before path uncertainty.

In [18]:
def point(x, lat, lon):
    if x[lat]: 
        return (x[lat], x[lon])

In [19]:
df['from_point'] = df.apply(lambda x: point(x, 'from_lat', 'from_lng'), axis=1)
df['dest_point'] = df.apply(lambda x: point(x, 'dest_lat', 'dest_lng'), axis=1)

In [20]:
def distance(x):
    try:
        return geodesic(x['from_point'], x['dest_point'], ellipsoid='WGS-84').kilometers
    except:
        pass

In [21]:
df['distance'] = df.apply(distance, axis=1)

In [22]:
df.sample(10)

Unnamed: 0,travel_id,mission_id,debut,fin,pays_origine,ville_origine,pays,ville,lieu,mode_deplacement,...,from_confidence,dest_ISO2,dest_citycoded,dest_lat,dest_lng,dest_accuracy,dest_confidence,from_point,dest_point,distance
5731,5731,5244,2019-04-12,2019-04-14,Belgique,Bruxelles,Royaume-Uni,Londres,Royal Holloway University of London,,...,4.0,GB,London,51.507322,-0.127647,0.830783,1.0,"(50.8436709, 4.3674366933879565)","(51.5073219, -0.1276474)",322.849447
848,848,766,2012-10-08,2012-10-15,,,Canada,Calgary (Alberta),Mount Royal University,,...,4.0,CA,,51.053423,-114.062589,0.74148,1.0,"(50.8436709, 4.3674366933879565)","(51.0534234, -114.0625892)",7310.028569
3461,3461,3129,2017-11-07,2017-11-09,,,Italie,Rome,Sapienza university,,...,4.0,IT,Roma,41.89332,12.482932,0.789611,1.0,"(50.8436709, 4.3674366933879565)","(41.8933203, 12.4829321)",1173.045388
4172,4172,3784,2018-04-08,2018-04-18,Belgique,Bruxelles,Chine,Pekin,Beihang University,avion_long_courrier_eco,...,4.0,CN,东城区,39.906217,116.391276,0.734401,1.0,"(50.8436709, 4.3674366933879565)","(39.906217, 116.3912757)",7981.756237
6084,6084,5572,2019-06-22,2019-07-04,Belgique,Bruxelles,États-Unis,"Toulouse, Montpellier",TSE,train,...,4.0,FR,Montpellier,43.611242,3.876734,0.741204,4.0,"(50.8436709, 4.3674366933879565)","(43.6112422, 3.8767337)",804.919691
2962,2962,2678,2017-01-22,2017-01-25,,,france,"Paris, Compičgne, Lille",Agence nationale de la recherche ŕ ParisUniver...,,...,4.0,,,,,,,"(50.8436709, 4.3674366933879565)","(nan, nan)",
4015,4015,3637,2018-01-10,2018-01-12,,,Luxembourg,,30e colloque de l’ADMEE-Europe au Luxembourg1...,,...,4.0,GB,London,51.507322,-0.127647,0.830783,1.0,"(50.8436709, 4.3674366933879565)","(51.5073219, -0.1276474)",322.849447
2737,2737,2474,2016-07-10,2016-08-08,,,France,Egigny,Domaine de l'abbaye de Preuilly - 77126 Eglign...,,...,4.0,,,,,,,"(50.8436709, 4.3674366933879565)","(nan, nan)",
852,852,769,2012-10-10,2012-10-13,,,France,Rennes,IEP de Rennes,,...,4.0,FR,Rennes,48.111339,-1.68002,0.725112,4.0,"(50.8436709, 4.3674366933879565)","(48.1113387, -1.6800198)",533.036533
4500,4500,4081,2018-06-13,2018-06-15,Belgique,Bruxelles,France,Paris,Paris Sorbonne,train,...,4.0,FR,Paris,48.856697,2.351462,0.93171,2.0,"(50.8436709, 4.3674366933879565)","(48.8566969, 2.3514616)",264.293514


In [23]:
df.to_pickle("missions.pickle")