# SC/BEP First Analysis

In [1]:
import sys
import time
import pathlib
import pandas as pd
import geopandas as gpd
import geocoder
from geopy.distance import geodesic
#from fuzzymatcher import link_table, fuzzy_left_join
%matplotlib inline
pd.set_option('display.max_rows', 300)

## Load datasets

In [2]:
country = gpd.read_file(pathlib.Path("country.geojson").open().read())

In [3]:
df = pd.read_csv("missions.csv", sep=';', header=0, encoding='cp1250', parse_dates=['debut', 'fin'])
df = df.reset_index().rename(columns={'index': 'mission_id'})
df.sample(5)

Unnamed: 0,mission_id,debut,fin,pays_origine,ville_origine,pays,ville,lieu,mode_deplacement,acronyme,libelle,faculte
3124,3124,2017-06-06,2017-07-03,,,Maroc,Marrakech et Rabat,Centre Jacques Berque (CNRS - Rabat),,ScSOC,Département de Sciences Sociales et des scienc...,PHILOSCSOC
1108,1108,2013-11-11,2013-11-14,,,Chine,Pékin,Communication University of China,,DECA,"Décanat de la Faculté de Lettres, Traduction e...",LTC
4679,4679,2018-11-23,2018-11-23,Belgique,Bruxelles,Royaume-Uni,Oxford,University of Oxford [GB],train,DECB,Décanat de la Faculté de Philosophie et Scienc...,PHILOSCSOC
286,286,2011-04-29,2011-05-28,,,Syrie,,,,HAA,"Département d'Histoire, arts et archéologie",PHILOSCSOC
5801,5801,2019-10-30,2019-11-05,Begique,Bruxelles,Maroc,Zagora,Structure : festival culturel Taragalte,avion_courte_distance,HAA,"Département d'Histoire, arts et archéologie",PHILOSCSOC


## Data Cleansing

### Swap inverted timestamps

In [4]:
df['elapsed'] = (df['fin'] - df['debut']).dt.days
df.loc[df['elapsed']<0,'fin'] = df.loc[df['elapsed']<0,'debut'].values
df.loc[df['elapsed']<0,'debut'] = df.loc[df['elapsed']<0,'fin'].values
df['elapsed'] = (df['fin'] - df['debut']).dt.days

### Normalize country names

In [5]:
def norm(x, fromkey, tokey):
    x[tokey] = x[fromkey].str.normalize('NFKD')\
                         .str.encode('ascii', errors='ignore').str.decode('utf-8')\
                         .str.strip().str.lower()

In [6]:
def unique(x, columns):
    keys = set()
    for k in columns:
        keys.update(x[k].dropna().unique())
    return keys

In [7]:
norm(df, 'pays_origine', 'from_country')
norm(df, 'pays', 'dest_country')

In [8]:
norm(df, 'ville_origine', 'from_city')
norm(df, 'ville', 'dest_city')

### Cleanse & split normalized city names

In [9]:
regreplace = r"\((.*?)\)"
regsplit = r'/|,|\&| et | puis |\+| and | - |;'
df['dest_city'] = df['dest_city'].str.replace(regreplace,"").str.split(regsplit)
df = df.explode('dest_city')
df['dest_city'] = df['dest_city'].str.replace("\d", "")
df['dest_city'] = df['dest_city'].str.strip()

In [10]:
#df['dest_city'].sample(30)

### Geocode city names

https://operations.osmfoundation.org/policies/nominatim/

In [11]:
def geocode(x, keys=['country_code', 'city', 'lat', 'lng', 'accuracy', 'confidence']):
    res = geocoder.osm(x).json
    time.sleep(2.0)
    return {k: (res or {}).get(k) for k in keys}

In [12]:
city = pd.DataFrame(list(df['from_city'].dropna().unique()) + list(df['dest_city'].unique()), columns=['city'])
city['city'].sample(30).values

array(['uppsala', 'bale', 'espagne', 'limerick', 'braunschweig',
       'djibouti', 'stanford', 'palaikastro sitias', 'reaading', 'ulcinj',
       'hong kong', 'new york city', 'campomoro', 'differentes villes',
       'mainz', 'nazca', 'paray-le-monial', 'winterthur', 'marrakech',
       'kalmar', 'wadi mussa', 'pontresina', 'marseille', 'il', 'tijuana',
       'atlanta', 'nyiregyhaza', 'taichung', 'athenes', 'modena'],
      dtype=object)

In [13]:
city.index

RangeIndex(start=0, stop=1268, step=1)

In [14]:
#citygc = city['city'].apply(geocode).apply(pd.Series)
#citygc.to_pickle('geocoded.pickle')
citygc = pd.read_pickle('geocoded.pickle')
citygc['country_code'] = citygc['country_code'].str.upper()
citygc = citygc.rename(columns={'city': 'citycoded', 'country_code': 'ISO2'})

In [15]:
geocoded = pd.concat([city, citygc], axis=1).drop_duplicates('city')
geocoded

Unnamed: 0,city,ISO2,citycoded,lat,lng,accuracy,confidence
0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.777530,4.0
1,charleroi,BE,Charleroi,50.412033,4.443624,0.668664,3.0
2,paris,FR,Paris,48.856697,2.351462,0.931710,2.0
3,casablanca,MA,Casablanca ⵜⴰⴷⴷⴰⵔⵜ ⵜⵓⵎⵍⵉⵍⵜ الدار البيضاء,33.595063,-7.618777,0.717325,1.0
4,reading,GB,,51.451495,-0.983634,0.670144,4.0
...,...,...,...,...,...,...,...
1263,stadtoldendorf,DE,Stadtoldendorf,51.884211,9.625220,0.528807,4.0
1264,longueil-sainte-marie,FR,Longueil-Sainte-Marie,49.357664,2.717691,0.849620,5.0
1265,montpellie,,,,,,
1266,otsu,JP,,35.004710,135.868674,0.532413,1.0


In [16]:
df['from_city'] = df['from_city'].fillna('bruxelles')

In [17]:
df = df.merge(geocoded.add_prefix('from_'), how='left')#, left_on='from_city', right_on='from_city')
df = df.merge(geocoded.add_prefix('dest_'), how='left')#, left_on='dest_city', right_on='from_city')

In [18]:
df = df.reset_index().rename(columns={'index': 'travel_id'})
df

Unnamed: 0,travel_id,mission_id,debut,fin,pays_origine,ville_origine,pays,ville,lieu,mode_deplacement,...,from_lat,from_lng,from_accuracy,from_confidence,dest_ISO2,dest_citycoded,dest_lat,dest_lng,dest_accuracy,dest_confidence
0,0,0,2010-05-28,2010-06-01,,,Grande Bretagne,Londres/Oxford,Oxford University — All Souls College,,...,50.843671,4.367437,0.777530,4.0,GB,London,51.507322,-0.127647,0.830783,1.0
1,1,0,2010-05-28,2010-06-01,,,Grande Bretagne,Londres/Oxford,Oxford University — All Souls College,,...,50.843671,4.367437,0.777530,4.0,GB,Oxford,51.752013,-1.257850,0.760604,4.0
2,2,1,2010-06-22,2010-07-08,,,Royaume-Uni,Londres,Archives du Royal Opera House Covent GardenArc...,,...,50.843671,4.367437,0.777530,4.0,GB,London,51.507322,-0.127647,0.830783,1.0
3,3,2,2010-06-22,2010-07-08,,,Royaume-Uni,Londres,Archives du Royal Opera House Covent GardenArc...,,...,50.843671,4.367437,0.777530,4.0,GB,London,51.507322,-0.127647,0.830783,1.0
4,4,3,2010-07-12,2010-07-17,,,Royaume-Uni,Londres,Archives du Royal Opera House Covent GardenArc...,,...,50.843671,4.367437,0.777530,4.0,GB,London,51.507322,-0.127647,0.830783,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6672,6672,6126,2020-02-23,2020-02-28,Belgique,Bruxelles,France,Toulouse/Perpignan/Montpellier,AgroParisTech [FR],train,...,50.843671,4.367437,0.777530,4.0,FR,Montpellier,43.611242,3.876734,0.741204,4.0
6673,6673,6127,2020-02-16,2020-02-18,Belgique,Bruxelles,France,Pessac,Université de Bordeaux-Montaigne,train,...,50.843671,4.367437,0.777530,4.0,FR,Pessac,44.805615,-0.630840,0.549654,4.0
6674,6674,6128,2020-03-02,2020-03-03,Pays-Bas,Amsterdam,France,Paris,OCDE,train,...,52.372760,4.893604,0.826813,1.0,FR,Paris,48.856697,2.351462,0.931710,2.0
6675,6675,6129,2020-01-28,2020-02-06,Belgique,Bruxelles,États-Unis,"Washington, Chapel Hill",Abilene Christian University [US],avion_long_courrier_eco,...,50.843671,4.367437,0.777530,4.0,US,Washington,38.894893,-77.036553,0.849289,1.0


### Compute distance using Geodesic

Avoid cartesian computation when dealing with geographic information unless your projection system is compatible with (reasonable on a small portion of the world uniquely).
Plane travels have a radius increased by 10 km when distance is appreciable, should quantify how it does affect precision before path uncertainty.

In [19]:
def point(x, lat, lon):
    if x[lat]: 
        return (x[lat], x[lon])

In [20]:
df['from_point'] = df.apply(lambda x: point(x, 'from_lat', 'from_lng'), axis=1)
df['dest_point'] = df.apply(lambda x: point(x, 'dest_lat', 'dest_lng'), axis=1)

In [21]:
def distance(x):
    try:
        return geodesic(x['from_point'], x['dest_point'], ellipsoid='WGS-84').kilometers
    except:
        pass

In [22]:
df['distance'] = df.apply(distance, axis=1)

In [23]:
df.sample(10)

Unnamed: 0,travel_id,mission_id,debut,fin,pays_origine,ville_origine,pays,ville,lieu,mode_deplacement,...,from_confidence,dest_ISO2,dest_citycoded,dest_lat,dest_lng,dest_accuracy,dest_confidence,from_point,dest_point,distance
5318,5318,4850,2019-01-27,2019-01-29,Belgique,Bruxelles,Italie,Rome,"Universitŕ di Roma - ""La Sapienza""",avion_courte_distance,...,4.0,IT,Roma,41.89332,12.482932,0.789611,1.0,"(50.8436709, 4.3674366933879565)","(41.8933203, 12.4829321)",1173.045388
4132,4132,3746,2018-04-11,2018-04-16,Belgique,Bruxelles,Royaume-Uni,Cambridge,University of Cambridge [GB],train,...,4.0,GB,Cambridge,52.203482,0.123582,0.742767,4.0,"(50.8436709, 4.3674366933879565)","(52.2034823, 0.1235817)",331.054075
3176,3176,2866,2017-04-11,2017-04-20,,,France,Aleria,Site d'Aléria,,...,4.0,FR,Aléria,42.113572,9.514473,0.513821,4.0,"(50.8436709, 4.3674366933879565)","(42.1135718, 9.5144732)",1047.195535
3805,3805,3443,2017-10-19,2017-10-21,,,France,Paris,IHA - Institut historique allemand,,...,4.0,FR,Paris,48.856697,2.351462,0.93171,2.0,"(50.8436709, 4.3674366933879565)","(48.8566969, 2.3514616)",264.293514
4210,4210,3818,2018-03-15,2018-03-15,Belgique,Bruxelles,France,Paris,Eurogip,train,...,4.0,FR,Paris,48.856697,2.351462,0.93171,2.0,"(50.8436709, 4.3674366933879565)","(48.8566969, 2.3514616)",264.293514
6387,6387,5866,2018-11-04,2020-03-04,Belgique,Bruxelles,République Démocratique du Congo,"Kisangani, Wanie-Rukula",Université de Kisangani [CD],avion_long_courrier_eco,...,4.0,CD,,0.193749,25.527917,0.5,4.0,"(50.8436709, 4.3674366933879565)","(0.1937489, 25.5279174)",5958.903853
2581,2581,2328,2016-06-10,2016-06-13,,,France,Paris,FIDH,,...,4.0,FR,Paris,48.856697,2.351462,0.93171,2.0,"(50.8436709, 4.3674366933879565)","(48.8566969, 2.3514616)",264.293514
4137,4137,3751,2018-02-10,2018-02-15,Belgique,Bruxelles,Suisse,Genčve,OMC,avion_courte_distance,...,4.0,,,,,,,"(50.8436709, 4.3674366933879565)","(nan, nan)",
5071,5071,4622,2018-11-20,2018-11-22,Belgique,Bruxelles,Royaume-Uni,Londre,"University College London, University of Londo...",train,...,4.0,GB,London,51.507322,-0.127647,0.830783,1.0,"(50.8436709, 4.3674366933879565)","(51.5073219, -0.1276474)",322.849447
3515,3515,3175,2017-06-29,2017-07-06,,,France et Grčce (Cręte),"Paris, divers Cręte",na,,...,4.0,GR,,35.406788,25.018286,0.101,10.0,"(50.8436709, 4.3674366933879565)","(35.4067884, 25.0182857)",2385.025681
