# SC/BEP First Analysis

In [1]:
import sys
import time
import pathlib
import pandas as pd
import geopandas as gpd
import geocoder
from geopy.distance import geodesic
#from fuzzymatcher import link_table, fuzzy_left_join
%matplotlib inline
pd.set_option('display.max_rows', 300)

## Load datasets

In [2]:
df = pd.read_csv("missions.csv", sep=';', header=0, encoding='cp1250', parse_dates=['debut', 'fin'])
df = df.reset_index().rename(columns={'index': 'mission_id'})
df.sample(5)

Unnamed: 0,mission_id,debut,fin,pays_origine,ville_origine,pays,ville,lieu,mode_deplacement,acronyme,libelle,faculte
4150,4150,2018-05-28,2018-05-28,Belgique,Bruxelles,France,Paris,Université Paris Nanterre (Paris X) [FR],train,SIC,Département des Sciences de l'information et d...,LTC
3456,3456,2017-12-08,2017-12-09,,,France,Paris,"auditorium du Jeu de Paume, Paris (France)",,HAA,"Département d'Histoire, arts et archéologie",PHILOSCSOC
3591,3591,2017-12-15,2017-12-15,,,France,Paris,Association Internationale Francophone de Rech...,,ScSOC,Département de Sciences Sociales et des scienc...,PHILOSCSOC
4574,4574,2018-10-26,2018-11-03,Bruxelles,Bucarest,Roumanie,Bucarest,Institut national d'administration,avion_courte_distance,ScPOL,Département de Sciences Politiques,PHILOSCSOC
97,97,2010-11-18,2010-11-19,,,,Bruxelles,Université Libre de BruxellesInstitut Royal du...,,HAA,"Département d'Histoire, arts et archéologie",PHILOSCSOC


## Data Cleansing

### Swap inverted timestamps

In [3]:
df['elapsed'] = (df['fin'] - df['debut']).dt.days
df.loc[df['elapsed']<0,'fin'] = df.loc[df['elapsed']<0,'debut'].values
df.loc[df['elapsed']<0,'debut'] = df.loc[df['elapsed']<0,'fin'].values
df['elapsed'] = (df['fin'] - df['debut']).dt.days

### Normalize country names

In [4]:
def norm(x, fromkey, tokey):
    x[tokey] = x[fromkey].str.normalize('NFKD')\
                         .str.encode('ascii', errors='ignore').str.decode('utf-8')\
                         .str.strip().str.lower()

In [5]:
def unique(x, columns):
    keys = set()
    for k in columns:
        keys.update(x[k].dropna().unique())
    return keys

In [6]:
norm(df, 'pays_origine', 'from_country')
norm(df, 'pays', 'dest_country')

In [7]:
norm(df, 'ville_origine', 'from_city')
norm(df, 'ville', 'dest_city')

### Cleanse & split normalized city names

In [8]:
regreplace = r"\((.*?)\)"
regsplit = r'/|,|\&| et | puis |\+| and | - |;'
df['dest_city'] = df['dest_city'].str.replace(regreplace,"").str.split(regsplit)
df = df.explode('dest_city')
df['dest_city'] = df['dest_city'].str.replace("\d", "")
df['dest_city'] = df['dest_city'].str.strip()

In [9]:
#df['dest_city'].sample(30)

### Geocode city names

https://operations.osmfoundation.org/policies/nominatim/

In [10]:
def geocode(x, keys=['country_code', 'city', 'lat', 'lng', 'accuracy', 'confidence']):
    res = geocoder.osm(x).json
    time.sleep(2.0)
    return {k: (res or {}).get(k) for k in keys}

In [11]:
city = pd.DataFrame(list(df['from_city'].dropna().unique()) + list(df['dest_city'].unique()), columns=['city'])
city['city'].sample(30).values

array(['colchester', 'ouagadougou', 'zielona gora', 'brasov', 'lucca',
       'leon', 'binche', 'ramillies', 'leysin', 'saint germain en laye',
       'la chaise-dieu', 'saragosse', 'aiseau-presles', 'johannesburg',
       'differentes villes', 'dresde', 'varsovie', 'loughborough',
       'torino', 'napoli', 'nijni-novgorod', 'karlsruhe', 'waterloo',
       'hamburg', 'etterbeek', 'damas', 'antalya', 'rabat',
       'tokyo- osaka', 'leuven'], dtype=object)

In [12]:
city.index

RangeIndex(start=0, stop=1268, step=1)

In [13]:
#citygc = city['city'].apply(geocode).apply(pd.Series)
#citygc.to_pickle('geocoded.pickle')
citygc = pd.read_pickle('geocoded.pickle')
citygc['country_code'] = citygc['country_code'].str.upper()
citygc = citygc.rename(columns={'city': 'citycoded', 'country_code': 'ISO2'})

In [14]:
geocoded = pd.concat([city, citygc], axis=1).drop_duplicates('city')
geocoded

Unnamed: 0,city,ISO2,citycoded,lat,lng,accuracy,confidence
0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.777530,4.0
1,charleroi,BE,Charleroi,50.412033,4.443624,0.668664,3.0
2,paris,FR,Paris,48.856697,2.351462,0.931710,2.0
3,casablanca,MA,Casablanca ⵜⴰⴷⴷⴰⵔⵜ ⵜⵓⵎⵍⵉⵍⵜ الدار البيضاء,33.595063,-7.618777,0.717325,1.0
4,reading,GB,,51.451495,-0.983634,0.670144,4.0
...,...,...,...,...,...,...,...
1263,stadtoldendorf,DE,Stadtoldendorf,51.884211,9.625220,0.528807,4.0
1264,longueil-sainte-marie,FR,Longueil-Sainte-Marie,49.357664,2.717691,0.849620,5.0
1265,montpellie,,,,,,
1266,otsu,JP,,35.004710,135.868674,0.532413,1.0


In [15]:
df['from_city'] = df['from_city'].fillna('bruxelles')

In [16]:
df = df.merge(geocoded.add_prefix('from_'), how='left')#, left_on='from_city', right_on='from_city')
df = df.merge(geocoded.add_prefix('dest_'), how='left')#, left_on='dest_city', right_on='from_city')

In [17]:
df = df.reset_index().rename(columns={'index': 'travel_id'})
df

Unnamed: 0,travel_id,mission_id,debut,fin,pays_origine,ville_origine,pays,ville,lieu,mode_deplacement,...,from_lat,from_lng,from_accuracy,from_confidence,dest_ISO2,dest_citycoded,dest_lat,dest_lng,dest_accuracy,dest_confidence
0,0,0,2010-05-28,2010-06-01,,,Grande Bretagne,Londres/Oxford,Oxford University — All Souls College,,...,50.843671,4.367437,0.777530,4.0,GB,London,51.507322,-0.127647,0.830783,1.0
1,1,0,2010-05-28,2010-06-01,,,Grande Bretagne,Londres/Oxford,Oxford University — All Souls College,,...,50.843671,4.367437,0.777530,4.0,GB,Oxford,51.752013,-1.257850,0.760604,4.0
2,2,1,2010-06-22,2010-07-08,,,Royaume-Uni,Londres,Archives du Royal Opera House Covent GardenArc...,,...,50.843671,4.367437,0.777530,4.0,GB,London,51.507322,-0.127647,0.830783,1.0
3,3,2,2010-06-22,2010-07-08,,,Royaume-Uni,Londres,Archives du Royal Opera House Covent GardenArc...,,...,50.843671,4.367437,0.777530,4.0,GB,London,51.507322,-0.127647,0.830783,1.0
4,4,3,2010-07-12,2010-07-17,,,Royaume-Uni,Londres,Archives du Royal Opera House Covent GardenArc...,,...,50.843671,4.367437,0.777530,4.0,GB,London,51.507322,-0.127647,0.830783,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6672,6672,6126,2020-02-23,2020-02-28,Belgique,Bruxelles,France,Toulouse/Perpignan/Montpellier,AgroParisTech [FR],train,...,50.843671,4.367437,0.777530,4.0,FR,Montpellier,43.611242,3.876734,0.741204,4.0
6673,6673,6127,2020-02-16,2020-02-18,Belgique,Bruxelles,France,Pessac,Université de Bordeaux-Montaigne,train,...,50.843671,4.367437,0.777530,4.0,FR,Pessac,44.805615,-0.630840,0.549654,4.0
6674,6674,6128,2020-03-02,2020-03-03,Pays-Bas,Amsterdam,France,Paris,OCDE,train,...,52.372760,4.893604,0.826813,1.0,FR,Paris,48.856697,2.351462,0.931710,2.0
6675,6675,6129,2020-01-28,2020-02-06,Belgique,Bruxelles,États-Unis,"Washington, Chapel Hill",Abilene Christian University [US],avion_long_courrier_eco,...,50.843671,4.367437,0.777530,4.0,US,Washington,38.894893,-77.036553,0.849289,1.0


### Compute distance using Geodesic

Avoid cartesian computation when dealing with geographic information unless your projection system is compatible with (reasonable on a small portion of the world uniquely).
Plane travels have a radius increased by 10 km when distance is appreciable, should quantify how it does affect precision before path uncertainty.

In [18]:
def point(x, lat, lon):
    if x[lat]: 
        return (x[lat], x[lon])

In [19]:
df['from_point'] = df.apply(lambda x: point(x, 'from_lat', 'from_lng'), axis=1)
df['dest_point'] = df.apply(lambda x: point(x, 'dest_lat', 'dest_lng'), axis=1)

In [20]:
def distance(x):
    try:
        return geodesic(x['from_point'], x['dest_point'], ellipsoid='WGS-84').kilometers
    except:
        pass

In [21]:
df['distance'] = df.apply(distance, axis=1)

In [22]:
df.sample(10)

Unnamed: 0,travel_id,mission_id,debut,fin,pays_origine,ville_origine,pays,ville,lieu,mode_deplacement,...,from_confidence,dest_ISO2,dest_citycoded,dest_lat,dest_lng,dest_accuracy,dest_confidence,from_point,dest_point,distance
3395,3395,3066,2017-05-16,2017-05-16,,,FRANCE,Paris,Université Paris-Sorbonne,,...,4.0,FR,Paris,48.856697,2.351462,0.93171,2.0,"(50.8436709, 4.3674366933879565)","(48.8566969, 2.3514616)",264.293514
3106,3106,2802,2017-04-08,2017-04-22,,,USA,"NYC, Philadelphia, Boston",Drexel University/MIT/Harvard University,,...,4.0,US,Boston,42.360253,-71.058291,0.810251,1.0,"(50.8436709, 4.3674366933879565)","(42.3602534, -71.0582912)",5599.423915
4345,4345,3942,2018-04-22,2018-04-24,Allemagne,Fribourg,France,Bordeaux,Institut d'Etudes Politiques de Bordeaux [FR],avion_courte_distance,...,1.0,FR,Bordeaux,44.841225,-0.580036,0.774005,4.0,"(46.6789116, 7.1027113)","(44.841225, -0.5800364)",631.315721
3346,3346,3021,2017-04-26,2017-04-28,,,Belgique,Louvain-la-Neuve,Université Catholique de Louvain,,...,4.0,BE,,50.674169,4.614157,0.742712,6.0,"(50.8436709, 4.3674366933879565)","(50.6741689, 4.614156731721076)",25.663463
1401,1401,1269,2014-02-27,2014-03-04,,,Hongrie / Autriche,Budapest/Sopron/Vienne,Musée des arts décoratifs. Budapest.Musée des ...,,...,4.0,AT,Wien,48.208354,16.372504,0.769412,1.0,"(50.8436709, 4.3674366933879565)","(48.2083537, 16.3725042)",915.989356
2360,2360,2127,2016-04-04,2016-04-09,,,Italie,Bari,Universitŕ degli Studi di Bari Aldo Moro,,...,4.0,IT,Bari,41.125784,16.862029,0.718718,1.0,"(50.8436709, 4.3674366933879565)","(41.1257843, 16.8620293)",1446.582377
5239,5239,4775,2019-03-02,2019-03-11,Belgique,Quenast,France,Dzaoudi,AgroParisTech [FR],avion_long_courrier_eco,...,6.0,,,,,,,"(50.6719076, 4.162702140015513)","(nan, nan)",
6370,6370,5851,2019-11-12,2019-11-14,Belgique,Gembloux,Belgique,Louvain la Neuve,Université Catholique de Louvain [BE],train,...,3.0,BE,,50.674169,4.614157,0.742712,6.0,"(50.5597273, 4.6943126)","(50.6741689, 4.614156731721076)",13.937372
4942,4942,4500,2018-10-08,2018-12-09,Belgique,Bruxelles,Chine,"Pekin, Wuhan, Shanghai","Beiwai, Sisu, Wuhan, University etc",avion_long_courrier_eco,...,4.0,CN,武汉市,30.595105,114.299935,0.524886,1.0,"(50.8436709, 4.3674366933879565)","(30.5951051, 114.2999353)",8682.83113
3414,3414,3084,2017-06-11,2017-06-16,,,Irelande,Limerick,University of Limerick,,...,4.0,IE,Limerick,52.661252,-8.630124,0.658938,1.0,"(50.8436709, 4.3674366933879565)","(52.661252, -8.6301239)",918.687673


In [23]:
df.to_pickle("missions.pickle")