# SC/BEP Data Conditioning

In [1]:
import sys
import time
import pathlib
import pandas as pd
import geopandas as gpd
import geocoder
from geopy.distance import geodesic
#from fuzzymatcher import link_table, fuzzy_left_join
%matplotlib inline
pd.set_option('display.max_rows', 300)

## Load dataset

In [2]:
df = pd.read_csv("data/missions.csv", sep=';', header=0, encoding='cp1250', parse_dates=['debut', 'fin'])
df = df.reset_index().rename(columns={'index': 'mission_id'})
df = df.drop('libelle', axis=1)
df.sample(5)

Unnamed: 0,mission_id,debut,fin,pays_origine,ville_origine,pays,ville,lieu,mode_deplacement,acronyme,faculte
5002,5002,2019-01-16,2019-01-18,Belgique,Bruxelles,France,Paris,Université Panthéon-Sorbonne (Paris I) [FR],train,PHILO,PHILOSCSOC
4781,4781,2019-01-21,2019-01-22,Belgique,Bruxelles,France,Paris,Centre National de la Danse,train,HAA,PHILOSCSOC
198,198,2011-04-11,2011-04-29,,,Chine,Shanghai,Université FUDAN,,L&L,LTC
3878,3878,2018-04-18,2018-04-20,Belgique,Bruxelles,United Kingdom,"Guildford, Surrey",University of Surrey,train,DECS,SBS
5621,5621,2019-11-15,2019-11-15,Belgique,Charleroi,France,Paris,"France, Université Paris 13",train,ScSOC,PHILOSCSOC


## Data Cleansing

### Swap inverted timestamps

In [3]:
df['elapsed'] = (df['fin'] - df['debut']).dt.days
df.loc[df['elapsed']<0,'fin'] = df.loc[df['elapsed']<0,'debut'].values
df.loc[df['elapsed']<0,'debut'] = df.loc[df['elapsed']<0,'fin'].values
df['elapsed'] = ((df['fin'] - df['debut']) + pd.Timedelta("1D")).dt.days

### Correct subpopulated category

In [4]:
df['acronyme'] = df['acronyme'].replace({'PHILA': 'PHILO'}) 

### Normalize country names

In [5]:
def norm(x, fromkey, tokey):
    x[tokey] = x[fromkey].str.normalize('NFKD')\
                         .str.encode('ascii', errors='ignore').str.decode('utf-8')\
                         .str.strip().str.lower()

In [6]:
def unique(x, columns):
    keys = set()
    for k in columns:
        keys.update(x[k].dropna().unique())
    return keys

In [7]:
norm(df, 'pays_origine', 'from_country')
norm(df, 'pays', 'dest_country')

In [8]:
norm(df, 'ville_origine', 'from_city')
norm(df, 'ville', 'dest_city')

In [9]:
country = pd.DataFrame(list(df['from_country']) + list(df['dest_country']), columns=['country'])
country = country.dropna().drop_duplicates()

### Cleanse & split normalized city names

In [10]:
regreplace = r"\((.*?)\)"
regsplit = r'/|,|\&| et | puis |\+| and | - |;'
df['dest_city'] = df['dest_city'].str.replace(regreplace,"").str.split(regsplit)
df = df.explode('dest_city')
df['dest_city'] = df['dest_city'].str.replace("\d", "")
df['dest_city'] = df['dest_city'].str.strip()

In [12]:
citytrans = pd.read_csv("data/city_trans.csv")
citytrans = {r.city: r.corrected for r in citytrans.itertuples()}

In [13]:
df['from_city'] = df['from_city'].replace(citytrans)
df['dest_city'] = df['dest_city'].replace(citytrans)

In [14]:
df['from_city'] = df['from_city'].fillna('bruxelles')
df['dest_city'] = df['dest_city'].fillna('bruxelles')

### Geocode city names

In [41]:
city = pd.DataFrame(list(df['from_city']) + list(df['dest_city']), columns=['city'])
city = city.dropna().drop_duplicates()
city.sample(5)

Unnamed: 0,city
4712,durham
7331,helsinki
11128,sain-denis
9441,amman petra
12806,tokyo-nagasaki


In [42]:
def geocode(x, keys=['country_code', 'city', 'lat', 'lng', 'accuracy', 'confidence']):
    # https://operations.osmfoundation.org/policies/nominatim/
    res = geocoder.osm(x).json
    time.sleep(1.2)
    return {k: (res or {}).get(k) for k in keys}

In [None]:
citygc = city['city'].apply(geocode).apply(pd.Series)
citygc.to_pickle('geocoded.pickle')

### Create Travel Cycle

In [None]:
travels = df.groupby("mission_id").agg({'from_city': 'first', 'dest_city': list}, default='first')
travels

### Compute distance using Geodesic

Avoid cartesian computation when dealing with geographic information unless your projection system is compatible with (reasonable on a small portion of the world uniquely).
Plane travels have a radius increased by 10 km when distance is appreciable, should quantify how it does affect precision before path uncertainty.

In [28]:
def point(x, lat, lon):
    if x[lat]: 
        return (x[lat], x[lon])

In [29]:
df['from_point'] = df.apply(lambda x: point(x, 'from_lat', 'from_lng'), axis=1)
df['dest_point'] = df.apply(lambda x: point(x, 'dest_lat', 'dest_lng'), axis=1)

In [30]:
def distance(x):
    try:
        return geodesic(x['from_point'], x['dest_point'], ellipsoid='WGS-84').kilometers
    except:
        pass

In [31]:
df['distance'] = df.apply(distance, axis=1)

In [32]:
df.sample(10)

Unnamed: 0,travel_id,mission_id,debut,fin,pays_origine,ville_origine,pays,ville,lieu,mode_deplacement,...,from_confidence,dest_ISO2,dest_citycoded,dest_lat,dest_lng,dest_accuracy,dest_confidence,from_point,dest_point,distance
988,988,895,2013-02-13,2013-02-14,,,Angleterre,Londres,,,...,4.0,GB,London,51.507322,-0.127647,0.830783,1.0,"(50.8436709, 4.3674366933879565)","(51.5073219, -0.1276474)",322.849447
4044,4044,3664,2018-02-24,2018-03-04,Belgique,Bruxelles,Haďti,Port au Prince,Université d'Etat ŕ Haďti.,,...,4.0,HT,Port-au-Prince,18.547327,-72.339593,0.885463,1.0,"(50.8436709, 4.3674366933879565)","(18.547327, -72.3395928)",7503.101423
5441,5441,4969,2019-06-05,2019-06-15,Belgique,Bruxelles,Japon,Osaka,Waseda University [JP],avion_courte_distance,...,4.0,JP,,34.619881,135.490357,0.672756,1.0,"(50.8436709, 4.3674366933879565)","(34.6198813, 135.490357)",9399.789057
3381,3381,3052,2017-06-14,2017-06-17,,,France,paris,ANR,,...,4.0,FR,Paris,48.856697,2.351462,0.93171,2.0,"(50.8436709, 4.3674366933879565)","(48.8566969, 2.3514616)",264.293514
3699,3699,3346,2017-10-18,2017-10-18,,,Finlande,Helsinki,Luckan Integration,,...,4.0,FI,Helsinki,60.16741,24.942577,0.8385,1.0,"(50.8436709, 4.3674366933879565)","(60.1674098, 24.9425769)",1651.653454
6060,6060,5549,2019-08-09,2019-08-15,Belgium,Brussels,United States of America,San Francisco,Hilton San Francisco Union Square,avion_long_courrier_eco,...,4.0,US,San Francisco,37.779026,-122.419906,0.925131,1.0,"(50.8436709, 4.3674366933879565)","(37.7790262, -122.4199061)",8902.13992
1321,1321,1201,2013-12-13,2013-12-14,,,France,Le Mans,Université Nantes Angers Le Mans,,...,4.0,FR,Le Mans,48.00735,0.196738,0.803604,4.0,"(50.8436709, 4.3674366933879565)","(48.0073498, 0.1967379)",436.964667
3775,3775,3418,2017-11-08,2017-11-11,,,France,Ajaccio,,,...,4.0,FR,Ajaccio,41.926399,8.737603,0.664071,2.0,"(50.8436709, 4.3674366933879565)","(41.9263991, 8.7376029)",1046.202609
668,668,606,2012-05-22,2012-05-29,,,Etats-Unis,"Phoenix, AZ",International Communication Association,,...,4.0,US,Phoenix,33.448437,-112.074142,0.733016,1.0,"(50.8436709, 4.3674366933879565)","(33.4484367, -112.0741417)",8792.22071
4885,4885,4449,2018-09-24,2018-09-28,Belgique,Bruxelles,Maroc,/,/,,...,4.0,,,,,,,"(50.8436709, 4.3674366933879565)","(nan, nan)",


In [33]:
#df.to_pickle("missions.pickle")