# SC/BEP Data Conditioning

In [1]:
import sys
import time
import pathlib
import pandas as pd
import geopandas as gpd
import geocoder
from geopy.distance import geodesic
#from fuzzymatcher import link_table, fuzzy_left_join
%matplotlib inline
pd.set_option('display.max_rows', 300)

## Load dataset

In [2]:
df = pd.read_csv("data/missions.csv", sep=';', header=0, encoding='cp1250', parse_dates=['debut', 'fin'])
df = df.reset_index().rename(columns={'index': 'mission_id'})
df = df.drop('libelle', axis=1)
df.sample(5)

Unnamed: 0,mission_id,debut,fin,pays_origine,ville_origine,pays,ville,lieu,mode_deplacement,acronyme,faculte
4791,4791,2019-01-09,2019-01-09,Belgique,Bruxelles,Belgique,4540 Amay,Brexgata University Academy [BE],voiture_diesel,HAA,PHILOSCSOC
536,536,2012-03-16,2012-03-17,,,France,Strasbourg,Université Strasbourg,,SIC,LTC
3218,3218,2017-07-02,2017-07-05,,,BELGIQUE,BRUXELLES,CREA - ULB,,HAA,PHILOSCSOC
3411,3411,2017-10-21,2017-10-23,,,Italie,Rome,Centro Pio Rajna. Centro di studi per la ricer...,,L&L,LTC
4828,4828,2019-01-22,2019-01-22,Belgique,Bruxelles,France,Paris,Institut Henri Poincaré - SFdS,train,ScPOL,PHILOSCSOC


## Data Cleansing

### Swap inverted timestamps

In [3]:
df['elapsed'] = (df['fin'] - df['debut']).dt.days
df.loc[df['elapsed']<0,'fin'] = df.loc[df['elapsed']<0,'debut'].values
df.loc[df['elapsed']<0,'debut'] = df.loc[df['elapsed']<0,'fin'].values
df['elapsed'] = ((df['fin'] - df['debut']) + pd.Timedelta("1D")).dt.days

### Correct subpopulated category

In [4]:
df['acronyme'] = df['acronyme'].replace({'PHILA': 'PHILO'}) 

### Normalize country names

In [5]:
def norm(x, fromkey, tokey):
    x[tokey] = x[fromkey].str.normalize('NFKD')\
                         .str.encode('ascii', errors='ignore').str.decode('utf-8')\
                         .str.strip().str.lower()

In [6]:
def unique(x, columns):
    keys = set()
    for k in columns:
        keys.update(x[k].dropna().unique())
    return keys

In [7]:
norm(df, 'pays_origine', 'from_country')
norm(df, 'pays', 'dest_country')

In [8]:
norm(df, 'ville_origine', 'from_city')
norm(df, 'ville', 'dest_city')

In [9]:
country = pd.DataFrame(list(df['from_country']) + list(df['dest_country']), columns=['country'])
country = country.dropna().drop_duplicates()

### Cleanse & split normalized city names

In [10]:
regreplace = r"\((.*?)\)"
regsplit = r'/|,|\&| et | puis |\+| and | - |;'
df['dest_city'] = df['dest_city'].str.replace(regreplace,"").str.split(regsplit)
df = df.explode('dest_city')
df['dest_city'] = df['dest_city'].str.replace("\d", "")
df['dest_city'] = df['dest_city'].str.strip()

In [11]:
citytrans = pd.read_csv("data/city_trans.csv")
citytrans = {r.city: r.corrected for r in citytrans.itertuples()}

In [12]:
df['from_city'] = df['from_city'].replace(citytrans)
df['dest_city'] = df['dest_city'].replace(citytrans)

In [13]:
df['from_city'] = df['from_city'].fillna('bruxelles')
df['dest_city'] = df['dest_city'].fillna('bruxelles')

### Geocode city names

In [14]:
city = pd.DataFrame(list(df['from_city']) + list(df['dest_city']), columns=['city'])
city = city.dropna().drop_duplicates().rename(columns={'city': 'citycleansed'})
city.sample(5)

Unnamed: 0,citycleansed
7053,palaikastro sitia
7408,sydney
8159,rio
7590,mont-saint-odile
4434,braine-lalleud


In [15]:
def geocode(x, keys=['country_code', 'city', 'lat', 'lng', 'accuracy', 'confidence']):
    # https://operations.osmfoundation.org/policies/nominatim/
    res = geocoder.osm(x).json
    time.sleep(1.2)
    return {k: (res or {}).get(k) for k in keys}

In [16]:
geocode = pd.read_json('data/geocode.json')
geocode

Unnamed: 0,geocodeid,citycleansed,ISO2,city,lat,lon,accuracy,confidence
0,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.777530,4.0
1,1,charleroi,BE,Charleroi,50.412033,4.443624,0.668664,3.0
2,2,paris,FR,Paris,48.856697,2.351462,0.931710,2.0
3,3,casablanca,MA,Casablanca ⵜⴰⴷⴷⴰⵔⵜ ⵜⵓⵎⵍⵉⵍⵜ الدار البيضاء,33.595063,-7.618777,0.717325,1.0
4,4,reading,GB,,51.451495,-0.983634,0.670144,4.0
...,...,...,...,...,...,...,...,...
1138,1138,sitia,GR,,35.206625,26.104971,0.476402,4.0
1139,1139,szklarska poreba,PL,Szklarska Poręba,50.827585,15.521166,0.566010,3.0
1140,1140,stadtoldendorf,DE,Stadtoldendorf,51.884211,9.625220,0.528807,4.0
1141,1141,longueil-sainte-marie,FR,Longueil-Sainte-Marie,49.357664,2.717691,0.849620,5.0


### Create Travel Cycle

In [34]:
travels = df.groupby("mission_id").agg({'from_city': 'first', 'dest_city': list}, default='first')
travels = travels.reset_index().rename(columns={'from_city': 'from', 'dest_city': 'to'}).dropna()
travels

Unnamed: 0,mission_id,from,to
0,0,bruxelles,"[londres, oxford]"
1,1,bruxelles,[londres]
2,2,bruxelles,[londres]
3,3,bruxelles,[londres]
4,4,bruxelles,[paris]
...,...,...,...
6125,6125,bruxelles,[londres]
6126,6126,bruxelles,"[toulouse, perpignan, montpellier]"
6127,6127,bruxelles,[pessac]
6128,6128,amsterdam,[paris]


In [35]:
travels['chain'] = travels.apply(lambda x: [x['from']] + x['to'] + [x['from']], axis=1)
travels = travels.explode('chain')
travels['end'] = travels.groupby("mission_id")["chain"].shift(-1)
travels = travels.dropna(subset=['end']).reset_index(drop=True).reset_index().rename(columns={'index': 'travel_id'})
travels = travels.drop(['from', 'to'], axis=1).rename(columns={'chain': 'from', 'end': 'to'})

In [36]:
travels

Unnamed: 0,travel_id,mission_id,from,to
0,0,0,bruxelles,londres
1,1,0,londres,oxford
2,2,0,oxford,bruxelles
3,3,1,bruxelles,londres
4,4,1,londres,bruxelles
...,...,...,...,...
12802,12802,6128,amsterdam,paris
12803,12803,6128,paris,amsterdam
12804,12804,6129,bruxelles,washington
12805,12805,6129,washington,chapel hill


### Compute distance using Geodesic

Avoid cartesian computation when dealing with geographic information unless your projection system is compatible with (reasonable on a small portion of the world uniquely).
Plane travels have a radius increased by 10 km when distance is appreciable, should quantify how it does affect precision before path uncertainty.

In [18]:
def point(x, lat, lon):
    if x[lat]: 
        return (x[lat], x[lon])

In [19]:
df['from_point'] = df.apply(lambda x: point(x, 'from_lat', 'from_lng'), axis=1)
df['dest_point'] = df.apply(lambda x: point(x, 'dest_lat', 'dest_lng'), axis=1)

KeyError: 'from_lat'

In [None]:
def distance(x):
    try:
        return geodesic(x['from_point'], x['dest_point'], ellipsoid='WGS-84').kilometers
    except:
        pass

In [None]:
df['distance'] = df.apply(distance, axis=1)

In [None]:
df.sample(10)

In [None]:
#df.to_pickle("missions.pickle")