# SC/BEP Data Conditioning

In [1]:
import sys
import time
import pathlib
import pandas as pd
import geopandas as gpd
import geocoder
from geopy.distance import geodesic
#from fuzzymatcher import link_table, fuzzy_left_join
%matplotlib inline
pd.set_option('display.max_rows', 300)

## Load dataset

In [2]:
missions = pd.read_csv("data/missions.csv", sep=';', header=0, encoding='cp1250', parse_dates=['debut', 'fin'])
missions = missions.reset_index().rename(columns={'index': 'mission_id'})
missions = missions.drop('libelle', axis=1)
missions.sample(5)

Unnamed: 0,mission_id,debut,fin,pays_origine,ville_origine,pays,ville,lieu,mode_deplacement,acronyme,faculte
1182,1182,2014-03-21,2014-03-29,,,Turquie,Istambul,Université Galatasaray,,PHILO,PHILOSCSOC
4575,4575,2018-10-28,2018-10-29,Belgique,Bruxelles,UK,London,DfiD et EBRD,train,DECS,SBS
541,541,2012-03-22,2012-03-23,,,France,Paris,Celsa. Paris-Sorbonne,,SIC,LTC
152,152,2010-12-03,2010-12-07,,,Italie,Rome,HERITY International,,HAA,PHILOSCSOC
1683,1683,2015-04-13,2015-04-19,,,Chine,Pékin,Communication University of China,,DECA,LTC


## Data Cleansing

### Swap inverted timestamps

In [3]:
missions['elapsed'] = (missions['fin'] - missions['debut']).dt.days
missions.loc[missions['elapsed']<0,'fin'] = missions.loc[missions['elapsed']<0,'debut'].values
missions.loc[missions['elapsed']<0,'debut'] = missions.loc[missions['elapsed']<0,'fin'].values
missions['elapsed'] = ((missions['fin'] - missions['debut']) + pd.Timedelta("1D")).dt.days

### Correct subpopulated category

In [4]:
missions['acronyme'] = missions['acronyme'].replace({'PHILA': 'PHILO'}) 

### Normalize city names

In [5]:
def norm(x, fromkey, tokey):
    x[tokey] = x[fromkey].str.normalize('NFKD')\
                         .str.encode('ascii', errors='ignore').str.decode('utf-8')\
                         .str.strip().str.lower()

In [6]:
norm(missions, 'ville_origine', 'from_city')
norm(missions, 'ville', 'dest_city')

### Cleanse & split normalized city names

In [7]:
regreplace = r"\((.*?)\)"
regsplit = r'/|,|\&| et | puis |\+| and | - |;'
missions['dest_city'] = missions['dest_city'].str.replace(regreplace,"").str.split(regsplit)

In [8]:
df = missions.explode('dest_city')
df['dest_city'] = df['dest_city'].str.replace("\d", "")
df['dest_city'] = df['dest_city'].str.strip()

In [9]:
citytrans = pd.read_csv("data/city_trans.csv")
citytrans = {r.city: r.corrected for r in citytrans.itertuples()}

In [10]:
df['from_city'] = df['from_city'].replace(citytrans)
df['dest_city'] = df['dest_city'].replace(citytrans)

In [11]:
df['from_city'] = df['from_city'].fillna('bruxelles')
df['dest_city'] = df['dest_city'].fillna('bruxelles')

### Geocode city names

In [12]:
city = pd.DataFrame(list(df['from_city']) + list(df['dest_city']), columns=['city'])
city = city.dropna().drop_duplicates().rename(columns={'city': 'citycleansed'})
city.sample(5)

Unnamed: 0,citycleansed
8810,kiel
8445,mayotte
6801,gothenburg
6903,bonn
9350,iakoutsk


In [13]:
def geocode(x, keys=['country_code', 'city', 'lat', 'lng', 'accuracy', 'confidence']):
    # https://operations.osmfoundation.org/policies/nominatim/
    res = geocoder.osm(x).json
    time.sleep(1.2)
    return {k: (res or {}).get(k) for k in keys}

In [14]:
geocode = pd.read_json('data/geocode.json')
geocode

Unnamed: 0,geocodeid,citycleansed,ISO2,city,lat,lon,accuracy,confidence
0,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.777530,4.0
1,1,charleroi,BE,Charleroi,50.412033,4.443624,0.668664,3.0
2,2,paris,FR,Paris,48.856697,2.351462,0.931710,2.0
3,3,casablanca,MA,Casablanca ⵜⴰⴷⴷⴰⵔⵜ ⵜⵓⵎⵍⵉⵍⵜ الدار البيضاء,33.595063,-7.618777,0.717325,1.0
4,4,reading,GB,,51.451495,-0.983634,0.670144,4.0
...,...,...,...,...,...,...,...,...
1138,1138,sitia,GR,,35.206625,26.104971,0.476402,4.0
1139,1139,szklarska poreba,PL,Szklarska Poręba,50.827585,15.521166,0.566010,3.0
1140,1140,stadtoldendorf,DE,Stadtoldendorf,51.884211,9.625220,0.528807,4.0
1141,1141,longueil-sainte-marie,FR,Longueil-Sainte-Marie,49.357664,2.717691,0.849620,5.0


### Create Travel Cycle

In [15]:
travels = df.groupby("mission_id").agg({'from_city': 'first', 'dest_city': list}, default='first')
travels = travels.reset_index().rename(columns={'from_city': 'from', 'dest_city': 'to'}).dropna()
travels.sample(5)

Unnamed: 0,mission_id,from,to
5158,5158,bruxelles,[paris]
2923,2923,bruxelles,"[montreal, quebec, halifax]"
3763,3763,paris,[paris]
5928,5928,bruxelles,[paris]
4272,4272,bruxelles,[talinn]


In [16]:
travels['chain'] = travels.apply(lambda x: [x['from']] + x['to'] + [x['from']], axis=1)
travels = travels.explode('chain')
travels['end'] = travels.groupby("mission_id")["chain"].shift(-1)
travels = travels.dropna(subset=['end']).reset_index(drop=True).reset_index().rename(columns={'index': 'travel_id'})
travels = travels.drop(['from', 'to'], axis=1).rename(columns={'chain': 'from', 'end': 'to'})

In [17]:
travels.head()

Unnamed: 0,travel_id,mission_id,from,to
0,0,0,bruxelles,londres
1,1,0,londres,oxford
2,2,0,oxford,bruxelles
3,3,1,bruxelles,londres
4,4,1,londres,bruxelles


In [18]:
travels = travels.merge(geocode.add_prefix('from_'), left_on='from', right_on='from_citycleansed', how='left')
travels = travels.merge(geocode.add_prefix('to_'), left_on='to', right_on='to_citycleansed', how='left')

In [19]:
travels = travels.drop(['from', 'to'], axis=1)

In [20]:
travels.sample(5)

Unnamed: 0,travel_id,mission_id,from_geocodeid,from_citycleansed,from_ISO2,from_city,from_lat,from_lon,from_accuracy,from_confidence,to_geocodeid,to_citycleansed,to_ISO2,to_city,to_lat,to_lon,to_accuracy,to_confidence
8109,8109,3857,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0,2,paris,FR,Paris,48.856697,2.351462,0.93171,2.0
6907,6907,3278,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0,52,dusseldorf,DE,Düsseldorf,51.225402,6.776314,0.672609,1.0
12642,12642,6055,2,paris,FR,Paris,48.856697,2.351462,0.93171,2.0,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0
3200,3200,1521,527,munchen,DE,,48.137108,11.575382,0.746211,1.0,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0
3257,3257,1549,461,aix en provence,FR,Aix-en-Provence,43.529842,5.447474,0.891085,1.0,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.77753,4.0


### Clean Missions

In [21]:
missions = missions.rename(columns={
    'debut': 'start', 'fin': 'stop', 'mode_deplacement': 'travel_type',
    'acronyme': 'acronym', 'faculte': 'faculty'
}).drop(['pays_origine', 'ville_origine', 'pays', 'lieu', 'ville', 'from_city', 'dest_city'], axis=1)
missions['travel_type'] = missions['travel_type'].replace({
    'avion_courte_distance': 'plane/short',
    'avion_long_courrier_eco': 'plane/eco',
    'avion_long_courrier_business': 'plane/first',
    'voiture_essence': 'car/gas',
    'voiture_diesel': 'car/diesel',
})
missions.sample(10)

Unnamed: 0,mission_id,start,stop,travel_type,acronym,faculty,elapsed
787,787,2012-11-28,2012-11-29,,L&L,LTC,2
94,94,2010-09-30,2010-10-01,,SIC,LTC,2
3927,3927,2018-03-29,2018-03-29,train,DECB,PHILOSCSOC,1
5820,5820,2019-11-05,2019-11-05,train,SIC,LTC,1
2933,2933,2017-04-10,2017-04-15,,DECA,LTC,6
1374,1374,2014-05-20,2014-05-24,,PHILO,PHILOSCSOC,5
1943,1943,2015-12-15,2015-12-17,,HAA,PHILOSCSOC,3
295,295,2011-06-25,2011-07-30,,HAA,PHILOSCSOC,36
530,530,2012-04-11,2012-04-14,,PHILO,PHILOSCSOC,4
1161,1161,2013-11-08,2013-11-10,,L&L,LTC,3


In [25]:
travels = travels.merge(missions)

In [26]:
travels

Unnamed: 0,travel_id,mission_id,from_geocodeid,from_citycleansed,from_ISO2,from_city,from_lat,from_lon,from_accuracy,from_confidence,...,to_lat,to_lon,to_accuracy,to_confidence,start,stop,travel_type,acronym,faculty,elapsed
0,0,0,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.777530,4.0,...,51.507322,-0.127647,0.830783,1.0,2010-05-28,2010-06-01,,DECA,LTC,5
1,1,0,5,londres,GB,London,51.507322,-0.127647,0.830783,1.0,...,51.752013,-1.257850,0.760604,4.0,2010-05-28,2010-06-01,,DECA,LTC,5
2,2,0,99,oxford,GB,Oxford,51.752013,-1.257850,0.760604,4.0,...,50.843671,4.367437,0.777530,4.0,2010-05-28,2010-06-01,,DECA,LTC,5
3,3,1,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.777530,4.0,...,51.507322,-0.127647,0.830783,1.0,2010-06-22,2010-07-08,,SIC,LTC,17
4,4,1,5,londres,GB,London,51.507322,-0.127647,0.830783,1.0,...,50.843671,4.367437,0.777530,4.0,2010-06-22,2010-07-08,,SIC,LTC,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12802,12802,6128,36,amsterdam,NL,Amsterdam,52.372760,4.893604,0.826813,1.0,...,48.856697,2.351462,0.931710,2.0,2020-03-02,2020-03-03,train,DECS,SBS,2
12803,12803,6128,2,paris,FR,Paris,48.856697,2.351462,0.931710,2.0,...,52.372760,4.893604,0.826813,1.0,2020-03-02,2020-03-03,train,DECS,SBS,2
12804,12804,6129,0,bruxelles,BE,Ville de Bruxelles - Stad Brussel,50.843671,4.367437,0.777530,4.0,...,38.894893,-77.036553,0.849289,1.0,2020-01-28,2020-02-06,plane/eco,L&L,LTC,10
12805,12805,6129,267,washington,US,Washington,38.894893,-77.036553,0.849289,1.0,...,35.913154,-79.055780,0.746859,4.0,2020-01-28,2020-02-06,plane/eco,L&L,LTC,10


### Compute distance using Geodesic

Avoid cartesian computation when dealing with geographic information unless your projection system is compatible with (reasonable on a small portion of the world uniquely).
Plane travels have a radius increased by 10 km when distance is appreciable, should quantify how it does affect precision before path uncertainty.

In [22]:
def point(x, lat, lon):
    if x[lat]: 
        return (x[lat], x[lon])

In [23]:
df['from_point'] = df.apply(lambda x: point(x, 'from_lat', 'from_lng'), axis=1)
df['dest_point'] = df.apply(lambda x: point(x, 'dest_lat', 'dest_lng'), axis=1)

KeyError: 'from_lat'

In [None]:
def distance(x):
    try:
        return geodesic(x['from_point'], x['dest_point'], ellipsoid='WGS-84').kilometers
    except:
        pass

In [None]:
df['distance'] = df.apply(distance, axis=1)

In [None]:
df.sample(10)

In [None]:
#df.to_pickle("missions.pickle")