# SC/BEP First Analysis

In [1]:
import pathlib
import pandas as pd
import geopandas as gpd
import geocoder
#from fuzzymatcher import link_table, fuzzy_left_join
%matplotlib inline
pd.set_option('display.max_rows', 300)

## Load datasets

In [2]:
country = gpd.read_file(pathlib.Path("country.geojson").open().read())

In [3]:
df = pd.read_csv("missions.csv", sep=';', header=0, encoding='cp1250', parse_dates=['debut', 'fin'])
df = df.reset_index()
df.sample(5)

Unnamed: 0,index,debut,fin,pays_origine,ville_origine,pays,ville,lieu,mode_deplacement,acronyme,libelle,faculte
2852,2852,2017-06-01,2017-08-31,,,Canada,vancouver,University of British Columbia,,ScPOL,Département de Sciences Politiques,PHILOSCSOC
4861,4861,2019-01-23,2019-01-25,Belgique,Bruxelles,Espagne,Barcelone,Universitat Pompeu Fabra [ES],avion_courte_distance,ScSOC,Département de Sciences Sociales et des scienc...,PHILOSCSOC
4005,4005,2018-06-04,2018-07-13,Belgique,Bruxelles,Belgique,Thuin,Chantier école (ULB),,HAA,"Département d'Histoire, arts et archéologie",PHILOSCSOC
2206,2206,2016-04-27,2016-04-27,,,Belgique,Mons,,,HAA,"Département d'Histoire, arts et archéologie",PHILOSCSOC
5744,5744,2019-09-20,2019-09-20,Belgique,Bruxelles,Pays-Bas,Amsterdam,Vrije Universiteit Amsterdam,train,ScSOC,Département de Sciences Sociales et des scienc...,PHILOSCSOC


## Data Cleansing

### Swap inverted timestamps

In [4]:
df['elapsed'] = (df['fin'] - df['debut']).dt.days
df.loc[df['elapsed']<0,'fin'] = df.loc[df['elapsed']<0,'debut'].values
df.loc[df['elapsed']<0,'debut'] = df.loc[df['elapsed']<0,'fin'].values
df['elapsed'] = (df['fin'] - df['debut']).dt.days

### Normalize country names

In [5]:
def norm(x, fromkey, tokey):
    x[tokey] = x[fromkey].str.normalize('NFKD')\
                         .str.encode('ascii', errors='ignore').str.decode('utf-8')\
                         .str.strip().str.lower()

In [6]:
def unique(x, columns):
    keys = set()
    for k in columns:
        keys.update(x[k].dropna().unique())
    return keys

In [7]:
norm(df, 'pays_origine', 'from_country')
norm(df, 'pays', 'dest_country')

In [8]:
norm(df, 'ville_origine', 'from_city')
norm(df, 'ville', 'dest_city')

In [9]:
s = unique(df, ['from_city', 'dest_city'])

In [10]:
#df['from_city'] = df['from_city'].str.replace(r"\((.*?)\)","").str.split(r'/|,|\&| et | puis |\+|;')
df['dest_city'] = df['dest_city'].str.replace(r"\((.*?)\)","").str.split(r'/|,|\&| et | puis |\+|;')
df = df.explode('dest_city')

In [11]:
df

Unnamed: 0,index,debut,fin,pays_origine,ville_origine,pays,ville,lieu,mode_deplacement,acronyme,libelle,faculte,elapsed,from_country,dest_country,from_city,dest_city
0,0,2010-05-28,2010-06-01,,,Grande Bretagne,Londres/Oxford,Oxford University — All Souls College,,DECA,"Décanat de la Faculté de Lettres, Traduction e...",LTC,4,,grande bretagne,,londres
0,0,2010-05-28,2010-06-01,,,Grande Bretagne,Londres/Oxford,Oxford University — All Souls College,,DECA,"Décanat de la Faculté de Lettres, Traduction e...",LTC,4,,grande bretagne,,oxford
1,1,2010-06-22,2010-07-08,,,Royaume-Uni,Londres,Archives du Royal Opera House Covent GardenArc...,,SIC,Département des Sciences de l'information et d...,LTC,16,,royaume-uni,,londres
2,2,2010-06-22,2010-07-08,,,Royaume-Uni,Londres,Archives du Royal Opera House Covent GardenArc...,,SIC,Département des Sciences de l'information et d...,LTC,16,,royaume-uni,,londres
3,3,2010-07-12,2010-07-17,,,Royaume-Uni,Londres,Archives du Royal Opera House Covent GardenArc...,,SIC,Département des Sciences de l'information et d...,LTC,5,,royaume-uni,,londres
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6126,6126,2020-02-23,2020-02-28,Belgique,Bruxelles,France,Toulouse/Perpignan/Montpellier,AgroParisTech [FR],train,DECS,Dean's Office - Solvay Brussels School of Econ...,SBS,5,belgique,france,bruxelles,montpellier
6127,6127,2020-02-16,2020-02-18,Belgique,Bruxelles,France,Pessac,Université de Bordeaux-Montaigne,train,L&L,Département de Langues et Lettres,LTC,2,belgique,france,bruxelles,pessac
6128,6128,2020-03-02,2020-03-03,Pays-Bas,Amsterdam,France,Paris,OCDE,train,DECS,Dean's Office - Solvay Brussels School of Econ...,SBS,1,pays-bas,france,amsterdam,paris
6129,6129,2020-01-28,2020-02-06,Belgique,Bruxelles,États-Unis,"Washington, Chapel Hill",Abilene Christian University [US],avion_long_courrier_eco,L&L,Département de Langues et Lettres,LTC,9,belgique,etats-unis,bruxelles,washington


In [12]:
import time
def geocode(x):
    res = geocoder.osm(x).json
    time.sleep(1)
    return {k: (res or {}).get(k) for k in ['country_code', 'city', 'lat', 'lng', 'accuracy', 'confidence']}

In [13]:
city = pd.DataFrame(list(df['from_city'].dropna().unique()) + list(df['dest_city'].unique()), columns=['city'])
city

Unnamed: 0,city
0,bruxelles
1,charleroi
2,paris
3,casablanca
4,reading
...,...
1451,montpellie
1452,tokyo and otsu
1453,athenes
1454,pessac


In [14]:
citygc = city.loc[:10,'city'].apply(geocode).apply(pd.Series)
citygc

Status code 429 from https://nominatim.openstreetmap.org/search: ERROR - 429 Client Error: Too Many Requests for url: https://nominatim.openstreetmap.org/search?q=bruxelles&format=jsonv2&addressdetails=1&limit=1
Status code 429 from https://nominatim.openstreetmap.org/search: ERROR - 429 Client Error: Too Many Requests for url: https://nominatim.openstreetmap.org/search?q=charleroi&format=jsonv2&addressdetails=1&limit=1
Status code 429 from https://nominatim.openstreetmap.org/search: ERROR - 429 Client Error: Too Many Requests for url: https://nominatim.openstreetmap.org/search?q=paris&format=jsonv2&addressdetails=1&limit=1
Status code 429 from https://nominatim.openstreetmap.org/search: ERROR - 429 Client Error: Too Many Requests for url: https://nominatim.openstreetmap.org/search?q=casablanca&format=jsonv2&addressdetails=1&limit=1
Status code 429 from https://nominatim.openstreetmap.org/search: ERROR - 429 Client Error: Too Many Requests for url: https://nominatim.openstreetmap.org/s

KeyboardInterrupt: 

In [15]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="Mission")
location = geolocator.geocode("brussels")

GeocoderQuotaExceeded: HTTP Error 429: Too Many Requests

x = df.loc[:20,:]
y = x['dest_city'].apply(geocode).apply(pd.Series)

pd.concat([x,y], axis=1)