# SC/BEP First Analysis

In [1]:
import sys
import time
import pathlib
import pandas as pd
import geopandas as gpd
import geocoder
#from fuzzymatcher import link_table, fuzzy_left_join
%matplotlib inline
pd.set_option('display.max_rows', 300)

## Load datasets

In [2]:
country = gpd.read_file(pathlib.Path("country.geojson").open().read())

In [3]:
df = pd.read_csv("missions.csv", sep=';', header=0, encoding='cp1250', parse_dates=['debut', 'fin'])
df = df.reset_index()
df.sample(5)

Unnamed: 0,index,debut,fin,pays_origine,ville_origine,pays,ville,lieu,mode_deplacement,acronyme,libelle,faculte
2829,2829,2017-03-08,2017-06-09,,,Taiwan R.O.C.,Taipei,National Taiwan Normal University,,L&L,Département de Langues et Lettres,LTC
452,452,2011-11-21,2011-11-22,,,Espagne,Barcelone,"Fundació i2CAT, Internet i Innovació Digital a...",,SIC,Département des Sciences de l'information et d...,LTC
2339,2339,2016-07-03,2016-07-07,,,France,Rennes,Institut d'études politiques de Rennes,,ScPOL,Département de Sciences Politiques,PHILOSCSOC
2575,2575,2016-09-05,2016-09-13,,,Belgique,Hasselt,Hasselt Universiteit,,ScPOL,Département de Sciences Politiques,PHILOSCSOC
5353,5353,2019-06-02,2019-06-05,Belgique,Bruxelles,France,Paris,Université de Paris Dauphine,train,DECS,Dean's Office - Solvay Brussels School of Econ...,SBS


## Data Cleansing

### Swap inverted timestamps

In [4]:
df['elapsed'] = (df['fin'] - df['debut']).dt.days
df.loc[df['elapsed']<0,'fin'] = df.loc[df['elapsed']<0,'debut'].values
df.loc[df['elapsed']<0,'debut'] = df.loc[df['elapsed']<0,'fin'].values
df['elapsed'] = (df['fin'] - df['debut']).dt.days

### Normalize country names

In [5]:
def norm(x, fromkey, tokey):
    x[tokey] = x[fromkey].str.normalize('NFKD')\
                         .str.encode('ascii', errors='ignore').str.decode('utf-8')\
                         .str.strip().str.lower()

In [6]:
def unique(x, columns):
    keys = set()
    for k in columns:
        keys.update(x[k].dropna().unique())
    return keys

In [7]:
norm(df, 'pays_origine', 'from_country')
norm(df, 'pays', 'dest_country')

In [8]:
norm(df, 'ville_origine', 'from_city')
norm(df, 'ville', 'dest_city')

### Cleanse & split normalized city names

In [None]:
regreplace = r"\((.*?)\)"
regsplit = r'/|,|\&| et | puis |\+| and | - |;'
df['dest_city'] = df['dest_city'].str.replace(regreplace,"").str.split(regsplit)
df = df.explode('dest_city')
df['dest_city'] = df['dest_city'].str.replace("\d", "")
df['dest_city'] = df['dest_city'].str.strip()

In [None]:
#df['dest_city'].sample(30)

### Geocode city names

https://operations.osmfoundation.org/policies/nominatim/

In [None]:
def geocode(x, keys=['country_code', 'city', 'lat', 'lng', 'accuracy', 'confidence']):
    res = geocoder.osm(x).json
    time.sleep(2.0)
    return {k: (res or {}).get(k) for k in keys}

In [None]:
city = pd.DataFrame(list(df['from_city'].dropna().unique()) + list(df['dest_city'].unique()), columns=['city'])
city['city'].sample(30).values

array(['amiens', 'reading', 'bamberg', 'leeds', 'japan', 'munster',
       'agder', 'ramillies', 'tafileh', 'wellington', 'bloomington',
       'new delhi', 'reus', 'sheffield', 'le cap', 'arlon', 'innsbruck',
       'ljubljana', 'luang prabang', 'lodz', 'ville davray', 'montreal',
       'lannion', 'tourtour', 'dc', 'roma', 'la gacilly', 'pekin',
       'amman petra', 'le mans'], dtype=object)

In [None]:
city.index

RangeIndex(start=0, stop=1268, step=1)

In [None]:
citygc = city['city'].apply(geocode).apply(pd.Series)
citygc.to_pickle('geocoded.pickle')