# SC/BEP First Analysis

In [1]:
import pathlib
import pandas as pd
import geopandas as gpd
import geocoder
#from fuzzymatcher import link_table, fuzzy_left_join
%matplotlib inline
pd.set_option('display.max_rows', 300)

## Load datasets

In [2]:
country = gpd.read_file(pathlib.Path("country.geojson").open().read())

In [3]:
df = pd.read_csv("missions.csv", sep=';', header=0, encoding='cp1250', parse_dates=['debut', 'fin'])
df = df.reset_index()
df.sample(5)

Unnamed: 0,index,debut,fin,pays_origine,ville_origine,pays,ville,lieu,mode_deplacement,acronyme,libelle,faculte
4446,4446,2018-11-07,2018-11-11,Belgique,Bruxelles,Suisse,lausanne,Business and Hotel Management School [CH],train,PHILO,Département de Philosophie et de Sciences des ...,PHILOSCSOC
3162,3162,2017-06-19,2017-06-28,,,Colombie,Bogota et al.,/,,HAA,"Département d'Histoire, arts et archéologie",PHILOSCSOC
4831,4831,2019-01-24,2019-01-26,Belgique,Bruxelles,Royaume-Uni,Oxford,University of Oxford [GB],train,DECS,Dean's Office - Solvay Brussels School of Econ...,SBS
3764,3764,2018-03-30,2018-05-06,Royaume-Uni,Londres,Royaume-Uni,Londres,"Institue of Historical Research, University of...",,HAA,"Département d'Histoire, arts et archéologie",PHILOSCSOC
4143,4143,2018-09-04,2018-09-09,Belgique,Bruxelles,Suisse,Lausanne,IDHEAP,avion_courte_distance,ScPOL,Département de Sciences Politiques,PHILOSCSOC


## Data Cleansing

### Swap inverted timestamps

In [4]:
df['elapsed'] = (df['fin'] - df['debut']).dt.days
df.loc[df['elapsed']<0,'fin'] = df.loc[df['elapsed']<0,'debut'].values
df.loc[df['elapsed']<0,'debut'] = df.loc[df['elapsed']<0,'fin'].values
df['elapsed'] = (df['fin'] - df['debut']).dt.days

### Normalize country names

In [5]:
def norm(x, fromkey, tokey):
    x[tokey] = x[fromkey].str.normalize('NFKD')\
                         .str.encode('ascii', errors='ignore').str.decode('utf-8')\
                         .str.strip().str.lower()

In [6]:
def unique(x, columns):
    keys = set()
    for k in columns:
        keys.update(x[k].dropna().unique())
    return keys

In [7]:
norm(df, 'pays_origine', 'from_country')
norm(df, 'pays', 'dest_country')

In [8]:
norm(df, 'ville_origine', 'from_city')
norm(df, 'ville', 'dest_city')

### Cleanse & split city names

In [10]:
regreplace = r"\((.*?)\)"
regsplit = r'/|,|\&| et | puis |\+| and | - |;'
df['dest_city'] = df['dest_city'].str.replace(regreplace,"").str.split(regsplit)
df = df.explode('dest_city')
df['dest_city'] = df['dest_city'].str.replace("\d", "")
df['dest_city'] = df['dest_city'].str.strip()

In [17]:
#df['dest_city'].sample(30)

### Geocode city names

In [12]:
import time
def geocode(x, keys=['country_code', 'city', 'lat', 'lng', 'accuracy', 'confidence']):
    res = geocoder.osm(x).json
    time.sleep(2)
    return {k: (res or {}).get(k) for k in keys}

In [16]:
city = pd.DataFrame(list(df['from_city'].dropna().unique()) + list(df['dest_city'].unique()), columns=['city'])
city['city'].sample(30).values

array(['calgary', 'tangers', 'valladolid', 'st pertersbourg', 'jerusalem',
       'mcze', 'carleton', 'ville davray', 'paris', 'sherbrooke',
       'copenhague', 'briuxelles', 'tanger', 'olloy-sur-viroin', 'apamee',
       'arras', 'monastir', 'goteborg', 'agrigente', 'sapporo',
       'louvain-la-neuve', 'loughborough', 'milan', 'kinshasa', 'al.',
       'ramillies', 'loveno di menaggio', 'tourcoing', 'binche',
       'karlsruhe'], dtype=object)