# SC/BEP First Analysis

In [1]:
import pathlib
import pandas as pd
import geopandas as gpd
#from fuzzymatcher import link_table, fuzzy_left_join
%matplotlib inline
pd.set_option('display.max_rows', 300)

## Load datasets

In [2]:
country = gpd.read_file(pathlib.Path("country.geojson").open().read())

In [3]:
df = pd.read_csv("missions.csv", sep=';', header=0, encoding='cp1250', parse_dates=['debut', 'fin'])
df.sample(5)

Unnamed: 0,debut,fin,pays_origine,ville_origine,pays,ville,lieu,mode_deplacement,acronyme,libelle,faculte
6125,2020-02-20,2020-02-20,Belgique,Bruxelles,Royaume-Uni,Londres,Aga Khan University [GB],train,DECS,Dean's Office - Solvay Brussels School of Econ...,SBS
958,2013-06-14,2013-06-23,,,Russie,Moscou-Iasnaďa Poliana,Musée d'Etat Leon Tolstoď,,L&L,Département de Langues et Lettres,LTC
5990,2019-12-13,2019-12-17,Belgique,Bruxelles,Royaume-Uni,London,"Birkbeck College, University of London [GB]",train,DECS,Dean's Office - Solvay Brussels School of Econ...,SBS
4402,2018-09-08,2018-09-20,Belgique,Bruxelles,Espagne,Madrid,Bircham International University [ES],voiture_diesel,PHILO,Département de Philosophie et de Sciences des ...,PHILOSCSOC
895,2013-02-13,2013-02-14,,,Angleterre,Londres,,,PHILO,Département de Philosophie et de Sciences des ...,PHILOSCSOC


## Data Cleansing

### Swap inverted timestamps

In [4]:
df['elapsed'] = (df['fin'] - df['debut']).dt.days
df.loc[df['elapsed']<0,'fin'] = df.loc[df['elapsed']<0,'debut'].values
df.loc[df['elapsed']<0,'debut'] = df.loc[df['elapsed']<0,'fin'].values
df['elapsed'] = (df['fin'] - df['debut']).dt.days

### Normalize country names

In [5]:
def norm(x, fromkey, tokey):
    x[tokey] = x[fromkey].str.normalize('NFKD')\
                         .str.encode('ascii', errors='ignore').str.decode('utf-8')\
                         .str.strip().str.lower()

In [6]:
def unique(x, columns):
    keys = set()
    for k in columns:
        keys.update(x[k].unique())
    return keys

In [7]:
norm(df, 'pays_origine', 'from_country')
norm(df, 'pays', 'dest_country')

In [8]:
unique(df, ['from_country', 'dest_country'])

{'',
 'abkhazie / georgie',
 'afrique du sud',
 'agentine et chile',
 'algerie',
 'allemagne',
 'allemagne, austria',
 'amsterdam',
 'angelterre',
 'angleterre',
 'arabie saoudite',
 'argentine',
 'armenie',
 'armenie georgie',
 'australie',
 'australie & nouvelle-zelande',
 'austria',
 'autriche',
 'azerbaddjan',
 'azerbaijan',
 'b',
 'bahrein',
 'bamako',
 'be',
 'begique',
 'belgien',
 'belgique',
 'belgique / france',
 'belgium',
 'benin',
 'benin / niger',
 'benin et niger',
 'bielorussie',
 'birmanie',
 'blegium',
 'bolivie',
 'bosnie-herzegovine',
 'bresil',
 'bresil et argentine',
 'britain',
 'brussels',
 'bruxelles',
 'bulgaria',
 'bulgarie',
 'burkina faso',
 'burundi',
 'burundi, rd congo',
 'burundi, rwanda',
 'buruni, rdcongo, rwanda',
 'cambodge',
 'cameroun',
 'canada',
 'canada & usa',
 'canada (quebec)',
 'canada + usa',
 'canada and usa',
 'canada/etats unis',
 'cap-vert',
 'cardiff',
 'catalogne',
 'chiine',
 'chili',
 'chili (ile de paques)',
 'china',
 'chine',
 '

In [12]:
norm(df, 'ville_origine', 'from_city')
norm(df, 'ville', 'dest_city')

In [16]:
unique(df, ['from_city', 'dest_city'])

{nan,
 'columbia, mo',
 'paris, compicgne, lille',
 'salerne',
 'waterloo',
 'bukarest',
 'bonifacio',
 'perpignan',
 'lausanne',
 'philadelphia et seattle',
 'anvers',
 'coimbra, lisbonne',
 'athcnes + sitia',
 'egligny (f-77126)',
 'warwick',
 'bangor, manchester',
 'buenos aires',
 'pierrefitte-sur-seine',
 'dinant/namur',
 'toulouse, perpignan, montpellier',
 'dakar / ziguinchor',
 'guadalajara',
 'toulouse/montepellier',
 'copenhague + malmo',
 'itanos (crcte orientale)',
 'delhi',
 'san jose',
 'issoire',
 'palma de mallorca',
 'puebla',
 'rabat, fes, meknes',
 'lille et paris',
 'bucarest, targoviste, braila',
 'reykjavik',
 'abomey',
 'washington',
 'karlsruhe',
 'turin',
 'aleria',
 'fribourg',
 'tours et dijon',
 'providence, ri; new york city, ny',
 'leyde',
 'napoli',
 'grainau',
 'poznan',
 'h? chi minh',
 'paris/st-die-des-vosges',
 'denver, co',
 'prais',
 'regensburg',
 'amsterdam',
 'dzaoudi',
 'conakry',
 'la havane, bogota',
 'nancy',
 'palaikastro, sitias (crcte)',


In [14]:
#pd.DataFrame(list(df['origin'].unique()) + list(df['origin'].unique())).drop_duplicates().dropna().to_csv("origin.csv", index=False)

In [15]:
ctrans = pd.read_csv("country_trans.csv", keep_default_na=False)
ctrans['city'] = ctrans['city'].fillna(0).astype(bool)
ctrans.sample(5)

Unnamed: 0,original,ISO2,city
40,senegal,SN,False
2,maroc,MA,False
13,angleterre,GB,False
11,grande-bretagne,GB,False
30,blegium,BE,False


In [11]:
df = df.merge(ctrans, left_on='origin', right_on='original', how='left')
df = df.drop(['origin', 'original'], axis=1).rename(columns={'ISO2': 'country_from'})
df['country_from'] = df['country_from'].fillna('BE')

KeyError: 'origin'

In [None]:
df = df.merge(ctrans, left_on='dest', right_on='original', how='left')
df = df.drop(['dest', 'original'], axis=1).rename(columns={'ISO2': 'country_to'})

In [None]:
df

In [None]:
df.groupby('faculte')['acronyme'].count().plot(kind='bar')

In [None]:
df.groupby('acronyme')['acronyme'].count().plot(kind='bar')

In [None]:
df.groupby('elapsed')['acronyme'].count().plot(kind='hist', bins=30)