# Preprocessing

## Libraries

In [43]:
import pandas as pd
import geopandas as gpd
from collections import Counter

from preprocessing import preprocess, substitute_col_by_dict

import warnings
#from shapely.errors import ShapelyDeprecationWarning
#warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning) 

## Create a Prefix Dictionary 
### (with help of Paris Opendata dataset)

In [44]:
#import the paris opendata dataset
voies = pd.read_csv("data/opendata_voie_paris.csv", sep=";")

#apply preprocessing to voies
#three different names for the streetnames
#example: L_VOIE: Malmaisons; L_COURTMIN: R. des Malmaisons; L_LONGMIN: Rue des Malmaisons
voies = preprocess(voies, "L_VOIE", new_colname="street_short")
voies = preprocess(voies, "L_COURTMIN", new_colname="street_abbr")
voies = preprocess(voies, "L_LONGMIN", new_colname="street_long")

#compute prefix dictionary with help of voies data
def get_prefix(row, court, long):
    '''
    Takes a row of a dataframe and returns the difference between the entries of the columns
    "long" and "court".

    :row: row (with column names) of a dataframe
    :court: string, name of column with shorter entry (has to coincide with end of string of column "long")
    :long: string, name of column with longer entry

    :return: string of difference between court and long entry of row
    '''
    return row[long].split(row[court])[0]

# get prefixes, both abbreviated and long versions
voies["prefix_court"] = voies.apply(get_prefix, args=("street_short", "street_abbr"), axis=1)
voies["prefix_long"] = voies.apply(get_prefix, args=("street_short", "street_long"), axis=1)
# candidate dictionary of all the prefixes
prefix_candidates = dict(zip(voies["prefix_court"], voies["prefix_long"]))
# only get prefixes with . in it (otherwise would risk to get part of a name, not type of street)
prefix_dict = {key.split(" ")[0]:value.split(" ")[0] for key,value in prefix_candidates.items() if "." in key}

# add prefixes by hand (after look into data)
prefix_dict["boul."] = "boulevard"
prefix_dict["boulev."] = "boulevard"
prefix_dict["boulv."] = "boulevard"
prefix_dict["q."] = "quai"
prefix_dict["aven."] = "avenue"
prefix_dict["faub."] = "faubourg"
prefix_dict["fau."] = "faubourg"
prefix_dict["st. "] =  "saint "
prefix_dict["impas."] = "impasse"
prefix_dict["l'aub."] = "l'auberge"
prefix_dict["laub"] = "l'auberge"
prefix_dict["st "] = "saint "
prefix_dict["ste "] = "sainte "
prefix_dict["sts "] = "saints "
prefix_dict["nve "] = "neuve "

In [45]:
voies.head(3)

Unnamed: 0,N_SQ_VO,C_COINSEE,C_DESI,C_LIAISON,L_VOIE,L_COURTMIN,L_LONGMIN,C_VOIE,B_FANTOIR,B_OFF,...,OBJECTID,C_DOMOFF,N_SQ_CO,LENGTH,Geometry X Y,street_short,street_abbr,street_long,prefix_court,prefix_long
0,750003123,75056,RUE,DES,MALMAISONS,R. des Malmaisons,Rue des Malmaisons,750565966,O,O,...,31,P,750001537,198.700914,"48.8214449824661,2.362110788580267",malmaisons,r. des malmaisons,rue des malmaisons,r. des,rue des
1,750005984,75056,RUE,,BUDE,R. Budé,Rue Budé,750561365,O,O,...,41,P,750001537,92.558635,"48.85171497797937,2.355557881870504",bude,r. bude,rue bude,r.,rue
2,750006284,75056,SQ,DE L',AVENUE FOCH,Sq. de l'Avenue Foch,Square de l'Avenue Foch,750560603,O,O,...,43,V,750001537,283.658466,"48.873171685606714,2.278467676214657",avenue foch,sq. de l'avenue foch,square de l'avenue foch,sq. de l',square de l'


In [46]:
#take a look at the first entries of the dictionary
for i, item in enumerate(prefix_dict.items()):
    print(item)
    if i>9:
        break

('r.', 'rue')
('sq.', 'square')
('pl.', 'place')
('av.', 'avenue')
('imp.', 'impasse')
('all.', 'allee')
('pas.', 'passage')
('ham.', 'hameau')
('vla.', 'villa')
('terr.', 'terrasse')
('rte.', 'route')


## Bottin Data

In [47]:
#load bottin data and preprocess it
bottins = pd.read_csv("data/strict_addressing.csv")
bottins["Rue"] = bottins["Rue"].fillna("")
bottins = preprocess(bottins, "Rue", "street")

In [48]:
# get number of abbreviations before substitution
print("#abbr. in data before dict substitution:", len([street for street in list(bottins["street"]) if "." in street]))
# substitute abbreviations in bottin data
bottins["street"] = substitute_col_by_dict(bottins["street"], prefix_dict)
# see how many abbreviations are left
print("#abbr. in data after dict substitution:", len([street for street in list(bottins["street"]) if "." in street]))

#abbr. in data before dict substitution: 1544991
#abbr. in data after dict substitution: 107280


In [49]:
#closer look into data -> substitute more abbreviations
word_dict = { "n. d.":"notre dame",
             "n. d ": "notre dame ", 
                "n. da": "notre da",
                "lafayette": "la fayette",
                "j. j. r": "jean jacques r", 
                "stmartin": "saint martin",
                "stdenis": "saint denis",
                "stmichel": "saint michel",
                "dutemple": "du temple",
                "faub.st": "faubourg saint",
                "faub.du ": "faubourg du ",
                "sthonore": "saint honore",
                "st.honore": "saint honore",
                "denazareth": "de nazareth",
                "stgermain": "saint germain",
                "saint g. ": "saint germain",
                "mar| tin": "martin",
                "dame de": "damede ",
                "petitesecuries": "petites ecuries",
                "faub ": "faubourg",
                "petitschamps": "petits champs",
                "saint germ.": "saint germain",
                "faub.montmartre": "faubourg montmartre",
                "faub.poissonniere": "faubourg poissonniere",
                "j.j. rousseau": "jean jaques rousseau",
                "j. j.rousseau": "jean jaques rousseau",
                "fanb. ": "faubourg ",
                "fauh. ": "faubourg ",
                "faub.. ": "faubourg ",
                "faab. " : "faubourg ",
                "f. saint ": "faubourg saint ",
                "pet. champs": "petits champs",
                "ay.": "avenue",
                "av.de": "avenue de ",
                "r.du ": "rue du ",
                "carref. ": "carrefour ",
                "f. poissoniere": "faubourg poissoniere",
                "saint g.": "saint germain",
                "faub).": "faubourg",
                "taub. ": "faubourg ",
                "vol. taire": "voltaire",
                "faubourg. ": "faubourg ",
                "fd. poissonniere": "faubourg poissoniere",
                "stras. bourg": "strassbourg",
                "saintgerm.": "saint germain",
                "montmart.": "montmartre",
                "bouley. ": "boulevard ",
                "bouly.": "boulevard",
                "pois. sonniere": "poissonniere",
                "pe. tits": "petits",
                "boul. ": "boulevard ",
                "poissonn.": "poissonniere",
                "f. poissonniere": "faubourg poissonniere",
                "f. montmartre": "faubourg montmartre",
                "fb. ": "faubourg ",
                "haub. ": "faubourg ",
                "j. j. pousseau": "jean jaques rousseau",
                "houl. ": "boulevard ",
                "montmar. tre": "montmartre",
                "r.de": "rue de",
                "r.des": "rue des",
                "pass.du": "passage du",
                "boul.": "boulevard ",
                "faub.": "faubourg ",
                "laub. ": "faubourg ",
                "faub.": "faubourg ",
                "faub..": "faubourg ",
                "fauh.": "faubourg ",
                "fauh.. ": "faubourg ",
                "faul).": "faubourg",
                "r.st": "rue saint",
                "alle. magne": "allemagne",
                "faub.st.": "faubourg saint",
                "r.": "rue ",
                "faubourgdu.": "faubourg du ",
                "ams. terdam": "amsterdam",
                "boul.de ": "boulevard de ",
                "j j. rousseau": "jean jaques rousseau",
                "j. j.. rousseau": "jean jaques rousseau",
                "j. j rousseau": "jean jaques rousseau",
                "faul.": "faubourg ",
                "av.": "avenue ",
                "montm.": "montmartre",
                "mont. martre": "montmartre",
                "petites.": "petites ",
                "petits.": "petits",
                "hauss. mann": "haussmann",
                "tem. ple": "temple",
                "boul.": "boulevard ",
                "faub..du": "faubourg du ",
                "riche. lieu": "richelieu",
                "b. bonne nouvelle": "boulevard bonne nouvelle",
                "b. bonne. nouvelle": "boulevard bonne nouvelle",
                "b. bonne nouv.": "boulevard bonne nouvelle",
                "chauss.": "chaussee ",
                "che. min": "chemin",
                "impass. ": "impasse",

                
                }

# substitute abbreviations
bottins["street"] = substitute_col_by_dict(bottins["street"], word_dict)
print("#abbr. in data after dict substitution:", len([street for street in list(bottins["street"]) if "." in street]))

#abbr. in data after dict substitution: 54957


In [50]:
#print most common abbreviations which are left
Counter([x for x in bottins["street"] if "." in x]).most_common(10)

[('n.d. de nazareth', 214),
 ("saint germain l'aux.", 155),
 ('j. j.  rousseau', 134),
 ('gren. saint germain', 130),
 ('rue n.d. de nazareth', 121),
 ('n.d. des victoires', 109),
 ('saint nicolas saint ant.', 106),
 ('f. du temple', 95),
 ('montagne sainte gen.', 94),
 ('faubourg saint ant.', 84)]

In [54]:
#replace double spaces with one space
bottins["street"] = bottins["street"].replace({"  ":" "}, regex=True)

In [55]:
#check if there are double spaces left
Counter([x for x in bottins["street"] if "  " in x])

Counter({'10.  a lyon. notre damede s victoires': 1, 'faab  tempie': 1})

In [56]:
#save prepared data
bottins.to_pickle("data/bottins_prep.pkl")
