# Preprocessing

## Libraries

In [22]:
import pandas as pd
import geopandas as gpd
from collections import Counter

from preprocessing import preprocess, substitute_col_by_dict

import warnings
#from shapely.errors import ShapelyDeprecationWarning
#warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning) 

## Create a Prefix Dictionary 
### (with help of Paris Opendata dataset)

In [23]:
voies = pd.read_csv("data/opendata_voie_paris.csv", sep=";")

#apply preprocessing to voies
voies = preprocess(voies, "L_VOIE", new_colname="street_short")
voies = preprocess(voies, "L_COURTMIN", new_colname="street_abbr")
voies = preprocess(voies, "L_LONGMIN", new_colname="street_long")

#compute prefix dictionary with help of voies data
def get_prefix(row, court, long):
    return row[long].split(row[court])[0]

voies["prefix_court"] = voies.apply(get_prefix, args=("street_short", "street_abbr"), axis=1)
voies["prefix_long"] = voies.apply(get_prefix, args=("street_short", "street_long"), axis=1)
prefix_candidates = dict(zip(voies["prefix_court"], voies["prefix_long"]))
# only get prefixes with . in it
prefix_dict = {key.split(" ")[0]:value.split(" ")[0] for key,value in prefix_candidates.items() if "." in key}

prefix_dict["boul."] = "boulevard"
prefix_dict["boulev."] = "boulevard"
prefix_dict["boulv."] = "boulevard"
prefix_dict["q."] = "quai"
prefix_dict["aven."] = "avenue"
prefix_dict["faub."] = "faubourg"
prefix_dict["fau."] = "faubourg"
prefix_dict["st. "] =  "saint "
prefix_dict["impas."] = "impasse"
prefix_dict["l'aub."] = "l'auberge"
prefix_dict["laub"] = "l'auberge"
prefix_dict["st "] = "saint "
prefix_dict["ste "] = "sainte "
prefix_dict["sts "] = "saints "
prefix_dict["nve "] = "neuve "

In [24]:
prefix_dict

{'r.': 'rue',
 'sq.': 'square',
 'pl.': 'place',
 'av.': 'avenue',
 'imp.': 'impasse',
 'all.': 'allee',
 'pas.': 'passage',
 'ham.': 'hameau',
 'vla.': 'villa',
 'terr.': 'terrasse',
 'rte.': 'route',
 'chem.': 'chemin',
 'qu.': 'quai',
 'sout.': 'souterrain',
 'prom.': 'promenade',
 'bd.': 'boulevard',
 'rpt.': 'rond',
 'port.': 'porte',
 'ptte.': 'placette',
 'gril.': 'grille',
 'gal.': 'galerie',
 'pass.': 'passage',
 'espl.': 'esplanade',
 'car.': 'carrefour',
 'parv.': 'parvis',
 'sent.': 'sentier',
 'crs.': 'cours',
 'v.': 'voie',
 'rle.': 'ruelle',
 'ron.': 'rond',
 'parvis': 'parvis',
 'p.': 'parc',
 'pr.': 'promenade',
 'bass.': 'bassin',
 'ech.': 'echangeur',
 'arc.': 'arcades',
 'chau.': 'chaussee',
 'per.': 'peristyle',
 'pa.': 'patio',
 'belv.': 'belvedere',
 'gav.': 'grande',
 'c.': 'cour',
 'aut.': 'autoroute',
 'jar.': 'jardin',
 'boul.': 'boulevard',
 'boulev.': 'boulevard',
 'boulv.': 'boulevard',
 'q.': 'quai',
 'aven.': 'avenue',
 'faub.': 'faubourg',
 'fau.': 'fau

## Bottin Data

In [25]:
bottins = pd.read_csv("data/strict_addressing.csv")
bottins["Rue"] = bottins["Rue"].fillna("")
bottins = preprocess(bottins, "Rue", "street")

In [None]:
def get_abbreviation(street):
    # returns streets which include a "."
    try:
        if "." in street:
            return street
    except:
        print(street)
      
# get number of abbreviations
print("#abbr. in data before dict substitution:", len(set([get_abbreviation(street) for street in list(bottins["street"])])))
# substitute abbreviations
bottins["street"] = substitute_col_by_dict(bottins["street"], prefix_dict)
# see how many abbreviations are left
print("#abbr. in data after dict substitution:", len(set([get_abbreviation(street) for street in list(bottins["street"])])))

#abbr. in data before dict substitution: 76549
#abbr. in data after dict substitution: 33155


In [None]:
word_dict = { "n. d.":"notre dame",
             "n. d ": "notre dame ", 
                "n. da": "notre da",
                "lafayette": "la fayette",
                "j. j. r": "jean jacques r", 
                "stmartin": "saint martin",
                "stdenis": "saint denis",
                "stmichel": "saint michel",
                "dutemple": "du temple",
                "faub.st": "faubourg saint",
                "faub.du ": "faubourg du ",
                "sthonore": "saint honore",
                "st.honore": "saint honore",
                "denazareth": "de nazareth",
                "stgermain": "saint germain",
                "saint g. ": "saint germain",
                "mar| tin": "martin",
                "dame de": "damede ",
                "petitesecuries": "petites ecuries",
                "faub ": "faubourg",
                "petitschamps": "petits champs",
                "saint germ.": "saint germain",
                "faub.montmartre": "faubourg montmartre",
                "faub.poissonniere": "faubourg poissonniere",
                "j.j. rousseau": "jean jaques rousseau",
                "j. j.rousseau": "jean jaques rousseau",
                "fanb. ": "faubourg ",
                "fauh. ": "faubourg ",
                "faub.. ": "faubourg ",
                "faab. " : "faubourg ",
                "f. saint ": "faubourg saint ",
                "pet. champs": "petits champs",
                "ay.": "avenue",
                "av.de": "avenue de ",
                "r.du ": "rue du ",
                "carref. ": "carrefour ",
                "f. poissoniere": "faubourg poissoniere",
                "saint g.": "saint germain",
                "faub).": "faubourg",
                "taub. ": "faubourg ",
                "vol. taire": "voltaire",
                "faubourg. ": "faubourg ",
                "fd. poissonniere": "faubourg poissoniere",
                "stras. bourg": "strassbourg",
                "saintgerm.": "saint germain",
                "montmart.": "montmartre",
                "bouley. ": "boulevard ",
                "bouly.": "boulevard",
                "pois. sonniere": "poissonniere",
                "pe. tits": "petits",
                "boul. ": "boulevard ",
                "poissonn.": "poissonniere",
                "f. poissonniere": "faubourg poissonniere",
                "f. montmartre": "faubourg montmartre",
                "fb. ": "faubourg ",
                "haub. ": "faubourg ",
                "j. j. pousseau": "jean jaques rousseau",
                "houl. ": "boulevard ",
                "montmar. tre": "montmartre",
                "r.de": "rue de",
                "r.des": "rue des",
                "pass.du": "passage du",
                "boul.": "boulevard ",
                "faub.": "faubourg ",
                "laub. ": "faubourg ",
                "faub.": "faubourg ",
                "faub..": "faubourg ",
                "fauh.": "faubourg ",
                "fauh.. ": "faubourg ",
                "faul).": "faubourg",
                "r.st": "rue saint",
                "alle. magne": "allemagne",
                "faub.st.": "faubourg saint",
                "r.": "rue ",
                "faubourgdu.": "faubourg du ",
                "ams. terdam": "amsterdam",
                "boul.de ": "boulevard de ",
                "j j. rousseau": "jean jaques rousseau",
                "j. j.. rousseau": "jean jaques rousseau",
                "j. j rousseau": "jean jaques rousseau",
                "faul.": "faubourg ",
                "av.": "avenue ",
                "montm.": "montmartre",
                "mont. martre": "montmartre",
                "petites.": "petites ",
                "petits.": "petits",
                "hauss. mann": "haussmann",
                "tem. ple": "temple",
                "boul.": "boulevard ",
                "faub..du": "faubourg du ",
                "riche. lieu": "richelieu",
                "b. bonne nouvelle": "boulevard bonne nouvelle",
                "b. bonne. nouvelle": "boulevard bonne nouvelle",
                "b. bonne nouv.": "boulevard bonne nouvelle",
                "chauss.": "chaussee ",
                "che. min": "chemin",
                "impass. ": "impasse",

                
                }

# substitute abbreviations
bottins["street"] = substitute_col_by_dict(bottins["street"], word_dict)
print("#abbr. in data after dict substitution:", len(set([get_abbreviation(street) for street in list(bottins["street"])])))

#abbr. in data after dict substitution: 29948


In [None]:
#replace double spaces with one space
bottins["street"] = bottins["street"].replace({"  ":" "}, regex=True)

In [None]:
Counter([x for x in bottins["street"] if "." in x]).most_common()

[('n.d. de nazareth', 214),
 ("saint germain l'aux.", 155),
 ('j. j.\\ rousseau', 134),
 ('gren. saint germain', 130),
 ('rue n.d. de nazareth', 121),
 ('n.d. des victoires', 109),
 ('saint nicolas saint ant.', 106),
 ('f. du temple', 95),
 ('montagne sainte gen.', 94),
 ('faubourg saint ant.', 84),
 ('grenelle saint hon.', 83),
 ('boulevard se. bastopol', 79),
 ("chaussee. d'antin", 78),
 ('m. le prince', 78),
 ('b. beaumarchais', 73),
 ('sainte marguerite saint ant.', 73),
 ('cherche. midi', 70),
 ('vaugi. rard', 69),
 ('rue n.d. des champs', 69),
 ('sts. peres', 67),
 ("fosses saint germain l'aux.", 66),
 ('boulevard sebas. topol', 65),
 ('grenelle saint h.', 64),
 ("ch. d'antin", 63),
 ('n.d. de lorette', 63),
 ('faubourg saint an. toine', 62),
 ('bourbon villen.', 61),
 ('traversiere saint ant.', 60),
 ('paradis. poissonniere', 58),
 ('boulevard de la vil. lette', 58),
 ('nve. des petits champs', 57),
 ('faubourg du. temple', 54),
 ('bercy saint ant.', 54),
 ('sainte croix de la b

In [None]:
Counter([x for x in bottins["street"] if "  " in x])

Counter({'paul\\  lelong': 3,
         'marais\\  saint martin': 21,
         'faubourg\\  poissonniere': 783,
         'croix des\\  petits champs': 38,
         'faubourg\\  saint honore': 185,
         'faubourg\\  montmartre': 471,
         'vieille du\\  temple': 9,
         'boulevard\\  beaumarchais': 4,
         'michel\\  lo comte': 1,
         "l'auberge\\  poissonniere": 3,
         'grenelle\\  saint germain': 51,
         'neuve des\\ petits\\  champs': 6,
         'neuve fontaine\\  saintgeorges': 5,
         'faubourg\\  saint antoine': 227,
         'bons\\  enfants': 17,
         'saint\\  antoine': 13,
         'faubourg\\  saint martin': 309,
         'faubourg\\ saint\\  antoine': 8,
         'faubourg\\  du temple': 128,
         'faubourg\\  saint denis': 207,
         'n.\\  d. des champs': 6,
         'faubourg\\  saint\\  antoine': 22,
         "pretres\\  saint germanl'auxerrois": 1,
         'place du marche\\  saintjean': 10,
         'f.\\  saint antoine': 

In [None]:
#save prepared data
bottins.to_pickle("data/bottins_prep.pkl")
