# Libraries

In [76]:
import pandas as pd
import numpy as np
import geopandas as gpd
from collections import Counter

from preprocessing import preprocess, substitute_col_by_dict

import warnings
try:
    from shapely.errors import ShapelyDeprecationWarning
    warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning) 
except:
    print("Couldn't import ShapelyDeprecationWarning")



(with help of Paris Opendata dataset)

# Bottin Data

In [77]:
Bottins = pd.read_csv("data/strict_addressing.csv")
Bottins["Rue"] = Bottins["Rue"].fillna("")
Bottins = preprocess(Bottins, "Rue", "street")

Bottins = Bottins.rename(columns={"Rue":"rue","street":"rue_processed","Nom":"nom","Métier":"metier","Numéro": "numero"})

In [78]:
Bottins.head(10)

Unnamed: 0.1,Unnamed: 0,page,row,nom,metier,rue,numero,annee,rue_processed
0,bpt6k6282019m,144,0,Aaron,bronzes,passage Choiseal,72 et 74.,1855,passage choiseal
1,bpt6k6282019m,144,1,Aaron (Mic.),manuf. de porcelaines,Bondy,30.,1855,bondy
2,bpt6k6282019m,144,3,Abadie,architecte,Provence,7.,1855,provence
3,bpt6k6282019m,144,5,Abadie,tabac et estamin.,Ménilmontant,158.,1855,menilmontant
4,bpt6k6282019m,144,6,Abanse,instituteur,Sts-Pères,30.,1855,sts peres
5,bpt6k6282019m,144,7,Abat,rentier,Isly,10.,1855,isly
6,bpt6k6282019m,144,13,Abault et Coudray,charpentiers,Corbeau,23.,1855,corbeau
7,bpt6k6282019m,144,14,Abault (Paul),libraire,quai des Angustins,9.,1855,quai des angustins
8,bpt6k6282019m,144,15,Abavid,vins,Beaujolais-da-Temple,7.,1855,beaujolais da temple
9,bpt6k6282019m,144,16,Abazaer (Are),cristaux et porcelaines,Pei.Ecuries,26.,1855,pei.ecuries


## prefix substitution

#### create a prefix dictionary 


In [79]:
#import the paris opendata dataset
voies_raw = pd.read_csv("data/opendata_voie_paris.csv", sep=";")
#keep only columns that might be useful further on
voies = voies_raw.copy()[["N_SQ_VO", "L_VOIE", "L_COURTMIN", "L_LONGMIN", "Geometry"]]
voies.rename(columns = {'N_SQ_VO':'id2022'}, inplace = True)

#apply preprocessing to voies
#three different names for the streetnames
#example: L_VOIE: Malmaisons; L_COURTMIN: R. des Malmaisons; L_LONGMIN: Rue des Malmaisons
voies = preprocess(voies, "L_VOIE", new_colname="street_short")
voies = preprocess(voies, "L_COURTMIN", new_colname="street_abbr")
voies = preprocess(voies, "L_LONGMIN", new_colname="street_long")

In [80]:
#compute prefix dictionary with help of voies data
def get_prefix(row, court, long):
    '''
    Takes a row of a dataframe and returns the difference between the entries of the columns
    "long" and "court".

    Parameters:
    ----------------
    :row: row (with column names) of a dataframe
    :court: string
        name of column with shorter entry (has to coincide with end of string of column "long")
    :long: string
        name of column with longer entry

    Returns:
    -----------------
    string of difference between court and long entry of row
    '''
    return row[long].split(row[court])[0]

# get prefixes, both abbreviated and long versions
voies["prefix_court"] = voies.apply(get_prefix, args=("street_short", "street_abbr"), axis=1)
voies["prefix_long"] = voies.apply(get_prefix, args=("street_short", "street_long"), axis=1)
# candidate dictionary of all the prefixes
prefix_candidates = dict(zip(voies["prefix_court"], voies["prefix_long"]))
# only get prefixes with . in it (otherwise would risk to get part of a name, not type of street)
prefix_dict = {key.split(" ")[0]:value.split(" ")[0] for key,value in prefix_candidates.items() if "." in key}

# add prefixes by hand (after look into data)
prefix_dict["boul."] = "boulevard"
prefix_dict["boulev."] = "boulevard"
prefix_dict["boulv."] = "boulevard"
prefix_dict["q."] = "quai"
prefix_dict["aven."] = "avenue"
prefix_dict["faub."] = "faubourg"
prefix_dict["fau."] = "faubourg"
prefix_dict["st."] =  "saint"
prefix_dict["impas."] = "impasse"
prefix_dict["l'aub."] = "l'auberge"
prefix_dict["laub"] = "l'auberge"
prefix_dict["st"] = "saint"
prefix_dict["ste"] = "sainte"
prefix_dict["sts"] = "saints"
prefix_dict["nve"] = "neuve"

In [81]:
#take a look at the first entries of the dictionary
for i, item in enumerate(prefix_dict.items()):
    print(item)
    if i>9:
        break

('r.', 'rue')
('sq.', 'square')
('pl.', 'place')
('av.', 'avenue')
('imp.', 'impasse')
('all.', 'allee')
('pas.', 'passage')
('ham.', 'hameau')
('vla.', 'villa')
('terr.', 'terrasse')
('rte.', 'route')


#### substitute prefixes

In [82]:
# get number of abbreviations before substitution
print("#abbr. in data before dict substitution:", len([street for street in list(Bottins["rue_processed"]) if "." in street]))
# substitute abbreviations in bottin data
Bottins["rue_processed"] = substitute_col_by_dict(Bottins["rue_processed"], prefix_dict)
# see how many abbreviations are left
print("#abbr. in data after dict substitution:", len([street for street in list(Bottins["rue_processed"]) if "." in street]))

#abbr. in data before dict substitution: 1544991
#abbr. in data after dict substitution: 106824


## manual substitution

#### create manual dictionary

In [83]:
#closer look into data -> substitute more abbreviations
manual_substitution = { 


                "alle. magne": "allemagne",
                "ams. terdam": "amsterdam",
                "av.": "avenue ",
                "av.de": "avenue de ",
                "ay.": "avenue",
                "b. beaumarchais": "boulevard beaumarchais",
                "b. bonne nouv.": "boulevard bonne nouvelle",
                "b. bonne nouvelle": "boulevard bonne nouvelle",
                "b. bonne. nouvelle": "boulevard bonne nouvelle",
                "b. du temple": "boulevard du temple",
                "b. poissonniere": "boulevard poissonniere",
                "boul. ": "boulevard ",
                "boul.": "boulevard ",
                "boul.": "boulevard ",
                "boul.de ": "boulevard de ",
                "bouley. ": "boulevard",
                "bouly.": "boulevard",
                "bourb. villeneuve": "rue bourbon villeneuve",
                "bourbon villen.": "rue bourbon villeneuve",
                "bretonn.": "bretonnerie",
                "carref. ": "carrefour ",
                "ch. d'antin": "rue de la chaussee d'antin",
                "chauss.": "chaussee ",
                "chaussee d'antin": "rue de la chaussee d'antin",
                "chaussee.": "chaussee",
                "che. min": "chemin",
                "cherche midi": "rue du cherche midi",
                "cherche. midi": "rue du cherche midi",
                "dame de": "damede ",
                "dame.": "dame",
                "denazareth": "de nazareth",
                "dutemple": "du temple",
                "echi. quier": "rue de l'echiquier",
                "ecole de med.": "rue de l'ecole de medecine",
                "eglise.": "eglise",
                "f. du temple": "faubourg du temple",
                "f. montmartre": "faubourg montmartre",
                "f. poissoniere": "faubourg poissoniere",
                "f. poissonniere": "faubourg poissonniere",
                "f. saint ": "faubourg saint",
                "faab. " : "faubourg",
                "fanb. ": "faubourg",
                "faub ": "faubourg",
                "faub.. ": "faubourg",
                "faub..": "faubourg ",
                "faub..du": "faubourg du ",
                "faub.": "faubourg ",
                "faub.": "faubourg ",
                "faub.du ": "faubourg du ",
                "faub.montmartre": "faubourg montmartre",
                "faub.poissonniere": "faubourg poissonniere",
                "faub.st.": "faubourg saint",
                "faub.st": "faubourg saint",
                "faub).": "faubourg",
                "faubourg du. temple": "rue du faubourg du temple",
                "faubourg saint an. toine": "rue du faubourg saint antoine",
                "faubourg saint ant.": "rue du faubourg saint antoine",
                "faubourg. ": "faubourg",
                "faubourg..": "faubourg",
                "faubourg.": "faubourg",
                "faubourg.du": "faubourg du",
                "faubourgdu.": "faubourg du ",
                "fauh. ": "faubourg",
                "fauh.. ": "faubourg ",
                "fauh.": "faubourg ",
                "faul.": "faubourg ",
                "faul).": "faubourg",
                "fb. ": "faubourg",
                "fd. poissonniere": "faubourg poissoniere",
                "germain l'aux.": "germain l\'auxerrois",
                "grande.": "grande",
                "haub. ": "faubourg",
                "hauss. mann": "haussmann",
                "houl. ": "boulevard ",
                "impass. ": "impasse",
                "j j. rousseau": "jean jaques rousseau",
                "j. j rousseau": "jean jaques rousseau",
                "j. j.   rousseau": "jean jaques rousseau",
                "j. j.  rousseau": "jean jaques rousseau",
                "j. j. pousseau": "jean jaques rousseau",
                "j. j. r": "jean jaques rousseau", 
                "j. j. rousseau": "jean jaques rousseau",
                "j. j.. rousseau": "jean jaques rousseau",
                "j. j.rousseau": "jean jaques rousseau",
                "j.j. rousseau": "jean jaques rousseau",
                "lafayette": "la fayette",
                "laub. ": "faubourg ",
                "le. compte": "le compte",
                "m. le prince": "rue monsieur le prince",
                "ma. genta": "magenta",
                "mar| tin": "martin",
                "meri. court": "mericourt",
                "mons. le prince": "rue monsieur le prince",
                "mont. martre": "montmartre",
                "montagne sainte gen.": "rue de la montagne sainte genevieve",
                "montm.": "montmartre",
                "montmar. tre": "montmartre",
                "montmart.": "montmartre",
                "n. d ": "notre dame",
                "n. d.":"notre dame",
                "n. da": "notre da",
                "n. de nazareth": "notre dame de nazareth",
                "n.d.": "notre dame",
                "naza. reth": "nazareth",
                "neuve. des petits champs": "rue neuve des petits champs",
                "notre d. de": "notre dame de",
                "notre damede naza. reth": "rue notre dame de nazareth",
                "pass.du": "passage du",
                "pe. tits": "petits",
                "pet. champs": "petits champs",
                "pet. ecuries":"petites ecuries",
                "petites.": "petites ",
                "petitesecuries": "petites ecuries",
                "petits.": "petits",
                "petitschamps": "petits champs",
                "pois. sonniere": "poissonniere",
                "poiss.": "poissonniere",
                "poissonn.": "poissonniere",
                "r.de": "rue de",
                "r.des": "rue des",
                "r.du ": "rue du ",
                "r.st": "rue saint",
                "rambu. teau": "rambouteau",
                "riche. lieu": "richelieu",
                "rue de vaugi. rard": "rue de vaugirard",
                "saint ant.": "saint antoine",
                "saint g. ": "saint germain",
                "saint g.": "saint germain",
                "saint germ.": "saint germain",
                "saint hon.": "saint honore",
                "saintgerm.": "saint germain",
                "saints. peres": "rue des saints peres",
                "se. basaintopol": "sebastopol",
                "sebas. topol": "sebastopol",
                "st.honore": "saint honore",
                "stdenis": "saint denis",
                "stgermain": "saint germain",
                "sthonore": "saint honore",
                "stmartin": "saint martin",
                "stmichel": "saint michel",
                "stras. bourg": "strassbourg",
                "taub. ": "faubourg",
                "tem. ple": "temple",
                "vaugi. rard": "vaugirard",
                "vi. vienne": "vivienne",
                "vil. lette": "villette",
                "vol. taire": "voltaire",
                "n.  d. de": "notre dame de",
                "b. poissonniere": "boulevard poissonniere",
                "ri. voli": "rue de rivoli",
                "croix des petits ch.": "rue croix des petits champs",
                "males. herbes": "malesherbes",
                "poisson. niere": "poissonniere",
                "rous. seau": "rousseau",
                "monsieur le. prince": "monsieur le prince",
                }


#### substitute

In [84]:
# substitute abbreviations
Bottins["rue_processed"] = substitute_col_by_dict(Bottins["rue_processed"], manual_substitution)
print("#abbr. in data after dict substitution:", len([street for street in list(Bottins["rue_processed"]) if "." in street]))

#abbr. in data after dict substitution: 49074


#### show streets with remaining prefixes

In [85]:
#print most common abbreviations which are left
Counter([x for x in Bottins["rue_processed"] if "." in x]).most_common(10)

[('gren. saint germain', 128),
 ('boulevard de saintras. bourg', 80),
 ('grenelle saint h.', 64),
 ("fosses saint germain l'aux.", 60),
 ('paradis. poissonniere', 58),
 ('grenelle. saint germain', 48),
 ('ferme des math.', 46),
 ('angouleme du t.', 43),
 ('michel le. comte', 43),
 ('saint domin. saint germain', 40)]

In [89]:
#replace double spaces with one space
Bottins["rue_processed"] = Bottins["rue_processed"].str.replace("  ", " ", regex=True)

#check if there are double spaces left
Counter([x for x in Bottins["rue_processed"] if "  " in x])

Counter({'10.  a lyon. notre damede s victoires': 1, 'faab  tempie': 1})

## Save preprocessed datasets

In [90]:
Bottins.to_pickle("data/bottins_prep.pkl")