# Preprocessing

## Libraries

In [2]:
import pandas as pd
import geopandas as gpd
from collections import Counter

from preprocessing import preprocess, substitute_col_by_dict

import warnings
from shapely.errors import ShapelyDeprecationWarning
warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning) 

## Create a prefix dictionary 


(with help of Paris Opendata dataset)

In [2]:
voies_raw = pd.read_csv("data/opendata_voie_paris.csv", sep=";")
#keep only columns that might be useful further on
voies = voies_raw.copy()[["N_SQ_VO", "L_VOIE", "L_COURTMIN", "L_LONGMIN", "Geometry"]]
voies.rename(columns = {'N_SQ_VO':'id2022'}, inplace = True)

#apply preprocessing to voies
voies = preprocess(voies, "L_VOIE", new_colname="street_short")
voies = preprocess(voies, "L_COURTMIN", new_colname="street_abbr")
voies = preprocess(voies, "L_LONGMIN", new_colname="street_long")

In [10]:
#compute prefix dictionary with help of voies data
def get_prefix(row, court, long):
    return row[long].split(row[court])[0]

voies["prefix_court"] = voies.apply(get_prefix, args=("street_short", "street_abbr"), axis=1)
voies["prefix_long"] = voies.apply(get_prefix, args=("street_short", "street_long"), axis=1)
prefix_candidates = dict(zip(voies["prefix_court"], voies["prefix_long"]))
# only get prefixes with . in it
prefix_dict = {key.split(" ")[0]:value.split(" ")[0] for key,value in prefix_candidates.items() if "." in key}

prefix_dict["boul."] = "boulevard"
prefix_dict["boulev."] = "boulevard"
prefix_dict["boulv."] = "boulevard"
prefix_dict["q."] = "quai"
prefix_dict["aven."] = "avenue"
prefix_dict["faub."] = "faubourg"
prefix_dict["fau."] = "faubourg"
prefix_dict["st."] =  "saint"
prefix_dict["impas."] = "impasse"
prefix_dict["l'aub."] = "l'auberge"
prefix_dict["laub"] = "l'auberge"
prefix_dict["st"] = "saint"
prefix_dict["ste"] = "sainte"
prefix_dict["sts"] = "saints"
prefix_dict["nve"] = "neuve"

## Bottin Data

In [10]:
Bottins = pd.read_csv("data/strict_addressing.csv")
Bottins["Rue"] = Bottins["Rue"].fillna("")
Bottins = preprocess(Bottins, "Rue", "street")


In [11]:
Bottins = Bottins.rename(columns={"Rue":"rue","street":"rue_processed","Nom":"nom","Métier":"metier","Numéro": "numero", "annee":"year"})

In [12]:
Bottins.head(10)

Unnamed: 0.1,Unnamed: 0,page,row,nom,metier,rue,numero,year,rue_processed
0,bpt6k6282019m,144,0,Aaron,bronzes,passage Choiseal,72 et 74.,1855,passage choiseal
1,bpt6k6282019m,144,1,Aaron (Mic.),manuf. de porcelaines,Bondy,30.,1855,bondy
2,bpt6k6282019m,144,3,Abadie,architecte,Provence,7.,1855,provence
3,bpt6k6282019m,144,5,Abadie,tabac et estamin.,Ménilmontant,158.,1855,menilmontant
4,bpt6k6282019m,144,6,Abanse,instituteur,Sts-Pères,30.,1855,sts peres
5,bpt6k6282019m,144,7,Abat,rentier,Isly,10.,1855,isly
6,bpt6k6282019m,144,13,Abault et Coudray,charpentiers,Corbeau,23.,1855,corbeau
7,bpt6k6282019m,144,14,Abault (Paul),libraire,quai des Angustins,9.,1855,quai des angustins
8,bpt6k6282019m,144,15,Abavid,vins,Beaujolais-da-Temple,7.,1855,beaujolais da temple
9,bpt6k6282019m,144,16,Abazaer (Are),cristaux et porcelaines,Pei.Ecuries,26.,1855,pei.ecuries


In [13]:
def get_abbreviation(street):
    # returns streets which include a "."
    try:
        if "." in street:
            return street
    except:
        print(street)
      
# get number of abbreviations
print("#abbr. in data before dict substitution:", len(set([get_abbreviation(street) for street in list(Bottins["rue_processed"])])))
# substitute abbreviations
Bottins["rue_processed"] = substitute_col_by_dict(Bottins["rue_processed"], prefix_dict)
# see how many abbreviations are left
print("#abbr. in data after dict substitution:", len(set([get_abbreviation(street) for street in list(Bottins["rue_processed"])])))

#abbr. in data before dict substitution: 76549


NameError: name 'prefix_dict' is not defined

In [14]:
manual_substitution = { "n. d.":"notre dame",
             "n. d ": "notre dame", 
                "n. da": "notre da",
                "lafayette": "la fayette",
                "j. j. r": "jean jacques r", 
                "stmartin": "saint martin",
                "stdenis": "saint denis",
                "stmichel": "saint michel",
                "dutemple": "du temple",
                "faub.st": "faubourg saint",
                "faub.du ": "faubourg du ",
                "sthonore": "saint honore",
                "st.honore": "saint honore",
                "denazareth": "de nazareth",
                "stgermain": "saint germain",
                "saint g. ": "saint germain",
                "mar| tin": "martin",
                "dame de": "damede ",
                "petitesecuries": "petites ecuries",
                "faub ": "faubourg",
                "petitschamps": "petits champs",
                "saint germ.": "saint germain",
                "faub.montmartre": "faubourg montmartre",
                "faub.poissonniere": "faubourg poissonniere",
                "j.j. rousseau": "jean jaques rousseau",
                "j. j.rousseau": "jean jaques rousseau",
                "fanb. ": "faubourg",
                "fauh. ": "faubourg",
                "faub.. ": "faubourg",
                "faab. " : "faubourg",
                "f. saint ": "faubourg saint",
                "pet. champs": "petits champs",
                "ay.": "avenue",
                "av.de": "avenue de ",
                "r.du ": "rue du ",
                "carref. ": "carrefour ",
                "f. poissoniere": "faubourg poissoniere",
                "saint g.": "saint germain",
                "faub).": "faubourg",
                "taub. ": "faubourg",
                "vol. taire": "voltaire",
                "faubourg. ": "faubourg",
                "fd. poissonniere": "faubourg poissoniere",
                "stras. bourg": "strassbourg",
                "saintgerm.": "saint germain",
                "montmart.": "montmartre",
                "bouley. ": "boulevard",
                "bouly.": "boulevard",
                "pois. sonniere": "poissonniere",
                "pe. tits": "petits",
                "boul. ": "boulevard ",
                "poissonn.": "poissonniere",
                "f. poissonniere": "faubourg poissonniere",
                "f. montmartre": "faubourg montmartre",
                "fb. ": "faubourg",
                "haub. ": "faubourg",
                "j. j. pousseau": "jean jaques rousseau",
                "houl. ": "boulevard ",
                "montmar. tre": "montmartre",
                "r.de": "rue de",
                "r.des": "rue des",
                "pass.du": "passage du",
                "boul.": "boulevard ",
                "faub.": "faubourg ",
                "laub. ": "faubourg ",
                "faub.": "faubourg ",
                "faub..": "faubourg ",
                "fauh.": "faubourg ",
                "fauh.. ": "faubourg ",
                "faul).": "faubourg",
                "r.st": "rue saint",
                "alle. magne": "allemagne",
                "faub.st.": "faubourg saint",
                "faubourgdu.": "faubourg du ",
                "ams. terdam": "amsterdam",
                "boul.de ": "boulevard de ",
                "j j. rousseau": "jean jaques rousseau",
                "j. j.. rousseau": "jean jaques rousseau",
                "j. j rousseau": "jean jaques rousseau",
                "faul.": "faubourg ",
                "av.": "avenue ",
                "montm.": "montmartre",
                "mont. martre": "montmartre",
                "petites.": "petites ",
                "petits.": "petits",
                "hauss. mann": "haussmann",
                "tem. ple": "temple",
                "boul.": "boulevard ",
                "faub..du": "faubourg du ",
                "riche. lieu": "richelieu",
                "b. bonne nouvelle": "boulevard bonne nouvelle",
                "b. bonne. nouvelle": "boulevard bonne nouvelle",
                "b. bonne nouv.": "boulevard bonne nouvelle",
                "chauss.": "chaussee ",
                "che. min": "chemin",
                "impass. ": "impasse",

                
                }


In [None]:

# substitute abbreviations
Bottins["street"] = substitute_col_by_dict(Bottins["street"], manual_substitution)
print("#abbr. in data after dict substitution:", len(set([get_abbreviation(street) for street in list(Bottins["street"])])))

In [8]:
Counter([x for x in Bottins["street"] if "." in x]).most_common()

[('saint maur popinc.', 244),
 ('n.d. de nazareth', 216),
 ("saint germain l'aux.", 176),
 ('gren. saint germain', 140),
 ('rue n.d. de nazareth', 126),
 ('n.d. des victoires', 112),
 ('saint nicolas saint ant.', 106),
 ('f. du temple', 99),
 ("saint germain l'auxerr.", 95),
 ('montagne sainte gen.', 94),
 ('saint maur pop.', 88),
 ('grenelle saint hon.', 83),
 ('faubourg saint ant.', 79),
 ('m. le prince', 79),
 ('boulevard se. bastopol', 79),
 ('montagne sainte genev.', 78),
 ("chaussee. d'antin", 78),
 ('anc. comedie', 77),
 ('b. beaumarchais', 73),
 ('sainte marguerite saint ant.', 73),
 ("fosses saint germain l'aux.", 71),
 ('lavandieres sainte opp.', 71),
 ('cherche. midi', 70),
 ('vaugi. rard', 69),
 ('rue n.d. des champs', 69),
 ('sts. peres', 67),
 ('grenelle saint h.', 64),
 ("ch. d'antin", 64),
 ('boulevard sebas. topol', 64),
 ('mons. le prince', 63),
 ('n.d. de lorette', 63),
 ('traversiere saint ant.', 62),
 ('bourbon villen.', 61),
 ('faubourg saint an. toine', 61),
 ('n

#TODO hier noch letztes Preprocessing, wo mehrere Leerzeichen zu einem gemacht werden

## Group on short streetnames

In [20]:
# group streets based on their short name
grouped_streets = streets.groupby("streetname_short", as_index=False).agg({"id1791": list, "id2022": list, "streetname": list})

In [31]:
# split streets in those that are unique and those that aren't
unique_short_streets = grouped_streets[grouped_streets['streetname'].str.len() == 1]
unique_short_streets[["id1791", "id2022", "streetname"]] = unique_short_streets[["id1791", "id2022", "streetname"]].apply(lambda x: x[0])
multiple_short_streets = grouped_streets[grouped_streets['streetname'].str.len() > 1]

print(f"#streets with unique short streetname: {len(unique_short_streets)}, not unique: {len(multiple_short_streets)}")
multiple_short_streets.head(3)

#streets with unique short streetname: 5475, not unique: 817


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,streetname_short,id1791,id2022,streetname,geometry,Geometry
26,abbaye,"[nan, nan]","[750000427.0, 750005073.0]","[chemin de l'abbaye, rue de l'abbaye]","[None, None]","[{""coordinates"": [[2.234627741073616, 48.86518..."
43,abbesses,"[nan, nan, nan]","[750004135.0, 750004136.0, 750004137.0]","[passage des abbesses, place des abbesses, rue...","[None, None, None]","[{""coordinates"": [[2.33786331798002, 48.884676..."
58,acacias,"[nan, nan]","[750002087.0, 750006300.0]","[passage des acacias, rue des acacias]","[None, None]","[{""coordinates"": [[2.294369320857079, 48.87768..."


# TODO check overlapping geodata streets (above)
multiple_short_streets: same short streetname, check for overlaping geodata in streetname

In [22]:
unique_short_streets.head(3)

Unnamed: 0,streetname_short,id1791,id2022,streetname,geometry,Geometry
0,2 hermites,414.0,,rue des 2 hermites,[LINESTRING (652250.1766987587 6861766.4676413...,[nan]
1,22 novembre 1943,,,,[None],"[{""coordinates"": [[2.2936269461097294, 48.8856..."
2,260 enfants,,,,[None],"[{""coordinates"": [[2.358687508146259, 48.85783..."


## Save preprocessed datasets

In [19]:
Bottins.to_pickle("data/bottins_prep.pkl")
streets.to_pickle("data/streets_prep.pkl")
unique_short_streets.to_pickle("data/unique_short_streets.pkl")
multiple_short_streets.to_pickle("data/not_unique_short_streets.pkl")

In [26]:
streets

Unnamed: 0,id1791,nom_voie,geometry,unprocessed_voie,voie_long,voie_short,id2022,L_VOIE,L_COURTMIN,L_LONGMIN,Geometry,street_short,street_abbr,street_long,source,streetname,streetname_short
0,1162.0,deux Boules,"LINESTRING (651945.401 6862326.085, 651971.801...",rue des deux Boules,rue des deux boules,deux boules,750005865.0,DEUX BOULES,R. des Deux Boules,Rue des Deux Boules,"{""coordinates"": [[2.346201988121505, 48.858853...",deux boules,r. des deux boules,rue des deux boules,both,rue des deux boules,deux boules
1,1531.0,Jean Lantier,"LINESTRING (651937.681 6862294.281, 651976.788...",rue Jean Lantier,rue jean lantier,jean lantier,750005921.0,JEAN LANTIER,R. Jean Lantier,Rue Jean Lantier,"{""coordinates"": [[2.3473051648434904, 48.85828...",jean lantier,r. jean lantier,rue jean lantier,both,rue jean lantier,jean lantier
2,3.0,Orfèvres,"LINESTRING (651951.576 6862209.676, 651976.788...",rue des Orfèvres,rue des orfevres,orfevres,750006633.0,ORFEVRES,R. des Orfèvres,Rue des Orfèvres,"{""coordinates"": [[2.3451033049875294, 48.85822...",orfevres,r. des orfevres,rue des orfevres,both,rue des orfevres,orfevres
3,4.0,mauvaises Paroles,"LINESTRING (651944.783 6862381.819, 652004.994...",rue des mauvaises Paroles,rue des mauvaises paroles,mauvaises paroles,,,,,,,,,1791,rue des mauvaises paroles,mauvaises paroles
4,5.0,Plat d'Etain,"LINESTRING (652023.830 6862377.342, 652048.223...",rue du Plat d'Etain,rue du plat d'etain,plat d'etain,750005668.0,PLAT D'ETAIN,R. du Plat d'Etain,Rue du Plat d'Etain,"{""coordinates"": [[2.346726151545538, 48.859334...",plat d'etain,r. du plat d'etain,rue du plat d'etain,both,rue du plat d'etain,plat d'etain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7447,,,,,,,750005143.0,PETITOT,R. Petitot,Rue Petitot,"{""coordinates"": [[2.393098441596134, 48.876375...",petitot,r. petitot,rue petitot,2022,rue petitot,petitot
7448,,,,,,,750006122.0,PAUL-HENRI GRAUWIN,R. Paul-Henri Grauwin,Rue Paul-Henri Grauwin,"{""coordinates"": [[2.378008624929552, 48.844481...",paul henri grauwin,r. paul henri grauwin,rue paul henri grauwin,2022,rue paul henri grauwin,paul henri grauwin
7449,,,,,,,750006676.0,BF/17,Voie Bf/17,Voie Bf/17,"{""coordinates"": [[2.2817887366556784, 48.87897...",bf/17,voie bf/17,voie bf/17,2022,voie bf/17,bf/17
7450,,,,,,,750003865.0,CHARLES CROS,R. Charles Cros,Rue Charles Cros,"{""coordinates"": [[2.4074192235635143, 48.87642...",charles cros,r. charles cros,rue charles cros,2022,rue charles cros,charles cros
