# Preprocessing

## Libraries

In [2]:
import pandas as pd
import geopandas as gpd
from collections import Counter

from preprocessing import preprocess, substitute_col_by_dict

## Street data

#### 2022 data

In [7]:
voies_raw = pd.read_csv("data/opendata_voie_paris.csv", sep=";")
#keep only columns that might be useful further on
voies = voies_raw.copy()[["N_SQ_VO", "L_VOIE", "L_COURTMIN", "L_LONGMIN", "Geometry"]]
voies.rename(columns = {'N_SQ_VO':'id2022'}, inplace = True)

#apply preprocessing to voies
voies = preprocess(voies, "L_VOIE", new_colname="street_short")
voies = preprocess(voies, "L_COURTMIN", new_colname="street_abbr")
voies = preprocess(voies, "L_LONGMIN", new_colname="street_long")

#### 1791 data

In [8]:
old = gpd.read_file("data/1791.zip", encoding='utf-8')
#old = old.drop(["geometry"], axis=1)
# drop rows that don't have street names
old = old.dropna(subset=["nom_voie"])
old.rename(columns = {'id':'id1791'}, inplace = True)
# replace None values by string ""
old = old.fillna("")
# create column with one long streetname
old["unprocessed_voie"] = old["type_voie"] + " " + old["particule"] + " " + old["nom_voie"]
old = old.drop(["type_voie", "particule"], axis=1)
old = preprocess(old, "unprocessed_voie", new_colname="voie_long")
old = preprocess(old, "nom_voie", new_colname="voie_short")

In [9]:
streets = old.merge(voies, how='outer', left_on="voie_long", right_on="street_long")
print("#total:", len(streets), "#old:", len(old), "#new:", len(voies), "#both old and new:", len(streets.dropna()))

# add a column to know where the data came from
streets["source"] = "1791"
streets.loc[streets["id2022"].notna(), "source"] = "2022"
streets.loc[(streets["id1791"].notna() & streets["id2022"].notna()), "source"] = "both"

# add both a column with (common) long and short streetnames
streets["streetname"] = streets["voie_long"].fillna(streets["street_long"])
streets["streetname_short"] = streets["street_short"]
streets["streetname_short"] = streets["streetname_short"].fillna(streets["voie_short"])
streets

#total: 7452 #old: 1292 #new: 6542 #both old and new: 433


Unnamed: 0,id1791,nom_voie,geometry,unprocessed_voie,voie_long,voie_short,id2022,L_VOIE,L_COURTMIN,L_LONGMIN,Geometry,street_short,street_abbr,street_long,source,streetname,streetname_short
0,1162.0,deux Boules,"LINESTRING (651945.401 6862326.085, 651971.801...",rue des deux Boules,rue des deux boules,deux boules,750005865.0,DEUX BOULES,R. des Deux Boules,Rue des Deux Boules,"{""coordinates"": [[2.346201988121505, 48.858853...",deux boules,r. des deux boules,rue des deux boules,both,rue des deux boules,deux boules
1,1531.0,Jean Lantier,"LINESTRING (651937.681 6862294.281, 651976.788...",rue Jean Lantier,rue jean lantier,jean lantier,750005921.0,JEAN LANTIER,R. Jean Lantier,Rue Jean Lantier,"{""coordinates"": [[2.3473051648434904, 48.85828...",jean lantier,r. jean lantier,rue jean lantier,both,rue jean lantier,jean lantier
2,3.0,Orfèvres,"LINESTRING (651951.576 6862209.676, 651976.788...",rue des Orfèvres,rue des orfevres,orfevres,750006633.0,ORFEVRES,R. des Orfèvres,Rue des Orfèvres,"{""coordinates"": [[2.3451033049875294, 48.85822...",orfevres,r. des orfevres,rue des orfevres,both,rue des orfevres,orfevres
3,4.0,mauvaises Paroles,"LINESTRING (651944.783 6862381.819, 652004.994...",rue des mauvaises Paroles,rue des mauvaises paroles,mauvaises paroles,,,,,,,,,1791,rue des mauvaises paroles,mauvaises paroles
4,5.0,Plat d'Etain,"LINESTRING (652023.830 6862377.342, 652048.223...",rue du Plat d'Etain,rue du plat d'etain,plat d'etain,750005668.0,PLAT D'ETAIN,R. du Plat d'Etain,Rue du Plat d'Etain,"{""coordinates"": [[2.346726151545538, 48.859334...",plat d'etain,r. du plat d'etain,rue du plat d'etain,both,rue du plat d'etain,plat d'etain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7447,,,,,,,750005143.0,PETITOT,R. Petitot,Rue Petitot,"{""coordinates"": [[2.393098441596134, 48.876375...",petitot,r. petitot,rue petitot,2022,rue petitot,petitot
7448,,,,,,,750006122.0,PAUL-HENRI GRAUWIN,R. Paul-Henri Grauwin,Rue Paul-Henri Grauwin,"{""coordinates"": [[2.378008624929552, 48.844481...",paul henri grauwin,r. paul henri grauwin,rue paul henri grauwin,2022,rue paul henri grauwin,paul henri grauwin
7449,,,,,,,750006676.0,BF/17,Voie Bf/17,Voie Bf/17,"{""coordinates"": [[2.2817887366556784, 48.87897...",bf/17,voie bf/17,voie bf/17,2022,voie bf/17,bf/17
7450,,,,,,,750003865.0,CHARLES CROS,R. Charles Cros,Rue Charles Cros,"{""coordinates"": [[2.4074192235635143, 48.87642...",charles cros,r. charles cros,rue charles cros,2022,rue charles cros,charles cros


## Create a Prefix Dictionary 
### (with help of Paris Opendata dataset)

In [10]:
#compute prefix dictionary with help of voies data
def get_prefix(row, court, long):
    return row[long].split(row[court])[0]

voies["prefix_court"] = voies.apply(get_prefix, args=("street_short", "street_abbr"), axis=1)
voies["prefix_long"] = voies.apply(get_prefix, args=("street_short", "street_long"), axis=1)
prefix_candidates = dict(zip(voies["prefix_court"], voies["prefix_long"]))
# only get prefixes with . in it
prefix_dict = {key.split(" ")[0]:value.split(" ")[0] for key,value in prefix_candidates.items() if "." in key}

prefix_dict["boul."] = "boulevard"
prefix_dict["boulev."] = "boulevard"
prefix_dict["boulv."] = "boulevard"
prefix_dict["q."] = "quai"
prefix_dict["aven."] = "avenue"
prefix_dict["faub."] = "faubourg"
prefix_dict["fau."] = "faubourg"
prefix_dict["st."] =  "saint"
prefix_dict["impas."] = "impasse"
prefix_dict["l'aub."] = "l'auberge"
prefix_dict["laub"] = "l'auberge"
prefix_dict["st"] = "saint"
prefix_dict["ste"] = "sainte"
prefix_dict["sts"] = "saints"
prefix_dict["nve"] = "neuve"

In [11]:
prefix_dict

{'imp.': 'impasse',
 'r.': 'rue',
 'bd.': 'boulevard',
 'av.': 'avenue',
 'pl.': 'place',
 'vla.': 'villa',
 'sq.': 'square',
 'pas.': 'passage',
 'qu.': 'quai',
 'all.': 'allee',
 'rte.': 'route',
 'sout.': 'souterrain',
 'chem.': 'chemin',
 'car.': 'carrefour',
 'ech.': 'echangeur',
 'gal.': 'galerie',
 'espl.': 'esplanade',
 'per.': 'peristyle',
 'crs.': 'cours',
 'port.': 'porte',
 'chau.': 'chaussee',
 'prom.': 'promenade',
 'ham.': 'hameau',
 'terr.': 'terrasse',
 'rpt.': 'rond',
 'ptte.': 'placette',
 'gril.': 'grille',
 'parv.': 'parvis',
 'pr.': 'promenade',
 'sent.': 'sentier',
 'rle.': 'ruelle',
 'c.': 'cour',
 'v.': 'voie',
 'aut.': 'autoroute',
 'arc.': 'arcades',
 'pa.': 'patio',
 'ron.': 'rond',
 'pass.': 'passerelle',
 'jar.': 'jardin',
 'gav.': 'grande',
 'parvis': 'parvis',
 'p.': 'parc',
 'bass.': 'bassin',
 'belv.': 'belvedere',
 'boul.': 'boulevard',
 'boulev.': 'boulevard',
 'boulv.': 'boulevard',
 'q.': 'quai',
 'aven.': 'avenue',
 'faub.': 'faubourg',
 'fau.': '

## Bottin Data

In [12]:
bottins = pd.read_csv("data/strict_addressing.csv")
bottins["Rue"] = bottins["Rue"].fillna("")
bottins = preprocess(bottins, "Rue", "street")

In [13]:
def get_abbreviation(street):
    # returns streets which include a "."
    try:
        if "." in street:
            return street
    except:
        print(street)
      
# get number of abbreviations
print("#abbr. in data before dict substitution:", len(set([get_abbreviation(street) for street in list(bottins["street"])])))
# substitute abbreviations
bottins["street"] = substitute_col_by_dict(bottins["street"], prefix_dict)
# see how many abbreviations are left
print("#abbr. in data after dict substitution:", len(set([get_abbreviation(street) for street in list(bottins["street"])])))

#abbr. in data before dict substitution: 76549
#abbr. in data after dict substitution: 32964


In [14]:
word_dict = { "n. d.":"notre dame",
             "n. d ": "notre dame", 
                "n. da": "notre da",
                "lafayette": "la fayette",
                "j. j. r": "jean jacques r", 
                "stmartin": "saint martin",
                "stdenis": "saint denis",
                "stmichel": "saint michel",
                "dutemple": "du temple",
                "faub.st": "faubourg saint",
                "faub.du ": "faubourg du ",
                "sthonore": "saint honore",
                "denazareth": "de nazareth",
                "stgermain": "saint germain",
                "mar| tin": "martin",
                "dame de": "damede ",
                "petitesecuries": "petites ecuries",
                "faub ": "faubourg",
                "petitschamps": "petits champs"}

# substitute abbreviations
bottins["street"] = substitute_col_by_dict(bottins["street"], word_dict)

In [15]:
Counter([x for x in bottins["street"] if "." in x]).most_common()

[('grenelle saint germ.', 1000),
 ('four saint germ.', 402),
 ('j. j.rousseau', 370),
 ('fanb. saint antoine', 295),
 ('f. saint martin', 289),
 ('saint dominique saint germ.', 282),
 ('fauh. saint martin', 280),
 ('fauh. saint antoine', 269),
 ('fanb. saint martin', 269),
 ('fanb. saint denis', 268),
 ('fauh. saint denis', 254),
 ('f. saint antoine', 248),
 ('neuve des pet. champs', 239),
 ('croix des pet. champs', 226),
 ('grenelle saint g.', 223),
 ('f. saint denis', 214),
 ('n.d. de nazareth', 214),
 ('faubourg. poissonniere', 208),
 ('sainte marguerite saint germ.', 207),
 ('ay. parmentier', 205),
 ('faubourg. saint denis', 196),
 ('j.j. rousseau', 196),
 ('fanb. du temple', 187),
 ('faubourg. saint martin', 181),
 ('boulevard vol. taire', 179),
 ('ay. wagram', 174),
 ("saint germ. l'auxerrois", 168),
 ('fauh. poissonniere', 164),
 ('fauh. saint honore', 162),
 ('paradis poissonn.', 158),
 ('fauh. montmartre', 152),
 ('fanb. poissonniere', 144),
 ('sainte marguerite saint g.', 142

## Group on short streetnames

In [20]:
# group streets based on their short name
grouped_streets = streets.groupby("streetname_short", as_index=False).agg({"id1791": list, "id2022": list, "streetname": list, "geometry": list, "Geometry":list})

In [21]:
# split streets in those that are unique and those that aren't
unique_short_streets = grouped_streets[grouped_streets['streetname'].str.len() == 1]
unique_short_streets[["id1791", "id2022", "streetname"]] = unique_short_streets[["id1791", "id2022", "streetname"]].apply(lambda x: x[0])
multiple_short_streets = grouped_streets[grouped_streets['streetname'].str.len() > 1]

print(f"#streets with unique short streetname: {len(unique_short_streets)}, not unique: {len(multiple_short_streets)}")
multiple_short_streets.head(3)

#streets with unique short streetname: 5475, not unique: 817


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,streetname_short,id1791,id2022,streetname,geometry,Geometry
26,abbaye,"[nan, nan]","[750000427.0, 750005073.0]","[chemin de l'abbaye, rue de l'abbaye]","[None, None]","[{""coordinates"": [[2.234627741073616, 48.86518..."
43,abbesses,"[nan, nan, nan]","[750004135.0, 750004136.0, 750004137.0]","[passage des abbesses, place des abbesses, rue...","[None, None, None]","[{""coordinates"": [[2.33786331798002, 48.884676..."
58,acacias,"[nan, nan]","[750002087.0, 750006300.0]","[passage des acacias, rue des acacias]","[None, None]","[{""coordinates"": [[2.294369320857079, 48.87768..."


# TODO check overlapping geodata streets (above)
multiple_short_streets: same short streetname, check for overlaping geodata in streetname

In [22]:
unique_short_streets.head(3)

Unnamed: 0,streetname_short,id1791,id2022,streetname,geometry,Geometry
0,2 hermites,414.0,,rue des 2 hermites,[LINESTRING (652250.1766987587 6861766.4676413...,[nan]
1,22 novembre 1943,,,,[None],"[{""coordinates"": [[2.2936269461097294, 48.8856..."
2,260 enfants,,,,[None],"[{""coordinates"": [[2.358687508146259, 48.85783..."


## Save preprocessed datasets

In [19]:
bottins.to_pickle("data/bottins_prep.pkl")
streets.to_pickle("data/streets_prep.pkl")
unique_short_streets.to_pickle("data/unique_short_streets.pkl")
multiple_short_streets.to_pickle("data/not_unique_short_streets.pkl")

In [26]:
streets

Unnamed: 0,id1791,nom_voie,geometry,unprocessed_voie,voie_long,voie_short,id2022,L_VOIE,L_COURTMIN,L_LONGMIN,Geometry,street_short,street_abbr,street_long,source,streetname,streetname_short
0,1162.0,deux Boules,"LINESTRING (651945.401 6862326.085, 651971.801...",rue des deux Boules,rue des deux boules,deux boules,750005865.0,DEUX BOULES,R. des Deux Boules,Rue des Deux Boules,"{""coordinates"": [[2.346201988121505, 48.858853...",deux boules,r. des deux boules,rue des deux boules,both,rue des deux boules,deux boules
1,1531.0,Jean Lantier,"LINESTRING (651937.681 6862294.281, 651976.788...",rue Jean Lantier,rue jean lantier,jean lantier,750005921.0,JEAN LANTIER,R. Jean Lantier,Rue Jean Lantier,"{""coordinates"": [[2.3473051648434904, 48.85828...",jean lantier,r. jean lantier,rue jean lantier,both,rue jean lantier,jean lantier
2,3.0,Orfèvres,"LINESTRING (651951.576 6862209.676, 651976.788...",rue des Orfèvres,rue des orfevres,orfevres,750006633.0,ORFEVRES,R. des Orfèvres,Rue des Orfèvres,"{""coordinates"": [[2.3451033049875294, 48.85822...",orfevres,r. des orfevres,rue des orfevres,both,rue des orfevres,orfevres
3,4.0,mauvaises Paroles,"LINESTRING (651944.783 6862381.819, 652004.994...",rue des mauvaises Paroles,rue des mauvaises paroles,mauvaises paroles,,,,,,,,,1791,rue des mauvaises paroles,mauvaises paroles
4,5.0,Plat d'Etain,"LINESTRING (652023.830 6862377.342, 652048.223...",rue du Plat d'Etain,rue du plat d'etain,plat d'etain,750005668.0,PLAT D'ETAIN,R. du Plat d'Etain,Rue du Plat d'Etain,"{""coordinates"": [[2.346726151545538, 48.859334...",plat d'etain,r. du plat d'etain,rue du plat d'etain,both,rue du plat d'etain,plat d'etain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7447,,,,,,,750005143.0,PETITOT,R. Petitot,Rue Petitot,"{""coordinates"": [[2.393098441596134, 48.876375...",petitot,r. petitot,rue petitot,2022,rue petitot,petitot
7448,,,,,,,750006122.0,PAUL-HENRI GRAUWIN,R. Paul-Henri Grauwin,Rue Paul-Henri Grauwin,"{""coordinates"": [[2.378008624929552, 48.844481...",paul henri grauwin,r. paul henri grauwin,rue paul henri grauwin,2022,rue paul henri grauwin,paul henri grauwin
7449,,,,,,,750006676.0,BF/17,Voie Bf/17,Voie Bf/17,"{""coordinates"": [[2.2817887366556784, 48.87897...",bf/17,voie bf/17,voie bf/17,2022,voie bf/17,bf/17
7450,,,,,,,750003865.0,CHARLES CROS,R. Charles Cros,Rue Charles Cros,"{""coordinates"": [[2.4074192235635143, 48.87642...",charles cros,r. charles cros,rue charles cros,2022,rue charles cros,charles cros
