# Preprocessing

## Libraries

In [1]:
import pandas as pd
import geopandas as gpd
from collections import Counter

from preprocessing import preprocess, substitute_col_by_dict

## Street data

#### 2022 data

In [2]:
voies_raw = pd.read_csv("data/opendata_voie_paris.csv", sep=";")
#keep only columns that might be useful further on
voies = voies_raw.copy()[["N_SQ_VO", "L_VOIE", "L_COURTMIN", "L_LONGMIN"]]
voies.rename(columns = {'N_SQ_VO':'id2022'}, inplace = True)

#apply preprocessing to voies
voies = preprocess(voies, "L_VOIE", new_colname="street_short")
voies = preprocess(voies, "L_COURTMIN", new_colname="street_abbr")
voies = preprocess(voies, "L_LONGMIN", new_colname="street_long")

#### 1791 data

In [3]:
old = gpd.read_file("data/1791.zip", encoding='utf-8')
old = old.drop(["geometry"], axis=1)
# drop rows that don't have street names
old = old.dropna(subset=["nom_voie"])
old.rename(columns = {'id':'id1791'}, inplace = True)
# replace None values by string ""
old = old.fillna("")
# create column with one long streetname
old["unprocessed_voie"] = old["type_voie"] + " " + old["particule"] + " " + old["nom_voie"]
old = old.drop(["type_voie", "particule"], axis=1)
old = preprocess(old, "unprocessed_voie", new_colname="voie_long")
old = preprocess(old, "nom_voie", new_colname="voie_short")

In [4]:
streets = old.merge(voies, how='outer', left_on="voie_long", right_on="street_long")
print("#total:", len(streets), "#old:", len(old), "#new:", len(voies), "#both old and new:", len(streets.dropna()))

# add a column to know where the data came from
streets["source"] = "1791"
streets.loc[streets["id2022"].notna(), "source"] = "2022"
streets.loc[(streets["id1791"].notna() & streets["id2022"].notna()), "source"] = "both"

# add both a column with (common) long and short streetnames
streets["streetname"] = streets["voie_long"].fillna(streets["street_long"])
streets["streetname_short"] = streets["street_short"]
streets["streetname_short"] = streets["streetname_short"].fillna(streets["voie_short"])
streets

#total: 7452 #old: 1292 #new: 6542 #both old and new: 433


Unnamed: 0,id1791,nom_voie,unprocessed_voie,voie_long,voie_short,id2022,L_VOIE,L_COURTMIN,L_LONGMIN,street_short,street_abbr,street_long,source,streetname,streetname_short
0,1162.0,deux Boules,rue des deux Boules,rue des deux boules,deux boules,750005865.0,DEUX BOULES,R. des Deux Boules,Rue des Deux Boules,deux boules,r. des deux boules,rue des deux boules,both,rue des deux boules,deux boules
1,1531.0,Jean Lantier,rue Jean Lantier,rue jean lantier,jean lantier,750005921.0,JEAN LANTIER,R. Jean Lantier,Rue Jean Lantier,jean lantier,r. jean lantier,rue jean lantier,both,rue jean lantier,jean lantier
2,3.0,Orfèvres,rue des Orfèvres,rue des orfevres,orfevres,750006633.0,ORFEVRES,R. des Orfèvres,Rue des Orfèvres,orfevres,r. des orfevres,rue des orfevres,both,rue des orfevres,orfevres
3,4.0,mauvaises Paroles,rue des mauvaises Paroles,rue des mauvaises paroles,mauvaises paroles,,,,,,,,1791,rue des mauvaises paroles,mauvaises paroles
4,5.0,Plat d'Etain,rue du Plat d'Etain,rue du plat d'etain,plat d'etain,750005668.0,PLAT D'ETAIN,R. du Plat d'Etain,Rue du Plat d'Etain,plat d'etain,r. du plat d'etain,rue du plat d'etain,both,rue du plat d'etain,plat d'etain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7447,,,,,,750005143.0,PETITOT,R. Petitot,Rue Petitot,petitot,r. petitot,rue petitot,2022,rue petitot,petitot
7448,,,,,,750006122.0,PAUL-HENRI GRAUWIN,R. Paul-Henri Grauwin,Rue Paul-Henri Grauwin,paul henri grauwin,r. paul henri grauwin,rue paul henri grauwin,2022,rue paul henri grauwin,paul henri grauwin
7449,,,,,,750006676.0,BF/17,Voie Bf/17,Voie Bf/17,bf/17,voie bf/17,voie bf/17,2022,voie bf/17,bf/17
7450,,,,,,750003865.0,CHARLES CROS,R. Charles Cros,Rue Charles Cros,charles cros,r. charles cros,rue charles cros,2022,rue charles cros,charles cros


## Create a Prefix Dictionary 
### (with help of Paris Opendata dataset)

In [5]:
#compute prefix dictionary with help of voies data
def get_prefix(row, court, long):
    return row[long].split(row[court])[0]

voies["prefix_court"] = voies.apply(get_prefix, args=("street_short", "street_abbr"), axis=1)
voies["prefix_long"] = voies.apply(get_prefix, args=("street_short", "street_long"), axis=1)
prefix_candidates = dict(zip(voies["prefix_court"], voies["prefix_long"]))
# only get prefixes with . in it
prefix_dict = {key.split(" ")[0]:value.split(" ")[0] for key,value in prefix_candidates.items() if "." in key}

prefix_dict["boul."] = "boulevard"
prefix_dict["boulev."] = "boulevard"
prefix_dict["boulv."] = "boulevard"
prefix_dict["q."] = "quai"
prefix_dict["aven."] = "avenue"
prefix_dict["faub."] = "faubourg"
prefix_dict["fau."] = "faubourg"
prefix_dict["st."] =  "saint"
prefix_dict["impas."] = "impasse"
prefix_dict["l'aub."] = "l'auberge"
prefix_dict["laub"] = "l'auberge"
prefix_dict["st"] = "saint"
prefix_dict["ste"] = "sainte"
prefix_dict["sts"] = "saints"
prefix_dict["nve"] = "neuve"

## Bottin Data

In [6]:
bottins = pd.read_csv("data/strict_addressing.csv")
bottins["Rue"] = bottins["Rue"].fillna("")
bottins = preprocess(bottins, "Rue", "street")

In [7]:
def get_abbreviation(street):
    # returns streets which include a "."
    try:
        if "." in street:
            return street
    except:
        print(street)
      
# get number of abbreviations
print("#abbr. in data before dict substitution:", len(set([get_abbreviation(street) for street in list(bottins["street"])])))
# substitute abbreviations
bottins["street"] = substitute_col_by_dict(bottins["street"], prefix_dict)
# see how many abbreviations are left
print("#abbr. in data after dict substitution:", len(set([get_abbreviation(street) for street in list(bottins["street"])])))

#abbr. in data before dict substitution: 76549
#abbr. in data after dict substitution: 37916


In [16]:
word_dict = { "n. d.":"notre dame",
             "n. d ": "notre dame", 
                "n. da": "notre da",
                "lafayette": "la fayette",
                "j. j. r": "jean jacques r", 
                "stmartin": "saint martin",
                "stdenis": "saint denis",
                "stmichel": "saint michel",
                "dutemple": "du temple",
                "faub.st": "faubourg saint",
                "faub.du ": "faubourg du ",
                "sthonore": "saint honore",
                "denazareth": "de nazareth",
                "stgermain": "saint germain",
                "mar| tin": "martin",
                "dame de": "damede ",
                "petitesecuries": "petites ecuries",
                "faub ": "faubourg",
                "petitschamps": "petits champs"}

# substitute abbreviations
bottins["street"] = substitute_col_by_dict(bottins["street"], word_dict)

In [17]:
Counter([x for x in bottins["street"] if "." in x]).most_common()

[('faub.poissonniere', 1394),
 ('grenelle saint germ.', 1014),
 ('faub.dutemple', 872),
 ('faub.montmartre', 765),
 ('four saint germ.', 404),
 ('j. j.rousseau', 371),
 ('fanb. saint martin', 362),
 ('fauh. saint martin', 345),
 ('fanb. saint antoine', 344),
 ('fanb. saint denis', 343),
 ('cloitre n. d', 327),
 ('fauh. saint antoine', 303),
 ('f. saint martin', 302),
 ('fauh. saint denis', 296),
 ('saint dominique saint germ.', 285),
 ('neuve des pet. champs', 263),
 ('f. saint antoine', 258),
 ('saint maur popinc.', 244),
 ('croix des pet. champs', 239),
 ('grenelle saint g.', 223),
 ('f. saint denis', 222),
 ('n.d. de nazareth', 216),
 ('fanb. saint honore', 215),
 ('fanb. poissonniere', 213),
 ('sainte marguerite saint germ.', 209),
 ('ay. parmentier', 205),
 ('faub.. poissonniere', 204),
 ('fanb. du temple', 200),
 ("saint germ. l'auxerrois", 200),
 ('j.j. rousseau', 196),
 ('faub.. saint denis', 190),
 ('fauh. saint honore', 187),
 ('fauh. poissonniere', 183),
 ('faub.. saint mart

## Group on short streetnames

In [20]:
# group streets based on their short name
grouped_streets = streets.groupby("streetname_short", as_index=False).agg({"id1791": list, "id2022": list, "streetname": list})

In [31]:
# split streets in those that are unique and those that aren't
unique_short_streets = grouped_streets[grouped_streets['streetname'].str.len() == 1]
unique_short_streets[["id1791", "id2022", "streetname"]] = unique_short_streets[["id1791", "id2022", "streetname"]].apply(lambda x: x[0])
multiple_short_streets = grouped_streets[grouped_streets['streetname'].str.len() > 1]

print(f"#streets with unique short streetname: {len(unique_short_streets)}, not unique: {len(multiple_short_streets)}")
multiple_short_streets.head(3)

#streets with unique short streetname: 5475, not unique: 817


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,streetname_short,id1791,id2022,streetname
26,abbaye,"[nan, nan]","[750000427.0, 750005073.0]","[chemin de l'abbaye, rue de l'abbaye]"
43,abbesses,"[nan, nan, nan]","[750004135.0, 750004136.0, 750004137.0]","[passage des abbesses, place des abbesses, rue..."
58,acacias,"[nan, nan]","[750002087.0, 750006300.0]","[passage des acacias, rue des acacias]"


In [32]:
unique_short_streets.head(3)

Unnamed: 0,streetname_short,id1791,id2022,streetname
0,2 hermites,414.0,,rue des 2 hermites
1,22 novembre 1943,,,
2,260 enfants,,,


## Save preprocessed datasets

In [33]:
bottins.to_pickle("data/bottins_prep.pkl")
streets.to_pickle("data/streets_prep.pkl")
unique_short_streets.to_pickle("data/unique_short_streets.pkl")
multiple_short_streets.to_pickle("data/not_unique_short_streets.pkl")