# Preprocessing

## Libraries

In [1]:
import pandas as pd
import geopandas as gpd
from collections import Counter

from preprocessing import preprocess, substitute_col_by_dict

import warnings
from shapely.errors import ShapelyDeprecationWarning
warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning) 

## Street data

#### 2022 data

In [2]:
voies = pd.read_pickle('data/all.pkl')

#apply preprocessing to voies
voies = preprocess(voies, "voie", new_colname="street_short")
voies = preprocess(voies, "court", new_colname="street_abbr")
voies = preprocess(voies, "long", new_colname="street_long")

In [3]:
voies = voies[['voie_origin','street_long', 'street_abbr', 'street_short','type','prefix','voie','geometry','centroid','length']]
voies.fillna('')
voies = gpd.GeoDataFrame(voies)
voies

Unnamed: 0,voie_origin,street_long,street_abbr,street_short,type,prefix,voie,geometry,centroid,length
0,both_dup,rue de bourgogne,r. de bourgogne,bourgogne,RUE,DE,BOURGOGNE,"LINESTRING (258069.893 6251198.938, 258055.700...",POINT (257997.1935295925 6250805.128055042),526.555511
1,both_dup,rue de paradis,r. de paradis,paradis,RUE,DE,PARADIS,"LINESTRING (262201.246 6253568.943, 261987.169...",POINT (261805.15289758582 6253696.406718903),548.404392
2,both_dup,rue des rosiers,r. des rosiers,rosiers,RUE,DES,ROSIERS,"LINESTRING (262860.427 6250498.429, 262805.718...",POINT (262670.0479667235 6250643.545885425),315.81374
3,both_dup,rue du renard,r. du renard,renard,RUE,DU,RENARD,"LINESTRING (261761.419 6250714.489, 261772.617...",POINT (261848.03797914274 6250954.008558707),335.100184
4,both_dup,avenue de breteuil,av. de breteuil,breteuil,AV,DE,BRETEUIL,"MULTILINESTRING ((257355.179 6248936.845, 2573...",POINT (257355.1096247492 6249338.670018175),1640.881061
...,...,...,...,...,...,...,...,...,...,...
7607,old_only,quay de la rapee,,rapee,quay,,Rapée,"LINESTRING (263495.393 6248758.551, 263644.526...",POINT (264233.78363613127 6247918.108184653),
7608,old_only,quay de bercy,,bercy,quay,,Bercy,"LINESTRING (264907.034 6247021.082, 264974.916...",POINT (265623.83601906087 6246215.960157622),
7609,old_only,quay des galleries du louvre,,galleries du louvre,quay,,Galleries du Louvre,"LINESTRING (260061.338 6251037.306, 259778.305...",POINT (259740.86549458705 6251137.881114531),
7610,old_only,quay des thuileries,,thuileries,quay,,Thuileries,"LINESTRING (259419.213 6251234.635, 258831.636...",POINT (258851.61092649246 6251522.205990931),


## Create a Prefix Dictionary 
### (with help of Paris Opendata dataset)

In [4]:

#compute prefix dictionary with help of voies data
def get_prefix(row, court, long):
    return row[long].split(row[court])[0]


for_dict = voies[voies['voie_origin'].isin(['new_only', 'both', 'both_dup'])]
for_dict = for_dict.dropna(subset=["street_short", "street_abbr","street_long"])

for_dict["prefix_court"] = for_dict.apply(get_prefix, args=("street_short", "street_abbr"), axis=1)
for_dict["prefix_long"] = for_dict.apply(get_prefix, args=("street_short", "street_long"), axis=1)
prefix_candidates = dict(zip(for_dict["prefix_court"], for_dict["prefix_long"]))
# only get prefixes with . in it
prefix_dict = {key.split(" ")[0]:value.split(" ")[0] for key,value in prefix_candidates.items() if "." in key}

prefix_dict["boul."] = "boulevard"
prefix_dict["boulev."] = "boulevard"
prefix_dict["boulv."] = "boulevard"
prefix_dict["q."] = "quai"
prefix_dict["aven."] = "avenue"
prefix_dict["faub."] = "faubourg"
prefix_dict["fau."] = "faubourg"
prefix_dict["st."] =  "saint"
prefix_dict["impas."] = "impasse"
prefix_dict["l'aub."] = "l'auberge"
prefix_dict["laub"] = "l'auberge"
prefix_dict["st"] = "saint"
prefix_dict["ste"] = "sainte"
prefix_dict["sts"] = "saints"
prefix_dict["nve"] = "neuve"

## Bottin Data

In [5]:
bottins = pd.read_csv("data/strict_addressing.csv")
bottins["Rue"] = bottins["Rue"].fillna("")
bottins = preprocess(bottins, "Rue", "street")

In [6]:
def get_abbreviation(street):
    # returns streets which include a "."
    try:
        if "." in street:
            return street
    except:
        print(street)
      
# get number of abbreviations
print("#abbr. in data before dict substitution:", len(set([get_abbreviation(street) for street in list(bottins["street"])])))
# substitute abbreviations
bottins["street"] = substitute_col_by_dict(bottins["street"], prefix_dict)
# see how many abbreviations are left
print("#abbr. in data after dict substitution:", len(set([get_abbreviation(street) for street in list(bottins["street"])])))

#abbr. in data before dict substitution: 76549
#abbr. in data after dict substitution: 37868


In [36]:
word_dict = { "n. d.":"notre dame",
             "n. d ": "notre dame", 
                "n. da": "notre da",
                "lafayette": "la fayette",
                "j. j. r": "jean jacques r", 
                "stmartin": "saint martin",
                "stdenis": "saint denis",
                "stmichel": "saint michel",
                "dutemple": "du temple",
                "faub.st": "faubourg saint",
                "faub.du ": "faubourg du ",
                "sthonore": "saint honore",
                "st.honore": "saint honore",
                "denazareth": "de nazareth",
                "stgermain": "saint germain",
                "saint g. ": "saint germain",
                "mar| tin": "martin",
                "dame de": "damede ",
                "petitesecuries": "petites ecuries",
                "faub ": "faubourg",
                "petitschamps": "petits champs",
                "saint germ.": "saint germain",
                "faub.montmartre": "faubourg montmartre",
                "faub.poissonniere": "faubourg poissonniere",
                "j.j. rousseau": "jean jaques rousseau",
                "j. j.rousseau": "jean jaques rousseau",
                "fanb. ": "faubourg",
                "fauh. ": "faubourg",
                "faub.. ": "faubourg",
                "faab. " : "faubourg",
                "f. saint ": "faubourg saint",
                "pet. champs": "petits champs",
                "ay.": "avenue",
                "av.de": "avenue de ",
                "r.du ": "rue du ",
                "carref. ": "carrefour ",
                "f. poissoniere": "faubourg poissoniere",
                "saint g.": "saint germain",
                "faub).": "faubourg",
                "taub. ": "faubourg",
                "vol. taire": "voltaire",
                "faubourg. ": "faubourg",
                "fd. poissonniere": "faubourg poissoniere",
                "stras. bourg": "strassbourg",
                "saintgerm.": "saint germain",
                "montmart.": "montmartre",
                "bouley. ": "boulevard",
                "bouly.": "boulevard",
                "pois. sonniere": "poissonniere",
                "pe. tits": "petits",
                "boul. ": "boulevard ",
                "poissonn.": "poissonniere",
                "f. poissonniere": "faubourg poissonniere",
                "f. montmartre": "faubourg montmartre",
                "fb. ": "faubourg",
                "haub. ": "faubourg",
                "j. j. pousseau": "jean jaques rousseau",
                "houl. ": "boulevard ",
                "montmar. tre": "montmartre",
                "r.de": "rue de",
                "r.des": "rue des",
                "pass.du": "passage du",
                "boul.": "boulevard ",
                "faub.": "faubourg ",
                "laub. ": "faubourg ",
                "faub.": "faubourg ",
                "faub..": "faubourg ",
                "fauh.": "faubourg ",
                "fauh.. ": "faubourg ",
                "faul).": "faubourg",
                "r.st": "rue saint",
                "alle. magne": "allemagne",
                "faub.st.": "faubourg saint",
                "r.": "rue ",
                "faubourgdu.": "faubourg du ",
                "ams. terdam": "amsterdam",
                "boul.de ": "boulevard de ",
                "j j. rousseau": "jean jaques rousseau",
                "j. j.. rousseau": "jean jaques rousseau",
                "j. j rousseau": "jean jaques rousseau",
                "faul.": "faubourg ",
                "av.": "avenue ",
                "montm.": "montmartre",
                "mont. martre": "montmartre",
                "petites.": "petites ",
                "petits.": "petits",
                "hauss. mann": "haussmann",
                "tem. ple": "temple",
                "boul.": "boulevard ",
                "faub..du": "faubourg du ",
                "riche. lieu": "richelieu",
                "b. bonne nouvelle": "boulevard bonne nouvelle",
                "b. bonne. nouvelle": "boulevard bonne nouvelle",
                "b. bonne nouv.": "boulevard bonne nouvelle",
                "chauss.": "chaussee ",
                "che. min": "chemin",
                "impass. ": "impasse",

                
                }

# substitute abbreviations
bottins["street"] = substitute_col_by_dict(bottins["street"], word_dict)
print("#abbr. in data after dict substitution:", len(set([get_abbreviation(street) for street in list(bottins["street"])])))

#abbr. in data after dict substitution: 32727


In [37]:
Counter([x for x in bottins["street"] if "." in x]).most_common()

[('saint maur popinc.', 244),
 ('n.d. de nazareth', 216),
 ("saint germain l'aux.", 176),
 ('gren. saint germain', 140),
 ('rue n.d. de nazareth', 126),
 ('n.d. des victoires', 112),
 ('saint nicolas saint ant.', 106),
 ('f. du temple', 99),
 ("saint germain l'auxerr.", 95),
 ('montagne sainte gen.', 94),
 ('saint maur pop.', 88),
 ('grenelle saint hon.', 83),
 ('faubourg saint ant.', 79),
 ('m. le prince', 79),
 ('boulevard se. bastopol', 79),
 ('montagne sainte genev.', 78),
 ("chaussee. d'antin", 78),
 ('anc. comedie', 77),
 ('b. beaumarchais', 73),
 ('sainte marguerite saint ant.', 73),
 ("fosses saint germain l'aux.", 71),
 ('lavandieres sainte opp.', 71),
 ('cherche. midi', 70),
 ('vaugi. rard', 69),
 ('rue n.d. des champs', 69),
 ('sts. peres', 67),
 ('grenelle saint h.', 64),
 ("ch. d'antin", 64),
 ('boulevard sebas. topol', 64),
 ('mons. le prince', 63),
 ('n.d. de lorette', 63),
 ('traversiere saint ant.', 62),
 ('bourbon villen.', 61),
 ('faubourg saint an. toine', 61),
 ('n

#TODO hier noch letztes Preprocessing, wo mehrere Leerzeichen zu einem gemacht werden

## Group on short streetnames

In [20]:
# group streets based on their short name
grouped_streets = streets.groupby("streetname_short", as_index=False).agg({"id1791": list, "id2022": list, "streetname": list})

In [31]:
# split streets in those that are unique and those that aren't
unique_short_streets = grouped_streets[grouped_streets['streetname'].str.len() == 1]
unique_short_streets[["id1791", "id2022", "streetname"]] = unique_short_streets[["id1791", "id2022", "streetname"]].apply(lambda x: x[0])
multiple_short_streets = grouped_streets[grouped_streets['streetname'].str.len() > 1]

print(f"#streets with unique short streetname: {len(unique_short_streets)}, not unique: {len(multiple_short_streets)}")
multiple_short_streets.head(3)

#streets with unique short streetname: 5475, not unique: 817


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,streetname_short,id1791,id2022,streetname
26,abbaye,"[nan, nan]","[750000427.0, 750005073.0]","[chemin de l'abbaye, rue de l'abbaye]"
43,abbesses,"[nan, nan, nan]","[750004135.0, 750004136.0, 750004137.0]","[passage des abbesses, place des abbesses, rue..."
58,acacias,"[nan, nan]","[750002087.0, 750006300.0]","[passage des acacias, rue des acacias]"


In [32]:
unique_short_streets.head(3)

Unnamed: 0,streetname_short,id1791,id2022,streetname
0,2 hermites,414.0,,rue des 2 hermites
1,22 novembre 1943,,,
2,260 enfants,,,


## Save preprocessed datasets

In [33]:
bottins.to_pickle("data/bottins_prep.pkl")
streets.to_pickle("data/streets_prep.pkl")
unique_short_streets.to_pickle("data/unique_short_streets.pkl")
multiple_short_streets.to_pickle("data/not_unique_short_streets.pkl")