In [180]:
import geopandas as gpd
import pandas as pd
import numpy as np
import re
from collections import Counter
from tqdm import tqdm

# Street Data

## Preprocessing (from other Notebook)

In [239]:
voies = pd.read_csv("data/opendata_voie_paris.csv", sep=";")
#keep only columns that might be useful further on
voies = voies[["N_SQ_VO", "L_VOIE", "L_COURTMIN", "L_LONGMIN"]]

#preprocessing
def preprocess(df, column, map_dict):
    data = df.copy()
    new_colname = f"{column}_prep"
    data[new_colname] = data[column].str.lower()
    data[new_colname] = data[new_colname].replace(map_dict, regex=True)
    return data

map_dict = {"é": "e", "è": "e", "ê":"e", "à":"a", "â":"a", "ô":"o", "î":"i", "û":"u", "ç":"c", 
                "\-":" ", "\_":" ", "' ":"'", "  ":" "}

#apply preprocessing to voies
voies = preprocess(voies, "L_VOIE", map_dict)
voies = preprocess(voies, "L_COURTMIN", map_dict)
voies = preprocess(voies, "L_LONGMIN", map_dict)

#add column to remember data source
voies["source"] = "2022"

In [240]:
old = gpd.read_file("data/1791.zip", encoding='utf-8')
old = old.drop(["geometry"], axis=1)
# drop rows that don't have street names
old = old.dropna(subset=["nom_voie"])
# replace None values by string ""
old = old.fillna("")
old["voie"] = old["type_voie"] + " " + old["particule"] + " " + old["nom_voie"]
old = old.drop(["type_voie", "particule"], axis=1)
old = preprocess(old, "voie", map_dict)
old = preprocess(old, "nom_voie", map_dict)

#add column to remember data source
old["source"] = "1791"

In [241]:
streets = old.merge(voies, how='outer', left_on="voie_prep", right_on="L_LONGMIN_prep")
print("#total:", len(streets), "#old:", len(old), "#new:", len(voies), "#both old and new:", len(streets.dropna()))
streets["streetname"] = streets["voie_prep"].fillna(streets["L_LONGMIN_prep"])
# add a column to know where the data came from
streets["source"] = streets["source_x"]
streets.loc[streets["source_y"].notna(), "source"] = streets["source_y"]
streets.loc[(streets["source_y"].notna() & streets["source_x"].notna()), "source"] = "both"
streets = streets.drop(["source_x", "source_y"], axis=1)
# add a column with short streetnames
streets["streetname_short"] = streets["L_VOIE_prep"]
streets["streetname_short"] = streets["streetname_short"].fillna(streets["nom_voie_prep"])
streets

#total: 7452 #old: 1292 #new: 6542 #both old and new: 433


Unnamed: 0,id,nom_voie,voie,voie_prep,nom_voie_prep,N_SQ_VO,L_VOIE,L_COURTMIN,L_LONGMIN,L_VOIE_prep,L_COURTMIN_prep,L_LONGMIN_prep,streetname,source,streetname_short
0,1162.0,deux Boules,rue des deux Boules,rue des deux boules,deux boules,750005865.0,DEUX BOULES,R. des Deux Boules,Rue des Deux Boules,deux boules,r. des deux boules,rue des deux boules,rue des deux boules,both,deux boules
1,1531.0,Jean Lantier,rue Jean Lantier,rue jean lantier,jean lantier,750005921.0,JEAN LANTIER,R. Jean Lantier,Rue Jean Lantier,jean lantier,r. jean lantier,rue jean lantier,rue jean lantier,both,jean lantier
2,3.0,Orfèvres,rue des Orfèvres,rue des orfevres,orfevres,750006633.0,ORFEVRES,R. des Orfèvres,Rue des Orfèvres,orfevres,r. des orfevres,rue des orfevres,rue des orfevres,both,orfevres
3,4.0,mauvaises Paroles,rue des mauvaises Paroles,rue des mauvaises paroles,mauvaises paroles,,,,,,,,rue des mauvaises paroles,1791,mauvaises paroles
4,5.0,Plat d'Etain,rue du Plat d'Etain,rue du plat d'etain,plat d'etain,750005668.0,PLAT D'ETAIN,R. du Plat d'Etain,Rue du Plat d'Etain,plat d'etain,r. du plat d'etain,rue du plat d'etain,rue du plat d'etain,both,plat d'etain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7447,,,,,,750005143.0,PETITOT,R. Petitot,Rue Petitot,petitot,r. petitot,rue petitot,rue petitot,2022,petitot
7448,,,,,,750006122.0,PAUL-HENRI GRAUWIN,R. Paul-Henri Grauwin,Rue Paul-Henri Grauwin,paul henri grauwin,r. paul henri grauwin,rue paul henri grauwin,rue paul henri grauwin,2022,paul henri grauwin
7449,,,,,,750006676.0,BF/17,Voie Bf/17,Voie Bf/17,bf/17,voie bf/17,voie bf/17,voie bf/17,2022,bf/17
7450,,,,,,750003865.0,CHARLES CROS,R. Charles Cros,Rue Charles Cros,charles cros,r. charles cros,rue charles cros,rue charles cros,2022,charles cros


# Bottin Data
## Preprocessing

In [229]:
#compute prefix dictionary with help of voies data
def get_prefix(row, court, long):
    return row[long].split(row[court])[0]

voies["prefix_court"] = voies.apply(get_prefix, args=("L_VOIE_prep", "L_COURTMIN_prep"), axis=1)
voies["prefix_long"] = voies.apply(get_prefix, args=("L_VOIE_prep", "L_LONGMIN_prep"), axis=1)
prefix_dict = dict(zip(voies["prefix_court"], voies["prefix_long"]))
# only get prefixes with . in it
pfx_select = {key.split(" ")[0]:value.split(" ")[0] for key,value in prefix_dict.items() if "." in key}

pfx_select["boul."] = "boulevard"
pfx_select["boulev."] = "boulevard"
pfx_select["boulv."] = "boulevard"
pfx_select["q."] = "quai"
pfx_select["aven."] = "avenue"
pfx_select["faub."] = "faubourg"
pfx_select["fau."] = "faubourg"
pfx_select["st."] =  "saint"
pfx_select["impas."] = "impasse"
pfx_select["l'aub."] = "l'auberge"
pfx_select["laub"] = "l'auberge"
pfx_select["st"] = "saint"
pfx_select["ste"] = "sainte"
pfx_select["sts"] = "saints"
pfx_select["nve"] = "neuve"


In [230]:
# load bottin data and preprocess
# change as many abbreviations as possible

bottins = pd.read_csv("data/strict_addressing.csv")
bottins["Rue"] = bottins["Rue"].fillna("")
bottins = preprocess(bottins, "Rue", map_dict)

def get_abbreviation(street):
    # returns streets which include a "."
    try:
        if "." in street:
            return street
    except:
        print(street)
      
# get number of abbreviations
print("#abbr. in data before dict substitution:", len(set([get_abbreviation(street) for street in list(bottins["Rue_prep"])])))
# substitute abbreviations
bottins["Rue_no_abbr"] = bottins["Rue_prep"].str.split().apply(lambda x: ' '.join([pfx_select.get(e, e) for e in x]))
# see how many abbreviations are left
print("#abbr. in data after dict substitution:", len(set([get_abbreviation(street) for street in list(bottins["Rue_no_abbr"])])))

#abbr. in data before dict substitution: 76549
#abbr. in data after dict substitution: 37916


## Align data

In [231]:
def align_on_column(df_not_aligned, df_streets, df_aligned=pd.DataFrame(), mergeOnLeft="Rue", mergeOnRight=["L_VOIE"]):
    # make copies to avoid alteration of source dataframes
    not_aligned = df_not_aligned.copy()
    streets = df_streets.copy()
    if not df_aligned.empty:
        aligned = df_aligned.copy()
    else:
        aligned = df_aligned
        
    for rkey in mergeOnRight:
        # format to make match possible
        streets[rkey] = streets[rkey].str.lower()
        not_aligned[mergeOnLeft] = not_aligned[mergeOnLeft].str.lower()
        # merge
        merged = not_aligned.merge(streets, how="left", left_on=mergeOnLeft, right_on=rkey, suffixes=(False, False))
        # append newly aligned cases to aligned dataframe
        newly_aligned = merged[merged[rkey].notna()]
        # add column to know what alignment was on
        newly_aligned["aligned_on"] = rkey
        aligned = pd.concat([aligned, newly_aligned])
        
        # update not aligned rows
        not_aligned = merged[merged[rkey].isna()]
        not_aligned = not_aligned.drop(list(streets.columns), axis=1)
        print(f"Joining on {rkey}\n#total aligned: {len(aligned)}, newly aligned: {len(newly_aligned)}, not aligned: {len(not_aligned)}")
    
    return aligned, not_aligned

In [242]:
long_aligned, not_aligned = align_on_column(bottins, streets, mergeOnLeft="Rue_no_abbr", mergeOnRight=["streetname"])
short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= streets, 
                    mergeOnLeft="Rue_no_abbr", mergeOnRight=["streetname_short"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newly_aligned["aligned_on"] = rkey


Joining on streetname
#total aligned: 1271276, newly aligned: 1271276, not aligned: 3212434
Joining on streetname_short
#total aligned: 3765026, newly aligned: 3765026, not aligned: 1137368


## Substitute words by dictionary

In [245]:
#handle "notre dame" differently (other method cannot handle spaces)

def substitute_words_by_dict(row, word_dict):
    # dict of form {"correct word": ["misspelled/abbreviated words"]}
    for correct, substitute_list in word_dict.items():
        for incorrect in substitute_list:
            if incorrect in row:
                row = row.replace(incorrect, correct)
    return row

word_dict = {"notre dame": ["n. d.", "n. d "], 
                "notre da": ["n. da"],
                "boulevard de": ["boulevard"],
                "boulevard de ": ["boulevard de de "],
                "boulevard d'": ["boulevard de d'"],
                "la fayette": ["lafayette"],
                "jean jacques r": ["j. j. r"], 
                "saint martin": ["stmartin"]}

# substitute abbreviations
not_aligned["Rue_no_abbr"] = not_aligned["Rue_no_abbr"].apply(substitute_words_by_dict, args=(word_dict,))

In [246]:
long_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= streets, 
                    df_aligned = long_aligned, mergeOnLeft="Rue_no_abbr", mergeOnRight=["streetname"])
short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= streets, 
                    df_aligned = short_aligned, mergeOnLeft="Rue_no_abbr", mergeOnRight=["streetname_short"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newly_aligned["aligned_on"] = rkey


Joining on streetname
#total aligned: 1323607, newly aligned: 52331, not aligned: 1085037
Joining on streetname_short
#total aligned: 3807992, newly aligned: 42966, not aligned: 1048035


In [259]:
most_common = Counter(not_aligned["Rue_no_abbr"]).most_common(200)
for t in most_common:
    rue, _ = t
    last = rue.split()[-1]
    print(last, [x for x in streets["streetname"] if last in x], "\n")

germain ["rue saint germain l'auxerrois", 'rue des fosses saint germain des pres', 'foire saint germain', 'preau de la foire saint germain', "rue des fosses saint germain l'auxerrois", "cloitre saint germain l'auxerrois", 'route de saint germain a paris', "rue des pretres saint germain l'auxerrois", 'rue germaine tailleferre', 'rue du docteur germain see', 'cite germain pilon', 'rue germain pilon', 'rue germaine richier', 'rue germaine krull', 'rue sophie germain', 'place saint germain des pres', 'rue germaine de stael', 'boulevard saint germain', 'rue germaine poinso chapuis'] 

allemagne [] 

montparnasse ['boulevard du montparnasse', 'rue du montparnasse', 'souterrain maine montparnasse'] 

rochechouart ['rue marguerite de rochechouart', 'boulevard marguerite de rochechouart'] 

d'orleans ["rue d'orleans", "rue d'orleans", "quay d'orleans", "rue neuve d'orleans", "rue neuve d'orleans", "route d'orleans", "square d'orleans", "les portiques d'orleans", "quai d'orleans", "galerie d'orl

In [260]:
most_common

[('grenelle saint germain', 8446),
 ('allemagne', 6737),
 ('boulevard de montparnasse', 6119),
 ('rochechouart', 5863),
 ("avenue d'orleans", 5696),
 ('quai jemmapes', 5620),
 ('rue de flandre', 5072),
 ('paradis poissonniere', 4544),
 ('saint louis marais', 4218),
 ('bourbon villeneuve', 3833),
 ('boulevard de la gare', 3829),
 ('saint maur popincourt', 3823),
 ('quai valmy', 3672),
 ('michodiere', 3599),
 ('grenetat', 3493),
 ('boulevard de rochechouart', 3381),
 ('four saint germain', 3340),
 ('petit carreau', 2866),
 ("rue d'allemagne", 2845),
 ('vanves', 2797),
 ('albouy', 2766),
 ('avenue wagram', 2708),
 ('grenelle saint honore', 2571),
 ('phelippeaux', 2552),
 ('la harpe', 2513),
 ('fontaine saint georges', 2485),
 ('cite trevise', 2459),
 ('marais saint martin', 2408),
 ('chaussee d’antin', 2367),
 ('aux ours', 2359),
 ('angouleme du temple', 2355),
 ('rue de vanves', 2341),
 ('faubourg stdenis', 2153),
 ('breda', 2151),
 ('corbeau', 2138),
 ('commerce grenelle', 2104),
 ('lux

In [263]:
[x for x in streets["streetname"] if "ville neuve" in x]

['rue de bourbon ville neuve', 'rue de la ville neuve']

In [249]:
streets[streets["streetname_short"]=="la fayette"]

Unnamed: 0,id,nom_voie,voie,voie_prep,nom_voie_prep,N_SQ_VO,L_VOIE,L_COURTMIN,L_LONGMIN,L_VOIE_prep,L_COURTMIN_prep,L_LONGMIN_prep,streetname,source,streetname_short
2621,,,,,,750004617.0,LA FAYETTE,R. la Fayette,Rue la Fayette,la fayette,r. la fayette,rue la fayette,rue la fayette,2022,la fayette


In [250]:
set([x for x in not_aligned["Rue_no_abbr"] if "j. j." in x])

{'31. henner (j. j.)',
 'baude (baron j. j.)',
 'faubourg saint honore. 234. perraud (j. j.)',
 'j. j. . rousseau',
 'j. j. 1 rousseau',
 'j. j. ?ousseau',
 'j. j. bousseau',
 'j. j. hoursseau',
 'j. j. kous. seau',
 'j. j. kousseau',
 'j. j. liousseau',
 'j. j. pousseau',
 'j. j. | rousseau',
 'j. j." rousseau',
 'j. j.. foussean',
 'j. j.. roussean',
 'j. j.. rousseau',
 'j. j... rousseau',
 'j. j.bosseau',
 'j. j.housseau',
 'j. j.ronsseau',
 'j. j.ropssean',
 'j. j.rosseau',
 'j. j.rouse.',
 'j. j.rousscau',
 'j. j.roussean',
 'j. j.rousseau',
 'j. j.sseau',
 'j. j.· rousseau',
 'le frotter de la garenne (chevalier ch. j. j.)',
 'le frotter de la garenno (chevalier ch. j. j.)',
 'le frotter dela garemne (chevalier ch. j. j.)',
 'oberkampf. bienayme (j. j.)',
 'r.j. j.roussean',
 'r.j. j.rousseau',
 'rue j. j. 1 rousseau',
 'rue j. j. bousseau',
 'rue j. j. louw. 46.',
 'rue j. j. pousseau',
 'rue j. j.. rousseau',
 'rue j. j.bousseau',
 'rue j. j.pousseau',
 'rue j. j.roussean',
 '