# Alignment Functions

#### Importing Libraries

In [1]:
import pandas as pd
import re
from collections import Counter
from tqdm import tqdm

#### Importing Data

In [2]:
bottins = pd.read_csv("data/strict_addressing.csv")
voies = pd.read_csv("data/opendata_voie_paris.csv", sep=";")
#keep only columns that might be useful further on
voies = voies[["N_SQ_VO", "L_VOIE", "L_COURTMIN", "L_LONGMIN", "Geometry", "LENGTH", "Geometry X Y"]]

## conservative

In [3]:
def align_on_column(df_not_aligned, df_streets, df_aligned=pd.DataFrame(), mergeOnLeft="Rue", mergeOnRight=["L_VOIE"]):
    # make copies to avoid alteration of source dataframes
    not_aligned = df_not_aligned.copy()
    streets = df_streets.copy()
    if not df_aligned.empty:
        aligned = df_aligned.copy()
    else:
        aligned = df_aligned
        
    for rkey in mergeOnRight:
        # format to make match possible
        streets[rkey] = streets[rkey].str.lower()
        not_aligned[mergeOnLeft] = not_aligned[mergeOnLeft].str.lower()
        # merge
        merged = not_aligned.merge(streets, how="left", left_on=mergeOnLeft, right_on=rkey, suffixes=(False, False))
        # append newly aligned cases to aligned dataframe
        newly_aligned = merged[merged[rkey].notna()]
        # add column to know what alignment was on
        newly_aligned["aligned_on"] = rkey
        aligned = pd.concat([aligned, newly_aligned])
        
        # update not aligned rows
        not_aligned = merged[merged[rkey].isna()]
        not_aligned = not_aligned.drop(list(streets.columns), axis=1)
        print(f"Joining on {rkey}\n#total aligned: {len(aligned)}, newly aligned: {len(newly_aligned)}, not aligned: {len(not_aligned)}")
    
    return aligned, not_aligned

In [4]:
aligned, not_aligned = align_on_column(bottins, voies, mergeOnRight=["L_VOIE", "L_COURTMIN", "L_LONGMIN"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newly_aligned["aligned_on"] = rkey


Joining on L_VOIE
#total aligned: 1933426, newly aligned: 1933426, not aligned: 3191199
Joining on L_COURTMIN
#total aligned: 2545967, newly aligned: 612541, not aligned: 2578666
Joining on L_LONGMIN
#total aligned: 2713723, newly aligned: 167756, not aligned: 2410910


## substituting accents (characters in general)

In [4]:
# function to replace different characters at once in a string, based on a dictionary
def replace_chars_by_dict(string, map_dict):
    #https://www.geeksforgeeks.org/python-replace-different-characters-in-string-at-once/
    return re.compile("|".join(map_dict.keys())).sub(lambda ele: map_dict[re.escape(ele.group(0))], string)

def substitute_on_streets_and_align(df_not_aligned, df_streets, map_dict, df_aligned=pd.DataFrame(), mergeOnLeft="Rue", mergeOnRight=["L_VOIE"]):
    streets = df_streets.copy()
    mergeOnRight_new = list()
    for rkey in mergeOnRight:
        rkey_new = f"{rkey}_new"
        streets[rkey_new] = streets[rkey].apply(replace_chars_by_dict, args=(map_dict,))
        mergeOnRight_new.append(rkey_new)
    
    aligned, not_aligned = align_on_column(df_not_aligned=df_not_aligned, df_streets=streets, df_aligned=df_aligned, mergeOnLeft=mergeOnLeft, mergeOnRight=mergeOnRight_new)
    aligned = aligned.drop(mergeOnRight_new, axis=1)
    return aligned, not_aligned 

In [6]:
dict_accents = {"é": "e", "è": "e", "ê":"e", "à":"a", "â":"a", "ô":"o", "î":"i", "û":"u"}
aligned, not_aligned = substitute_on_streets_and_align(df_not_aligned=not_aligned, df_streets=voies, 
                                                                 map_dict=dict_accents, df_aligned=aligned,
                                                                mergeOnRight=["L_VOIE", "L_COURTMIN", "L_LONGMIN"])

Joining on L_VOIE_new
#total aligned: 2713723, newly aligned: 0, not aligned: 2410910


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newly_aligned["aligned_on"] = rkey


Joining on L_COURTMIN_new
#total aligned: 2718069, newly aligned: 4346, not aligned: 2406564
Joining on L_LONGMIN_new
#total aligned: 2718972, newly aligned: 903, not aligned: 2405661


## substituting spaces and .-

In [7]:
not_aligned["no_spaces"] = not_aligned["Rue"].str.replace(".", "", regex=False)
not_aligned["no_spaces"] = not_aligned["no_spaces"].str.replace(" ", "", regex=False)
not_aligned["no_spaces"] = not_aligned["no_spaces"].str.replace("-", "", regex=False)

In [8]:
voies_copy = voies.copy()
voies_copy["no_spaces_long"] = voies_copy["L_LONGMIN"].str.replace(".", "", regex=False)
voies_copy["no_spaces_long"] = voies_copy["no_spaces_long"].str.replace(" ", "", regex=False)
voies_copy["no_spaces_long"] = voies_copy["no_spaces_long"].str.replace("-", "", regex=False)

In [9]:
aligned, not_aligned = align_on_column(not_aligned, voies_copy, aligned, mergeOnLeft="no_spaces", mergeOnRight=["no_spaces_long"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newly_aligned["aligned_on"] = rkey


Joining on no_spaces_long
#total aligned: 2747447, newly aligned: 28475, not aligned: 2377194


In [10]:
voies_copy = voies.copy()
voies_copy["no_spaces_court"] = voies_copy["L_COURTMIN"].str.replace(".", "", regex=False)
voies_copy["no_spaces_court"] = voies_copy["no_spaces_court"].str.replace(" ", "", regex=False)
voies_copy["no_spaces_court"] = voies_copy["no_spaces_court"].str.replace("-", "", regex=False)
aligned, not_aligned = align_on_column(not_aligned, voies_copy, aligned, mergeOnLeft="no_spaces", mergeOnRight=["no_spaces_court"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newly_aligned["aligned_on"] = rkey


Joining on no_spaces_court
#total aligned: 2856095, newly aligned: 108648, not aligned: 2268694


In [11]:
aligned.tail()

Unnamed: 0.1,Unnamed: 0,page,row,Nom,Métier,Rue,Numéro,annee,N_SQ_VO,L_VOIE,L_COURTMIN,L_LONGMIN,Geometry,LENGTH,Geometry X Y,aligned_on,no_spaces,no_spaces_long,no_spaces_court
2377325,bpt6k9780089g,1607,192,Zullig (Arnold),fourreur,r. croix-des-petitschamps,33.0,1922,750005891.0,CROIX DES PETITS CHAMPS,R. Croix des Petits Champs,Rue Croix des Petits Champs,"{""coordinates"": [[2.339184280672447, 48.862218...",395.247666,"48.863930647003784,2.340033235528675",no_spaces_court,rcroixdespetitschamps,,rcroixdespetitschamps
2377326,bpt6k9780089g,1607,194,Zuloaga,peintre-art.,r.caulaincourt,54.0,1922,750006458.0,CAULAINCOURT,R. Caulaincourt,Rue Caulaincourt,"{""coordinates"": [[[2.3295839496506456, 48.8846...",1377.350313,"48.888012381700015,2.335330632931254",no_spaces_court,rcaulaincourt,,rcaulaincourt
2377334,bpt6k9780089g,1607,227,Zusman (S.),meubles,r. joseph-dijon,16.0,1922,750003300.0,JOSEPH DIJON,R. Joseph Dijon,Rue Joseph Dijon,"{""coordinates"": [[2.34789895265875, 48.8936934...",199.038155,"48.893993137898086,2.3466298590464247",no_spaces_court,rjosephdijon,,rjosephdijon
2377336,bpt6k9780089g,1607,233,Zutter,électricité,av. de la grande-armée,85.0,1922,750001383.0,GRANDE ARMEE,Av. de la Grande Armée,Avenue de la Grande Armée,"{""coordinates"": [[[2.294022093593509, 48.87410...",944.223697,"48.87588684710302,2.288398613762114",no_spaces_court,avdelagrandearmée,,avdelagrandearmée
2377337,bpt6k9780089g,1607,235,Zvang,blanchiss.,r. jules-vallès,5.0,1922,750003237.0,JULES VALLES,R. Jules Vallès,Rue Jules Vallès,"{""coordinates"": [[2.385126163672793, 48.852695...",213.882502,"48.853659996235805,2.385053516055706",no_spaces_court,rjulesvallès,,rjulesvallès


## intermediary statistics

In [13]:
#showing that there are still a lot of streets to match
print("streets matched:", len(aligned["Rue"].unique()), "streets not matched:", len(not_aligned["Rue"].unique()))

16203 174827


In [14]:
#a lot of frequent streets are not matched yet!
Counter(not_aligned["Rue"]).most_common(200)

[('st-honoré', 29537),
 ('st-denis', 29477),
 ('st-martin', 25716),
 ('faub.-st-martin', 19295),
 ('faub.-st-antoine', 18336),
 ('faub.-st-denis', 17799),
 ('boul. voltaire', 16675),
 ('st-jacques', 13365),
 ('lafayette', 12842),
 ('boul. sébastopol', 12819),
 ('faub.-st-honoré', 12452),
 ('st-antoine', 12158),
 ('sèvres', 11995),
 ('st-lazare', 11802),
 ('faub.-poissonnière', 11485),
 ('boul. beaumarchais', 11236),
 ('boul. magenta', 11227),
 ('faub.-du-temple', 11038),
 ('boul. st-germain', 10978),
 ('cléry', 9850),
 ('vieille-du-temple', 9038),
 ('boul. haussmann', 8868),
 ('bondy', 8713),
 ('université', 8549),
 ('boul. malesherbes', 8458),
 ('ste-anne', 8297),
 ('ménilmontant', 8098),
 ('faub.-montmartre', 8032),
 ('st-dominique', 7994),
 ('grenelle-st-germain', 7946),
 ('boul. st-michel', 7794),
 ('st-maur', 7391),
 ('boul. de la villette', 6984),
 ('boul. de strasbourg', 6966),
 ('boul. du temple', 6875),
 ('allemagne', 6736),
 ('st-sauveur', 6645),
 ('aumaire', 6354),
 ('boul. 

In [15]:
# this is also the case for streets from 1883 on, when Paris was already remodelled -> old map data won't help
not_aligned.iloc[-1000000] #1883
Counter(not_aligned[-100000:]["Rue"]).most_common(10)

[('boul. voltaire', 1475),
 ('r. st-honoré', 1248),
 ('faub. poissonnière', 1205),
 ('faub. st-antoine', 1152),
 ('faub. st-denis', 1130),
 ('r. st-maur', 1102),
 ('faub. st-martin', 1040),
 ('r. st-denis', 1020),
 ('boul. st-germain', 1012),
 ('r. de flandre', 986)]

In [16]:
#statistics about how often the least frequent streets appear in the not_aligned data
counted_rues = Counter(not_aligned["Rue"])
Counter(counted_rues.values()).most_common(10) #-> 121746 street names appear only once, 19080 only twice etc.

[(1, 121746),
 (2, 19080),
 (3, 8284),
 (4, 4698),
 (5, 3088),
 (6, 2234),
 (7, 1549),
 (8, 1217),
 (9, 1011),
 (10, 854)]

## creating a list of most common prefixes and aligning without them

In [17]:
#get street names of not_aligned data
rue_names = list(counted_rues.keys())

In [19]:
#if there is a space or a "-" in the name, match everything before it (greedy matching)
def match_prefix(street):
    result = re.match(r".*[ -]", street)
    if result:
        return result.group()
    else:
        return ""

In [20]:
match_prefix(rue_names[0])

'passage '

In [21]:
# get all the prefixes with the above defined function
prefixes = [match_prefix(street) for street in rue_names[:17827]] 
Counter(prefixes).most_common(100)

[('', 3208),
 ('st-', 265),
 ('pass. ', 99),
 ('nve-', 91),
 ('et ', 87),
 ('neuve-', 75),
 ('faub.-', 70),
 ('ste-', 64),
 ('boul. ', 60),
 ('passage ', 58),
 ('nve-st-', 54),
 ('place ', 52),
 ('nye-', 49),
 ('faub.-st-', 45),
 ('quai ', 45),
 ('q. ', 42),
 ('petit-', 35),
 ('nve-des-', 33),
 ('pl. ', 32),
 ('saint-', 31),
 ('avenue ', 31),
 ('trois-', 30),
 ('grenelle-st-', 30),
 ('boul. des ', 29),
 ('vieille-', 29),
 ('geoffroy-', 29),
 ('faub. -', 29),
 ('faub. ', 28),
 ('four-st-', 28),
 ('faub. -st-', 27),
 ('impasse ', 27),
 ('nye-st-', 26),
 ('neuve-st-', 26),
 ('boulev. ', 25),
 ('cité ', 23),
 ('st-maur-', 22),
 ('chaussée-', 22),
 ('fossés-', 22),
 ('si-', 22),
 ('vieux-', 21),
 ('st. ', 21),
 ('marché-', 21),
 ('rue ', 21),
 ('imp. ', 21),
 ('grand-', 20),
 ('st ', 20),
 ('paradis-', 19),
 ('q. de ', 19),
 ('pass. du ', 19),
 ('petits-', 19),
 ('bourbon-', 18),
 ('cour ', 18),
 ('deux-', 18),
 ('quai de ', 18),
 ('· ', 18),
 ('grande-', 18),
 ('st-louis-', 17),
 ('faub.-d

In [22]:
# take 300 most common prefixes (appearing down to 5 or 6 times in data)
most_common_prefixes = list(zip(*Counter(prefixes).most_common(300)))[0]
# convert to list and delete the empty entry
most_common_prefixes = list(most_common_prefixes)[1:]
most_common_prefixes

['st-',
 'pass. ',
 'nve-',
 'et ',
 'neuve-',
 'faub.-',
 'ste-',
 'boul. ',
 'passage ',
 'nve-st-',
 'place ',
 'nye-',
 'faub.-st-',
 'quai ',
 'q. ',
 'petit-',
 'nve-des-',
 'pl. ',
 'saint-',
 'avenue ',
 'trois-',
 'grenelle-st-',
 'boul. des ',
 'vieille-',
 'geoffroy-',
 'faub. -',
 'faub. ',
 'four-st-',
 'faub. -st-',
 'impasse ',
 'nye-st-',
 'neuve-st-',
 'boulev. ',
 'cité ',
 'st-maur-',
 'chaussée-',
 'fossés-',
 'si-',
 'vieux-',
 'st. ',
 'marché-',
 'rue ',
 'imp. ',
 'grand-',
 'st ',
 'paradis-',
 'q. de ',
 'pass. du ',
 'petits-',
 'bourbon-',
 'cour ',
 'deux-',
 'quai de ',
 '· ',
 'grande-',
 'st-louis-',
 'faub.-du-',
 'st-germain-',
 'grange-',
 'fontaine-',
 'neuve-des-',
 'boul. de ',
 'q. de la ',
 'jean-',
 'pl. st-',
 'pass. des ',
 'notre-dame-de-',
 'petites-',
 'vieille-du-',
 'st-nicolas-',
 'faubourg-',
 'quai de la ',
 'notre-dame-',
 'quai des ',
 'boul. st-',
 'amandiers-',
 'faub. st-',
 'la ',
 'pierre-',
 'anjou-',
 'faub. - ',
 'q. des ',
 

In [23]:
#creating new column "sans_pref" in not_aligned data, where all prefixes are deleted
not_aligned["sans_pref"] = not_aligned['Rue']
for prefix in tqdm(most_common_prefixes):
    not_aligned["sans_pref"] = not_aligned["sans_pref"].str.replace(prefix, "", regex=False)

100%|████████████████████████████████████████████████████████████████████████████████| 299/299 [10:08<00:00,  2.04s/it]


In [24]:
#aligning on this new column (sans_pref)
aligned, not_aligned = align_on_column(not_aligned, voies, aligned, mergeOnLeft="sans_pref", mergeOnRight=["L_VOIE", "L_COURTMIN", "L_LONGMIN"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newly_aligned["aligned_on"] = rkey


Joining on L_VOIE
#total aligned: 3558483, newly aligned: 702388, not aligned: 1948817
Joining on L_COURTMIN
#total aligned: 3558483, newly aligned: 0, not aligned: 1948817
Joining on L_LONGMIN
#total aligned: 3558485, newly aligned: 2, not aligned: 1948815


In [25]:
aligned.tail(50)

Unnamed: 0.1,Unnamed: 0,page,row,Nom,Métier,Rue,Numéro,annee,N_SQ_VO,L_VOIE,L_COURTMIN,L_LONGMIN,Geometry,LENGTH,Geometry X Y,aligned_on,no_spaces,no_spaces_long,no_spaces_court,sans_pref
2651104,bpt6k9780089g,1606,61,Zelikoswki,meubles,boul. voltaire,30.,1922,750002515.0,voltaire,Cité Voltaire,Cité Voltaire,"{""coordinates"": [[[2.3890787361900028, 48.8524...",276.553915,"48.8527066801208,2.3902033886928113",L_VOIE,boulvoltaire,,,voltaire
2651105,bpt6k9780089g,1606,63,Zelinsky (L.),tailleur pour dames,boul. de la chapelle,118.,1922,750003852.0,chapelle,Bd. de la Chapelle,Boulevard de la Chapelle,"{""coordinates"": [[[2.3512555061916722, 48.8837...",2405.956587,"48.884196292924635,2.357063542137758",L_VOIE,bouldelachapelle,,,chapelle
2651106,bpt6k9780089g,1606,63,Zelinsky (L.),tailleur pour dames,boul. de la chapelle,118.,1922,750002391.0,chapelle,Sout. Chapelle,Souterrain Chapelle,"{""coordinates"": [[2.3621049852972154, 48.89859...",445.554776,"48.898525303304226,2.359068291678964",L_VOIE,bouldelachapelle,,,chapelle
2651107,bpt6k9780089g,1606,63,Zelinsky (L.),tailleur pour dames,boul. de la chapelle,118.,1922,750003853.0,chapelle,Cité de la Chapelle,Cité de la Chapelle,"{""coordinates"": [[2.3597183866608917, 48.88721...",172.899667,"48.88730632922413,2.358730766590166",L_VOIE,bouldelachapelle,,,chapelle
2651108,bpt6k9780089g,1606,63,Zelinsky (L.),tailleur pour dames,boul. de la chapelle,118.,1922,750006453.0,chapelle,Imp. de la Chapelle,Impasse de la Chapelle,"{""coordinates"": [[2.359718070687783, 48.892703...",84.194412,"48.892698728117594,2.3591442172030948",L_VOIE,bouldelachapelle,,,chapelle
2651109,bpt6k9780089g,1606,63,Zelinsky (L.),tailleur pour dames,boul. de la chapelle,118.,1922,750003854.0,chapelle,Pl. de la Chapelle,Place de la Chapelle,"{""coordinates"": [[2.360791435980116, 48.884530...",253.536635,"48.88480648972644,2.3594297765903955",L_VOIE,bouldelachapelle,,,chapelle
2651110,bpt6k9780089g,1606,63,Zelinsky (L.),tailleur pour dames,boul. de la chapelle,118.,1922,750001777.0,chapelle,Av. de la Chapelle,Avenue de la Chapelle,"{""coordinates"": [[[2.286812272456765, 48.88084...",133.54217,"48.88111616566177,2.286533313908099",L_VOIE,bouldelachapelle,,,chapelle
2651111,bpt6k9780089g,1606,63,Zelinsky (L.),tailleur pour dames,boul. de la chapelle,118.,1922,750003855.0,chapelle,R. de la Chapelle,Rue de la Chapelle,"{""coordinates"": [[[2.3598807896913803, 48.8908...",1984.740679,"48.894529443910386,2.359500975638578",L_VOIE,bouldelachapelle,,,chapelle
2651115,bpt6k9780089g,1606,97,Zēmor,publicite,boul. des italiens,8.,1922,750005764.0,italiens,Bd. des Italiens,Boulevard des Italiens,"{""coordinates"": [[[2.339546440002341, 48.87189...",451.371193,"48.87151413382537,2.3373363688341864",L_VOIE,bouldesitaliens,,,italiens
2651116,bpt6k9780089g,1606,97,Zēmor,publicite,boul. des italiens,8.,1922,750004662.0,italiens,R. des Italiens,Rue des Italiens,"{""coordinates"": [[2.335954790381124, 48.871278...",115.898439,"48.87178443346516,2.335875254820215",L_VOIE,bouldesitaliens,,,italiens


In [26]:
not_aligned.head(50)

Unnamed: 0.1,Unnamed: 0,page,row,Nom,Métier,Rue,Numéro,annee,no_spaces,sans_pref
0,bpt6k6282019m,144,0,Aaron,bronzes,passage choiseal,72 et 74.,1855,passagechoiseal,choiseal
1,bpt6k6282019m,144,1,Aaron (Mic.),manuf. de porcelaines,bondy,30.,1855,bondy,bondy
2,bpt6k6282019m,144,5,Abadie,tabac et estamin.,ménilmontant,158.,1855,ménilmontant,ménilmontant
3,bpt6k6282019m,144,6,Abanse,instituteur,sts-pères,30.,1855,stspères,pères
4,bpt6k6282019m,144,13,Abault et Coudray,charpentiers,corbeau,23.,1855,corbeau,corbeau
5,bpt6k6282019m,144,14,Abault (Paul),libraire,quai des angustins,9.,1855,quaidesangustins,angustins
6,bpt6k6282019m,144,15,Abavid,vins,beaujolais-da-temple,7.,1855,beaujolaisdatemple,da-temple
7,bpt6k6282019m,144,16,Abazaer (Are),cristaux et porcelaines,pei.ecuries,26.,1855,peiecuries,pei.ecuries
8,bpt6k6282019m,144,17,Abbadie (A.) et Montagnan,draps,neuve-desbons-enfants,1.,1855,neuvedesbonsenfants,desenfants
9,bpt6k6282019m,144,20,Abbal,coiffeur,parc-royal,8.,1855,parcroyal,parc-royal
