# Alignment Functions

#### Importing Libraries

In [1]:
import pandas as pd
import re

#### Importing Data

In [2]:
bottins = pd.read_csv("data/strict_addressing.csv")
voies = pd.read_csv("data/opendata_voie_paris.csv", sep=";")
#keep only columns that might be useful further on
voies = voies[["N_SQ_VO", "L_VOIE", "L_COURTMIN", "L_LONGMIN", "Geometry", "LENGTH", "Geometry X Y"]]

## conservative

In [3]:
def align_on_column(df_not_aligned, df_streets, df_aligned=pd.DataFrame(), mergeOnLeft="Rue", mergeOnRight=["L_VOIE"]):
    # make copies to avoid alteration of source dataframes
    not_aligned = df_not_aligned.copy()
    streets = df_streets.copy()
    if not df_aligned.empty:
        aligned = df_aligned.copy()
    else:
        aligned = df_aligned
        
    for rkey in mergeOnRight:
        # format to make match possible
        streets[rkey] = streets[rkey].str.lower()
        not_aligned[mergeOnLeft] = not_aligned[mergeOnLeft].str.lower()
        # merge
        merged = not_aligned.merge(streets, how="left", left_on=mergeOnLeft, right_on=rkey, suffixes=(False, False))
        # append newly aligned cases to aligned dataframe
        newly_aligned = merged[merged[rkey].notna()]
        # add column to know what alignment was on
        newly_aligned["aligned_on"] = rkey
        aligned = pd.concat([aligned, newly_aligned])
        
        # update not aligned rows
        not_aligned = merged[merged[rkey].isna()]
        not_aligned = not_aligned.drop(list(streets.columns), axis=1)
        print(f"Joining on {rkey}\n#total aligned: {len(aligned)}, newly aligned: {len(newly_aligned)}, not aligned: {len(not_aligned)}")
    
    return aligned, not_aligned

## substituting accents (characters in general)

In [4]:
# function to replace different characters at once in a string, based on a dictionary
def replace_chars_by_dict(string, map_dict):
    #https://www.geeksforgeeks.org/python-replace-different-characters-in-string-at-once/
    return re.compile("|".join(map_dict.keys())).sub(lambda ele: map_dict[re.escape(ele.group(0))], string)

def substitute_on_streets_and_align(df_not_aligned, df_streets, map_dict, df_aligned=pd.DataFrame(), mergeOnLeft="Rue", mergeOnRight=["L_VOIE"]):
    streets = df_streets.copy()
    mergeOnRight_new = list()
    for rkey in mergeOnRight:
        rkey_new = f"{rkey}_new"
        streets[rkey_new] = streets[rkey].apply(replace_chars_by_dict, args=(map_dict,))
        mergeOnRight_new.append(rkey_new)
    
    aligned, not_aligned = align_on_column(df_not_aligned=df_not_aligned, df_streets=streets, df_aligned=df_aligned, mergeOnLeft=mergeOnLeft, mergeOnRight=mergeOnRight_new)
    aligned = aligned.drop(mergeOnRight_new, axis=1)
    return aligned, not_aligned 

In [5]:
aligned, not_aligned = align_on_column(bottins, voies, mergeOnRight=["L_VOIE", "L_COURTMIN", "L_LONGMIN"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newly_aligned["aligned_on"] = rkey


Joining on L_VOIE
#total aligned: 1933426, newly aligned: 1933426, not aligned: 3191199
Joining on L_COURTMIN
#total aligned: 2545967, newly aligned: 612541, not aligned: 2578666
Joining on L_LONGMIN
#total aligned: 2713723, newly aligned: 167756, not aligned: 2410910


In [6]:
dict_accents = {"é": "e", "è": "e", "ê":"e", "à":"a", "â":"a", "ô":"o", "î":"i", "û":"u"}
aligned, not_aligned = substitute_on_streets_and_align(df_not_aligned=not_aligned, df_streets=voies, 
                                                                 map_dict=dict_accents, df_aligned=aligned,
                                                                mergeOnRight=["L_VOIE", "L_COURTMIN", "L_LONGMIN"])

Joining on L_VOIE_new
#total aligned: 2713723, newly aligned: 0, not aligned: 2410910


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newly_aligned["aligned_on"] = rkey


Joining on L_COURTMIN_new
#total aligned: 2718069, newly aligned: 4346, not aligned: 2406564
Joining on L_LONGMIN_new
#total aligned: 2718972, newly aligned: 903, not aligned: 2405661


## substituting spaces and .-

In [7]:
not_aligned["no_spaces"] = not_aligned["Rue"].str.replace(".", "", regex=False)
not_aligned["no_spaces"] = not_aligned["no_spaces"].str.replace(" ", "", regex=False)
not_aligned["no_spaces"] = not_aligned["no_spaces"].str.replace("-", "", regex=False)

In [8]:
voies_copy = voies.copy()
voies_copy["no_spaces_long"] = voies_copy["L_LONGMIN"].str.replace(".", "", regex=False)
voies_copy["no_spaces_long"] = voies_copy["no_spaces_long"].str.replace(" ", "", regex=False)
voies_copy["no_spaces_long"] = voies_copy["no_spaces_long"].str.replace("-", "", regex=False)

In [9]:
aligned, not_aligned = align_on_column(not_aligned, voies_copy, aligned, mergeOnLeft="no_spaces", mergeOnRight=["no_spaces_long"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newly_aligned["aligned_on"] = rkey


Joining on no_spaces_long
#total aligned: 2747447, newly aligned: 28475, not aligned: 2377194


In [10]:
voies_copy = voies.copy()
voies_copy["no_spaces_court"] = voies_copy["L_COURTMIN"].str.replace(".", "", regex=False)
voies_copy["no_spaces_court"] = voies_copy["no_spaces_court"].str.replace(" ", "", regex=False)
voies_copy["no_spaces_court"] = voies_copy["no_spaces_court"].str.replace("-", "", regex=False)
aligned, not_aligned = align_on_column(not_aligned, voies_copy, aligned, mergeOnLeft="no_spaces", mergeOnRight=["no_spaces_court"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newly_aligned["aligned_on"] = rkey


Joining on no_spaces_court
#total aligned: 2856095, newly aligned: 108648, not aligned: 2268694


In [13]:
aligned.tail()

Unnamed: 0.1,Unnamed: 0,page,row,Nom,Métier,Rue,Numéro,annee,N_SQ_VO,L_VOIE,L_COURTMIN,L_LONGMIN,Geometry,LENGTH,Geometry X Y,aligned_on,no_spaces,no_spaces_long,no_spaces_court
2377325,bpt6k9780089g,1607,192,Zullig (Arnold),fourreur,r. croix-des-petitschamps,33.0,1922,750005891.0,CROIX DES PETITS CHAMPS,R. Croix des Petits Champs,Rue Croix des Petits Champs,"{""coordinates"": [[2.339184280672447, 48.862218...",395.247666,"48.863930647003784,2.340033235528675",no_spaces_court,rcroixdespetitschamps,,rcroixdespetitschamps
2377326,bpt6k9780089g,1607,194,Zuloaga,peintre-art.,r.caulaincourt,54.0,1922,750006458.0,CAULAINCOURT,R. Caulaincourt,Rue Caulaincourt,"{""coordinates"": [[[2.3295839496506456, 48.8846...",1377.350313,"48.888012381700015,2.335330632931254",no_spaces_court,rcaulaincourt,,rcaulaincourt
2377334,bpt6k9780089g,1607,227,Zusman (S.),meubles,r. joseph-dijon,16.0,1922,750003300.0,JOSEPH DIJON,R. Joseph Dijon,Rue Joseph Dijon,"{""coordinates"": [[2.34789895265875, 48.8936934...",199.038155,"48.893993137898086,2.3466298590464247",no_spaces_court,rjosephdijon,,rjosephdijon
2377336,bpt6k9780089g,1607,233,Zutter,électricité,av. de la grande-armée,85.0,1922,750001383.0,GRANDE ARMEE,Av. de la Grande Armée,Avenue de la Grande Armée,"{""coordinates"": [[[2.294022093593509, 48.87410...",944.223697,"48.87588684710302,2.288398613762114",no_spaces_court,avdelagrandearmée,,avdelagrandearmée
2377337,bpt6k9780089g,1607,235,Zvang,blanchiss.,r. jules-vallès,5.0,1922,750003237.0,JULES VALLES,R. Jules Vallès,Rue Jules Vallès,"{""coordinates"": [[2.385126163672793, 48.852695...",213.882502,"48.853659996235805,2.385053516055706",no_spaces_court,rjulesvallès,,rjulesvallès


In [12]:
not_aligned

Unnamed: 0.1,Unnamed: 0,page,row,Nom,Métier,Rue,Numéro,annee,no_spaces
0,bpt6k6282019m,144,0,Aaron,bronzes,passage choiseal,72 et 74.,1855,passagechoiseal
1,bpt6k6282019m,144,1,Aaron (Mic.),manuf. de porcelaines,bondy,30.,1855,bondy
2,bpt6k6282019m,144,5,Abadie,tabac et estamin.,ménilmontant,158.,1855,ménilmontant
3,bpt6k6282019m,144,6,Abanse,instituteur,sts-pères,30.,1855,stspères
4,bpt6k6282019m,144,13,Abault et Coudray,charpentiers,corbeau,23.,1855,corbeau
...,...,...,...,...,...,...,...,...,...
2377335,bpt6k9780089g,1607,229,Zussy,coiffeur,r. st-vincent-de-paul,10.,1922,rstvincentdepaul
2377338,bpt6k9780089g,1607,246,Zweiger (Vve),vins-restaur.,r. des hospitalières-st-gervais,12.,1922,rdeshospitalièresstgervais
2377339,bpt6k9780089g,1607,252,Zwilling,déchets de cuir,r. de buffon,67.,1922,rdebuffon
2377340,bpt6k9780089g,1607,254,Zwinger (Henri),agent de fabriques,faub. poissonnière,65.,1922,faubpoissonnière


In [22]:
print(len(aligned["Rue"].unique()), len(not_aligned["Rue"].unique()))


16203 174827


In [35]:
from collections import Counter
Counter(not_aligned["Rue"]).most_common(20)

[('st-honoré', 29537),
 ('st-denis', 29477),
 ('st-martin', 25716),
 ('faub.-st-martin', 19295),
 ('faub.-st-antoine', 18336),
 ('faub.-st-denis', 17799),
 ('boul. voltaire', 16675),
 ('st-jacques', 13365),
 ('lafayette', 12842),
 ('boul. sébastopol', 12819),
 ('faub.-st-honoré', 12452),
 ('st-antoine', 12158),
 ('sèvres', 11995),
 ('st-lazare', 11802),
 ('faub.-poissonnière', 11485),
 ('boul. beaumarchais', 11236),
 ('boul. magenta', 11227),
 ('faub.-du-temple', 11038),
 ('boul. st-germain', 10978),
 ('cléry', 9850)]

In [46]:
not_aligned.iloc[-1000000] #1883
Counter(not_aligned[-100000:]["Rue"]).most_common(100)

[('boul. voltaire', 1475),
 ('r. st-honoré', 1248),
 ('faub. poissonnière', 1205),
 ('faub. st-antoine', 1152),
 ('faub. st-denis', 1130),
 ('r. st-maur', 1102),
 ('faub. st-martin', 1040),
 ('r. st-denis', 1020),
 ('boul. st-germain', 1012),
 ('r. de flandre', 986),
 ('faub. st-honoré', 976),
 ('faub. du temple', 860),
 ('r. st-martin', 852),
 ('boul. haussmann', 847),
 ('r. st-lazare', 832),
 ('boul. sébastopol', 786),
 ('boul. malesherbes', 716),
 ('boul. de la villette', 632),
 ('boul. du montparnasse', 623),
 ('boul. de strasbourg', 610),
 ("r. d'angoulême", 581),
 ('boul. de magenta', 576),
 ('boul. raspail', 563),
 ('faub. montmartre', 557),
 ('r. st-jacques', 545),
 ('boul. de clichy', 534),
 ('boul. beaumarchais', 526),
 ('r. de bondy', 518),
 ("av. d'orléans", 507),
 ('r. de vanves', 497),
 ('boul. st-michel', 491),
 ('av. de st-ouen', 482),
 ('r. st-antoine', 459),
 ('r. st-dominique', 457),
 ('r. caumartin', 448),
 ('boul. diderot', 439),
 ('boul. péreire', 432),
 ('r. st-c