# Alignment
## Load libraries and data

In [78]:
import pandas as pd
from collections import Counter

from alignment import align_on_column
from preprocessing import substitute_col_by_dict

In [79]:
bottins = pd.read_pickle("data/bottins_prep.pkl")
streets = pd.read_pickle("data/streets_prep.pkl")
unique_short_s = pd.read_pickle("data/unique_short_streets.pkl")
non_unique_short_s = pd.read_pickle("data/not_unique_short_streets.pkl")

In [80]:
streets.head(3)

Unnamed: 0,id1791,nom_voie,unprocessed_voie,voie_long,voie_short,id2022,L_VOIE,L_COURTMIN,L_LONGMIN,street_short,street_abbr,street_long,source,streetname,streetname_short
0,1162.0,deux Boules,rue des deux Boules,rue des deux boules,deux boules,750005865.0,DEUX BOULES,R. des Deux Boules,Rue des Deux Boules,deux boules,r. des deux boules,rue des deux boules,both,rue des deux boules,deux boules
1,1531.0,Jean Lantier,rue Jean Lantier,rue jean lantier,jean lantier,750005921.0,JEAN LANTIER,R. Jean Lantier,Rue Jean Lantier,jean lantier,r. jean lantier,rue jean lantier,both,rue jean lantier,jean lantier
2,3.0,Orfèvres,rue des Orfèvres,rue des orfevres,orfevres,750006633.0,ORFEVRES,R. des Orfèvres,Rue des Orfèvres,orfevres,r. des orfevres,rue des orfevres,both,rue des orfevres,orfevres


## Align data

In [81]:
long_aligned, not_aligned = align_on_column(df_not_aligned = bottins, df_streets = streets, 
                    mergeOnLeft="street", mergeOnRight="streetname", align_method="perfect")
u_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= unique_short_s, 
                    mergeOnLeft="street", mergeOnRight="streetname_short", align_method="perfect short")
nu_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= non_unique_short_s, 
                    mergeOnLeft="street", mergeOnRight="streetname_short", align_method="perfect short")                

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newly_aligned["align_method"] = align_method


Joining on street and streetname, method:perfect
#total aligned: 1217153, newly aligned: 1217153, not aligned: 3269042
Joining on street and streetname_short, method:perfect short
#total aligned: 1053221, newly aligned: 1053221, not aligned: 2215821
Joining on street and streetname_short, method:perfect short
#total aligned: 914125, newly aligned: 914125, not aligned: 1301696


## Substitute words by dictionary

In [82]:
word_dict = {"boulevard": "boulevard de",
                "boulevard de de ": "boulevard de ",
                "boulevard de d'": "boulevard d'",
                "boulevards": "boulevard des",
                "damede": "dame de",
                "damedes": "dame des",
                "faubourgsaint": "faubourg saint",
                "faubourgpoissonniere": "faubourg poissonniere",
                "faubourgdu": "faubourg du",
                "faubourgmontmartre": "faubourg montmartre",
                "quai jemmapes": "quai de jemmapes",
                "boulevards italiens": "boulevard des italiens",
                "villeneuve": "ville neuve",
                "quai valmy": "quai de valmy",
                "avenue wagram": "avenue de wagram",
                "boulevard de montparnasse": "boulevard du montparnasse"
                }

# substitute abbreviations
not_aligned["street"] = substitute_col_by_dict(not_aligned["street"], word_dict)

In [83]:
long_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= streets, 
                    df_aligned = long_aligned, mergeOnLeft="street", mergeOnRight="streetname", 
                    align_method="perfect")
u_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= unique_short_s, 
                    df_aligned= u_short_aligned, mergeOnLeft="street", mergeOnRight="streetname_short", 
                    align_method="perfect short")
nu_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= non_unique_short_s, 
                    df_aligned= nu_short_aligned, mergeOnLeft="street", mergeOnRight="streetname_short", 
                    align_method="perfect short")                

Joining on street and streetname, method:perfect
#total aligned: 1350888, newly aligned: 133735, not aligned: 1167961
Joining on street and streetname_short, method:perfect short
#total aligned: 1200079, newly aligned: 146858, not aligned: 1021103
Joining on street and streetname_short, method:perfect short
#total aligned: 940384, newly aligned: 26259, not aligned: 994844


In [84]:
most_common = Counter(not_aligned["street"]).most_common(100)
for t in most_common:
    rue, occur = t
    last = rue.split()[-1]
    print(rue, occur, [x for x in streets["streetname"] if last in x], "\n")

grenelle saint germain 9182 ["rue saint germain l'auxerrois", 'rue des fosses saint germain des pres', 'foire saint germain', 'preau de la foire saint germain', "rue des fosses saint germain l'auxerrois", "cloitre saint germain l'auxerrois", 'route de saint germain a paris', "rue des pretres saint germain l'auxerrois", 'rue germaine tailleferre', 'rue du docteur germain see', 'cite germain pilon', 'rue germain pilon', 'rue germaine richier', 'rue germaine krull', 'rue sophie germain', 'place saint germain des pres', 'rue germaine de stael', 'boulevard saint germain', 'rue germaine poinso chapuis'] 

allemagne 6737 [] 

rochechouart 5863 ['rue marguerite de rochechouart', 'boulevard marguerite de rochechouart'] 

avenue d'orleans 5696 ["rue d'orleans", "rue d'orleans", "quay d'orleans", "rue neuve d'orleans", "rue neuve d'orleans", "route d'orleans", "square d'orleans", "les portiques d'orleans", "quai d'orleans", "galerie d'orleans", "villa d'orleans", "avenue de la porte d'orleans"] 


In [85]:
# to check if something exists in the streets dataset
print([x for x in streets["streetname"] if "boulevards" in x])
print(streets[streets["streetname_short"]=="la fayette"])

[]
      id1791 nom_voie unprocessed_voie voie_long voie_short       id2022  \
2621     NaN      NaN              NaN       NaN        NaN  750004617.0   

          L_VOIE     L_COURTMIN       L_LONGMIN street_short    street_abbr  \
2621  LA FAYETTE  R. la Fayette  Rue la Fayette   la fayette  r. la fayette   

         street_long source      streetname streetname_short  
2621  rue la fayette   2022  rue la fayette       la fayette  


In [86]:
# get most common in not aligned data
Counter([x for x in not_aligned["street"] if "." in x]).most_common(100)

[('faub.poissonniere', 1394),
 ('grenelle saint germ.', 1014),
 ('faub.dutemple', 872),
 ('faub.montmartre', 765),
 ('four saint germ.', 404),
 ('j. j.rousseau', 371),
 ('fanb. saint martin', 362),
 ('fauh. saint martin', 345),
 ('fanb. saint antoine', 344),
 ('fanb. saint denis', 343),
 ('cloitre n. d', 327),
 ('fauh. saint antoine', 303),
 ('f. saint martin', 302),
 ('fauh. saint denis', 296),
 ('saint dominique saint germ.', 285),
 ('neuve des pet. champs', 263),
 ('f. saint antoine', 258),
 ('saint maur popinc.', 244),
 ('croix des pet. champs', 239),
 ('grenelle saint g.', 223),
 ('f. saint denis', 222),
 ('n.d. de nazareth', 216),
 ('fanb. saint honore', 215),
 ('fanb. poissonniere', 213),
 ('sainte marguerite saint germ.', 209),
 ('ay. parmentier', 205),
 ('faub.. poissonniere', 204),
 ('fanb. du temple', 200),
 ("saint germ. l'auxerrois", 200),
 ('j.j. rousseau', 196),
 ('faub.. saint denis', 190),
 ('fauh. saint honore', 187),
 ('fauh. poissonniere', 183),
 ('faub.. saint mart

### Alignment without spaces

In [87]:
replace_spaces = {"\ ":"", "\|":"", "\.":"", "\:":"", "\'":""}
not_aligned["no_spaces"] = not_aligned["street"].replace(replace_spaces, regex=True)
streets["no_spaces_long"] = streets["streetname"].replace(replace_spaces, regex=True)
unique_short_s["no_spaces_short"] = unique_short_s["streetname_short"].replace(replace_spaces, regex=True)
non_unique_short_s["no_spaces_short"] = non_unique_short_s["streetname_short"].replace(replace_spaces, regex=True)

In [88]:
not_aligned.head(5)

Unnamed: 0.1,Unnamed: 0,page,row,Nom,Métier,Rue,Numéro,annee,street,no_spaces
0,bpt6k6282019m,144,0,Aaron,bronzes,passage Choiseal,72 et 74.,1855,passage choiseal,passagechoiseal
1,bpt6k6282019m,144,13,Abault et Coudray,charpentiers,Corbeau,23.,1855,corbeau,corbeau
2,bpt6k6282019m,144,14,Abault (Paul),libraire,quai des Angustins,9.,1855,quai des angustins,quaidesangustins
3,bpt6k6282019m,144,15,Abavid,vins,Beaujolais-da-Temple,7.,1855,beaujolais da temple,beaujolaisdatemple
4,bpt6k6282019m,144,16,Abazaer (Are),cristaux et porcelaines,Pei.Ecuries,26.,1855,pei.ecuries,peiecuries


In [89]:
long_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= streets, 
                    df_aligned = long_aligned, mergeOnLeft="no_spaces", mergeOnRight="no_spaces_long", 
                    align_method="no spaces perfect")
u_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= unique_short_s, 
                    df_aligned = u_short_aligned, mergeOnLeft="no_spaces", mergeOnRight="no_spaces_short",
                    align_method="no spaces perfect short")
nu_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= non_unique_short_s, 
                    df_aligned = nu_short_aligned, mergeOnLeft="no_spaces", mergeOnRight="no_spaces_short",
                    align_method="no spaces perfect short")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newly_aligned["align_method"] = align_method


Joining on no_spaces and no_spaces_long, method:no spaces perfect
#total aligned: 1396808, newly aligned: 45920, not aligned: 950374
Joining on no_spaces and no_spaces_short, method:no spaces perfect short
#total aligned: 1246571, newly aligned: 46492, not aligned: 904127
Joining on no_spaces and no_spaces_short, method:no spaces perfect short
#total aligned: 962404, newly aligned: 22020, not aligned: 882118


In [90]:
nu_short_aligned.tail(10)

Unnamed: 0.1,Unnamed: 0,page,row,Nom,Métier,Rue,Numéro,annee,street,streetname_short,id1791,id2022,streetname,align_method,no_spaces,no_spaces_short
885684,bpt6k9780089g,788,97,Bossuat,couronnes funéraires,.Montgallet,23.0,1922,.montgallet,montgallet,"[1172.0, nan]","[750003056.0, 750003055.0]","[rue montgallet, passage montgallet]",no spaces perfect short,montgallet,montgallet
886597,bpt6k9780089g,827,238,Caillat (Edmond),éténisterie,Faub. St-An. toine,24.0,1922,faubourg saint an. toine,faubourg saint antoine,"[1170.0, nan]","[nan, 750003526.0]","[grande rue du faubourg saint antoine, rue du ...",no spaces perfect short,faubourgsaintantoine,faubourgsaintantoine
887326,bpt6k9780089g,858,79,Charpentier & Lemaitre,bar,Faub. St-An. toine,161.0,1922,faubourg saint an. toine,faubourg saint antoine,"[1170.0, nan]","[nan, 750003526.0]","[grande rue du faubourg saint antoine, rue du ...",no spaces perfect short,faubourgsaintantoine,faubourgsaintantoine
890384,bpt6k9780089g,997,208,Blectro-Stock (L'),appareillage électrique,. Coquillière,31.0,1922,. coquilliere,coquilliere,"[273.0, nan]","[750005879.0, 750006048.0]","[rue coquilliere, souterrain coquilliere]",no spaces perfect short,coquilliere,coquilliere
893706,bpt6k9780089g,1149,270,Jamaux (F.),art. de literie,Faub. St-An. toine,283.0,1922,faubourg saint an. toine,faubourg saint antoine,"[1170.0, nan]","[nan, 750003526.0]","[grande rue du faubourg saint antoine, rue du ...",no spaces perfect short,faubourgsaintantoine,faubourgsaintantoine
893835,bpt6k9780089g,1156,228,Jollain (A.),margueterie,Faub. St-Antoi. ne,95.0,1922,faubourg saint antoi. ne,faubourg saint antoine,"[1170.0, nan]","[nan, 750003526.0]","[grande rue du faubourg saint antoine, rue du ...",no spaces perfect short,faubourgsaintantoine,faubourgsaintantoine
894148,bpt6k9780089g,1169,277,Klémann (F.),ameublements,Faub. St-An. toine,62.0,1922,faubourg saint an. toine,faubourg saint antoine,"[1170.0, nan]","[nan, 750003526.0]","[grande rue du faubourg saint antoine, rue du ...",no spaces perfect short,faubourgsaintantoine,faubourgsaintantoine
899450,bpt6k9780089g,1401,92,Pont (Mme),couturière,. Lamarck,145.0,1922,. lamarck,lamarck,"[nan, nan]","[750003262.0, 750003263.0]","[rue lamarck, square lamarck]",no spaces perfect short,lamarck,lamarck
900604,bpt6k9780089g,1448,161,Bofflix,rasoirs et repasseurs de lames,. Rambuteau,14.0,1922,. rambuteau,rambuteau,"[nan, nan, nan]","[750005592.0, 750005591.0, 750005593.0]","[porte rambuteau, passage rambuteau, rue rambu...",no spaces perfect short,rambuteau,rambuteau
901202,bpt6k9780089g,1475,18,Sarrault,cordonnier,. Meslay,60.0,1922,. meslay,meslay,"[nan, nan]","[750006630.0, 750005711.0]","[rue meslay, passage meslay]",no spaces perfect short,meslay,meslay
