# Alignment
## Load libraries and data

In [36]:
import pandas as pd
import geopandas as gpd
from collections import Counter

from alignment import align_on_column, get_fuzzy_dict
from preprocessing import substitute_col_by_dict, preprocess
import pickle

In [37]:
bottins = pd.read_pickle("data/bottins_prep.pkl")
streets = pd.read_pickle("data/FinalUnique.pkl")
unique_short_s = pd.read_pickle("data/unique_short_streets.pkl")
non_unique_short_s = pd.read_pickle("data/not_unique_short_streets.pkl")

In [38]:
streets.head(3)

Unnamed: 0,rowid,type,name,article,streetname,geometry,streetname_prep,year,buffer,matching,streetname_short,filter,name_prep
932,51359,ALL,ADRIENNE LECOUVREUR,,Allée Adrienne Lecouvreur,"MULTILINESTRING ((255943.171 6250595.050, 2559...",allee adrienne lecouvreur,[2022],"POLYGON ((255822.8359876057 6250854.330169575,...",[51359],All. Adrienne Lecouvreur,True,adrienne lecouvreur
6273,14196,ALL,ALEXANDRE VIALATTE,,Allée Alexandre Vialatte,"LINESTRING (262386.931 6244735.002, 262394.045...",allee alexandre vialatte,[2022],"POLYGON ((262493.678321503 6244660.720869429, ...",[14196],All. Alexandre Vialatte,True,alexandre vialatte
5500,139590,ALL,ALQUIER-DEBROUSSE,,Allée Alquier-Debrousse,"LINESTRING (267885.248 6251328.904, 267898.247...",allee alquier debrousse,[2022],"POLYGON ((267849.993863189 6251423.652873451, ...",[139590],All. Alquier-Debrousse,True,alquier debrousse


## Align data

In [39]:
long_aligned, not_aligned = align_on_column(df_not_aligned = bottins, df_streets = streets, 
                    mergeOnLeft="street", mergeOnRight="streetname_prep", align_method="perfect")
u_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= unique_short_s, 
                    mergeOnLeft="street", mergeOnRight="name_prep", align_method="perfect short")
nu_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= non_unique_short_s, 
                    mergeOnLeft="street", mergeOnRight="name_prep", align_method="perfect short")                

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newly_aligned["align_method"] = align_method


Joining on street and streetname_prep, method:perfect
#total aligned: 1226222, newly aligned: 1226222, not aligned: 3179942
Joining on street and name_prep, method:perfect short
#total aligned: 1225539, newly aligned: 1225539, not aligned: 1954403
Joining on street and name_prep, method:perfect short
#total aligned: 880536, newly aligned: 880536, not aligned: 1073867


## Substitute words by dictionary

In [40]:
word_dict = {"boulevard": "boulevard de",
                "boulevard de de ": "boulevard de ",
                "boulevard de d'": "boulevard d'",
                "boulevards": "boulevard des",
                "damede": "dame de",
                "damedes": "dame des",
                "faubourgsaint": "faubourg saint",
                "faubourgpoissonniere": "faubourg poissonniere",
                "faubourgdu": "faubourg du",
                "faubourgmontmartre": "faubourg montmartre",
                "quai jemmapes": "quai de jemmapes",
                "boulevards italiens": "boulevard des italiens",
                "villeneuve": "ville neuve",
                "quai valmy": "quai de valmy",
                "avenue wagram": "avenue de wagram",
                "boulevard de montparnasse": "boulevard du montparnasse"
                }

# substitute abbreviations
not_aligned["street"] = substitute_col_by_dict(not_aligned["street"], word_dict)

In [41]:
long_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= streets, 
                    df_aligned = long_aligned, mergeOnLeft="street", mergeOnRight="streetname_prep", 
                    align_method="perfect")
u_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= unique_short_s, 
                    df_aligned= u_short_aligned, mergeOnLeft="street", mergeOnRight="name_prep", 
                    align_method="perfect short")
nu_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= non_unique_short_s, 
                    df_aligned= nu_short_aligned, mergeOnLeft="street", mergeOnRight="name_prep", 
                    align_method="perfect short")                

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newly_aligned["align_method"] = align_method


Joining on street and streetname_prep, method:perfect
#total aligned: 1281614, newly aligned: 55392, not aligned: 1018475
Joining on street and name_prep, method:perfect short
#total aligned: 1252504, newly aligned: 26965, not aligned: 991510
Joining on street and name_prep, method:perfect short
#total aligned: 880536, newly aligned: 0, not aligned: 991510


In [42]:
most_common = Counter(not_aligned["street"]).most_common(100)
for t in most_common:
    rue, occur = t
    last = rue.split()[-1]
    print(rue, occur, [x for x in streets["streetname_prep"] if last in x], "\n")

vieille du temple 10961 ['boulevard du temple', 'rue des fontaines du temple', 'rue des fosses du temple', 'rue du faubourg du temple', 'rue du temple', 'rue vieille du temple'] 

grenelle saint germain 10422 ['boulevard saint germain', 'cite germain pilon', 'passage saint germain le vieux', 'place saint germain des pres', 'rue des fosses saint germain des pres', "rue des fosses saint germain l'auxerrois", "rue des pretres saint germain l'auxerrois", 'rue du docteur germain see', 'rue germain pilon', 'rue germaine de stael', 'rue germaine krull', 'rue germaine poinso chapuis', 'rue germaine richier', 'rue germaine tailleferre', 'rue saint germain des pres', "rue saint germain l'auxerrois", 'rue sophie germain'] 

allemagne 6748 [] 

neuve des petits champs 6188 ['arcades des champs elysees', 'avenue des champs elysees', 'cul de sac du bouquet des champs', 'passage ormeaux grands champs', 'port des champs elysees', 'rond point des champs elysees marcel dassault', 'rue croix des petits c

In [43]:
# to check if something exists in the streets dataset
print([x for x in streets["streetname_prep"] if "boulevards" in x])
print(streets[streets["name_prep"]=="la fayette"])

[]
       rowid type        name article      streetname  \
1585  182043  RUE  LA FAYETTE    None  Rue la Fayette   

                                               geometry streetname_prep  \
1585  MULTILINESTRING ((259753.125 6253359.817, 2598...  rue la fayette   

        year                                             buffer  matching  \
1585  [2022]  POLYGON ((259829.2462873084 6253497.303816207,...  [182043]   

     streetname_short filter   name_prep  
1585    R. la Fayette   True  la fayette  


In [44]:
# get most common in not aligned data
Counter([x for x in not_aligned["street"] if "." in x]).most_common(100)

[('n.d. de nazareth', 216),
 ('j. j. rousseau', 187),
 ("saint germain l'aux.", 176),
 ('gren. saint germain', 140),
 ('rue n.d. de nazareth', 126),
 ('n.d. des victoires', 112),
 ('saint nicolas saint ant.', 106),
 ('f. du temple', 99),
 ('montagne sainte gen.', 94),
 ('faubourg saint ant.', 89),
 ('n. d. de nazareth', 87),
 ('boulevard de se. bastopol', 85),
 ('grenelle saint hon.', 83),
 ('m. le prince', 79),
 ("chaussee. d'antin", 78),
 ('b. beaumarchais', 73),
 ('sainte marguerite saint ant.', 73),
 ("fosses saint germain l'aux.", 71),
 ('cherche. midi', 70),
 ('vaugi. rard', 69),
 ('boulevard de sebas. topol', 69),
 ('rue n.d. des champs', 69),
 ('sts. peres', 67),
 ('grenelle saint h.', 64),
 ("ch. d'antin", 64),
 ('mons. le prince', 63),
 ('n.d. de lorette', 63),
 ('faubourg saint an. toine', 63),
 ('traversiere saint ant.', 62),
 ('bourbon villen.', 61),
 ('nve. des petits champs', 59),
 ('paradis. poissonniere', 58),
 ('boulevard de la vil. lette', 58),
 ('faubourg du. temple

### Alignment without spaces

In [45]:
replace_spaces = {"\ ":"", "\|":"", "\.":"", "\:":"", "\'":""}
not_aligned["no_spaces"] = not_aligned["street"].replace(replace_spaces, regex=True)
streets["no_spaces_long"] = streets["streetname_prep"].replace(replace_spaces, regex=True)
unique_short_s["no_spaces_short"] = unique_short_s["name_prep"].replace(replace_spaces, regex=True)
non_unique_short_s["no_spaces_short"] = non_unique_short_s["name_prep"].replace(replace_spaces, regex=True)

In [46]:
not_aligned.head(5)

Unnamed: 0.1,Unnamed: 0,page,row,Nom,Métier,Rue,Numéro,annee,street,no_spaces
0,bpt6k6282019m,144,0,Aaron,bronzes,passage Choiseal,72 et 74.,1855,passage choiseal,passagechoiseal
1,bpt6k6282019m,144,14,Abault (Paul),libraire,quai des Angustins,9.,1855,quai des angustins,quaidesangustins
2,bpt6k6282019m,144,15,Abavid,vins,Beaujolais-da-Temple,7.,1855,beaujolais da temple,beaujolaisdatemple
3,bpt6k6282019m,144,16,Abazaer (Are),cristaux et porcelaines,Pei.Ecuries,26.,1855,pei.ecuries,peiecuries
4,bpt6k6282019m,144,17,Abbadie (A.) et Montagnan,draps,Neuve-desBons-Enfants,1.,1855,neuve desbons enfants,neuvedesbonsenfants


In [47]:
long_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= streets, 
                    df_aligned = long_aligned, mergeOnLeft="no_spaces", mergeOnRight="no_spaces_long", 
                    align_method="no spaces perfect")
u_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= unique_short_s, 
                    df_aligned = u_short_aligned, mergeOnLeft="no_spaces", mergeOnRight="no_spaces_short",
                    align_method="no spaces perfect short")
nu_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= non_unique_short_s, 
                    df_aligned = nu_short_aligned, mergeOnLeft="no_spaces", mergeOnRight="no_spaces_short",
                    align_method="no spaces perfect short")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newly_aligned["align_method"] = align_method


Joining on no_spaces and no_spaces_long, method:no spaces perfect
#total aligned: 1332707, newly aligned: 51093, not aligned: 940772
Joining on no_spaces and no_spaces_short, method:no spaces perfect short
#total aligned: 1301724, newly aligned: 49220, not aligned: 891960
Joining on no_spaces and no_spaces_short, method:no spaces perfect short
#total aligned: 899920, newly aligned: 19384, not aligned: 872613


In [48]:
nu_short_aligned.tail(4)

Unnamed: 0.1,Unnamed: 0,page,row,Nom,Métier,Rue,Numéro,annee,street,name_prep,streetname,year,rowid,align_method,no_spaces,no_spaces_short
888526,bpt6k9780089g,1448,161,Bofflix,rasoirs et repasseurs de lames,. Rambuteau,14.,1922,. rambuteau,rambuteau,"[Passage Rambuteau, Porte Rambuteau, Rue Rambu...","[[2022], [2022], [2022]]","[60059, 163039, 36486]",no spaces perfect short,rambuteau,rambuteau
889125,bpt6k9780089g,1475,18,Sarrault,cordonnier,. Meslay,60.,1922,. meslay,meslay,"[Passage Meslay, Rue Meslay]","[[2022], [1836, 2022]]","[161625, 2311]",no spaces perfect short,meslay,meslay
889622,bpt6k9780089g,1494,124,Simon & fils,courtiers en marchandises,.. Chevert,22.,1922,.. chevert,chevert,"[Rue Chevert, Rue de Chevert]","[[2022], [1836, 1836]]","[27666, 738]",no spaces perfect short,chevert,chevert
890293,bpt6k9780089g,1533,109,Suzanne (Mme),lingerie p. dames,MarchéSt Honoré,306 et 307.,1922,marchesaint honore,marche saint honore,"[Place du Marché Saint-Honoré, Rue du Marché S...","[[2022], [1836, 1836, 1836, 2022]]","[53268, 2275]",no spaces perfect short,marchesainthonore,marchesainthonore


In [49]:
Counter(u_short_aligned["Métier"]).most_common(50)

[('vins', 73562),
 ('épicier', 35121),
 ('propriétaire', 25119),
 ('médecin', 20332),
 ('tailleur', 18023),
 ('coiffeur', 17186),
 ('boulanger', 16388),
 ('boucher', 15618),
 ('architecte', 13229),
 ('rentier', 12664),
 ('ébéniste', 12569),
 ('serrurier', 11226),
 ('menuisier', 10973),
 ('chaussures', 10201),
 ('horloger', 9622),
 ('peintre-artiste', 9094),
 ('modes', 9046),
 ('limonadier', 8846),
 ('mercerie', 8491),
 ('charcutier', 8067),
 ('pharmacien', 7060),
 ('sage-femme', 6962),
 ('bottier', 6889),
 ('tapissier', 6623),
 ('avocat', 6453),
 ('couturière', 6048),
 ('lingerie', 6014),
 ('peintre-vitrier', 5330),
 ('vins-traiteur', 5225),
 ('tonnelier', 4810),
 ('vins en gros', 4595),
 ('chapelier', 4534),
 ('fleuriste', 4525),
 ('bijoutier en or', 4416),
 ('tabac', 4379),
 ('pâtissier', 4364),
 ('herboriste', 4340),
 ('libraire', 4215),
 ('cordonnier', 3905),
 ('propr.', 3717),
 ('bois et charbons', 3612),
 ('relieur', 3564),
 ('mécanicien', 3517),
 ('papetier', 3490),
 ('restaurat

In [50]:
print("Aligned data so far:", (len(long_aligned)+len(u_short_aligned)+len(nu_short_aligned))/len(bottins))

Aligned data so far: 0.8021378686766992


In [51]:
long_aligned[["Nom", "Métier", "Rue", "Numéro", "year", "streetname"]].head(2)

Unnamed: 0,Nom,Métier,Rue,Numéro,year,streetname
12,Abbatucci (Séverin),député de la Corse,place Vendôme,11-13.,[2022],Place Vendôme
13,Abbatucci # (Th.),maître des requêtes au conseil d'Etat,pl. Vendôme,ll et 13.,[2022],Place Vendôme


In [52]:
long_aligned[["Nom", "Métier", "Rue", "Numéro", "year", "streetname", "no_spaces"]].tail(2)

Unnamed: 0,Nom,Métier,Rue,Numéro,year,streetname,no_spaces
991857,Zuloaga,peintre-art.,r.Caulaincourt,54.0,[2022],Rue Caulaincourt,ruecaulaincourt
991860,Zurconi (P.),fourreur,quai de l'Hôtel-deVille,46.0,[2022],Quai de l'Hôtel de Ville,quaidelhoteldeville


In [53]:
non_unique_short_s[["name_prep", "streetname"]].head(5)

Unnamed: 0,name_prep,streetname
18,abbaye,"[Chemin de l'Abbaye, Rue de l'Abbaye]"
35,abbesses,"[Passage des Abbesses, Place des Abbesses, Rue..."
50,acacias,"[Passage des Acacias, Petite Rue des Acacias]"
73,adrienne,"[Cité Adrienne, Villa Adrienne]"
110,albert bartholome,"[Avenue Albert Bartholomé, Square Albert Barth..."


In [54]:
[x for x in not_aligned["Rue"] if ("5" in x)][10:20]

['52. Lauriston (Marquis de)',
 '54. Petiet',
 '5. Saint-Projet(chev. de)',
 '15',
 '15',
 '56 et q. des Augustins',
 '57. Baudon de Mony',
 '65. Chasteigner (Cte)',
 'Popincourt 5. Genie',
 '105. Lagrange (Cte C. de)']

## fuzzy matching

In [55]:
from fuzzywuzzy import process, fuzz

def simple_processor(token: str) -> str:
    """A string processor to return the same string as input.
        This dummy processor is used to avoid the default processor of the Rapidfuzz module to calculate string similarity.

    Parameters
    ----------
    token : str
        The input string to process.

    
    Returns
    -------
    str
        The output string same as the input string.
    """
    return token

In [57]:
#get a subset of all non-aligned rows, because otherwise computation is too heavy
not_aligned_rues = not_aligned["street"].unique().tolist()
not_aligned_selected = [street for street, _ in Counter(not_aligned["street"].tolist()).most_common(10000)]
#first 100 streets for first analysis
not_aligned_selected100 = not_aligned_rues[:100]

In [58]:
streets_all_vars = streets["streetname_prep"]
#streets_all_vars.append(streets["streetname_short"])
#streets_all_vars.append(streets["streetname_short_prep"])
streets_all_vars = list(set(streets_all_vars))
streets_all_vars[:10]

['porte rambuteau',
 'villa copernic',
 'rue maurice loewy',
 'rue dalou',
 "rue d'aix",
 'cours des marechaux',
 'cour jasmin',
 'cul de sac croix bussiere',
 'rue emile pierre casel',
 'rue de toulouse']

In [59]:
#trying a fuzzy matching for similarity value 85%
for x in not_aligned_selected100:
    best_one = process.extractOne(x, streets_all_vars, processor=simple_processor, scorer=fuzz.ratio,
    score_cutoff=85)
    #if there is a matching street with similarity > 85 %, print it
    if best_one:
        print(x, best_one)

passage choiseal ('passage choiseul', 94)
neuve desbons enfants ('rue neuve des bons enfants', 89)
neuvesaint eustache ('rue neuve saint eustache', 88)
neape saint eustache ('porte saint eustache', 85)
place de l'hotel de ville ("rue de l'hotel de ville", 88)
neuve des petits champs ('rue neuve des petits champs', 92)
place saint germain l'auxerrois ("rue saint germain l'auxerrois", 90)
passage des pet. ecuries ('passage des petites ecuries', 90)
passage de venise ('passage venise', 90)
neuve sainte cathe.rine ('rue neuve sainte catherine', 90)
nre sainte catherine ('rue sainte catherine', 95)
val sainte catherine ('rue sainte catherine', 85)
neuve saint medard ('rue neuve saint medard', 90)
vieille du temple ('rue vieille du temple', 89)
place du vieuxmarche saint martin ('rue du vieux marche saint martin', 89)
neuve bourg l'abbe ("rue neuve bourg l'abbe", 90)
chemin de ronde de la barriere montreuil ('chemin de ronde de la barriere de pantin', 85)
neuve saint augustin ('rue neuve sai

In [60]:
#trying matching with similarity value of 80%
for x in not_aligned_selected100:
    best_one = process.extractOne(x, streets_all_vars, processor=simple_processor, scorer=fuzz.ratio,
    score_cutoff=80)
    # look at those matches between 80 and 90%
    if best_one:
        if best_one[1]<90:
            print(x, best_one)

quai des angustins ('quai des grands augustins', 84)
neuve desbons enfants ('rue neuve des bons enfants', 89)
saint maar popincourt ('rue saint andre popincourt', 81)
neuvesaint eustache ('rue neuve saint eustache', 88)
neape saint eustache ('porte saint eustache', 85)
place de l'hotel de ville ("rue de l'hotel de ville", 88)
orleans saint marcel ('rue saint marcel', 83)
saint maur popincourt ('rue saint andre popincourt', 81)
neave saint denis ('rue neuve saint denis', 84)
aux ours ('rue aux ours', 80)
boulevoie mont parnasse ('boulevard du mont parnasse', 82)
bonlevoie poissonniere ('boulevard poissonniere', 82)
anjou saint honore ('rue saint honore', 82)
cháteau landon ('rue chateau landon', 81)
val sainte catherine ('rue sainte catherine', 85)
vieille du temple ('rue vieille du temple', 89)
place du vieuxmarche saint martin ('rue du vieux marche saint martin', 89)
neuve steustache ('rue neuve saint eustache', 80)
nye saint eustache ('porte saint eustache', 84)
chemin de ronde de la

In [61]:
get_fuzzy_dict(streets_all_vars, not_aligned_selected100, score_cutoff=85)

{"neuve bourg l'abbe": "rue neuve bourg l'abbe",
 'passage de venise': 'passage venise',
 'neuve saint augustin': 'rue neuve saint augustin',
 "place saint germain l'auxerrois": "rue saint germain l'auxerrois",
 'passage des pet. ecuries': 'passage des petites ecuries',
 'place du vieuxmarche saint martin': 'rue du vieux marche saint martin',
 'neuve desbons enfants': 'rue neuve des bons enfants',
 'passage choiseal': 'passage choiseul',
 'vieille du temple': 'rue vieille du temple',
 'neuve saint denis': 'rue neuve saint denis',
 'neuve sainte cathe.rine': 'rue neuve sainte catherine',
 'val sainte catherine': 'rue sainte catherine',
 'chemin de ronde de la barriere montreuil': 'chemin de ronde de la barriere de pantin',
 'neuve des petits champs': 'rue neuve des petits champs',
 'passage delorme': 'passage de rome',
 'boulevard de capucines': 'boulevard des capucines',
 'passage sainte avoye': 'passage sainte avoie',
 'neuvesaint eustache': 'rue neuve saint eustache',
 'neuve saint m

In [69]:
#make two seperate dictionaries, one with score cutoff value 85 and one with 80

# if dictionary with cutoff 85 is already there, load it. If not, compute it
try:
    with open('data/fuzzy_dict10000with85.pkl', 'rb') as f:
        fuzzy_dict85 = pickle.load(f)
except:
    fuzzy_dict85 = get_fuzzy_dict(streets_all_vars, not_aligned_selected, score_cutoff=85)
    with open("data/fuzzy_dict10000with85.pkl","wb") as f:
        pickle.dump(fuzzy_dict85,f)

# same for dictionary with cutoff 80
try:
    with open('data/fuzzy_dict10000with80.pkl', 'rb') as f:
        fuzzy_dict80 = pickle.load(f)
except:
    fuzzy_dict80 = get_fuzzy_dict(streets_all_vars, not_aligned_selected, score_cutoff=80)
    with open("data/fuzzy_dict10000with80.pkl","wb") as f:
        pickle.dump(fuzzy_dict80,f)

In [70]:
print("# entries for cutoff 80:", len(fuzzy_dict80), "cutoff 85:", len(fuzzy_dict85))

# entries for cutoff 80: 4566 cutoff 85: 3015


In [75]:
for item in fuzzy_dict85.items():
    if "." in item[0]:
        print(item)

('sainte croix de la bretonn.', 'rue sainte croix de la bretonnerie')
('boulevard de des bati. gnolles', 'boulevard des batignolles')
('vieil. le du temple', 'rue vieille du temple')
('rue des sts.peres', 'rue des saints peres')
('boulevard de des filles. du calvaire', 'boulevard des filles du calvaire')
('esplan. des invalides', 'place des invalides')
('boulevard de des filles du cal. vaire', 'boulevard des filles du calvaire')
('nre. saint merri', 'rue saint merri')
('nre.des petits champs', 'rue des petits champs')
('nve. des petits champs', 'rue des petits champs')
('nre.saint augustin', 'rue saint augustin')
('boulevard de des filles du. calvaire', 'boulevard des filles du calvaire')
('boulevard de poisson. niere', 'boulevard poissonniere')
('9. de la tournelle', 'rue de la tournelle')
('f. saint antoine', 'rue saint antoine')
('boulevard de richard. lenoir', 'boulevard richard lenoir')
('boulevard de mont. parnasse', 'boulevard du mont parnasse')
('rue ste. croix de la bretonneri

In [71]:
#create a new column in not_aligned dataset and map the fuzzy matched streetnames to the entries
not_aligned["street_fuzzy80"] = not_aligned["street"].map(fuzzy_dict80)
not_aligned["street_fuzzy85"] = not_aligned["street"].map(fuzzy_dict85)

In [72]:
#align on the newly created columns
long_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= streets, 
                    df_aligned = long_aligned, mergeOnLeft="street_fuzzy85", mergeOnRight="streetname_prep", 
                    align_method="fuzzy 85")

long_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= streets, 
                    df_aligned = long_aligned, mergeOnLeft="street_fuzzy80", mergeOnRight="streetname_prep", 
                    align_method="fuzzy 80")

Joining on street_fuzzy85 and streetname_prep, method:fuzzy 85
#total aligned: 1552755, newly aligned: 0, not aligned: 652565


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newly_aligned["align_method"] = align_method


Joining on street_fuzzy80 and streetname_prep, method:fuzzy 80
#total aligned: 1684612, newly aligned: 131857, not aligned: 520708


In [104]:
print("Aligned data:", (len(long_aligned)+len(u_short_aligned)+len(nu_short_aligned))/len(bottins))

Aligned data: 0.8820043920289848


In [73]:
unique_aligned = pd.concat([long_aligned, u_short_aligned])



In [91]:
unique_aligned_selection = unique_aligned[["page", "row", "Nom", "Métier", "Rue", "Numéro", 
        "annee", "streetname", "geometry", "name", "year", "align_method"]]
unique_aligned_selection.to_pickle("data/unique_aligned.pkl")

In [92]:
unique_aligned_selection.tail(10)

Unnamed: 0,page,row,Nom,Métier,Rue,Numéro,annee,streetname,geometry,name,year,align_method
934623,1314,315,Modiano,commissionnaire,Faub. Montmar tre,13.,1922,,,,,no spaces perfect short
934656,1316,109,Joline (Lucien),représentant d'adragante,: Etienne-Dolet,34.,1922,,,,,no spaces perfect short
934715,1318,125,Monier,couleurs,. . Gustave-Doré,4.,1922,,,,,no spaces perfect short
937380,1434,164,Reydet,couleurs,. Poulet,1.,1922,,,,,no spaces perfect short
937537,1441,111,Ripaux,vins et hôtel,. Compans,25 el 27.,1922,,,,,no spaces perfect short
937618,1444,113,Robert,tailleur pour dames,Faub. St-Ho noré,103.,1922,,,,,no spaces perfect short
938882,1497,123,Smil,cordonnier,.. Dutot,65.,1922,,,,,no spaces perfect short
938948,1501,150,Société anonyme française l'Industrie Musicale,instruments de musique,Faub. Pois- sonnière,11.,1922,,,,,no spaces perfect short
939029,1505,207,Société centrale de fournitures industrielles,fournitures générales pour usines,. Philippe-de-Girard,54.,1922,,,,,no spaces perfect short
940689,1587,111,Villain (Mue),mercerie,. Corbeau,3.,1922,,,,,no spaces perfect short


In [93]:
unique_aligned.columns

Index(['Unnamed: 0', 'page', 'row', 'Nom', 'Métier', 'Rue', 'Numéro', 'annee',
       'street', 'rowid', 'type', 'name', 'article', 'streetname', 'geometry',
       'streetname_prep', 'year', 'buffer', 'matching', 'streetname_short',
       'filter', 'name_prep', 'align_method', 'no_spaces', 'no_spaces_long',
       'street_fuzzy80', 'street_fuzzy85', 'no_spaces_short'],
      dtype='object')

In [103]:
unique_aligned_selection.tail(4)

Unnamed: 0,page,row,Nom,Métier,Rue,Numéro,annee,streetname,geometry,name,year,align_method
938882,1497,123,Smil,cordonnier,.. Dutot,65.0,1922,,,,,no spaces perfect short
938948,1501,150,Société anonyme française l'Industrie Musicale,instruments de musique,Faub. Pois- sonnière,11.0,1922,,,,,no spaces perfect short
939029,1505,207,Société centrale de fournitures industrielles,fournitures générales pour usines,. Philippe-de-Girard,54.0,1922,,,,,no spaces perfect short
940689,1587,111,Villain (Mue),mercerie,. Corbeau,3.0,1922,,,,,no spaces perfect short
