# Alignment
## Load libraries and data

In [None]:
import pandas as pd
import geopandas as gpd
from collections import Counter
import pickle
from matplotlib import pyplot as plt

from alignment import align_on_column, get_fuzzy_dict
from preprocessing import substitute_col_by_dict

In [None]:
# set variable to True if you want to use dataset with profession tags (preprocessed with parts of Ravis Code)
USE_TAGGED_DATASET = False

In [None]:
if USE_TAGGED_DATASET:
    bottins = pd.read_pickle("data/bottins_tagged_prep.pkl")
else:
    bottins = pd.read_pickle("data/bottins_prep.pkl")
    
streets = pd.read_pickle("data/FinalUnique.pkl")
unique_short_s = pd.read_pickle("data/unique_short_streets.pkl")
non_unique_short_s = pd.read_pickle("data/not_unique_short_streets.pkl")

In [None]:
bottins.head(3)

## Align data

In [None]:
long_aligned, not_aligned = align_on_column(df_not_aligned = bottins, df_streets = streets, 
                    mergeOnLeft="rue_processed", mergeOnRight="streetname_prep", align_method="perfect")
u_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= unique_short_s, 
                    mergeOnLeft="rue_processed", mergeOnRight="name_prep", align_method="perfect short")
nu_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= non_unique_short_s, 
                    mergeOnLeft="rue_processed", mergeOnRight="name_prep", align_method="perfect short")                

## Substitute words by dictionary

In [None]:
word_dict = {"boulevard": "boulevard de",
                "boulevard de de ": "boulevard de ",
                "boulevard de d'": "boulevard d'",
                "boulevards": "boulevard des",
                "damede": "dame de",
                "damedes": "dame des",
                "faubourgsaint": "faubourg saint",
                "faubourgpoissonniere": "faubourg poissonniere",
                "faubourgdu": "faubourg du",
                "faubourgmontmartre": "faubourg montmartre",
                "quai jemmapes": "quai de jemmapes",
                "boulevards italiens": "boulevard des italiens",
                "villeneuve": "ville neuve",
                "quai valmy": "quai de valmy",
                "avenue wagram": "avenue de wagram",
                "boulevard de montparnasse": "boulevard du montparnasse"
                }

# substitute abbreviations
not_aligned["rue_processed"] = substitute_col_by_dict(not_aligned["rue_processed"], word_dict)

In [None]:
long_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= streets, 
                    df_aligned = long_aligned, mergeOnLeft="rue_processed", mergeOnRight="streetname_prep", 
                    align_method="perfect")
u_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= unique_short_s, 
                    df_aligned= u_short_aligned, mergeOnLeft="rue_processed", mergeOnRight="name_prep", 
                    align_method="perfect short")
nu_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= non_unique_short_s, 
                    df_aligned= nu_short_aligned, mergeOnLeft="rue_processed", mergeOnRight="name_prep", 
                    align_method="perfect short")                

In [None]:
most_common = Counter(not_aligned["rue_processed"]).most_common(100)
for t in most_common:
    rue, occur = t
    last = rue.split()[-1]
    print(rue, occur, [x for x in streets["streetname_prep"] if last in x], "\n")

In [None]:
# to check if something exists in the streets dataset
print([x for x in streets["streetname_prep"] if "boulevards" in x])
#print(streets[streets["name_prep"]=="la fayette"])

In [None]:
# get most common in not aligned data
Counter([x for x in not_aligned["rue_processed"] if "." in x]).most_common(100)

### Alignment without spaces

In [None]:
replace_spaces = {"\ ":"", "\|":"", "\.":"", "\:":"", "\'":""}
not_aligned["no_spaces"] = not_aligned["rue_processed"].replace(replace_spaces, regex=True)
streets["no_spaces_long"] = streets["streetname_prep"].replace(replace_spaces, regex=True)
unique_short_s["no_spaces_short"] = unique_short_s["name_prep"].replace(replace_spaces, regex=True)
non_unique_short_s["no_spaces_short"] = non_unique_short_s["name_prep"].replace(replace_spaces, regex=True)

In [None]:
not_aligned.head(5)

In [None]:
long_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= streets, 
                    df_aligned = long_aligned, mergeOnLeft="no_spaces", mergeOnRight="no_spaces_long", 
                    align_method="no spaces perfect")
u_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= unique_short_s, 
                    df_aligned = u_short_aligned, mergeOnLeft="no_spaces", mergeOnRight="no_spaces_short",
                    align_method="no spaces perfect short")
nu_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= non_unique_short_s, 
                    df_aligned = nu_short_aligned, mergeOnLeft="no_spaces", mergeOnRight="no_spaces_short",
                    align_method="no spaces perfect short")

In [None]:
nu_short_aligned.tail(4)

In [None]:
print("Aligned data so far:", (len(long_aligned)+len(u_short_aligned)+len(nu_short_aligned))/len(bottins))

In [None]:
long_aligned[["nom", "metier", "rue", "numero", "year", "streetname"]].head(2)

In [None]:
long_aligned[["nom", "metier", "rue", "numero", "year", "streetname", "no_spaces"]].tail(2)

In [None]:
non_unique_short_s[["name_prep", "streetname"]].head(5)

In [None]:
[x for x in not_aligned["rue"] if ("5" in x)][10:20]

## fuzzy matching

In [None]:
from fuzzywuzzy import process, fuzz

def simple_processor(token: str) -> str:
    """A string processor to return the same string as input.
        This dummy processor is used to avoid the default processor of the Rapidfuzz module to calculate string similarity.

    Parameters
    ----------
    token : str
        The input string to process.

    
    Returns
    -------
    str
        The output string same as the input string.
    """
    return token

In [None]:
#get a subset of all non-aligned rows, because otherwise computation is too heavy
not_aligned_rues = not_aligned["rue_processed"].unique().tolist()
not_aligned_selected = [street for street, _ in Counter(not_aligned["rue_processed"].tolist()).most_common(10000)]
#first 100 streets for first analysis
not_aligned_selected100 = not_aligned_rues[:100]

In [None]:
streets_all_vars = streets["streetname_prep"]
#streets_all_vars.append(streets["streetname_short"])
#streets_all_vars.append(streets["streetname_short_prep"])
streets_all_vars = list(set(streets_all_vars))
streets_all_vars[:10]

In [None]:
#trying a fuzzy matching for similarity value 85%
for x in not_aligned_selected100:
    best_one = process.extractOne(x, streets_all_vars, processor=simple_processor, scorer=fuzz.ratio,
    score_cutoff=85)
    #if there is a matching street with similarity > 85 %, print it
    if best_one:
        print(x, best_one)

In [None]:
#trying matching with similarity value of 80%
for x in not_aligned_selected100:
    best_one = process.extractOne(x, streets_all_vars, processor=simple_processor, scorer=fuzz.ratio,
    score_cutoff=80)
    # look at those matches between 80 and 90%
    if best_one:
        if best_one[1]<90:
            print(x, best_one)

In [None]:
get_fuzzy_dict(streets_all_vars, not_aligned_selected100, score_cutoff=85)

In [None]:
#make two seperate dictionaries, one with score cutoff value 85 and one with 80

# if dictionary with cutoff 85 is already there, load it. If not, compute it
try:
    with open('data/fuzzy_dict10000with85.pkl', 'rb') as f:
        fuzzy_dict85 = pickle.load(f)
except:
    fuzzy_dict85 = get_fuzzy_dict(streets_all_vars, not_aligned_selected, score_cutoff=85)
    with open("data/fuzzy_dict10000with85.pkl","wb") as f:
        pickle.dump(fuzzy_dict85,f)

# same for dictionary with cutoff 80
try:
    with open('data/fuzzy_dict10000with80.pkl', 'rb') as f:
        fuzzy_dict80 = pickle.load(f)
except:
    fuzzy_dict80 = get_fuzzy_dict(streets_all_vars, not_aligned_selected, score_cutoff=80)
    with open("data/fuzzy_dict10000with80.pkl","wb") as f:
        pickle.dump(fuzzy_dict80,f)

In [None]:
print("# entries for cutoff 80:", len(fuzzy_dict80), "cutoff 85:", len(fuzzy_dict85))

In [None]:
for item in fuzzy_dict85.items():
    if "." in item[0]:
        print(item)

In [None]:
#create a new column in not_aligned dataset and map the fuzzy matched streetnames to the entries
not_aligned["street_fuzzy80"] = not_aligned["rue_processed"].map(fuzzy_dict80)
not_aligned["street_fuzzy85"] = not_aligned["rue_processed"].map(fuzzy_dict85)

In [None]:
#align on the newly created columns
long_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= streets, 
                    df_aligned = long_aligned, mergeOnLeft="street_fuzzy85", mergeOnRight="streetname_prep", 
                    align_method="fuzzy 85")

long_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= streets, 
                    df_aligned = long_aligned, mergeOnLeft="street_fuzzy80", mergeOnRight="streetname_prep", 
                    align_method="fuzzy 80")

In [None]:
print("Aligned data:", (len(long_aligned)+len(u_short_aligned)+len(nu_short_aligned))/len(bottins))

In [None]:
unique_aligned = pd.concat([long_aligned, u_short_aligned])

In [None]:
if USE_TAGGED_DATASET:
        unique_aligned_selection = unique_aligned[["row", "nom", "metier", "rue", "numero", 
                "annee", "streetname", "geometry", "name", "year", "align_method", "tags"]]
        unique_aligned_selection.to_pickle("data/unique_aligned_tagged.pkl")
else:
        unique_aligned_selection = unique_aligned[["page", "row", "nom", "metier", "rue", "numero", 
                "annee", "streetname", "geometry", "name", "year", "align_method"]]
        unique_aligned_selection.to_pickle("data/unique_aligned.pkl")

# Alignment Assessment

### Get relevant data

In [None]:
all_aligned = pd.concat([long_aligned, u_short_aligned, nu_short_aligned])
all_aligned = all_aligned[["page", "row", "nom", "metier", "rue", "numero", 
                "annee", "streetname", "geometry", "name", "year", "align_method"]]
all_streets = pd.concat([streets, unique_short_s, non_unique_short_s])

## Overview over alignment

In [None]:
methods = ["perfect", "perfect short", "no spaces perfect", "no spaces perfect short", "fuzzy 85", "fuzzy 80"]
dev_aligned = {}
for method in methods:
    dev_aligned[method] = len(all_aligned[all_aligned["align_method"]==method])

In [None]:
plt.bar(dev_aligned.keys(), dev_aligned.values())
plt.xticks(rotation = 90)
plt.title("Number of alignment entries per method\n(alignment in order from left to right)")

## Quality of Alignment

In [None]:
def print_sample(df, align_methods, sample_size, random_state=42):
    df = df[df["align_method"].isin(align_methods)].sample(n=sample_size, random_state=random_state)
    data = zip(df["rue"], df["streetname"], df["align_method"])
    for i, entry in enumerate(data):
        print(f"{i+1}. bottin: {entry[0]}  -  matched: {entry[1]}   ({entry[2]})") 

### Perfect Alignment
Checking the code below, the alignment is correct in **100%** of the cases.

However, the alignment on the short streetnames is by nature sometimes ambiguous, because the short name was used in the Bottin Data.

In [None]:
print_sample(all_aligned, ["perfect", "perfect short"], 100)

### Alignment without spaces
The alignment of the samples below is correct in **100%** of the cases.

In [None]:
print_sample(all_aligned, ["no spaces perfect", "no spaces perfect short"], 100)

### Fuzzy alignment
#### Alignment with threshold 85 (85% of the two strings matched)

Correct: **80%**

Unclear if correct: **5%**
- 6 (two possibilities), 
- 11 ("avenue" matched with "rue" -> no avenue in data),
- 15 ("royale" matched with "rue" -> no royale in data),
- 21 ("cité" matched with "route" -> no cité in data),
- 26 ("route" matched with "rue" -> no route in data)

Incorrect matches: **15%**
- 3, 14, 22, 23, 24, 25, 30, 42, 47, 60, 65, 70, 78, 80, 82

(most of incorrect matches because there was not the correct streettype (e.g. "avenue" instead of "rue") and thus matched to different street entirely)



idea for improvement: write custom ratio which punishes non-alignments in last part of string more

In [None]:
print_sample(all_aligned, ["fuzzy 85"], 100)

#### Alignment of threshold between 80 and 85

correct: **46%**

incorrect: **39%**
- 2, 6, 8, 11, 12, 14, 17, 18, 24, 25, 26, 29, 31, 34, 38, 39, 42, 43, 44, 46, 51, 56, 59, 62, 66, 67, 69, 70, 78, 85, 86, 87, 88, 92, 95, 96, 97, 98, 100
- from this incorrect street type: **7%** (26, 29, 34, 42, 78, 87, 100)

unclear: **15%**
- 3 ("rue" instead of "chaussée")
- 4 ("rue" instead of "plâtre")
- 10 ("rue" instead of "grenelle")
- 37 ("cite" instead of "rue")
- 40 ("rue" instead of "cloître")
- 41 ("rue alphonse karr" instead of "rue alphonse")
- 48 ("rue" instead of "boulevard")
- 53 ("rue" instead of "rotonde")
- 54 ("rue" instead of "place")
- 57 ("rue" instead of "cloître")
- 73 ("rue" instead of "grénelle")
- 77 ("rue alphonse karr" instead of "rue alphonse")
- 91 ("rue" instead of "impasse")
- 93 ("rue" instead of "cité")
- 99 ("rue" instead of "cité")

-> many wrongly matched because "correct" street was not in dataset

In [None]:
#check for names in street dataset
#print([x for x in streets["streetname_prep"] if "honore" in x])

In [None]:
print_sample(all_aligned, ["fuzzy 80"], 100)

## Not Aligned Data

In [None]:
Counter(not_aligned["rue"]).most_common(15)