# Alignment

The goal of this Notebook is to
* align the bottin data with the street network data with the following methods:
    1. perfect matching
    2. perfect matching without spaces
    3. fuzzy matching
* assess the quality of the alignment

## Load libraries and data

In [None]:
import pandas as pd
import geopandas as gpd
from collections import Counter
import pickle
from matplotlib import pyplot as plt
from fuzzywuzzy import process, fuzz

from alignment import align_on_column, get_fuzzy_dict, simple_processor, print_sample
from preprocessing import substitute_col_by_dict

In [None]:
# set variable to True if you want to use dataset with profession tags (preprocessed with parts of Ravis Code)
USE_TAGGED_DATASET = False

In [None]:
if USE_TAGGED_DATASET:
    bottins = pd.read_pickle("data/bottins_tagged_prep.pkl")
else:
    bottins = pd.read_pickle("data/bottins_prep.pkl")
    
streets = pd.read_pickle("data/FinalUnique.pkl")
unique_short_s = pd.read_pickle("data/unique_short_streets.pkl")
non_unique_short_s = pd.read_pickle("data/not_unique_short_streets.pkl")

In [None]:
bottins.head(3)

## 1. Align data: Perfect matching

In [None]:
# perfect alignment
long_aligned, not_aligned = align_on_column(df_not_aligned = bottins, df_streets = streets, 
                    mergeOnLeft="rue_processed", mergeOnRight="streetname_prep", align_method="perfect")
u_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= unique_short_s, 
                    mergeOnLeft="rue_processed", mergeOnRight="name_prep", align_method="perfect short")
nu_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= non_unique_short_s, 
                    mergeOnLeft="rue_processed", mergeOnRight="name_prep", align_method="perfect short")                

### Some alignment by hand

In [None]:
# substitute frequent OCR errors etc. by hand
word_dict = {"boulevard": "boulevard de",
                "boulevard de de ": "boulevard de ",
                "boulevard de d'": "boulevard d'",
                "boulevards": "boulevard des",
                "damede": "dame de",
                "damedes": "dame des",
                "faubourgsaint": "faubourg saint",
                "faubourgpoissonniere": "faubourg poissonniere",
                "faubourgdu": "faubourg du",
                "faubourgmontmartre": "faubourg montmartre",
                "quai jemmapes": "quai de jemmapes",
                "boulevards italiens": "boulevard des italiens",
                "villeneuve": "ville neuve",
                "quai valmy": "quai de valmy",
                "avenue wagram": "avenue de wagram",
                "boulevard de montparnasse": "boulevard du montparnasse"
                }

# substitute words
not_aligned["rue_processed"] = substitute_col_by_dict(not_aligned["rue_processed"], word_dict)

In [None]:
# alignment
long_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= streets, 
                    df_aligned = long_aligned, mergeOnLeft="rue_processed", mergeOnRight="streetname_prep", 
                    align_method="perfect")
u_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= unique_short_s, 
                    df_aligned= u_short_aligned, mergeOnLeft="rue_processed", mergeOnRight="name_prep", 
                    align_method="perfect short")
nu_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= non_unique_short_s, 
                    df_aligned= nu_short_aligned, mergeOnLeft="rue_processed", mergeOnRight="name_prep", 
                    align_method="perfect short")                

In [None]:
# Helper Code to check alignment so far
'''
# print the most common not aligned streets from the bottin data and possible counterparts in the street network data
most_common = Counter(not_aligned["rue_processed"]).most_common(100)
for t in most_common:
    rue, occur = t
    last = rue.split()[-1]
    print(rue, occur, [x for x in streets["streetname_prep"] if last in x], "\n")

# code to check if something exists in the streets dataset
print([x for x in streets["streetname_prep"] if "boulevards" in x])
print(streets[streets["name_prep"]=="la fayette"])

# get most common streetnames in not aligned data
Counter([x for x in not_aligned["rue_processed"] if "." in x]).most_common(100)
'''

## 2. Alignment without spaces

In [None]:
# create new columns in all datasets where spaces and some special characters are deleted
replace_spaces = {"\ ":"", "\|":"", "\.":"", "\:":"", "\'":""}
not_aligned["no_spaces"] = not_aligned["rue_processed"].replace(replace_spaces, regex=True)
streets["no_spaces_long"] = streets["streetname_prep"].replace(replace_spaces, regex=True)
unique_short_s["no_spaces_short"] = unique_short_s["name_prep"].replace(replace_spaces, regex=True)
non_unique_short_s["no_spaces_short"] = non_unique_short_s["name_prep"].replace(replace_spaces, regex=True)

In [None]:
not_aligned.head(5)

In [None]:
# conduct alignment
long_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= streets, 
                    df_aligned = long_aligned, mergeOnLeft="no_spaces", mergeOnRight="no_spaces_long", 
                    align_method="no spaces perfect")
u_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= unique_short_s, 
                    df_aligned = u_short_aligned, mergeOnLeft="no_spaces", mergeOnRight="no_spaces_short",
                    align_method="no spaces perfect short")
nu_short_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= non_unique_short_s, 
                    df_aligned = nu_short_aligned, mergeOnLeft="no_spaces", mergeOnRight="no_spaces_short",
                    align_method="no spaces perfect short")

In [None]:
print("Aligned data so far:", (len(long_aligned)+len(u_short_aligned)+len(nu_short_aligned))/len(bottins))

## 3. Fuzzy Matching

In [None]:
#get list of all non aligned street names and a subset for first overview
not_aligned_rues = not_aligned["rue_processed"].unique().tolist()
not_aligned_selected100 = not_aligned_rues[:100]

In [None]:
# put all streets from the street network data in one list
streets_all_vars = streets["streetname_prep"]
streets_all_vars = list(set(streets_all_vars))
streets_all_vars[:10]

### Check for different cutoff scores

In [None]:
#trying a fuzzy matching for similarity value 85%
for x in not_aligned_selected100:
    best_one = process.extractOne(x, streets_all_vars, processor=simple_processor, scorer=fuzz.ratio,
    score_cutoff=85)
    #if there is a matching street with similarity > 85 %, print it
    if best_one:
        print(x, best_one)

In [None]:
#trying matching with similarity value of 80%
for x in not_aligned_selected100:
    best_one = process.extractOne(x, streets_all_vars, processor=simple_processor, scorer=fuzz.ratio,
    score_cutoff=80)
    # look at those matches between 80 and 90%
    if best_one:
        if best_one[1]<90:
            print(x, best_one)

### Creation of fuzzy dictionaries

In [None]:
#make two seperate dictionaries, one with score cutoff value 85 and one with 80
#code takes around 3 hours; if this is too long, make a selection like
#not_aligned_selected = [street for street, _ in Counter(not_aligned["rue_processed"].tolist()).most_common(10000)]
#this still yields good results (1-2% less data aligned)

# if dictionary with cutoff 85 is already there, load it. If not, compute it
# dictionary of the form {bottin street: most similar street in street data}
try:
    with open('data/fuzzy_dictwith85.pkl', 'rb') as f:
        fuzzy_dict85 = pickle.load(f)
except:
    fuzzy_dict85 = get_fuzzy_dict(streets_all_vars, not_aligned_rues, score_cutoff=85)
    with open("data/fuzzy_dictwith85.pkl","wb") as f:
        pickle.dump(fuzzy_dict85,f)

# same for dictionary with cutoff 80
try:
    with open('data/fuzzy_dictwith80.pkl', 'rb') as f:
        fuzzy_dict80 = pickle.load(f)
except:
    fuzzy_dict80 = get_fuzzy_dict(streets_all_vars, not_aligned_rues, score_cutoff=80)
    with open("data/fuzzy_dictwith80.pkl","wb") as f:
        pickle.dump(fuzzy_dict80,f)

In [None]:
# number of entries per dictionary
print("# entries for cutoff 80:", len(fuzzy_dict80), "cutoff 85:", len(fuzzy_dict85))

### Fuzzy Alignment

In [None]:
#create a new column in not_aligned dataset and map the fuzzy matched streetnames to the entries
not_aligned["street_fuzzy80"] = not_aligned["rue_processed"].map(fuzzy_dict80)
not_aligned["street_fuzzy85"] = not_aligned["rue_processed"].map(fuzzy_dict85)

In [None]:
#align on the newly created columns
long_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= streets, 
                    df_aligned = long_aligned, mergeOnLeft="street_fuzzy85", mergeOnRight="streetname_prep", 
                    align_method="fuzzy 85")

long_aligned, not_aligned = align_on_column(df_not_aligned = not_aligned, df_streets= streets, 
                    df_aligned = long_aligned, mergeOnLeft="street_fuzzy80", mergeOnRight="streetname_prep", 
                    align_method="fuzzy 80")

# Save Data

In [None]:
unique_aligned = pd.concat([long_aligned, u_short_aligned])

In [None]:
if USE_TAGGED_DATASET:
        unique_aligned_selection = unique_aligned[["row", "nom", "metier", "rue", "numero", 
                "annee", "streetname", "geometry", "name", "year", "align_method", "tags"]]
        unique_aligned_selection.to_pickle("data/unique_aligned_tagged.pkl")
else:
        unique_aligned_selection = unique_aligned[["page", "row", "nom", "metier", "rue", "numero", 
                "annee", "streetname", "geometry", "name", "year", "align_method"]]
        unique_aligned_selection.to_pickle("data/unique_aligned.pkl")

# Assessment of the Alignment

### Concatenate relevant data

In [None]:
all_aligned = pd.concat([long_aligned, u_short_aligned, nu_short_aligned])
all_aligned = all_aligned[["page", "row", "nom", "metier", "rue", "numero", 
                "annee", "streetname", "geometry", "name", "year", "align_method"]]
all_streets = pd.concat([streets, unique_short_s, non_unique_short_s])

### Ratio of overall aligned data

In [None]:
print("Aligned data:", (len(all_aligned))/len(bottins))

## Overview over alignment per method

In [None]:
methods = ["perfect", "perfect short", "no spaces perfect", "no spaces perfect short", "fuzzy 85", "fuzzy 80"]
dev_aligned = {}
for method in methods:
    dev_aligned[method] = len(all_aligned[all_aligned["align_method"]==method])
    print(f"{round(100*dev_aligned[method]/len(all_aligned), 2)}% of aligned data was aligned by method: {method},\
    ({round(100*dev_aligned[method]/len(bottins), 2)}% of all data)")

In [None]:
# plotting result
plt.bar(dev_aligned.keys(), dev_aligned.values())
plt.xticks(rotation = 90)
plt.title("Number of alignment entries per method\n(alignment in order from left to right)")

## Quality Assessment of Alignment

### Perfect Alignment
Checking the code below, the alignment is correct in **100%** of the cases.

However, the alignment on the short streetnames is by nature sometimes ambiguous, because the short name was used in the Bottin Data.

In [None]:
print_sample(all_aligned, ["perfect", "perfect short"], 100)

### Alignment without spaces
The alignment of the samples below is correct in **100%** of the cases.

In [None]:
print_sample(all_aligned, ["no spaces perfect", "no spaces perfect short"], 100)

### Fuzzy alignment
#### Alignment with threshold 85 (85% of the two strings matched)

Correct: **82%**

Unclear if correct: **5%**
- 3 ("place" matched with "rue" -> no place in data),
- 8 ("payée" matched with "place" -> no payée in data),
- 65 ("moreau" matched with "rue" -> no moreau in data), 
- 88 ("caron" matched with "rue" -> no caron in data),
- 92 ("route"(ronte) matched with "rue" -> no route in data)


Incorrect matches: **13%**
- 2, 5, 7, 18, 20, 21, 34, 39, 41, 49, 82, 85, 100

(many of incorrect matches because there was not the correct streettype (e.g. "avenue" instead of "rue") and thus matched to different street entirely)

In [None]:
print_sample(all_aligned, ["fuzzy 85"], 100)

#### Alignment of threshold between 80 and 85

correct: **49%**

incorrect: **30%**
- 3, 4, 6, 9, 13, 15, 23, 28, 31, 38, 39, 40, 41, 43, 44, 45, 47, 49, 53, 55, 56, 58, 65, 67, 78, 79, 85, 86, 94, 95
- from this incorrect street type: **3%** (4, 6, 13)

unclear: **21%**
- 1 ("rue" instead of "quai")
- 8 ("impasse" instaed of "marais")
- 10 ("rue" instead of "place")
- 11 ("cité" instead of "rue")
- 16 ("impasse" instead of "marais")
- 17 ("rue" instead of "cloitre")
- 19 ("cité" instead of "rue")
- 24
- 27 ("rue" instead of "pavée")
- 32 ("cité" instead of "rue")
- 33 ("place" instead of "rue")
- 52 ("rue" instead of "passage")
- 59 ("rue" instead of "quai")
- 60 ("place" instead of "rue")
- 69 ("cité" instead of "poteau")
- 70 ("place" instead of "rue")
- 72 ("rue" instead of "square")
- 80 ("cité" instead of "place")
- 82 ("villa" instead of "place")
- 83 ("cité" instead of "rue")
- 99 ("boulevard" instead of "boucherie")

-> many wrongly matched because "correct" street (type) was not in dataset

In [None]:
print_sample(all_aligned, ["fuzzy 80"], 100)

In [None]:
#helper code: check for names in street dataset
#print([x for x in streets["streetname_prep"] if "invalides" in x])

## Plotting quality of alignment

In [None]:
# construct a dataframe with absolute numbers of alignment per method and the ratios of the quality
alignment_ratios = [1, 1, 1, 1, 0.82, 0.49]
df_aligned = {}
i=0
for key, value in dev_aligned.items():
    df_aligned[key] = [value, alignment_ratios[i]]
    i+=1
statistics = pd.DataFrame.from_dict(df_aligned)

# plotting
statistics.iloc[0].plot(kind='bar')
statistics.iloc[1].plot(secondary_y=True, ylim=(0,1.02), rot=90, color="red")
plt.title("Number of alignment entries (bars)\nand quality of alignment (line) per method\n[alignment in order from left to right]")

## Not Aligned Data

In [None]:
Counter(not_aligned["rue"]).most_common(15)

In [None]:
# number of data which rests to be aligned
len(not_aligned)