In [1]:
%reload_ext autoreload
%autoreload 2

In [55]:
!pwd

/mnt/c/Users/Iyed/Projects/Work/MarsadMajles/seshat/notebooks/votes


In [2]:
import copy
import pickle
from datetime import datetime
from itertools import product
from pprint import pprint
from random import shuffle
from typing import Any, Callable, List

import jellyfish
import numpy as np
import pandas as pd
from glom import T, glom
from loguru import logger
from scipy.optimize import linear_sum_assignment

import textdistance
from seshat.extractors.db_data import (
    RemoteDB,
    get_active_deputies_on_date,
    remote_db_config,
)
from seshat.models import Deputy, get_session

In [3]:
pd.set_option("display.max_rows", 100)

## Get the names of active deputies

In [4]:
session_date = datetime(2020, 6, 29)
session_date

datetime.datetime(2020, 6, 29, 0, 0)

In [5]:
deps = pd.DataFrame(get_active_deputies_on_date(session_date))

In [6]:
pd.set_option("display.max_row", 218)
deps.sort_values("name_ar").head()

Unnamed: 0,_id,name_ar,lastname_ar,joined,left
34,5dcbd190a7986d066f9d197c,حسان,بن الحاج إبراهيم,2019-11-13,
212,5dcbd190a7986d066f9d19ef,زهير,مخلوف,2019-11-13,
24,5dcbd18fa7986d066f9d196c,شكري,بلحاج عمارة,2019-11-13,
210,5dcbd190a7986d066f9d19e7,طارق,براهمي,2019-11-13,
31,5dcbd190a7986d066f9d1976,فيصل,الطاهري,2019-11-13,


## Get names extracted from PDFs

In [7]:
with open(
    "/mnt/c/Users/Iyed/Projects/Work/MarsadMajles/seshat/votes.pickle", mode="rb"
) as f:
    raw_names = [df.iloc[:, 0].to_list() for df in pickle.load(f)]

## Transform reference names

In [8]:
reference_names = (
    deps.assign(fullname_ar=deps["name_ar"].str.cat(deps["lastname_ar"], sep=" "))
    .loc[:, "fullname_ar"]
    .to_list()
)

In [9]:
assert len(reference_names) == 217

In [10]:
source_diff = list(set(raw_names[1]).difference(reference_names))
target_diff = list(set(reference_names).difference(raw_names[1]))

In [11]:
len(source_diff)

70

In [12]:
len(target_diff)

70

In [13]:
def get_closest(
    query: str,
    candidates: List[str],
    n_closest: int = 1,
    scorer: Callable = textdistance.jaro_winkler.normalized_similarity,
):
    scores = [{"candidate": c, "similarity": scorer(query, c),} for c in candidates]
    if n_closest > 1:
        return sorted(scores, key=lambda x: x["similarity"], reverse=True)[:n_closest]
    else:
        return max(scores, key=lambda x: x["similarity"])

In [14]:
len(sorted(reference_names))

217

### Naïve approach

In [15]:
def fuzzy_match_naive(
    source: List[str], target: List[str], *args, **kwargs
) -> pd.DataFrame:
    matches_list = []
    source_diff = set(source).difference(
        target
    )  # Consider only queries that do not have an exact match the target list
    target_diff = set(target).difference(
        source
    )  # Consider only queries that do not have an exact match the sources list

    for q in source_diff:
        match = get_closest(q, target_diff, *args, **kwargs)
        matches_list.append((q, match))

    normalized = glom(
        matches_list,
        [{"source": T[0], "target": T[1]["candidate"], "score": T[1]["similarity"],}],
    )
    matches_df = pd.DataFrame(normalized).sort_values("target")
    return matches_df

In [16]:
%%time
matches_df = fuzzy_match_naive(raw_names[0], reference_names,)
matches_df.query("score < 1").sort_values("score")

CPU times: user 172 ms, sys: 0 ns, total: 172 ms
Wall time: 159 ms


Unnamed: 0,source,target,score
55,الطاهر فارس بلال,فارس بلال,0.743056
16,اسامة الصغير,أسامة علية الصغير,0.763666
15,أحمد الصافي سعيد,أمال السعيدي,0.772159
5,امال الورتاني,آمال الورتاتني,0.787851
47,حسان بالحاج ابراهيم,حسان بن الحاج إبراهيم,0.79222
42,الناجي الجراحي,ناجي الجراحي,0.813492
65,طارق ابراهمي,طارق براهمي,0.823232
12,فاضل الوج,الفاضل الوج,0.828283
60,المنجي الرحوي,منجي الرحوي,0.857809
48,الأزهر الشملي,لزهر الشملي,0.857809


In [17]:
matches_df.pipe(lambda df: df.loc[df["target"].duplicated(keep=False)])

Unnamed: 0,source,target,score
25,أمل السعيدي,أمال السعيدي,0.953535
15,أحمد الصافي سعيد,أمال السعيدي,0.772159
44,علي البجاوي,علي الهرماسي,0.878788
50,علي هرماسي,علي الهرماسي,0.926667


In [18]:
pd.DataFrame(get_closest("علي البجاوي", reference_names, n_closest=3))

Unnamed: 0,candidate,similarity
0,علي الهرماسي,0.878788
1,علي الطياشي,0.865909
2,علي بنعون,0.842424


## Slightly less naïve

In [19]:
def fuzzy_match_less_naive(
    source: List[str], target: List[str], *args, **kwargs
) -> pd.DataFrame:
    matches_list = []
    source_diff = ordered_difference(
        source, target
    )  # Consider only queries that do not have an exact match the target list
    target_diff = ordered_difference(
        target, source
    )  # Consider only candidates that not have an exact match in the source list
    for q in source_diff:
        match = get_closest(q, target_diff, *args, **kwargs)
        matches_list.append((q, match))
        target_diff.remove(match["candidate"])

    normalized = glom(
        matches_list,
        [{"source": T[0], "target": T[1]["candidate"], "score": T[1]["similarity"],}],
    )
    matches_df = pd.DataFrame(normalized).sort_values("target")
    return matches_df

In [20]:
%%time
matches_df = fuzzy_match_less_naive(raw_names[0], reference_names)
matches_df.sort_values("score")

NameError: name 'ordered_difference' is not defined

In [21]:
matches_df.pipe(lambda df: df.loc[df["target"].duplicated(keep=False)])

Unnamed: 0,source,target,score
25,أمل السعيدي,أمال السعيدي,0.953535
15,أحمد الصافي سعيد,أمال السعيدي,0.772159
44,علي البجاوي,علي الهرماسي,0.878788
50,علي هرماسي,علي الهرماسي,0.926667


## Alis

In [22]:
source_alis = list(filter(lambda x: "علي" in x.split()[0], raw_names[0]))
target_alis = list(filter(lambda x: "علي" in x.split()[0], reference_names))

In [23]:
source_alis

['علي بن عون', 'علي البجاوي', 'علي هرماسي']

In [24]:
target_alis

['علي بنعون', 'علي الهرماسي', 'علي الطياشي']

In [25]:
def ordered_difference(left: List[Any], right: List[Any]) -> List[Any]:
    return [i for i in left if i not in right]

In [26]:
def experiment(source, target, n=10):
    source_copy = copy.deepcopy(source)
    target_copy = copy.deepcopy(target)
    results = []
    for _ in range(n):
        shuffle(source_copy)
        shuffle(target_copy)
        results.append(fuzzy_match_less_naive(source_copy, target_copy))
    return results

In [27]:
results = experiment(source_alis, target_alis, n=100)

In [28]:
shuffle(source_alis)
fuzzy_match_less_naive(source_alis, target_alis)

Unnamed: 0,source,target,score
2,علي البجاوي,علي الطياشي,0.865909
0,علي هرماسي,علي الهرماسي,0.926667
1,علي بن عون,علي بنعون,0.98


In [29]:
source = ordered_difference(raw_names[1], reference_names)
target = ordered_difference(reference_names, raw_names[1])

In [30]:
def fuzzy_matching_best(
    source, target, scorer=textdistance.jaro_winkler.normalized_distance, maximize=False
):
    diff_source = ordered_difference(source, target)
    diff_target = ordered_difference(target, source)

    pairs = product(diff_source, diff_target)
    scores = np.array([scorer(q, c) for q, c in pairs]).reshape(
        (len(diff_source), len(diff_target))
    )
    row_ind, col_ind = linear_sum_assignment(scores, maximize)
    return pd.DataFrame(
        [
            {
                "source": diff_source[i],
                "target": diff_target[j],
                "distance": scores[i, j],
            }
            for i, j in zip(row_ind, col_ind)
        ]
    )

In [35]:
%%time
fuzzy_matching_best(
    raw_names[0], reference_names, textdistance.ratcliff_obershelp.distance
).sort_values("source")

CPU times: user 422 ms, sys: 15.6 ms, total: 438 ms
Wall time: 426 ms


Unnamed: 0,source,target,distance
61,أحمد الصافي سعيد,الصافي سعيد,0.185185
15,أمل السعيدي,أمال السعيدي,0.043478
32,أمين ميساوي,أمين الميساوي,0.083333
26,اسامة الخليفي,أسامة الخليفي,0.076923
0,اسامة الصغير,أسامة علية الصغير,0.241379
16,الأزهر الشملي,لزهر الشملي,0.083333
33,الحبيب بن سيدهم,حبيب بن سيدهم,0.071429
1,السيد الفرجاني,السيد فرجاني,0.076923
30,الطاهر فارس بلال,فارس بلال,0.28
51,المكي زغدود,مكي زغدود,0.1


In [36]:
source_samiras = ["سميرة السميعي", "سميرة بعيزيق سلامة", "سميرة سايحي"]
target_samiras = ["سميرة السايحي", "سميرة بعيزيق", "سميرة سميعي"]

In [49]:
fuzzy_matching_best(
    [source_samiras[0]],
    target_samiras,
    scorer=textdistance.levenshtein.normalized_distance,
)

Unnamed: 0,source,target,distance
0,سميرة السميعي,سميرة السايحي,0.153846


In [53]:
get_closest(
    source_samiras[0],
    target_samiras,
    n_closest=3,
    scorer=textdistance.ratcliff_obershelp.normalized_distance,
)

[{'candidate': 'سميرة بعيزيق', 'similarity': 0.36},
 {'candidate': 'سميرة السايحي', 'similarity': 0.15384615384615385},
 {'candidate': 'سميرة سميعي', 'similarity': 0.08333333333333337}]

In [54]:
get_closest(
    source_samiras[0],
    target_samiras,
    n_closest=3,
    scorer=textdistance.jaro_winkler.normalized_distance,
)

[{'candidate': 'سميرة بعيزيق', 'similarity': 0.13376068376068373},
 {'candidate': 'سميرة السايحي', 'similarity': 0.06153846153846154},
 {'candidate': 'سميرة سميعي', 'similarity': 0.03076923076923077}]