### Here, we are testing 5000 X 5000 rows so 25 mil combinations. We exlude the address feature

In [13]:
import pandas as pd 
from similarity_engine import SimilarityEngine 
from preprocessing import Preprocessing 
from visualization_utils import SimilarityVisualizer

print("Beginning preprocessing")
print("\n" + "-" * 50)
source_df, target_df, _ = Preprocessing(
    "/home/jmar/matching_project/similarity_engine/yml_examples/preprocess.yml"
).process_data()
source_df = source_df.drop_duplicates(subset="id")
target_df = target_df.drop_duplicates(subset="id")
print("\n" + "-" * 50)
print("Beginning similarity caclulations")
print("\n" + "-" * 50)
engine = SimilarityEngine(
    "/home/jmar/matching_project/similarity_engine/yml_examples/engine.yml"
)
scores = engine.compare_multiple_records(source_df.head(5000), target_df.head(5000))
print("\n" + "-" * 50)
print("Beginning visualizations")
print("\n" + "-" * 50)
"""visualizer = SimilarityVisualizer(scores)
visualizer.plot_overall_similarity_hist()
visualizer.plot_rule_based_vs_similarity()
feature_columns = [
    "first_name_similarity",
    "last_name_similarity",
    "subtype_similarity",
    "type_similarity",
]
visualizer.plot_feature_histograms(feature_columns)
visualizer.plot_similarity_trends(feature_columns)
print("\n" + "-" * 50)"""

Beginning preprocessing

--------------------------------------------------


Iterating through folder: 100%|██████████| 19/19 [00:04<00:00,  3.92it/s]



--------------------------------------------------
Beginning similarity caclulations

--------------------------------------------------


Processing records: 100%|██████████| 3060/3060 [09:25<00:00,  5.42it/s]



--------------------------------------------------
Beginning visualizations

--------------------------------------------------


'visualizer = SimilarityVisualizer(scores)\nvisualizer.plot_overall_similarity_hist()\nvisualizer.plot_rule_based_vs_similarity()\nfeature_columns = [\n    "first_name_similarity",\n    "last_name_similarity",\n    "subtype_similarity",\n    "type_similarity",\n]\nvisualizer.plot_feature_histograms(feature_columns)\nvisualizer.plot_similarity_trends(feature_columns)\nprint("\n" + "-" * 50)'

In [17]:
true_positives = scores.loc[scores["source_id"] == scores["target_id"]]
value_counts = true_positives["match_label"].value_counts()
true_matches = value_counts.get("Match", 0)
total_records = value_counts.sum()
accuracy = true_matches / total_records
print(f"Accuracy: True Positives / All = {true_matches} / {total_records} = {accuracy:.2f}")

Accuracy: True Positives / All = 88 / 107 = 0.82


In [16]:
merged = pd.merge(source_df.head(5000), target_df.head(5000), on="id", how="inner")
merged["MATCH_CATEGORY_HCP"].value_counts()

MATCH_CATEGORY_HCP
match manual    107
Name: count, dtype: int64

In [19]:
def get_source_record(id: str):
    """ 
    Function to check potential false negatives
    Params:
        - id: The id of the record 
    Returns:
        - record: The record with the specified ID
        """
    record = target_df.loc[target_df["id"] == id, [
        "id", "first_name__v", "standardized_last_name", "address_full",
        "cleaned_phonenumber", "cleaned_email", "type", "specialty_1_label"
    ]]
    return record
get_source_record("942572098556267894")

Unnamed: 0,id,first_name__v,standardized_last_name,address_full,cleaned_phonenumber,cleaned_email,type,specialty_1_label
494,942572098556267894,Beena,thampy,Next to Royal Rose Hotel Pink Building(501) ...,97126767366,,DOCTOR,Obstetrics Gynecology Womens Health


In [20]:
def get_target_record(id: str):
    """ 
    Function to check potential false negatives
    Params:
        - id: The id of the record 
    Returns:
        - record: The record with the specified ID
        """
    record = source_df.loc[source_df["id"] == id, ["standardized_last_name", "address_full",
        "cleaned_phonenumber", "cleaned_email", "id", "FIRST_NAME", "HCP_TYPE_V__LABEL", "SPCLTY1"]]
    return record
get_target_record("942572098556267894")

Unnamed: 0,standardized_last_name,address_full,cleaned_phonenumber,cleaned_email,id,FIRST_NAME,HCP_TYPE_V__LABEL,SPCLTY1
29,kader,Electra St Beside Royal Rose Hotel Pin Abu Dha...,,,942572098556267894,Bina,DOCTOR,Obstetrics Gynecology Womens Health
