# Examples for Name Matching (using Pandas)

This notebook illustrate basic usage of name matching algorithm from the `entity_matching_model` package.

(Code below also works with Spark version.)

In [None]:
import emm
import matplotlib

In [None]:
import pandas as pd
from emm import PandasEntityMatching, resources
from emm.data.create_data import pandas_create_noised_data
from emm.helper.blocking_functions import first as first_character
from emm.threshold.threshold_decision import get_threshold_curves_parameters
import warnings
warnings.filterwarnings("ignore")

In [None]:
%matplotlib inline

In [None]:
# create noised names, based on Dutch chamber of commerce data
ground_truth, _, positive_noised_pd, negative_pd = pandas_create_noised_data(random_seed=42)

In [None]:
len(ground_truth), len(positive_noised_pd), len(negative_pd)

In [None]:
# have a look at the names in the ground truth
ground_truth

In [None]:
# and now at those in the noised dataset
positive_noised_pd

Next we configure an EntityMatching object that only looks at names.

In [None]:
# example indexers
indexers = [
        {
            'type': 'cosine_similarity',
            'tokenizer': 'words',           # word-based cosine similarity
            'ngram': 1,
            'num_candidates': 5,            # max 5 candidates per name-to-match
            'cos_sim_lower_bound': 0.2,     # lower bound on cosine similarity
        },
        {
            'type': 'cosine_similarity',
            'tokenizer': 'characters',      # 2character-based cosine similarity
            'ngram': 2,
            'num_candidates': 5,
            'cos_sim_lower_bound': 0.2,
            'blocking_func': first_character
        },
        {'type': 'sni', 'window_length': 3}  # sorted neighbouring indexing window of size 3.
]

In [None]:
em_params = {
    'name_only': True,          # only consider name information for matching
    'entity_id_col': 'Index',   # important to set index and name columns
    'name_col': 'Name',
    'indexers': [indexers[0]],
    'supervised_on': True,       # without specifying a model, this option add an untrained supervided model  
    'return_sm_features': True,  # when calling transform, return the features used by the supervised model
    'without_rank_features': False,
    'with_legal_entity_forms_match': False,  # add feature with match of legal entity forms, e.g. ltd != co
}
p = PandasEntityMatching(em_params)


In [None]:
# this fits the tfidf matrix of the indexer(s), based on the ground truth names.
p.fit(ground_truth, copy_ground_truth=True)

In [None]:
# note that return_sm_features = True, and the supervised model is untrained 
# when calling transform(), the features used by the supervised model are returned (X_feat_*)
resp = p.transform(positive_noised_pd)

In [None]:
resp.head()

In [None]:
# approximately ~3 candidates per name to match.
len(positive_noised_pd), len(resp)

In [None]:
resp['correct'] = resp['gt_entity_id'] == resp['entity_id']

In [None]:
resp['rank_0'].hist()

In [None]:
resp['score_0'][resp['correct'] == True].hist(bins=40)

In [None]:
resp['score_0'][resp.rank_0 == 1].hist(bins=40)

In [None]:
resn = p.transform(negative_pd)

In [None]:
resn['score_0'][resn.rank_0 == 1].hist(bins=40)

In [None]:
# turn off returning of sm features in transform() call.
p.set_return_sm_features(False)

In [None]:
# in more detail: internally the supervised model is trained on the follow name-pairs
name_pairs = p.create_training_name_pairs(positive_noised_pd[:2267]) 

In [None]:
name_pairs.head()

In [None]:
# fit the supervised model part of the PandasEntityMatching object (this takes a while)
# these name-pairs are generated automatically internally.
p.fit_classifier(positive_noised_pd[:2267])

In [None]:
# alternatively one can fit the classifier using:
#p.fit_classifier(train_name_pairs=name_pairs)

In [None]:
resp2 = p.transform(positive_noised_pd[2267:])

In [None]:
resp2['correct'] = (resp2['gt_entity_id'] == resp2['entity_id'])

In [None]:
resp2

In [None]:
len(resp2[resp2.best_match == True])

In [None]:
#resp2['nm_score'].hist(bins=40, log=True, alpha=0.5)
resp2['nm_score'][resp2.best_match == True][resp2.correct == False].hist(bins=40, log=True, alpha=0.5)
resp2['nm_score'][resp2.best_match == True][resp2.correct == True].hist(bins=40, log=True, alpha=0.5)


In [None]:
resn2 = p.transform(negative_pd)

In [None]:
# note: we have trained without negative names!
resn2[resn2.best_match]['nm_score'].hist(bins=40, log=True, alpha=0.5)

In [None]:
# try training with negative names
# either add negative names to the positive ones, and retrain
# or in case negative names are missing:
p.fit_classifier(positive_noised_pd[:2267], create_negative_sample_fraction=0.5)
# look at the impact!

In [None]:
p.save('trained_em.pickle')

In [None]:
neg_names = negative_pd.rename(columns={'Name': 'name', 'Index': 'index'})

In [None]:
# change of column names
nm = PandasEntityMatching.load("trained_em.pickle", 
                               override_parameters={'name_col': 'name', 'entity_id_col': 'index'})

In [None]:
resn3 = nm.transform(neg_names)

In [None]:
resn3[resn3.best_match]['nm_score'].hist(bins=40, log=True, alpha=0.5)

Ideas:
- try different indexers
- with and without rank features
- return sm features
- training with variations of the above.
- training create_negative_fraction


## Discrimination threshold determination

In [None]:
positive_test = positive_noised_pd[2267:]
negative_test = negative_pd[:len(positive_test)]

candidates_pos = p.transform(positive_test)
candidates_neg = p.transform(negative_test)
candidates_pos['positive_set'] = True
candidates_neg['positive_set'] = False
candidates = pd.concat([candidates_pos, candidates_neg])
candidates['correct'] = (candidates['gt_entity_id'] == candidates['entity_id'])

best_candidates = candidates[candidates.best_match]

In [None]:
# get discrimination threshold curves for best candidates
curves = get_threshold_curves_parameters(best_candidates)

In [None]:
# only name-matching, so there is no aggregation here
curves['threshold_curves'].keys()

In [None]:
# add them to the EMM model. this is needed to run nm.calc_threshold() below.
nm.parameters.update(curves)

### Get threshold scores

In [None]:
# discrimination threshold for positive names only, with minimum precision of 95%
threshold1 = nm.calc_threshold(agg_name='non_aggregated', type_name='positive', metric_name='precision', min_value=0.95)

In [None]:
print(threshold1)

In [None]:
# discrimination threshold for positive and negative names, with minimum precision of 80%
threshold2 = nm.calc_threshold(agg_name='non_aggregated', type_name='all', metric_name='precision', min_value=0.80)

In [None]:
print(threshold2)

In [None]:
nm.save('trained_em_with_thresholds.pickle')