# Examples to use Aggregation method, using Pandas

The EMM package can be used to match a group of company names that belong together,
to a company name in the ground truth. For example, all names used to address an external bank account.

This notebook illustrate basic usage of `entity_matching_model` package, 
how to use the aggregation layer.

(Examples below also work with Spark version.)

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
from emm import PandasEntityMatching, resources
from emm.data.create_data import pandas_create_noised_data
from emm.threshold.threshold_decision import get_threshold_curves_parameters
import warnings
warnings.filterwarnings("ignore")

## Train a model

In [None]:
# create noised names, based on Dutch chamber of commerce data
ground_truth, _, positive_noised_pd, negative_pd = pandas_create_noised_data(random_seed=42)
train_set, positive_test_set = positive_noised_pd[:2267], positive_noised_pd[2267:]
negative_test_set = negative_pd[:len(positive_test_set)]

In [None]:
# example indexers
indexers = [
    {
        'type': 'cosine_similarity',
        'tokenizer': 'words',           # word-based cosine similarity
        'ngram': 1,
        'num_candidates': 5,            # max 5 candidates per name-to-match
        'cos_sim_lower_bound': 0.2,     # lower bound on cosine similarity
    },
]

em_params = {
    'name_only': True,          # only consider name information for matching
    'entity_id_col': 'Index',   # important to set index and name columns
    'name_col': 'Name',
    'indexers': [indexers[0]],
    'supervised_on': True,       # without specifying a model, this option add an untrained supervised model  
    'return_sm_features': True,  # when calling transform, return the features used by the supervised model
    'without_rank_features': False,
    'with_legal_entity_forms_match': True,  # add feature with match of legal entity forms, e.g. ltd != co
    'aggregation_layer': True,    # aggregation layer, the aggregation of names on an account level
    'aggregation_method': 'mean_score', # aggregation method
}
p = PandasEntityMatching(em_params)

In [None]:
# this fits the tfidf matrix of the indexer(s), based on the ground truth names.
p.fit(ground_truth)

In [None]:
# fit the supervised model part of the PandasEntityMatching object (this takes a while)
# these name-pairs are generated automatically internally.
# the aggregation layer does not need fitting, so no special training set is required.
p.fit_classifier(train_set)

### scoring for name aggregation

In [None]:
# For aggregation of name-scores, need to have:
# an 'account' column: which indicated which names belong together
# and a frequency column, here call 'counterparty_account_count_distinct', 
# which indicates how frequently each name occurs.

# Below we add these column with dummy values. 
# Each name belongs to a single account and is used just once.

In [None]:
positive_test_set['account'] = range(len(positive_test_set))
positive_test_set['account'] = positive_test_set['account'].astype(str)
positive_test_set['counterparty_account_count_distinct'] = 1

negative_test_set['account'] = range(len(negative_test_set))
negative_test_set['account'] += 10000
negative_test_set['account'] = negative_test_set['account'].astype(str)
negative_test_set['counterparty_account_count_distinct'] = 1

In [None]:
# this can take some time.
candidates_pos = p.transform(positive_test_set)
candidates_neg = p.transform(negative_test_set)
candidates_neg['positive_set'] = False
candidates_pos['positive_set'] = True

In [None]:
candidates = pd.concat([candidates_pos, candidates_neg])
candidates['correct'] = (candidates['gt_entity_id'] == candidates['entity_id'])
best_candidates = candidates[candidates.best_match]

In [None]:
# as we only have one name per account, the name-scores and aggregated scores are the same.
best_candidates[['nm_score', 'agg_score']].head()

In [None]:
# for threshold curves (below), scores cannot contain NANs.
best_candidates.dropna(subset=['agg_score'], inplace=True)

In [None]:
# get discrimination threshold curves for best candidates
# do clustering of `agg_score` column
curves = get_threshold_curves_parameters(best_candidates, score_col='agg_score', 
                                         aggregation_layer=True, aggregation_method="mean_score")

In [None]:
# aggregation here
curves['threshold_curves'].keys()

In [None]:
# curves['threshold_curves']

In [None]:
# add them to the EMM model
p.parameters.update(curves)

In [None]:
p.save('am_curves.pkl')

## Load pretrained model

In [None]:
am = PandasEntityMatching.load('am_curves.pkl')

## Get thresholds

In [None]:
# discrimination threshold for positive names only, with minimum precision of 95%
threshold1 = am.calc_threshold(agg_name="mean_score", type_name='positive', metric_name='precision', min_value=0.95)

In [None]:
threshold1

In [None]:
# discrimination threshold for positive and negative names, with minimum precision of 80%
threshold2 = am.calc_threshold(agg_name="mean_score", type_name='all', metric_name='precision', min_value=0.80)

In [None]:
threshold2