# Examples for Name Matching (using Spark)

This notebook illustrate basic usage of name matching algorithm from the `entity_matching_model` package.

In [None]:
import emm

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from emm import SparkEntityMatching

import warnings
warnings.filterwarnings("ignore")

In [None]:
# create spark session
conf = {
"spark.driver.memory": "4G",
"spark.driver.memoryOverhead": "4G",
"spark.driver.maxResultSize": "1G",
"spark.executor.memory": "4G",
"spark.executor.memoryOverhead": "4G",
"spark.sql.shuffle.partitions": 1,  # because in examples we use very small datasets
}

conf = [(k, v) for k, v in conf.items()]
config = SparkConf().setAll(conf)

spark_session = SparkSession.builder.appName("Spark EMM Example").config(conf=config)
spark = spark_session.getOrCreate()


In [None]:
gt = spark.createDataFrame([
    (1, 'John Smith LLC'),
    (2, 'ING LLC'),
    (3, 'John Doe LLC'),
    (4, 'Zhe Sun G.M.B.H'),
    (5, 'Random GMBH'),
], ['id', 'name'])
gt.show(10, False)

Prepare very simple supervised model (only for illustration purposes).
For production usage use model trained by Core Algo or train your own on real data.

In [None]:
from emm.supervised_model.base_supervised_model import train_test_model
from emm.helper.io import save_file
from emm.data import create_training_data

df, vocabulary = create_training_data()
sem, _= train_test_model(df, vocabulary, name_only=False)
save_file("sem.pkl", sem)
sem_nm, _ = train_test_model(df, vocabulary, name_only=True)
save_file("sem_nm.pkl", sem_nm)

In [None]:
nm = SparkEntityMatching({
    'name_only': True,
    'preprocessor': 'preprocess_merge_abbr',
    'indexers': [{
        'type': 'cosine_similarity',
        'tokenizer': 'words',
        'ngram': 1,
        'num_candidates': 5,
        'cos_sim_lower_bound': 0.2,
    }],
    'supervised_on': True,
    'supervised_model_filename': 'sem_nm.pkl',
    'supervised_model_dir': '.',
})
nm.write().overwrite().save("serialized_em_nm.pkl")

## Name matching without supervised model

Name matching using basic preprocessing, word tokenization and cosine similarity. 
This example is not using any supervised model. The candidate score is just a cosine similarity value.

In [None]:
nm = SparkEntityMatching({
    'name_only': True,
    'entity_id_col': 'id',
    'name_col': 'name',
    'preprocessor': 'preprocess_merge_abbr',
    'indexers': [{
        'type': 'cosine_similarity',
        'tokenizer': 'words',
        'ngram': 1,
        'num_candidates': 5,
        'cos_sim_lower_bound': 0.2,
    }],
    'supervised_on': False,
})
nm.fit(gt)

In [None]:
res = nm.transform(spark.createDataFrame([
    (10, 'John Smith'),
    (11, 'I.n.G. LLC'),
    (12, 'Jon DOEE LLC'), # this will not be matched due to mispellings
], ['id', 'name']))
res.show(10, False)

Name matching using basic preprocessing, 2-characters ngram tokenization and cosine similarity. 
This example is not using any supervised model. The candidate score is just a cosine similarity value.

In [None]:
nm = SparkEntityMatching({
    'name_only': True,
    'entity_id_col': 'id',
    'name_col': 'name',
    'preprocessor': 'preprocess_merge_abbr',
    'indexers': [{
        'type': 'cosine_similarity',
        'tokenizer': 'characters',
        'ngram': 2,
        'num_candidates': 5,
        'cos_sim_lower_bound': 0.2,
    }],
    'supervised_on': False,
})
nm.fit(gt)
res = nm.transform(spark.createDataFrame([
    (10, 'John Smith'),
    (11, 'I.n.G. LLC'),
    (12, 'Jon DOEE LLC'),  # it will not be matched due to mispellings
], ['id', 'name']))
res.show(10, False)

Name matching using basic preprocessing and two indexers (word & ngram cosine similarity). 
This example is not using any supervised model.

In [None]:
nm = SparkEntityMatching({
    'name_only': True,
    'entity_id_col': 'id',
    'name_col': 'name',
    'preprocessor': 'preprocess_merge_abbr',
    'indexers': [
        {'type': 'cosine_similarity', 'tokenizer': 'words', 'ngram': 1, 'num_candidates': 5, 'cos_sim_lower_bound': 0.2},
        {'type': 'cosine_similarity', 'tokenizer': 'characters', 'ngram': 2, 'num_candidates': 5, 'cos_sim_lower_bound': 0.2},
    ],
    'supervised_on': False,
})
nm.fit(gt)
res = nm.transform(spark.createDataFrame([
    (10, 'John Smith'),
    (11, 'I.n.G. LLC'),
    (12, 'Jon DOEE LLC'),
], ['id', 'name']))
res.show(10, False)

Name matching using basic preprocessing with Sorted Neighbourhood indexing. 
This example is not using any supervised model. The candidate score is just a SNI distance (normalized to range 0-1).

In [None]:
nm = SparkEntityMatching({
    'name_only': True,
    'uid_col': 'uid',
    'entity_id_col': 'id',
    'name_col': 'name',
    'preprocessor': 'preprocess_merge_abbr',
    'indexers': [
        {'type': 'sni', 'window_length': 3},
    ],
    'supervised_on': False,
})
nm.fit(gt)
res = nm.transform(spark.createDataFrame([
    (10, 'Jo S'),
    (11, 'InG. LLC'),
    (12, 'Jon DOEE LLC'),
], ['id', 'name']))
res.show(10, False)

You can also define custom function that transforms names before SNI, for example: reversing names

In [None]:
reverse_name = lambda x: x[::-1]
nm = SparkEntityMatching({
    'name_only': True,
    'uid_col': 'uid',
    'entity_id_col': 'id',
    'name_col': 'name',
    'preprocessor': 'preprocess_merge_abbr',
    'indexers': [
        {'type': 'sni', 'window_length': 3, 'mapping_func': reverse_name},
    ],
    'supervised_on': False,
})
nm.fit(gt)
res = nm.transform(spark.createDataFrame([
    (11, 'a InG. LLC'),
    (12, 'ING. LLC ZZZ'),
    (13, 'John Smith LLC'),
], ['id', 'name']))
res.show(10, False)

Name matching using blocking function (it will generate only those candidate pairs that have the same value of blocking function).

In [None]:
first_character = lambda x: x[0] if len(x) > 0 else '?'

nm = SparkEntityMatching({
    'name_only': True,
    'uid_col': 'uid',
    'entity_id_col': 'id',
    'name_col': 'name',
    'preprocessor': 'preprocess_merge_abbr',
    'indexers': [
        {'type': 'cosine_similarity', 'tokenizer': 'characters', 'ngram': 1, 'blocking_func': first_character},
    ],
    'supervised_on': False,
})
nm.fit(gt)
res = nm.transform(spark.createDataFrame([
    (10, '!notING'),  # it will not be matched due to different value of blocking function (first character)
    (11, 'ING'),
], ['id', 'name']))
res.show(10, False)

## Name matching with supervised model

In [None]:
nm = SparkEntityMatching({
    'name_only': True,
    'entity_id_col': 'id',
    'name_col': 'name',
    'preprocessor': 'preprocess_merge_abbr',
    'indexers': [{
        'type': 'cosine_similarity',
        'tokenizer': 'characters',
        'ngram': 2,
        'num_candidates': 5,
        'cos_sim_lower_bound': 0.2,
    }],
    'supervised_on': True,
    'supervised_model_filename': 'sem_nm.pkl',
    'supervised_model_dir': '.',
})
nm.fit(gt)
res = nm.transform(spark.createDataFrame([
    (10, 'John Smith'),
    (11, 'I.n.G. LLC'),
    (12, 'Jon DOEE LLC'),
], ['id', 'name']))
res.show(10, False)

## Name matching using multiple indexers

In [None]:
nm = SparkEntityMatching({
    'name_only': True,
    'entity_id_col': 'id',
    'name_col': 'name',
    'preprocessor': 'preprocess_merge_abbr',
    'indexers': [
        {'type': 'cosine_similarity', 'tokenizer': 'words', 'ngram': 1, 'num_candidates': 5, 'cos_sim_lower_bound': 0.2},
        {'type': 'cosine_similarity', 'tokenizer': 'characters', 'ngram': 2, 'num_candidates': 5, 'cos_sim_lower_bound': 0.2},
        {'type': 'sni', 'window_length': 3},
    ],
    'supervised_on': False,
})
nm.fit(gt)
res = nm.transform(spark.createDataFrame([
    (10, 'John Smith'),
    (11, 'I.n.G. LLC'),
    (12, 'Jon DOEE LLC'),
    (14, 'Z'),  # this will be matched only by SNI
], ['id', 'name']))
res.show(10, False)

## Name matching from serialized model

The persisted model is spark only. A loaded model no longer needs to be fit to the ground-truth data.
In particular for large datasets (e.g. > 100k names in the ground truth), this can save a lot of time when reusing a trained entity-matching model.

In [None]:
nm.save('name_matching_spark_model')

In [None]:
nm2 = SparkEntityMatching.load('name_matching_spark_model')

In [None]:
res2 = nm2.transform(spark.createDataFrame([
    (10, 'John Smith'),
    (11, 'I.n.G. LLC'),
    (12, 'Jon DOE LLC'),
], ['id', 'name']))
res2.show(10, False)