# Mantain a persistent UUID for groups

The output from the first deduplication of records looks like

|group_UUID||

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("NameMatching_Notebook")
    .config('spark.dynamicAllocation.enabled', False)
    .config('spark.executorEnv.PYTHON_EGG_CACHE', '/tmp')
    .config('spark.executor.instances', 4)
    .config('spark.executor.cores', 13)
    .config('spark.executor.memory', '14g')
    .config('spark.driver.memory', '7g')
    .getOrCreate()
)
sc = spark.sparkContext

In [2]:
import pyspark.sql.functions as sf

from pyspark.sql.window import Window

## Prepare data

Since we don't have an example of new incoming data we will separate into two dataframes. One as if it was the current data and another as the new ingestion data. I expect that there will be some overlapping operators, which is a realistic case.

**Load current operator data and keep 5%**

In [3]:
operators_old_dir = 'adl://ulohubdldevne.azuredatalakestore.net/data/parquet/OPERATORS_MERGED.parquet'

oprs_old = (
    spark
    .read.parquet(operators_old_dir)
    .filter(sf.col('countryCode') == 'DK')
    .sample(False, 0.05)
    .withColumn('id', sf.explode(sf.col('refIds')))
    .select('ohubOperatorId', 'id')
)
oprs_old.show(2, truncate=False)

+------------------------------------+------------------------+
|ohubOperatorId                      |id                      |
+------------------------------------+------------------------+
|03c07042-5646-4cb6-bb59-fc4fd9a4d91a|DK~MM-INIT-OPER~O~489081|
|03c07042-5646-4cb6-bb59-fc4fd9a4d91a|DK~mellowmessage~489081 |
+------------------------------------+------------------------+
only showing top 2 rows



**Get sample of source data 10%**

In [4]:
# read operators and assign id
operators_matched_dir = 'adl://ulohubdldevne.azuredatalakestore.net/data/parquet/OPERATORS.parquet'

input_oprs_raw = (
    spark
    .read.parquet(operators_matched_dir)
    .filter(sf.col('COUNTRY_CODE') == 'DK')
    .sample(False, 0.1)
)
    
input_oprs = (
    input_oprs_raw
    .withColumn('id', sf.concat_ws('~',
                                   sf.col('COUNTRY_CODE'),
                                   sf.col('SOURCE'),
                                   sf.col('REF_OPERATOR_ID'))).select('id', 'NAME_CLEANSED')
    .repartition('id')
)

**New and overlapping operators**

In [5]:
df = input_oprs.join(oprs_old, on='id', how='left')

print('Known Operators:', oprs_old.count())
print('Input Operators:', input_oprs_raw.count())
print('Overlapping:', df.na.drop(subset=['ohubOperatorId']).count())
print('New:', df.filter(sf.isnull(sf.col('ohubOperatorId'))).count())

Known Operators: 1995
Input Operators: 4202
Overlapping: 202
New: 4000


## Matching incoming input operators

In [6]:
DROP_CHARS = "\\\\!#%&()*+-/:;<=>?@\\^|~\u00A8\u00A9\u00AA\u00AC\u00AD\u00AF\u00B0" \
             "\u00B1\u00B2\u00B3\u00B6\u00B8\u00B9\u00BA\u00BB\u00BC\u00BD\u00BE" \
             "\u2013\u2014\u2022\u2026\u20AC\u2121\u2122\u2196\u2197\u247F\u250A" \
             "\u2543\u2605\u2606\u3001\u3002\u300C\u300D\u300E\u300F\u3010\u3011" \
             "\uFE36\uFF01\uFF06\uFF08\uFF09\uFF1A\uFF1B\uFF1F{}\u00AE\u00F7\u02F1" \
             "\u02F3\u02F5\u02F6\u02F9\u02FB\u02FC\u02FD\u1BFC\u1BFD\u2260\u2264" \
             "\u2DE2\u2DF2\uEC66\uEC7C\uEC7E\uED2B\uED34\uED3A\uEDAB\uEDFC\uEE3B" \
             "\uEEA3\uEF61\uEFA2\uEFB0\uEFB5\uEFEA\uEFED\uFDAB\uFFB7\u007F\u24D2" \
             "\u2560\u2623\u263A\u2661\u2665\u266A\u2764\uE2B1\uFF0D"
REGEX = "[{}]".format(DROP_CHARS)

**preprocess input operators**

In [7]:
w = Window.partitionBy('COUNTRY_CODE').orderBy(sf.asc('id'))

input_oprs = (
    input_oprs_raw
    .na.drop(subset=['NAME_CLEANSED'])
    # create unique ID
    .withColumn('id', sf.concat_ws('~',
                                   sf.col('COUNTRY_CODE'),
                                   sf.col('SOURCE'),
                                   sf.col('REF_OPERATOR_ID')))
    .fillna('')
    # create string columns to matched
    .withColumn('matching_string',
                sf.concat_ws(' ',
                             sf.col('NAME_CLEANSED'),
                             sf.col('CITY_CLEANSED'),
                             sf.col('STREET_CLEANSED'),
                             sf.col('ZIP_CODE_CLEANSED')))
    .withColumn('matching_string', sf.regexp_replace('matching_string', REGEX, ''))
    .withColumn('matching_string', sf.lower(sf.trim(sf.regexp_replace('matching_string', '\s+', ' '))))
    .withColumn('string_index', sf.row_number().over(w) - 1)
    .select('COUNTRY_CODE', 'string_index', 'id', 'matching_string')
    .repartition('id')
)

input_oprs.persist()
input_oprs.sort('string_index').show(5, truncate=False)

+------------+------------+---------------+--------------------------------------------------------------+
|COUNTRY_CODE|string_index|id             |matching_string                                               |
+------------+------------+---------------+--------------------------------------------------------------+
|DK          |0           |DK~DEX~10003323|jacob kongsbak lassen københavnv flæsketorvet19 1711          |
|DK          |1           |DK~DEX~10003413|kwik sparsupergros rebate brøndby gammelager11 2605           |
|DK          |2           |DK~DEX~10003428|sugrolekkerland rebate tåstrup helgeshøj alle20 2630          |
|DK          |3           |DK~DEX~10003429|nærkøb rebate vejle bugattivej18 7100                         |
|DK          |4           |DK~DEX~10003464|dansk cater as dansk cater promotion svenstrup vidalsvej6 9230|
+------------+------------+---------------+--------------------------------------------------------------+
only showing top 5 rows



**preprocess known operators**

In [8]:
# Create string name used originally for the matching
w = Window.partitionBy('countryCode').orderBy(sf.asc('id'))

oprs_old = (
    spark
    .read.parquet(operators_old_dir)
    .filter(sf.col('countryCode') == 'DK')
    .withColumn('id', sf.col('operator').getItem('operatorConcatId'))
    .withColumn('matching_string', sf.concat_ws(' ',
                                                sf.col('operator').getItem('nameCleansed'),
                                                sf.col('operator').getItem('cityCleansed'),
                                                sf.col('operator').getItem('streetCleansed'),
                                                sf.col('operator').getItem('zipCodeCleansed')))
    .withColumn('matching_string', sf.regexp_replace('matching_string', REGEX, ''))
    .withColumn('matching_string', sf.lower(sf.trim(sf.regexp_replace(sf.col('matching_string'), '\s+', ' '))))
    .withColumn('string_index', sf.row_number().over(w) - 1)
    .select('countryCode', 'ohubOperatorId', 'string_index', 'id', 'matching_string')
)

oprs_old.persist()
oprs_old.sort('string_index').show(5, truncate=False)

+-----------+------------------------------------+------------+----------------------------+----------------------------------------------------------------------------+
|countryCode|ohubOperatorId                      |string_index|id                          |matching_string                                                             |
+-----------+------------------------------------+------------+----------------------------+----------------------------------------------------------------------------+
|DK         |1a7d6ece-83dd-41d0-860a-345555629f5a|0           |DK~COMPLAINTS~2503250       |unknown                                                                     |
|DK         |602930a3-59b9-4910-8dfa-5cf2f9dab5c3|1           |DK~COMPLAINTS~OP_USD_5067_15|                                                                            |
|DK         |97ad8b8b-8505-42df-b691-982e1dc02752|2           |DK~DEX~10001401             |boursin fromagerie su pacy sur eure croisysureure route de

Know that the format is ready to match we use the matching algorithm

In [9]:
import os
from glob import glob

egg_file = glob(os.path.join('..', 'dist', '*.egg'))[0]
sc.addPyFile(egg_file)

from string_matching.spark_string_matching import match_strings

In [13]:
n_top = 1 # we get only the top match

similarity = match_strings(
    spark,
    input_oprs.select('string_index', 'matching_string'),
    df2=oprs_old.select('string_index', 'matching_string'),
    string_column='matching_string', row_number_column='string_index',
    n_top=n_top, threshold=0.8, n_gram=2, min_document_frequency=2, max_vocabulary_size=1500
)
similarity.show(5)

+---+---+----------+
|  i|  j|SIMILARITY|
+---+---+----------+
|  0|  5|       1.0|
|  1|  7|       1.0|
|  2| 11|       1.0|
|  3| 12|       1.0|
|  4| 25|       1.0|
+---+---+----------+
only showing top 5 rows



In [22]:
matches = (
    input_oprs
    .join(similarity, input_oprs['string_index'] == similarity['i'],
    how='left').drop('string_index')
    .selectExpr('j', 'SIMILARITY',
                'matching_string as matching_string_input',
                'id as id_input')
    .join(oprs_old, sf.col('j') == oprs_old['string_index'],
    how='left').drop('string_index')
    .selectExpr('SIMILARITY',
                'matching_string_input',
                'matching_string as matching_string_old',
                'id_input',
                'id as id_old')
    # we don't want matches were id and matching string remained the same
    .withColumn('matching_string_input', sf.lower(sf.col('matching_string_input')))
    .withColumn('matching_string_old', sf.lower(sf.col('matching_string_old')))
    .filter((sf.col('matching_string_input') != sf.col('matching_string_old')) |
            (sf.col('id_input') != sf.col('id_old')))
)
matches.persist()
matches.count()

1252

In [23]:
matches.show(3, truncate=False)

+----------+------------------------------------------+---------------------------------------------------------+-------------------------+-----------------------+
|SIMILARITY|matching_string_input                     |matching_string_old                                      |id_input                 |id_old                 |
+----------+------------------------------------------+---------------------------------------------------------+-------------------------+-----------------------+
|0.9053102 |center bageriet tune tune centret1 b 4030 |helle bjergtrup center bageriet tune tune centret1 b 4030|DK~DEX~10344242          |DK~DEX~10344243        |
|1.0       |antropologerne                            |antropologerne                                           |DK~MM-INIT-OPER~O~1468874|DK~WEBUPDATER~O~1468874|
|0.93287545|ågård efterskole ågård kirkebakken 13 6040|ågård efterskole egtved kirkebakken13 ågård 6040         |DK~mellowmessage~486452  |DK~DEX~1004121735      |
+----------+----

In [24]:
# thsi should be empty
matches.filter(sf.col('id_input') == sf.col('id_old')).show(20, truncate=False)

+----------+---------------------+-------------------+--------+------+
|SIMILARITY|matching_string_input|matching_string_old|id_input|id_old|
+----------+---------------------+-------------------+--------+------+
+----------+---------------------+-------------------+--------+------+

