# Mantain a persistent UUID for groups

The output from the first deduplication of records looks like

|group_UUID||

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = (SparkSession
         .builder
         .appName("NameMatching_Notebook")
#              .config('spark.dynamicAllocation.enabled', False)
         .config('spark.executorEnv.PYTHON_EGG_CACHE', '/tmp')
#              .config('spark.executor.instances', 4)
#              .config('spark.executor.cores', 13)
#              .config('spark.executor.memory', '14g')
         .config('spark.driver.memory', '7g')
         .getOrCreate())
sc = spark.sparkContext

In [33]:
import pyspark.sql.functions as sf

from pyspark.sql.window import Window

## Prepare data

Since we don't have an example of new incoming data we will separate into two dataframes. One as if it was the current data and another as the new ingestion data. I expect that there will be some overlapping operators, which is a realistic case.

**Load current operator data and keep 5%**

In [87]:
operators_old_dir = 'adl://ulohubdldevne.azuredatalakestore.net/data/parquet/OPERATORS_MERGED.parquet'

oprs_old = (
    spark
    .read.parquet(operators_old_dir)
    .filter(sf.col('countryCode') == 'DK')
    .sample(False, 0.05)
    .withColumn('id', sf.explode(sf.col('refIds')))
    .select('ohubOperatorId', 'id')
)
oprs_old.show(2, truncate=False)

+------------------------------------+------------------------+
|ohubOperatorId                      |id                      |
+------------------------------------+------------------------+
|0935427a-dc99-4ae9-b9d2-bee072edf474|DK~DEX~10080940         |
|0935427a-dc99-4ae9-b9d2-bee072edf474|DK~MM-INIT-OPER~O~622625|
+------------------------------------+------------------------+
only showing top 2 rows



**Get sample of source data 10%**

In [85]:
# read operators and assign id
operators_matched_dir = 'adl://ulohubdldevne.azuredatalakestore.net/data/parquet/OPERATORS.parquet'

input_oprs = (
    spark
    .read.parquet(operators_matched_dir)
    .filter(sf.col('COUNTRY_CODE') == 'DK')
    .sample(False, 0.1)
    .withColumn('id', sf.concat_ws('~',
                                   sf.col('COUNTRY_CODE'),
                                   sf.col('SOURCE'),
                                   sf.col('REF_OPERATOR_ID'))).select('id', 'NAME_CLEANSED')
)

**New and overlapping operators**

In [86]:
df = input_oprs.join(oprs_old, on='id', how='left')

print('Known Operators:', oprs_old.count())
print('Input Operators:', input_oprs.count())
print('Overlapping:', df.na.drop(subset=['ohubOperatorId']).count())
print('New:', df.filter(sf.isnull(sf.col('ohubOperatorId'))).count())

Known Operators: 2002
Input Operators: 4136
Overlapping: 176
New: 3960


Create string name used originally for the matching

In [30]:
DROP_CHARS = "\\\\!#%&()*+-/:;<=>?@\\^|~\u00A8\u00A9\u00AA\u00AC\u00AD\u00AF\u00B0" \
             "\u00B1\u00B2\u00B3\u00B6\u00B8\u00B9\u00BA\u00BB\u00BC\u00BD\u00BE" \
             "\u2013\u2014\u2022\u2026\u20AC\u2121\u2122\u2196\u2197\u247F\u250A" \
             "\u2543\u2605\u2606\u3001\u3002\u300C\u300D\u300E\u300F\u3010\u3011" \
             "\uFE36\uFF01\uFF06\uFF08\uFF09\uFF1A\uFF1B\uFF1F{}\u00AE\u00F7\u02F1" \
             "\u02F3\u02F5\u02F6\u02F9\u02FB\u02FC\u02FD\u1BFC\u1BFD\u2260\u2264" \
             "\u2DE2\u2DF2\uEC66\uEC7C\uEC7E\uED2B\uED34\uED3A\uEDAB\uEDFC\uEE3B" \
             "\uEEA3\uEF61\uEFA2\uEFB0\uEFB5\uEFEA\uEFED\uFDAB\uFFB7\u007F\u24D2" \
             "\u2560\u2623\u263A\u2661\u2665\u266A\u2764\uE2B1\uFF0D"
REGEX = "[{}]".format(DROP_CHARS)

In [41]:
# at the moment only with Denmark

w = Window.partitionBy('countryCode').orderBy(sf.asc('id'))

operators_old = (
    spark
    .read.parquet(operators_old_dir)
    .filter(sf.col('countryCode') == 'DK')
    .withColumn('id', sf.col('operator').getItem('operatorConcatId'))
    .withColumn('matching_string', sf.concat_ws(' ',
                                                sf.col('operator').getItem('nameCleansed'),
                                                sf.col('operator').getItem('cityCleansed'),
                                                sf.col('operator').getItem('streetCleansed'),
                                                sf.col('operator').getItem('zipCodeCleansed')))
    .withColumn('matching_string', sf.regexp_replace('matching_string', REGEX, ''))
    .withColumn('matching_string', sf.trim(sf.regexp_replace(sf.col('matching_string'), '\s+', ' ')))
    .withColumn('string_index', sf.row_number().over(w) - 1)
    .select('ohubOperatorId', 'string_index','id', 'matching_string')
)
    
operators_old.show(5, truncate=False)

+------------------------------------+------------+----------------------------+----------------------------------------------------------------------------+
|ohubOperatorId                      |string_index|id                          |matching_string                                                             |
+------------------------------------+------------+----------------------------+----------------------------------------------------------------------------+
|1a7d6ece-83dd-41d0-860a-345555629f5a|0           |DK~COMPLAINTS~2503250       |unknown                                                                     |
|602930a3-59b9-4910-8dfa-5cf2f9dab5c3|1           |DK~COMPLAINTS~OP_USD_5067_15|                                                                            |
|97ad8b8b-8505-42df-b691-982e1dc02752|2           |DK~DEX~10001401             |boursin fromagerie su pacy sur eure croisysureure route de st aquilin3 27120|
|3830d696-c16f-4039-badd-6b584de1aedd|3           |D

In [17]:
operators_matched_dir = 'adl://ulohubdldevne.azuredatalakestore.net/data/parquet/OPERATORS_MATCHED.parquet'

oprs_matched = spark.read.parquet(operators_matched_dir).filter(sf.col('COUNTRY_CODE') == 'DK').select('SOURCE_ID', 'TARGET_ID', 'SOURCE_NAME', 'TARGET_NAME')
oprs_matched.sort('SOURCE_ID', ascending=False).show(20)

+--------------------+--------------------+--------------------+--------------------+
|           SOURCE_ID|           TARGET_ID|         SOURCE_NAME|         TARGET_NAME|
+--------------------+--------------------+--------------------+--------------------+
|DK~mellowmessage~...|DK~mellowmessage~...|intet odder rudeh...|elev odder rudeha...|
|DK~mellowmessage~...|DK~mellowmessage~...|æblehaven hillerø...|æblehaven 3400 tu...|
|DK~mellowmessage~...|DK~mellowmessage~...|as storebælt kors...|sund bælt holding...|
|DK~mellowmessage~...|DK~mellowmessage~...|ulvsund lave ikke...|        ulvsund 4780|
|DK~mellowmessage~...|DK~mellowmessage~...|     sødisbakke 9550|     sødisbakke 9550|
|DK~mellowmessage~...|DK~mellowmessage~...|    seb huset 2 1577|      seb hus 1 1577|
|DK~mellowmessage~...|DK~mellowmessage~...|first hotel moles...|first hotel moles...|
|DK~mellowmessage~...|DK~mellowmessage~...|   djurs mad is 8963|   djurs mad is 8963|
|DK~mellowmessage~...|DK~mellowmessage~...|okæ madserv