# Mantain a persistent UUID for groups

The idea is that there are already grouped operators and each group has a unique UUID. When new operator data comes in we need to match them with current grouped operators and add them to a group if necessary keeping the UUID persistent.

The new input operator data can be of three forms:

 - Completely new and not yet in our records: In this case the operators should be added with a new unique UUID
 - Known operator with no change: In this case the operator could still be matched in the same group its already in or reassign to a new group.
 - Known operator with a change in its data: Operator should remain in the same group.

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("NameMatching_Notebook")
    .config('spark.dynamicAllocation.enabled', False)
    .config('spark.executorEnv.PYTHON_EGG_CACHE', '/tmp')
    .config('spark.executor.instances', 4)
    .config('spark.executor.cores', 13)
    .config('spark.executor.memory', '14g')
    .config('spark.driver.memory', '7g')
    .getOrCreate()
)
sc = spark.sparkContext

In [None]:
import pyspark.sql.functions as sf

from pyspark.sql.window import Window

## Define helper functions

Each group should have a unique UUID. Then I need a function to create this unique ID for each completely new operators which doesn't match to any group (they will become their own group). Since by concatenating the columns like `COUNTRY_CODE~SOURCE~REF_OPERATOR_ID` gives a unique opertor ID we can use this to generate the group ID. I do this passing this ID to a hashing algorithm.

In [None]:
import hashlib

def create_32_char_hash(string):
    hash_id = hashlib.md5(string.encode(encoding='utf-8')).hexdigest()
    return '-'.join([hash_id[:8], hash_id[8:12], hash_id[12:16], hash_id[16:]])

# create udf for use in spark later
udf_create_32_char_hash = sf.udf(create_32_char_hash)

I need functions to preprocess the input operator data and the existing groupped operator data. 

First I declare a regex made of unwanted characters (given by Roderik), this will be filtered out during the preprocessing.

In [None]:
DROP_CHARS = "\\\\!#%&()*+-/:;<=>?@\\^|~\u00A8\u00A9\u00AA\u00AC\u00AD\u00AF\u00B0" \
             "\u00B1\u00B2\u00B3\u00B6\u00B8\u00B9\u00BA\u00BB\u00BC\u00BD\u00BE" \
             "\u2013\u2014\u2022\u2026\u20AC\u2121\u2122\u2196\u2197\u247F\u250A" \
             "\u2543\u2605\u2606\u3001\u3002\u300C\u300D\u300E\u300F\u3010\u3011" \
             "\uFE36\uFF01\uFF06\uFF08\uFF09\uFF1A\uFF1B\uFF1F{}\u00AE\u00F7\u02F1" \
             "\u02F3\u02F5\u02F6\u02F9\u02FB\u02FC\u02FD\u1BFC\u1BFD\u2260\u2264" \
             "\u2DE2\u2DF2\uEC66\uEC7C\uEC7E\uED2B\uED34\uED3A\uEDAB\uEDFC\uEE3B" \
             "\uEEA3\uEF61\uEFA2\uEFB0\uEFB5\uEFEA\uEFED\uFDAB\uFFB7\u007F\u24D2" \
             "\u2560\u2623\u263A\u2661\u2665\u266A\u2764\uE2B1\uFF0D"
REGEX = "[{}]".format(DROP_CHARS)

The preprocesing from known grouped operators is different than from new input operator data. Below I create the matching string for the group which will be used for matching againts new input operator data. I also add a row number columns which is necessary for the matching later to join back with the original information of the groups.

In [None]:
def preprocess_grouped_kown_operators(ddf):
    # Create string name used originally for the matching
    w = Window.partitionBy('countryCode').orderBy(sf.asc('ohubOperatorId'))
    return (ddf
            .withColumn('matching_string', sf.concat_ws(' ',
                                                        sf.col('operator').getItem('nameCleansed'),
                                                        sf.col('operator').getItem('cityCleansed'),
                                                        sf.col('operator').getItem('streetCleansed'),
                                                        sf.col('operator').getItem('zipCodeCleansed')))
            .withColumn('matching_string', sf.regexp_replace('matching_string', REGEX, ''))
            .withColumn('matching_string', sf.lower(sf.trim(sf.regexp_replace(sf.col('matching_string'), '\s+', ' '))))
            .withColumn('string_index', sf.row_number().over(w) - 1)
            .select('countryCode', 'string_index', 'ohubOperatorId', 'matching_string'))


The preprocessing of known operators consist of making the unique refId and the string to be matched on.

In [None]:
def preprocess_input_operatros(ddf):
    w = Window.partitionBy('COUNTRY_CODE').orderBy(sf.asc('refId'))
    return (ddf
            .na.drop(subset=['NAME_CLEANSED'])
            .withColumn('refId', sf.concat_ws('~',
                                   sf.col('COUNTRY_CODE'),
                                   sf.col('SOURCE'),
                                   sf.col('REF_OPERATOR_ID')))
            .fillna('')
            # create string columns to matched
            .withColumn('matching_string',
                        sf.concat_ws(' ',
                                     sf.col('NAME_CLEANSED'),
                                     sf.col('CITY_CLEANSED'),
                                     sf.col('STREET_CLEANSED'),
                                     sf.col('ZIP_CODE_CLEANSED')))
            .withColumn('matching_string', sf.regexp_replace('matching_string', REGEX, ''))
            .withColumn('matching_string', sf.lower(sf.trim(sf.regexp_replace('matching_string', '\s+', ' '))))
            .withColumn('string_index', sf.row_number().over(w) - 1)
            .select('COUNTRY_CODE', 'string_index', 'refId', 'matching_string')
           )

## Load all operators

At the end of the matching I will need to join back the matches to the current operators to have the complete data of known and new input operators.

In [None]:
operators_old_dir = 'adl://ulohubdldevne.azuredatalakestore.net/data/parquet/test/OPERATORS_MERGED.parquet'

In [None]:
all_oprs = (
    spark
    .read.parquet(operators_old_dir)
    .withColumn('refId', sf.explode('refIds'))
    .drop('refIds')
)
all_oprs.persist()
all_oprs.show(3)

## Load and preprocess grouped current operator data

In [None]:
operators_old_dir = 'adl://ulohubdldevne.azuredatalakestore.net/data/parquet/test/OPERATORS_MERGED.parquet'

In [None]:
oprs_old = preprocess_grouped_kown_operators(spark.read.parquet(operators_old_dir))

oprs_old.persist()
oprs_old.sort('string_index').show(5, truncate=False)

## Load and preprocess new input data

In [None]:
input_operators_dir = 'adl://ulohubdldevne.azuredatalakestore.net/data/parquet/test/OPERATORS.parquet'

In [None]:
input_oprs = preprocess_input_operatros(spark.read.parquet(input_operators_dir))

input_oprs.persist()
input_oprs.sort('string_index').show(5, truncate=False)

## Match new input vs group operators

Know that the format is ready to match we use the matching algorithm

In [None]:
import os
from glob import glob

egg_file = glob(os.path.join('..', 'dist', '*.egg'))[0]
sc.addPyFile(egg_file)

from string_matching.spark_string_matching import match_strings

In [None]:
country_codes = (input_oprs
                 .select('COUNTRY_CODE')
                 .distinct()
                 .rdd.map(lambda r: r[0]).collect())
country_codes

In [None]:
country_code = 'DK'

In [None]:
n_top = 1 # we get only the top match

In [None]:
for country_code in country_codes:
    print('\nStart:', country_code)
    
    input_oprs_ctr = input_oprs.filter(sf.col('COUNTRY_CODE') == country_code).repartition('refId')
    oprs_old_ctr = oprs_old.filter(sf.col('countryCode') == country_code).repartition('ohubOperatorId')

    print('Calculating similarity')
    
    similarity = match_strings(
        spark,
        input_oprs_ctr.select('string_index', 'matching_string'),
        df2=oprs_old_ctr.select('string_index', 'matching_string'),
        string_column='matching_string', row_number_column='string_index',
        n_top=n_top, threshold=0.8, n_gram=2, min_document_frequency=2, max_vocabulary_size=2000
    )

    print('Join to original refId and ohubOperatorId')

    matches = (
        input_oprs_ctr
        .join(similarity, input_oprs_ctr['string_index'] == similarity['i'], how='left')
        .drop('string_index')
        .selectExpr('j', 'SIMILARITY',
                    'matching_string as matching_string_input', 'refId')
        .join(oprs_old_ctr, sf.col('j') == oprs_old_ctr['string_index'], how='left')
        .drop('string_index')
        .withColumn('countryCode', sf.lit(country_code))
        .selectExpr('SIMILARITY',
                    'countryCode',
                    'matching_string_input',
                    'matching_string as matching_string_old',
                    'refId',
                    'ohubOperatorId as ohubOperatorId_matched')
    )
    matches.persist()
    matches.count()

    print('Updating UUID')

    updated_matching = (
        matches
        .join(all_oprs.filter(sf.col('countryCode') == country_code), on=['refId', 'countryCode'], how='outer')
        .withColumn('ohubOperatorId',
            sf.when(sf.col('ohubOperatorId_matched').isNotNull(), sf.col('ohubOperatorId_matched')).otherwise(sf.col('ohubOperatorId')))
        .withColumn('ohubOperatorId', 
            sf.when(sf.col('ohubOperatorId').isNull(), udf_create_32_char_hash(sf.col('refId'))).otherwise(sf.col('ohubOperatorId')))
        .groupBy('ohubOperatorId', 'operator', 'countryCode').agg(sf.collect_list('refId').alias('refIds'))
    )
    updated_matching.persist()
    print('Number of groups:', oprs_old_ctr.count(), '-->', updated_matching.count())
    
    print('Writing to parquet')
    
    (updated_matching
     .coalesce(20)
     .write
     .partitionBy('countryCode')
     .parquet('adl://ulohubdldevne.azuredatalakestore.net/data/parquet/test/OPERATORS_MERGED_t2.parquet', mode='append'))
    
    print('Done:', country_code)