In [None]:
import os
import math

from glob import glob
from os import path

In [None]:
!bash ../compile_library.sh

In [None]:
egg_file = glob(os.path.join('..', 'dist', '*.egg'))[0]
egg_file

In [None]:
import findspark
findspark.init()
import pyspark

spark = (pyspark.sql
         .SparkSession
         .builder
         .appName("NameMatching_Notebook")
#              .config('spark.dynamicAllocation.enabled', False)
         .config('spark.executorEnv.PYTHON_EGG_CACHE', '/tmp')
#              .config('spark.executor.instances', 4)
#              .config('spark.executor.cores', 13)
#              .config('spark.executor.memory', '14g')
         .config('spark.driver.memory', '7g')
         .getOrCreate())
sc = spark.sparkContext
sc.setLogLevel("INFO")

sc.addPyFile(egg_file)

In [None]:
from pyspark.sql import functions as sf
from pyspark.sql.window import Window

In [None]:
input_file = 'adl://ulohubdldevne.azuredatalakestore.net/data/parquet/OPERATORS.parquet'
output_file = 'adl://ulohubdldevne.azuredatalakestore.net/data/parquet/OPERATORS_MATCHED.parquet'

input_file = '../../data/OPERATORS.parquet'
output_file = '../../data/OPERATORS_MATCHED.parquet'

save_output = False
mode = 'overwrite'

In [None]:
ntop = 10
threshold = 0.8
fraction = .001

In [None]:
drop_chars = "\\\\!#%&()*+-/:;<=>?@\\^|~\u00A8\u00A9\u00AA\u00AC\u00AD\u00AF\u00B0\u00B1\u00B2\u00B3\u00B6\u00B8\u00B9\u00BA\u00BB\u00BC\u00BD\u00BE\u2013\u2014\u2022\u2026\u20AC\u2121\u2122\u2196\u2197\u247F\u250A\u2543\u2605\u2606\u3001\u3002\u300C\u300D\u300E\u300F\u3010\u3011\uFE36\uFF01\uFF06\uFF08\uFF09\uFF1A\uFF1B\uFF1F{}\u00AE\u00F7\u02F1\u02F3\u02F5\u02F6\u02F9\u02FB\u02FC\u02FD\u1BFC\u1BFD\u2260\u2264\u2DE2\u2DF2\uEC66\uEC7C\uEC7E\uED2B\uED34\uED3A\uEDAB\uEDFC\uEE3B\uEEA3\uEF61\uEFA2\uEFB0\uEFB5\uEFEA\uEFED\uFDAB\uFFB7\u007F\u24D2\u2560\u2623\u263A\u2661\u2665\u266A\u2764\uE2B1\uFF0D"
regex = "[{}]".format(drop_chars)

In [None]:
print("Driver Memory: ", sc._conf.get('spark.driver.memory'))

w = Window.partitionBy('COUNTRY_CODE').orderBy(sf.asc('id'))

all_operators = (
    spark
    .read.parquet(input_file)
    .sample(False, fraction)
    .na.drop(subset=['NAME_CLEANSED'])
    # create unique ID
    .withColumn('id', sf.concat_ws('~',
                                   sf.col('COUNTRY_CODE'),
                                   sf.col('SOURCE'),
                                   sf.col('REF_OPERATOR_ID')))
    .fillna('')
    # create string columns to matched
    .withColumn('name',
                sf.concat_ws(' ',
                             sf.col('NAME_CLEANSED'),
                             sf.col('CITY_CLEANSED'),
                             sf.col('STREET_CLEANSED'),
                             sf.col('ZIP_CODE_CLEANSED')))
    .withColumn('name', sf.regexp_replace('name', regex, ''))
    .withColumn('name', sf.trim(sf.regexp_replace('name', '\s+', ' ')))
    .withColumn('name_index', sf.row_number().over(w) - 1)
    .select('name_index', 'id', 'name', 'COUNTRY_CODE')
)
all_operators.persist()

In [None]:
all_opr_count = all_operators.groupby('COUNTRY_CODE','name').count().sort('count', ascending=False)
all_opr_count.show(200, truncate=False)

In [None]:
opr_count = all_operators.groupby('COUNTRY_CODE').count()
opr_count.sort('count', ascending=True).show(10)

In [None]:
country_codes = (opr_count[opr_count['count'] > 100]
                 .select('COUNTRY_CODE')
                 .distinct()
                 .rdd.map(lambda r: r[0]).collect())

print(country_codes)

In [None]:
from string_matching.spark_string_matching import match_strings

In [None]:
for country_code in country_codes:
    operators = (
        all_operators[sf.col('COUNTRY_CODE') == country_code]
        .filter(sf.col('COUNTRY_CODE') == country_code)
        .drop('COUNTRY_CODE')
        .repartition('id')
        .sort('id', ascending=True)
    )
    
    similarity = match_strings(spark, operators,
                               string_column='name', row_number_column='name_index',
                               n_top=ntop, threshold=threshold, n_gram=2, min_document_frequency=2, max_vocabulary_size=1500)

    grouping_window = (
        Window
        .partitionBy('j')
        .orderBy(sf.asc('i')))

    # keep only the first entry sorted alphabetically
    grp_sim = (
        similarity
        .withColumn("rn", sf.row_number().over(grouping_window))
        .filter(sf.col("rn") == 1)
        .drop('rn')
    )

    # remove group ID from column j
    grouped_similarity =  grp_sim.join(
        grp_sim.select('j').subtract(grp_sim.select('i')),
        on='j', how='inner'
    )

    matches = (
        grouped_similarity
        .join(operators, grouped_similarity['i'] == operators['name_index'],
              how='left').drop('name_index')
        .selectExpr('i', 'j', 'id as SOURCE_ID',
                    'SIMILARITY', 'name as SOURCE_NAME')
        .join(operators, grouped_similarity['j'] == operators['name_index'],
              how='left').drop('name_index')
        .withColumn('COUNTRY_CODE', sf.lit(country_code))
        .selectExpr('COUNTRY_CODE', 'SOURCE_ID', 'id as TARGET_ID',
                    'SIMILARITY', 'SOURCE_NAME', 'name as TARGET_NAME')
    )

    if save_output:
        (matches
         .coalesce(20)
         .write
         .partitionBy('country_code')
         .parquet(fn, mode=mode))
    else:
        matches.persist()
        n_matches = matches.count()

        print('\n\nNr. Similarities:\t', n_matches)
        print('Threshold:\t', threshold)
        print('NTop:\t', ntop)
        (matches
         .select('SOURCE_ID', 'TARGET_ID',
                 'SIMILARITY', 'SOURCE_NAME', 'TARGET_NAME')
         .sort('SIMILARITY', ascending=True)
         .show(50, truncate=False))

        (matches
         .groupBy(['SOURCE_ID', 'SOURCE_NAME'])
         .count()
         .sort('count', ascending=False).show(50, truncate=False))

        matches.describe('SIMILARITY').show()
    print("Done, country code:", country_code)