In [None]:
import os
import math

from glob import glob
from os import path

import sys
sys.path.append('..')

In [None]:
!bash ../compile_library.sh

In [None]:
egg_file = glob(os.path.join('..', 'dist', '*.egg'))[0]
egg_file

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = (SparkSession
         .builder
         .appName("Contacts_Matching_Notebook")
#          .config('spark.dynamicAllocation.enabled', False)
         .config('spark.executorEnv.PYTHON_EGG_CACHE', '/tmp')
#          .config('spark.executor.instances', 4)
#          .config('spark.executor.cores', 13)
#          .config('spark.executor.memory', '14g')
         .config('spark.driver.memory', '6g')
         .getOrCreate())
sc = spark.sparkContext
sc.setLogLevel("INFO")

sc.addPyFile(egg_file)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as sf
from pyspark.sql.window import Window

In [None]:
ntop = 10
threshold = 0.7

fraction = 1.

In [None]:
input_file = 'adl://ulohubdldevne.azuredatalakestore.net/data/parquet/CONTACTPERSONS.parquet'
output_file = 'adl://ulohubdldevne.azuredatalakestore.net/data/parquet/CONTACTPERSONS_MATCHED.parquet'

input_file = '../../data/CONTACTPERSONS.parquet'
output_file = '../../data/CONTACTPERSONS_MATCHED.parquet'

save_output = False
mode = 'append'

In [None]:
drop_chars = "\\\\!#%&()*+-/:;<=>?@\\^|~\u00A8\u00A9\u00AA\u00AC\u00AD\u00AF\u00B0\u00B1\u00B2\u00B3\u00B6\u00B8\u00B9\u00BA\u00BB\u00BC\u00BD\u00BE\u2013\u2014\u2022\u2026\u20AC\u2121\u2122\u2196\u2197\u247F\u250A\u2543\u2605\u2606\u3001\u3002\u300C\u300D\u300E\u300F\u3010\u3011\uFE36\uFF01\uFF06\uFF08\uFF09\uFF1A\uFF1B\uFF1F{}\u00AE\u00F7\u02F1\u02F3\u02F5\u02F6\u02F9\u02FB\u02FC\u02FD\u1BFC\u1BFD\u2260\u2264\u2DE2\u2DF2\uEC66\uEC7C\uEC7E\uED2B\uED34\uED3A\uEDAB\uEDFC\uEE3B\uEEA3\uEF61\uEFA2\uEFB0\uEFB5\uEFEA\uEFED\uFDAB\uFFB7\u007F\u24D2\u2560\u2623\u263A\u2661\u2665\u266A\u2764\uE2B1\uFF0D"
regex = "[{}]".format(drop_chars)

In [None]:
spark.read.parquet(input_file).columns

In [None]:
print("Driver Memory: ", sc._conf.get('spark.driver.memory'))

w = Window.partitionBy('COUNTRY_CODE').orderBy(sf.asc('id'))

all_contacts = (
    spark
    .read.parquet(input_file)
    .sample(False, fraction)
    # keep only if no email nor phone
    .filter(sf.isnull(sf.col('EMAIL_ADDRESS')) & sf.isnull(sf.col('MOBILE_PHONE_NUMBER')))
    # drop if no first name and no last name
    .na.drop(subset=['FIRST_NAME_CLEANSED', 'LAST_NAME_CLEANSED'], how='all')
    # drop if no street
    .na.drop(subset=['STREET_CLEANSED'], how='any')
    # same logic but for an empty string
    .filter((sf.trim(sf.col('STREET_CLEANSED')) != '') &
            ((sf.trim(sf.col('FIRST_NAME_CLEANSED')) != '') | (sf.trim(sf.col('LAST_NAME_CLEANSED')) != '')))
    # create unique ID
    .withColumn('id', sf.concat_ws('~',
                                   sf.col('COUNTRY_CODE'),
                                   sf.col('SOURCE'),
                                   sf.col('REF_CONTACT_PERSON_ID')))
    .fillna('')
    # create string columns to matched
    .withColumn('name',
                sf.concat_ws(' ',
                             sf.col('FIRST_NAME_CLEANSED'),
                             sf.col('LAST_NAME_CLEANSED')))
    .withColumn('name', sf.regexp_replace('name', regex, ''))
    .withColumn('name', sf.trim(sf.regexp_replace('name', '\s+', ' ')))
    .withColumn('name_index', sf.row_number().over(w) - 1)
    .select('name_index', 'id', 'name', 'COUNTRY_CODE', 'FIRST_NAME_CLEANSED', 'LAST_NAME_CLEANSED', 'STREET_CLEANSED', 'HOUSENUMBER', 'ZIP_CODE_CLEANSED', 'CITY_CLEANSED')
)
all_contacts.persist()
all_contacts.count()

In [None]:
all_contacts_count = all_contacts.groupby('COUNTRY_CODE', 'name').count().sort('count', ascending=False)
all_contacts_count.show(5, truncate=False)

In [None]:
contacts_count = all_contacts.groupby('COUNTRY_CODE').count()
contacts_count.sort('count', ascending=True).show(15)
contacts_count.sort('count', ascending=False).show(15)

In [None]:
country_codes = (contacts_count[contacts_count['count'] > 100]
                 .select('COUNTRY_CODE')
                 .distinct()
                 .rdd.map(lambda r: r[0]).collect())

print(country_codes)

In [None]:
from string_matching.spark_string_matching import match_strings

In [None]:
for country_code in country_codes:
    print('Start:', country_code)
    
    contacts = (
        all_contacts
        .filter(sf.col('COUNTRY_CODE') == country_code)
        .drop('COUNTRY_CODE')
        .repartition('id')
        .sort('id', ascending=True)
    )
    
    print('Number of contacts:', contacts.count())

    print('Vectorizing names')
    similarity = match_strings(spark, contacts,
                               string_column='name', row_number_column='name_index',
                               n_top=ntop, threshold=threshold, n_gram=2, min_document_frequency=2, max_vocabulary_size=1500)
    similarity.persist()
    print('Nr. of similarities:', similarity.count())
    
    matches = (
        similarity
        .join(contacts, similarity['i'] == contacts['name_index'],
              how='left').drop('name_index')
        .selectExpr('i', 'j', 'id as SOURCE_ID',
                    'SIMILARITY', 'name as SOURCE_NAME',
                    'STREET_CLEANSED as SOURCE_STREET',
                    'ZIP_CODE_CLEANSED as SOURCE_ZIP_CODE', 'CITY_CLEANSED as SOURCE_CITY')
        .join(contacts, similarity['j'] == contacts['name_index'],
              how='left').drop('name_index')
        .withColumn('COUNTRY_CODE', sf.lit(country_code))
        .selectExpr('i', 'j', 'COUNTRY_CODE', 'SOURCE_ID',
                    'id as TARGET_ID','SIMILARITY',
                    'SOURCE_NAME', 'STREET_CLEANSED as TARGET_STREET',
                    'SOURCE_STREET', 'name as TARGET_NAME',
                    'SOURCE_ZIP_CODE', 'ZIP_CODE_CLEANSED as TARGET_ZIP_CODE',
                    'SOURCE_CITY', 'CITY_CLEANSED as TARGET_CITY')
        .filter((sf.col('SOURCE_ZIP_CODE') == sf.col('TARGET_ZIP_CODE')) | (sf.col('SOURCE_CITY') == sf.col('TARGET_CITY')))
        .withColumn('street_lev_distance', sf.levenshtein(sf.col('SOURCE_STREET'), sf.col('TARGET_STREET')))
    )
    matches.persist()
    print('Nr. of matches same city or Zipcode', matches.count())
    
    matches_lev = matches.filter(sf.col('street_lev_distance') < 5)
    
    print('Nr. of matches Levenshtein filter:', matches_lev.count())
    
    grouping_window = (
    Window
    .partitionBy('j')
    .orderBy(sf.asc('i')))

    # keep only the first entry sorted alphabetically
    grp_match = (
        matches_lev
        .withColumn("rn", sf.row_number().over(grouping_window))
        .filter(sf.col("rn") == 1)
        .drop('rn')
    )

    # remove group ID from column j
    grouped_matches =  grp_match.join(
        grp_match.select('j').subtract(grp_match.select('i')),
        on='j', how='inner'
    )

     
    if save_output:
        (grouped_matches
         .coalesce(5)
         .write
         .partitionBy('country_code')
         .parquet(output_file, mode=mode))
    else:
        grouped_matches.persist()
        n_matches = grouped_matches.count()

        print('\n\nNr. grouped matches:\t', n_matches)
        print('Threshold:\t', threshold)
        print('NTop:\t', ntop)
        (grouped_matches
         .select('SIMILARITY',
                 'SOURCE_NAME', 'TARGET_NAME',
                 'SOURCE_STREET', 'TARGET_STREET',
                 'SOURCE_ZIP_CODE', 'TARGET_ZIP_CODE',
                 'SOURCE_CITY', 'TARGET_CITY')
         .sort('SIMILARITY', ascending=True)
         .show(50, truncate=False))

        (grouped_matches
         .groupBy(['SOURCE_ID', 'SOURCE_NAME'])
         .count()
         .sort('count', ascending=False).show(50, truncate=False))

        grouped_matches.describe('SIMILARITY').show()
    print("Done, country code:", country_code)