# OUniverse
**Match Pase 1 Input vs Phase 2 output**

In [None]:
import os
import math

from glob import glob
from os import path

import sys
sys.path.append('..')

Path pointing to cython sparse_dot directory

In [None]:
!bash ../compile_library.sh

In [None]:
egg_file = glob(os.path.join('..', 'dist', '*.egg'))[0]
egg_file

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = (SparkSession
         .builder
         .appName("NameMatching_Notebook")
#              .config('spark.dynamicAllocation.enabled', False)
         .config('spark.executorEnv.PYTHON_EGG_CACHE', '/tmp')
#              .config('spark.executor.instances', 4)
#              .config('spark.executor.cores', 13)
#              .config('spark.executor.memory', '14g')
         .config('spark.driver.memory', '4g')
         .getOrCreate())
sc = spark.sparkContext
sc.setLogLevel("INFO")

sc.addPyFile(egg_file)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as sf
from pyspark.sql.window import Window

In [None]:
phase_1_file = '../../data/Phase_I/Phase_I_Input/20181101/OPERATORS_20180111.csv'
phase_2_file = '../../data/Phase_II/Phase_II_Output/avro/*.avro'

save_output = False

In [None]:
NTOP = 3
THRESHOLD = 0.5
MATRIX_CHUNK_ROWS = 750
FRACTION = 1.

In [None]:
drop_chars = "\"\\\\!#%&()*+-/:;<=>?@\\^|~\u00A8\u00A9\u00AA\u00AC\u00AD\u00AF\u00B0\u00B1\u00B2\u00B3\u00B6\u00B8\u00B9\u00BA\u00BB\u00BC\u00BD\u00BE\u2013\u2014\u2022\u2026\u20AC\u2121\u2122\u2196\u2197\u247F\u250A\u2543\u2605\u2606\u3001\u3002\u300C\u300D\u300E\u300F\u3010\u3011\uFE36\uFF01\uFF06\uFF08\uFF09\uFF1A\uFF1B\uFF1F{}\u00AE\u00F7\u02F1\u02F3\u02F5\u02F6\u02F9\u02FB\u02FC\u02FD\u1BFC\u1BFD\u2260\u2264\u2DE2\u2DF2\uEC66\uEC7C\uEC7E\uED2B\uED34\uED3A\uEDAB\uEDFC\uEE3B\uEEA3\uEF61\uEFA2\uEFB0\uEFB5\uEFEA\uEFED\uFDAB\uFFB7\u007F\u24D2\u2560\u2623\u263A\u2661\u2665\u266A\u2764\uE2B1\uFF0D"
REGEX = "[{}]".format(drop_chars)

In [None]:
DROP_NAMES = ['unknown', '', ' ', 'zzdummy es080']

In [None]:
def read_opr1_csv(spark, filepath):
    return(spark
           .read.csv(filepath, header=True, sep=',')
           .sample(False, FRACTION)
           .select('OPR_COUNTRY_CODE', 'OPR_NAME', 'OPR_REGION',
                   'OPR_STREET', 'OPR_HOUSE_NUMBER', 'OPR_ZIP_CODE',
                   'OPR_SOURCE', 'OPR_OPR_ORIG_INTEGRATION_ID'))

def read_opr2_csv(spark, filepath):
    return(spark
           .read.format("com.databricks.spark.avro")
           .load(filepath)
           .sample(False, FRACTION)
           .select('inputCountry', 'name', 'formattedAddress', 'placeId'))
    

def preprocess_phase1(ddf):
    w = Window.partitionBy('OPR_COUNTRY_CODE').orderBy(sf.asc('name'))
    return (ddf
            .fillna('')
            # create string columns to matched
            .withColumn('name',
                        sf.concat_ws(' ',
                                     sf.col('OPR_NAME'),
                                     sf.col('OPR_REGION'),
                                     sf.col('OPR_STREET'),
                                     sf.col('OPR_HOUSE_NUMBER'),
                                     sf.col('OPR_ZIP_CODE')))
            .withColumn('name', sf.lower(sf.col('name')))
            .withColumn('name', sf.regexp_replace(sf.col('name'), REGEX, ''))
            .withColumn('name', sf.trim(sf.regexp_replace(sf.col('name'), '\s+', ' ')))
            .filter(~sf.col('name').isin(*DROP_NAMES))
            .withColumn('name_index', sf.row_number().over(w) - 1)
            .selectExpr('name_index', 'name', 'OPR_COUNTRY_CODE as country_code',
                        'OPR_SOURCE', 'OPR_OPR_ORIG_INTEGRATION_ID'))

def preprocess_phase2(ddf):
    w = Window.partitionBy('inputCountry').orderBy(sf.asc('name'))
    return (ddf
            .drop_duplicates(subset=['placeId'])
            .fillna('')
            # create string columns to matched
            .withColumn('name',
                        sf.concat_ws(' ',
                                     sf.col('name'),
                                     sf.col('formattedAddress')))
            .withColumn('name', sf.lower(sf.col('name')))
            .withColumn('name', sf.regexp_replace(sf.col('name'), REGEX, ''))
            .withColumn('name', sf.trim(sf.regexp_replace(sf.col('name'), '\s+', ' ')))
            .filter(~sf.col('name').isin(*DROP_NAMES))
            .withColumn('name_index', sf.row_number().over(w) - 1)
            .selectExpr('name_index', 'name', 'inputCountry as country_code', 'placeId'))

## Read and preprocess data

In [None]:
phase1_opr_raw = read_opr1_csv(spark, phase_1_file)
count_phase_1_raw = phase1_opr_raw.groupby('OPR_COUNTRY_CODE').count().selectExpr('OPR_COUNTRY_CODE as country_code', 'count as count_raw')

phase1_opr = preprocess_phase1(phase1_opr_raw)
phase1_opr.persist()
count_phase_1 = phase1_opr.groupby('country_code').count().selectExpr('country_code', 'count as count_clean')

phase1_opr.select('name_index', 'name', 'OPR_SOURCE').show(5, truncate=False)

(count_phase_1_raw
 .join(count_phase_1, on='country_code')
 .withColumn('percentage_kept', 100 * sf.col('count_clean') / sf.col('count_raw'))
 .show())

In [None]:
phase2_opr_raw = read_opr2_csv(spark, phase_2_file)
count_phase_2_raw = phase2_opr_raw.groupby('inputCountry').count().selectExpr('inputCountry as country_code', 'count as count_raw')

phase2_opr = preprocess_phase2(phase2_opr_raw)
phase2_opr.persist()
count_phase_2 = phase2_opr.groupby('country_code').count().selectExpr('country_code', 'count as count_clean')

phase2_opr.show(5, truncate=False)

(count_phase_2_raw
 .join(count_phase_2, on='country_code')
 .withColumn('percentage_kept', 100 * sf.col('count_clean') / sf.col('count_raw'))
 .show())

In [None]:
country_codes = (count_phase_1
                 .select('country_code')
                 .distinct()
                 .rdd.map(lambda r: r[0]).collect())

print(country_codes)

In [None]:
def select_and_repartition_country(ddf, country_code):
    return (ddf
            .filter(sf.col('country_code') == country_code)
            .drop('country_code')
            .repartition('name_index')
            .sort('name_index', ascending=True))


def append_matches(similarity, opr_1, opr_2, country_code):
    return (opr_1
            .join(similarity, opr_1['name_index'] == similarity['i'],
            how='left').drop('name_index')
            .selectExpr('j', 'SIMILARITY', 'name as name_phase1',
                        'OPR_SOURCE', 'OPR_OPR_ORIG_INTEGRATION_ID')
            .join(opr_2, sf.col('j') == opr_2['name_index'],
            how='left').drop('name_index')
            .withColumn('COUNTRY_CODE', sf.lit(country_code))
            .selectExpr('COUNTRY_CODE', 'SIMILARITY', 'name_phase1', 'name as name_phase2', 
                        'OPR_SOURCE', 'OPR_OPR_ORIG_INTEGRATION_ID', 'placeId'))

In [None]:
from string_matching.spark_string_matching import match_strings

In [None]:
for country_code in country_codes:
    print(country_code, 'START')
    
    ctr_opr1 = select_and_repartition_country(phase1_opr, country_code)
    ctr_opr2 = select_and_repartition_country(phase2_opr, country_code)
   
    similarity = match_strings(spark, ctr_opr1.select('name_index', 'name'),
                               df2=ctr_opr2.select('name_index', 'name'),
                               string_column='name', row_number_column='name_index',
                               n_top=NTOP, threshold=THRESHOLD, n_gram=2, min_document_frequency=2, max_vocabulary_size=1500)
    
    
    matches = append_matches(similarity, ctr_opr1, ctr_opr2, country_code)
    
    if save_output:
        (matches
         .coalesce(1)
         .write
         .csv(country_code + '.csv', header=True))
        print('File saved for', country_code)
    else:
        matches.persist()
        n_matches = matches.dropna(subset=['SIMILARITY']).count()

        print('\n\nNr. Similarities:\t', n_matches)
        print('Threshold:\t', THRESHOLD)
        print('NTop:\t', NTOP)
        print('Fraction', FRACTION)
        (matches
         .select('SIMILARITY', 'name_phase1',
                 'name_phase2', 'placeId')
         .sample(False, 0.1)
         .sort('SIMILARITY', ascending=False)
         .show(50, truncate=True))

        matches.describe('SIMILARITY').show()
    
    print(country_code, 'DONE\n')