In [None]:
import os
import subprocess

from glob import glob
from os import path

Path pointing to cython sparse_dot directory

In [None]:
! bash /home/rodrigo/projects/unilever/ohub2/name-matching/compile_library.sh

In [None]:
cwd = path.join('/home/rodrigo/projects/unilever/ohub2/name-matching', 'cython')
cwd

In [None]:
egg_file = glob(path.join(cwd, 'dist', '*.egg'))[0]
egg_file

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = (SparkSession
         .builder
         .appName("NameMatching-OUniverse")
#          .config('spark.dynamicAllocation.enabled', False)
         .config('spark.executorEnv.PYTHON_EGG_CACHE', '/tmp')
#          .config('spark.executor.instances', 4)
#          .config('spark.executor.cores', 13)
#          .config('spark.executor.memory', '14g')
         .config('spark.driver.memory', '7g')
         .getOrCreate())
sc = spark.sparkContext
sc.setLogLevel("WARN")

sc.addPyFile(egg_file)

In [None]:
# import sparse_dot_topn.sparse_dot_topn as ct # this is the cython code module

from pyspark import keyword_only

from pyspark.ml import Pipeline
from pyspark.ml import Transformer

from pyspark.ml.feature import CountVectorizer, HashingTF
from pyspark.ml.feature import IDF
from pyspark.ml.feature import NGram
from pyspark.ml.feature import Normalizer
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import StopWordsRemover

from pyspark.mllib.linalg.distributed import IndexedRow
from pyspark.mllib.linalg.distributed import IndexedRowMatrix

from pyspark.ml.param.shared import HasInputCol
from pyspark.ml.param.shared import HasOutputCol
from pyspark.ml.param.shared import Param
from pyspark.ml.param.shared import Params

from pyspark.sql import functions as sf
from pyspark.sql.window import Window
from pyspark.sql.types import LongType
from pyspark.sql.types import StructField
from pyspark.sql.types import StructType
from pyspark.sql.types import StringType

from pyspark.sql.types import ArrayType, IntegerType, StringType, FloatType
from pyspark.mllib.linalg import VectorUDT, Vector, Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.sql.functions import udf
from pyspark.sql.types import StructField, StructType
from pyspark.sql.functions import array, struct

import numpy as np
from scipy.sparse import csr_matrix, coo_matrix
from scipy.sparse import _sparsetools
from scipy.sparse.sputils import get_index_dtype
import math
from itertools import tee

In [None]:
def chunk_dot_limit(A, B, ntop, threshold=0, start_row=0):
    import sparse_dot_topn.sparse_dot_topn as ct
    B = B.tocsr()

    M = A.shape[0]
    N = B.shape[1]

    idx_dtype = np.int32
    
    only_upper_triangular = 0

    # massive memory reduction
    # max number of possible non-zero element in the upper triangular matrix
    nnz_max = M * ntop

    # arrays will be returned by reference
    rows = np.empty(nnz_max, dtype=idx_dtype)
    cols = np.empty(nnz_max, dtype=idx_dtype)
    data = np.empty(nnz_max, dtype=A.dtype)

    # C++ wrapped with Cython implementation
    # number of found non-zero entries in the upper triangular matrix
    # I'll use this value to slice the returning numpy array
    nnz = ct.sparse_dot_topn(
        M, N,
        np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        threshold,
        rows, cols, data, start_row, only_upper_triangular)
#     return [(nnz, nnz, float(nnz))]
    return ((int(i), int(j), float(v)) for i, j, v in
            zip(rows[:nnz], cols[:nnz], data[:nnz]))

In [None]:
class NameVectorizer(object):
    def __init__(self, n_gram, min_df, vocab_size):
        self.n_gram = n_gram
        self.min_df = min_df
        self.vocab_size = vocab_size
        self.__create_pipeline()

    def __create_pipeline(self):
        regexTokenizer = RegexTokenizer(inputCol="name",
                                        outputCol="tokens",
                                        pattern="")
        ngram_creator = NGram(inputCol="tokens",
                              outputCol="n_grams",
                              n=self.n_gram)
        tf_counter = CountVectorizer(inputCol='n_grams',
                                          outputCol='term_frequency',
                                          minTF=1.0,
                                          minDF=self.min_df,
                                          vocabSize=self.vocab_size,
                                          binary=False)
        idf_counter = IDF(inputCol="term_frequency",
                               outputCol="tfidf_vector")
        l2_normalizer = Normalizer(inputCol="tfidf_vector",
                                   outputCol="name_vector",
                                   p=2)
        
        self.pipeline = Pipeline(
            stages=[regexTokenizer,
                    ngram_creator,
                    tf_counter,
                    idf_counter,
                    l2_normalizer]
        )

    def fit_transform(self, df1, df2):
        df = df1.union(df2)
        self.pipeline = self.pipeline.fit(df)      
        return self.pipeline.transform(df1), self.pipeline.transform(df2)


def unpack_vector(sparse):
    return ((int(index), float(value)) for index, value in
            zip(sparse.indices, sparse.values) if value > 0.05)


schema = StructType([StructField("dummy_id", LongType(), False),
                     StructField("id", StringType(), False),
                     StructField("name", StringType(), False)])

schema_sparse = ArrayType(StructType([
    StructField("j_ngram", IntegerType(), False),
    StructField("value", FloatType(), False)
]))

sim_schema = StructType([
    StructField("i", IntegerType(), False),
    StructField("j", IntegerType(), False),
    StructField("SIMILARITY", FloatType(), False)
])

udf_unpack_vector = sf.udf(unpack_vector, schema_sparse)


In [None]:
def read_opr1_csv(spark, filepath):
    return(spark
           .read.csv(filepath, header=True, sep=',')
           .select('OPR_COUNTRY_CODE', 'OPR_NAME', 'OPR_REGION',
                   'OPR_STREET', 'OPR_HOUSE_NUMBER', 'OPR_ZIP_CODE',
                   'OPR_SOURCE', 'OPR_OPR_ORIG_INTEGRATION_ID'))

def read_opr2_csv(spark, filepath):
    return(spark
           .read.format("com.databricks.spark.avro")
           .load(filepath)
           .select('inputCountry', 'name', 'formattedAddress', 'placeId'))
    

def preprocess_phase1(ddf):
    w = Window.partitionBy('OPR_COUNTRY_CODE').orderBy(sf.asc('name'))
    return (ddf
            .fillna('')
            # create string columns to matched
            .withColumn('name',
                        sf.concat_ws(' ',
                                     sf.col('OPR_NAME'),
                                     sf.col('OPR_REGION'),
                                     sf.col('OPR_STREET'),
                                     sf.col('OPR_HOUSE_NUMBER'),
                                     sf.col('OPR_ZIP_CODE')))
            .withColumn('name', sf.regexp_replace('name', REGEX, ''))
            .withColumn('name', sf.trim(sf.regexp_replace('name', '\s+', ' ')))
            .filter(~sf.col('name').isin(*DROP_NAMES))
            .withColumn('name_index', sf.row_number().over(w) - 1)
            .selectExpr('name_index', 'name', 'OPR_COUNTRY_CODE as country_code', 'OPR_SOURCE', 'OPR_OPR_ORIG_INTEGRATION_ID'))

def preprocess_phase2(ddf):
    w = Window.partitionBy('inputCountry').orderBy(sf.asc('name'))
    return (ddf
            .fillna('')
            # create string columns to matched
            .withColumn('name',
                        sf.concat_ws(' ',
                                     sf.col('name'),
                                     sf.col('formattedAddress')))
            .withColumn('name', sf.regexp_replace('name', REGEX, ''))
            .withColumn('name', sf.trim(sf.regexp_replace('name', '\s+', ' ')))
            .filter(~sf.col('name').isin(*DROP_NAMES))
            .withColumn('name_index', sf.row_number().over(w) - 1)
            .selectExpr('name_index', 'name', 'inputCountry as country_code', 'placeId'))

In [None]:
phase_1_file = '../../Phase_I/Phase_I_Input_OHUB_Operator_Files/20181101/OPR_20180111.csv'
phase_2_file = '../../Phase_II/Phase_II_Output/avro/*.avro'

In [None]:
NTOP = 3
THRESHOLD = 0.7
MATRIX_CHUNK_ROWS = 750

In [None]:
drop_chars = "\"\\\\!#%&()*+-/:;<=>?@\\^|~\u00A8\u00A9\u00AA\u00AC\u00AD\u00AF\u00B0\u00B1\u00B2\u00B3\u00B6\u00B8\u00B9\u00BA\u00BB\u00BC\u00BD\u00BE\u2013\u2014\u2022\u2026\u20AC\u2121\u2122\u2196\u2197\u247F\u250A\u2543\u2605\u2606\u3001\u3002\u300C\u300D\u300E\u300F\u3010\u3011\uFE36\uFF01\uFF06\uFF08\uFF09\uFF1A\uFF1B\uFF1F{}\u00AE\u00F7\u02F1\u02F3\u02F5\u02F6\u02F9\u02FB\u02FC\u02FD\u1BFC\u1BFD\u2260\u2264\u2DE2\u2DF2\uEC66\uEC7C\uEC7E\uED2B\uED34\uED3A\uEDAB\uEDFC\uEE3B\uEEA3\uEF61\uEFA2\uEFB0\uEFB5\uEFEA\uEFED\uFDAB\uFFB7\u007F\u24D2\u2560\u2623\u263A\u2661\u2665\u266A\u2764\uE2B1\uFF0D"
REGEX = "[{}]".format(drop_chars)

In [None]:
DROP_NAMES = ['unknown', '', ' ']

## Read and preprocess data

In [None]:
phase1_opr_raw = read_opr1_csv(spark, phase_1_file)
count_phase_1_raw = phase1_opr_raw.groupby('OPR_COUNTRY_CODE').count()

phase1_opr = preprocess_phase1(phase1_opr_raw)
phase1_opr.persist()
count_phase_1 = phase1_opr.groupby('country_code').count()

phase1_opr.show(5, truncate=False)

count_phase_1_raw.join(count_phase_1, sf.col('OPR_COUNTRY_CODE') == sf.col('country_code')).show()

In [None]:
phase2_opr_raw = read_opr2_csv(spark, phase_2_file)
count_phase_2_raw = phase2_opr_raw.groupby('inputCountry').count()

phase2_opr = preprocess_phase2(phase2_opr_raw)
phase2_opr.persist()
count_phase_2 = phase2_opr.groupby('country_code').count()

phase2_opr.show(5, truncate=False)

count_phase_2_raw.join(count_phase_2, sf.col('inputCountry') == sf.col('country_code')).show()

In [None]:
country_codes = (count_phase_1
                 .select('country_code')
                 .distinct()
                 .rdd.map(lambda r: r[0]).collect())

print(country_codes)

In [None]:
def select_and_repartition_country(ddf, country_code):
    return (ddf
            .filter(sf.col('country_code') == country_code)
            .drop('country_code')
            .repartition('name_index')
            .sort('name_index', ascending=True))

def dense_to_sparse_ddf(ddf):
    udf_unpack_vector = sf.udf(unpack_vector, ngram_schema)
    return (ddf
            .withColumn('explode', sf.explode(udf_unpack_vector(sf.col('name_vector'))))
            .withColumn('ngram_index', sf.col('explode').getItem('ngram_index'))
            .withColumn('value', sf.col('explode').getItem('value'))
            .select('name_index', 'ngram_index', 'value'))

def sparse_to_csr_matrix(ddf):
    df = ddf.toPandas()
    df.name_index = df.name_index.astype(np.int32)
    df.ngram_index = df.ngram_index.astype(np.int32)
    df.value = df.value.astype(np.float64)

    csr_names_vs_ngrams = csr_matrix(
        (df.value.values, (df.name_index.values, df.ngram_index.values)),
        shape=(df.name_index.max() + 1, df.ngram_index.max() + 1),
        dtype=np.float64)
    del df
    return csr_names_vs_ngrams

def split_into_chunks(csr_names_vs_ngrams):
    n_chunks = max(1, math.floor(csr_names_vs_ngrams.shape[0] / MATRIX_CHUNK_ROWS))
    chunk_size = math.ceil(csr_names_vs_ngrams.shape[0] / n_chunks)
    print("Matrix chunk size is " + str(chunk_size))
    n_chunks = math.ceil(csr_names_vs_ngrams.shape[0] / chunk_size)
    chunks = [(csr_names_vs_ngrams[
               (i * chunk_size): min((i + 1) * chunk_size, csr_names_vs_ngrams.shape[0])], i * chunk_size)
              for i in range(n_chunks)]
    return chunks

ngram_schema = ArrayType(StructType([
    StructField("ngram_index", IntegerType(), False),
    StructField("value", FloatType(), False)
]))

similarity_schema = StructType([
    StructField("i", IntegerType(), False),
    StructField("j", IntegerType(), False),
    StructField("SIMILARITY", FloatType(), False)
])

def calculate_similarity(chunks_rdd, csr_rdd_transpose):
    similarity = chunks_rdd.flatMap(
        lambda x: chunk_dot_limit(x[0], csr_rdd_transpose.value,
                                  ntop=NTOP,
                                  threshold=THRESHOLD,
                                  start_row=x[1])
    )

    return similarity.toDF(similarity_schema)


def find_matches(similarity, opr_1, opr_2, country_code):
    return (similarity
            .join(opr_1, similarity['i'] == opr_1['name_index'],
                  how='left').drop('name_index')
            .selectExpr('i', 'j', 'SIMILARITY', 'name as name_phase1',
                        'OPR_SOURCE', 'OPR_OPR_ORIG_INTEGRATION_ID')
            .join(opr_2, similarity['j'] == opr_2['name_index'],
                  how='left').drop('name_index')
            .withColumn('COUNTRY_CODE', sf.lit(country_code))
            .selectExpr('COUNTRY_CODE', 'SIMILARITY', 'name_phase1', 'name as name_phase2', 
                        'OPR_SOURCE', 'OPR_OPR_ORIG_INTEGRATION_ID', 'placeId'))

In [None]:
for country_code in country_codes:
    print(country_code, 'START')
    
    ctr_opr1 = select_and_repartition_country(phase1_opr, country_code)
    ctr_opr2 = select_and_repartition_country(phase2_opr, country_code)

    encoded_names_1, encoded_names_2 = NameVectorizer(n_gram=2, min_df=2, vocab_size=1000).fit_transform(ctr_opr1.select('name_index', 'name'),
                                                                                                         ctr_opr2.select('name_index', 'name'))

    names_vs_ngrams_1 = dense_to_sparse_ddf(encoded_names_1)
    names_vs_ngrams_2 = dense_to_sparse_ddf(encoded_names_2)

    csr_names_vs_ngrams_1 = sparse_to_csr_matrix(names_vs_ngrams_1)
    csr_names_vs_ngrams_2 = sparse_to_csr_matrix(names_vs_ngrams_2)

    csr_rdd_2 = spark.sparkContext.broadcast(csr_names_vs_ngrams_2.transpose())

    chunks = split_into_chunks(csr_names_vs_ngrams_1)
    print("Parallelizing matrix in " + str(len(chunks)) + " chunks")
    chunks_rdd = spark.sparkContext.parallelize(chunks, numSlices=len(chunks))

    del csr_names_vs_ngrams_1, csr_names_vs_ngrams_2

    similarity = calculate_similarity(chunks_rdd, csr_rdd_2)
    # similarity.sort('SIMILARITY', ascending=False).show(10)

    matches = find_matches(similarity, ctr_opr1, ctr_opr2, country_code)
    # matches.sort('SIMILARITY', ascending=False).show(50, truncate=False)
    
    matches.coalesce(1).write.csv(country_code + '.csv', header=True)
    print(country_code, 'DONE')