In [None]:
import os
import subprocess

from glob import glob
from os import path

In [None]:
cwd = path.join(os.getcwd(), 'cython')
cwd

In [None]:
subprocess.run(['python', 'setup.py', 'bdist_egg'], cwd=cwd)

In [None]:
egg_file = glob(path.join('cython', 'dist', '*.egg'))[0]
egg_file

In [None]:
import findspark
findspark.init()

import pyspark

conf = pyspark.SparkConf().setAppName('NameMatching')
conf.setExecutorEnv('PYTHON_EGG_CACHE', '/tmp')
# supposing you have 4 nodes with 8 cores each
conf.set('spark.dynamicAllocation.enabled', False)
conf.set('spark.executor.instances', 4) # low so that memory sharing is high
conf.set('spark.executor.cores', 14)
conf.set('spark.executor.memory', '14g')
conf.set('spark.driver.memory', '15g')

sc = pyspark.SparkContext(appName="NameMatching", conf=conf)
sqlContext = pyspark.sql.SQLContext(sc)
sc.addPyFile(egg_file)

In [None]:
sc.addPyFile(egg_file)

In [None]:
# import sparse_dot_topn.sparse_dot_topn as ct # this is the cython code module

from pyspark import keyword_only

from pyspark.ml import Pipeline
from pyspark.ml import Transformer

from pyspark.ml.feature import CountVectorizer, HashingTF
from pyspark.ml.feature import IDF
from pyspark.ml.feature import NGram
from pyspark.ml.feature import Normalizer
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import StopWordsRemover

from pyspark.mllib.linalg.distributed import IndexedRow
from pyspark.mllib.linalg.distributed import IndexedRowMatrix

from pyspark.ml.param.shared import HasInputCol
from pyspark.ml.param.shared import HasOutputCol
from pyspark.ml.param.shared import Param
from pyspark.ml.param.shared import Params

from pyspark.sql import functions as sf
from pyspark.sql.types import LongType
from pyspark.sql.types import StructField
from pyspark.sql.types import StructType
from pyspark.sql.types import StringType

from pyspark.sql.types import ArrayType, IntegerType, StringType, FloatType
from pyspark.mllib.linalg import VectorUDT, Vector, Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.sql.functions import udf
from pyspark.sql.types import StructField, StructType
from pyspark.sql.functions import array, struct

import numpy as np
from scipy.sparse import csr_matrix, coo_matrix
from scipy.sparse import _sparsetools
from scipy.sparse.sputils import get_index_dtype
import math
from itertools import tee

In [None]:
def chunk_dot_limit(A, B, ntop, lower_bound=0, start_row=0):
    B = B.tocsr()

    M, K1 = A.shape
    K2, N = B.shape

    idx_dtype = np.int32

    nnz_max = M * ntop

    indptr = np.empty(M + 1, dtype=idx_dtype)
    rows = np.empty(nnz_max, dtype=idx_dtype)
    cols = np.empty(nnz_max, dtype=idx_dtype)
    data = np.empty(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N,
        np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, rows, cols, data)

    max_k = indptr[-1]
    return ((int(i) + start_row, int(j), float(v))
            for i, j, v in zip(rows[:max_k], cols[:max_k], data[:max_k])
            if (i + start_row) < j)

In [None]:
class NameVectorizer(object):

    def __init__(self, n_gram, min_df, vocab_size):
        self.n_gram = n_gram
        self.min_df = min_df
        self.vocab_size = vocab_size
        self.__create_pipeline()

    def __create_pipeline(self):
        regexTokenizer = RegexTokenizer(inputCol="name",
                                        outputCol="tokens",
                                        pattern="")
        ngram_creator = NGram(inputCol="tokens",
                              outputCol="n_grams",
                              n=self.n_gram)
        tf_counter = CountVectorizer(inputCol='n_grams',
                                     outputCol='term_frequency',
                                     minTF=1.0,
                                     minDF=self.min_df,
                                     vocabSize=self.vocab_size,
                                     binary=False)
        idf_counter = IDF(inputCol="term_frequency",
                          outputCol="tfidf_vector")
        l2_normalizer = Normalizer(inputCol="tfidf_vector",
                                   outputCol="encoded_vector",
                                   p=2)

        self.pipeline = Pipeline(
            stages=[regexTokenizer,
                    ngram_creator,
                    tf_counter,
                    idf_counter,
                    l2_normalizer]
        )

    def fit(self, df):
        return self.pipeline.fit(df)
    
    def transform(self, df):
        return self.pipeline.transform(df)


def zip_sparse(sparse):
    return ((int(index), float(value))
            for index, value in zip(sparse.indices, sparse.values)
            if value > 0.1)


schema = StructType([StructField("dummy_id", LongType(), False),
                     StructField("id", StringType(), False),
                     StructField("name", StringType(), False)])

schema_sparse = ArrayType(StructType([
    StructField("j_ngram", IntegerType(), False),
    StructField("value", FloatType(), False)
]))

sim_schema = StructType([
    StructField("i", IntegerType(), False),
    StructField("j", IntegerType(), False),
    StructField("SIMILARITY", FloatType(), False)
])

udf_zip_sparse = sf.udf(zip_sparse, schema_sparse)


In [None]:
input_file = 'adl://ulohubdldevne.azuredatalakestore.net/data/parquet/OPERATORS.parquet'
output_file = 'adl://ulohubdldevne.azuredatalakestore.net/data/parquet/OPERATORS_MATCHED.parquet'

In [None]:
ntop = 100
threshold = 0.8

In [None]:
drop_chars = "\\\\!#%&()*+-/:;<=>?@\\^|~\u00A8\u00A9\u00AA\u00AC\u00AD\u00AF\u00B0\u00B1\u00B2\u00B3\u00B6\u00B8\u00B9\u00BA\u00BB\u00BC\u00BD\u00BE\u2013\u2014\u2022\u2026\u20AC\u2121\u2122\u2196\u2197\u247F\u250A\u2543\u2605\u2606\u3001\u3002\u300C\u300D\u300E\u300F\u3010\u3011\uFE36\uFF01\uFF06\uFF08\uFF09\uFF1A\uFF1B\uFF1F{}\u00AE\u00F7\u02F1\u02F3\u02F5\u02F6\u02F9\u02FB\u02FC\u02FD\u1BFC\u1BFD\u2260\u2264\u2DE2\u2DF2\uEC66\uEC7C\uEC7E\uED2B\uED34\uED3A\uEDAB\uEDFC\uEE3B\uEEA3\uEF61\uEFA2\uEFB0\uEFB5\uEFEA\uEFED\uFDAB\uFFB7\u007F\u24D2\u2560\u2623\u263A\u2661\u2665\u266A\u2764\uE2B1\uFF0D"
regex = "[{}]".format(drop_chars)

In [None]:
drop_names = ['unknown', 'unknown companyad', 'not specified notspecified not specified0 00000', '是否 54534', 'privat']

In [None]:
sqlContext = pyspark.sql.SQLContext(sc)
print("Driver Memory: ", sc._conf.get('spark.driver.memory'))

all_operators = (
    sqlContext
    .read.parquet(input_file)
    .na.drop(subset=['NAME_CLEANSED'])
    .withColumn('id',
                sf.concat('COUNTRY_CODE', sf.lit("~"),
                          'SOURCE', sf.lit('~'),
                          'REF_OPERATOR_ID'))
    .fillna('')
    .withColumn('name',
                sf.concat('NAME_CLEANSED', sf.lit(' '),
                          sf.col('CITY_CLEANSED'), sf.lit(' '),
                          sf.col('STREET_CLEANSED'), sf.lit(' '),
                          sf.col('ZIP_CODE_CLEANSED')))
    .withColumn('name', sf.regexp_replace('name', regex, ''))
    .withColumn('name', sf.trim(sf.regexp_replace('name', '\s+', ' ')))
    .filter(sf.col('name').isin(*drop_names) == False)
)
all_operators.persist()

In [None]:
all_opr_count = all_operators.groupby('COUNTRY_CODE','name').count().sort('count', ascending=False)
all_opr_count.persist()

In [None]:
all_opr_count.show(200, truncate=False)

In [None]:
opr_count = all_operators.groupby('COUNTRY_CODE').count()
opr_count.sort('count', ascendint=True).show(10)

In [None]:
country_codes = (opr_count[opr_count['count'] > 100]
                 .select('COUNTRY_CODE')
                 .distinct()
                 .rdd.map(lambda r: r[0]).collect())

print(country_codes)

In [None]:
for country_code in ['US']:
    print("Country code:", country_code)
    
    operators = (
        all_opr[sf.col('COUNTRY_CODE') == country_code]
        .repartition('id')
        .select('id', 'name')
        .sort('id', ascending=True)
        .rdd
        .zipWithIndex()
        .map(lambda x: (x[1], x[0][0], x[0][1]))
        .toDF(schema)
    )

    
    nvec = NameVectorizer(n_gram=2, min_df=2, vocab_size=1500).fit(operators)
    
    sparse_names = (nvec
                    .transform(operators)
                    .select(['dummy_id', 'encoded_vector']))

    match_pairs = (
        sparse_names.select('dummy_id', 'encoded_vector')
        .withColumn('explode',
                    sf.explode(udf_zip_sparse(sf.col('encoded_vector'))))
        .withColumn('j_ngram', sf.col('explode').getItem('j_ngram'))
        .withColumn('value', sf.col('explode').getItem('value'))
        .select('dummy_id', 'j_ngram', 'value')
    )

    df = match_pairs.toPandas()
    df.dummy_id = df.dummy_id.astype(np.int32)
    df.j_ngram = df.j_ngram.astype(np.int32)
    df.value = df.value.astype(np.float64)

    n_matrix = csr_matrix(
        (df.value.values, (df.dummy_id.values, df.j_ngram.values)),
        shape=(df.dummy_id.max() + 1, df.j_ngram.max() + 1),
        dtype=np.float64
    )
    del df

    m_T_bcast = sc.broadcast(n_matrix.transpose())

    n_chunks = max(1, math.floor(n_matrix.shape[0] / 1000))
    chunk_size = math.ceil(n_matrix.shape[0] / n_chunks)
    n_chunks = math.ceil(n_matrix.shape[0] / chunk_size)
    m_chunks = [
        (n_matrix[
         (i * chunk_size): min((i + 1) * chunk_size, n_matrix.shape[0])],
         i * chunk_size) for i in range(n_chunks)]
    m_distr = sc.parallelize(m_chunks, numSlices=n_chunks)

    similarity = m_distr.flatMap(
        lambda x: chunk_dot_limit(x[0], m_T_bcast.value,
                                  ntop=ntop,
                                  lower_bound=threshold,
                                  start_row=x[1])
    )
    m_T_bcast.unpersist()

    similarity = similarity.toDF(sim_schema)
    
    # group similarities with column i being the group id
    grouping_window = (Window
                       .partitionBy('j')
                       .orderBy(sf.asc('i')))

    grp_sim = (
        similarity
        .withColumn("rn", sf.row_number().over(grouping_window))
        .filter(sf.col("rn") == 1)
        .drop('rn')
    )

    grp_similarity = grp_sim.join(
        grp_sim.select('j').subtract(grp_sim.select('i')),
        on='j', how='inner'
    )

    matches = (
        grp_similarity
        .join(operators, similarity['i'] == operators['dummy_id'],
              how='left').drop('dummy_id')
        .selectExpr('i', 'j', 'id as SOURCE_ID', 'SIMILARITY', 'name as SOURCE_NAME')
        .join(operators, similarity['j'] == operators['dummy_id'],
              how='left').drop('dummy_id')
        .withColumn('matched_string', sf.lit(
            'NAME_CLEANSED CITY_CLEANSED STREET_CLEANSED ZIP_CODE_CLEANSED'
        ))
        .withColumn('COUNTRY_CODE', sf.lit(country_code))
        .selectExpr('matched_string', 'COUNTRY_CODE', 'SOURCE_ID', 'id as TARGET_ID',
                    'SIMILARITY', 'SOURCE_NAME', 'name as TARGET_NAME')
    )
    
#     print(matches.count())

    of = os.path.join(output_file, "country_code={}".format(country_code))
    mode = 'overwrite'
    print("Writing to:", of)
    print("Mode:", mode)

    (matches
     .write
     .mode(mode)
     .save(of))
#      .parquet(output_file, mode='append'))

    print("Done, country code:", country_code)