In [None]:
from pyspark import SparkContext, SparkConf
import tensorflow_hub as hub
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t
from pyspark.sql import Window as w
from graphframes import GraphFrame
from pyspark.ml.linalg import DenseVector, SparseVector
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, RegexTokenizer, CountVectorizer, StopWordsRemover, NGram, Normalizer, VectorAssembler, Word2Vec, Word2VecModel, PCA
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.clustering import LDA
from pyspark.mllib.linalg import Vectors, VectorUDT
conf = (SparkConf().setMaster('local').setAppName('local-1616888250368').set('spark.executor.memory', '15g').set('spark.driver.memory', '20g')\
        .set('spark.sql.broadcastTimeout', '1000'))
sc = SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

# 0. Load data

In [None]:
df = spark.read.csv("data/X2.csv", header=True)
df.toPandas()

# 0. Data cleaning

In [None]:
for c in df.columns:
#set everything to lowercase
    df = df.withColumn(c, f.lower(f.col(c)))

#extract brand or infer from title
df = df.drop('ssd_capacity')
df = df.withColumn('brand', f.regexp_extract('brand', "^(\w+)", 0))
computer_brands = ['(lenovo', 'acer', 'hp', 'dell', 'asus', 'samsung', 'huawei', 'surface', 'apple)']
computer_brands_pattern = '|'.join(computer_brands)
df = df.withColumn('brand', f.when( f.regexp_extract('title', computer_brands_pattern, 0)!='', f.regexp_extract('title', computer_brands_pattern, 0))\
                   .otherwise(df.brand))
#exctract cpu_brand and infer type if intel
cpu_brands = ['(intel', 'apple', 'amd', 'nvidia', 'arm)']
cpu_pattern = '|'.join(cpu_brands)
df = df.withColumn('cpu_model',f.regexp_extract('cpu_model', '(i\d|pentium|celeron|a\d)', 0))
df = df.withColumn('cpu_model', f.when( (f.regexp_extract('cpu_brand','(intel|amd)', 0 )!='') & f.isnull(df.cpu_model) ,\
                                        f.regexp_extract('cpu_brand', '(i\d|pentium|celeron|a\d)', 0))\
                   .otherwise(df.cpu_model))
df = df.withColumn('cpu_brand', f.when(f.regexp_extract('cpu_brand', cpu_pattern, 0) != '', f.regexp_extract('cpu_brand', cpu_pattern, 1))\
                                       .otherwise(f.regexp_extract('title', cpu_pattern, 0)))
df = df.withColumn('weight', f.when(df.weight.contains('pounds') | df.weight.contains('lbs'),
                                    (f.regexp_extract('weight', '(\d+.?\d)', 0).cast(t.DoubleType()))).otherwise(
                                    f.round(f.regexp_extract('weight', '(\d+.?\d)', 0).cast(t.DoubleType())*2.20462,1)
                        )
                    )


# 1. Blocking
Blocking will be done feeding a TF-IDF matrix to an LDA model and extracting
keywords from the title matching them to topics.

In [None]:
"""UTILITIES"""

"""Returns the df with tokenized columns with stopwords removed"""
def tokenize(df, string_cols):
  output = df
  stopW = ['softwarecity', 'amazon', 'com','pc', 'windows', 'computers', 'computer', 'accessories', 'laptop', 'notebook', 'kg', 'inch', 'processor', 'memory','gb', 'ram', 'hdd', 'ssd', 'cpu', 'display', 'hz', 'ghz', 'tb','rpm', 'slot', 'slots', 'mhz', 'cache', 'ram', 'ddram', 'dram', 'hd']
  for c in string_cols:
    output = output.withColumn('temp', f.coalesce(f.col(c), f.lower(c), f.lit('')))
    tokenizer = RegexTokenizer(inputCol='temp', outputCol=c+"_tokens", pattern = "\\W")
    remover = StopWordsRemover(inputCol=c+"_tokens", outputCol=c+"_swRemoved", stopWords=stopW)
    output = tokenizer.transform(output)

    filter_alnum = f.udf(lambda l : [t for t in l if t.isalpha() and len(t) >= 2], t.ArrayType(t.StringType()))
    output = output.withColumn(c+'_tokens', filter_alnum(f.col(c+"_tokens")))

    output = remover.transform(output)\
      .drop('temp', c+"_tokens")
    # output has c+swRemoved columns
  return output

def generate_blocking_keys(df, token_cols, min_freq=1):
    """Pipeline:
            1 - CountVectorizer -> TF
            2 - IDF
            3 - LDA
    """
    df = df.withColumn('tokens_swRemoved', f.concat(*token_cols))
    cv = CountVectorizer(inputCol='tokens_swRemoved', outputCol="rawFeatures")
    cvmodel = cv.fit(df)
    df_vect = cvmodel.transform(df)

    idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=min_freq)
    idfModel = idf.fit(df_vect)
    df_idf= idfModel.transform(df_vect)

    normalizer = Normalizer(p=2.0, inputCol='features', outputCol='tfidf')
    output = normalizer.transform(df_idf)

    lda = LDA(k=5, maxIter=1000, featuresCol='tfidf')
    lda_model = lda.fit(output)
    vocab = cvmodel.vocabulary
    #returns words for each topic term
    def get_words(token_list):
        return [vocab[token_id] for token_id in token_list]

    udf_to_words = f.udf(get_words, t.ArrayType(t.StringType()))

    #create list of topic keywords
    # i.e topic 1 -> acer, anspire, intel
    topics = lda_model.describeTopics(3).withColumn('topicWords', udf_to_words(f.col('termIndices'))).collect()
    list_of_topics = []
    for r in topics:
        topicW = r.__getitem__('topicWords')
        for w in topicW:
            list_of_topics.append(w)

    #returns list of 3 'hashtags' i.e keywords for topic
    #from tokens: title, brand, cpu_brand
    def get_key(words):
        l = [w for w in words if w in list_of_topics]
        l = list(set(l))
        l.sort()
        return l[:3]
    udf_get_key = f.udf(get_key, t.ArrayType(t.StringType()))
    output = output.withColumn("blocking_key", udf_get_key(f.col("tokens_swRemoved")))
    output.select("blocking_key").show()
    return output

"""Use universal sentence encoder from tensorflow_hub"""
MODEL = None
def get_model_magic():
  global MODEL
  if MODEL is None:
      MODEL = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
  return MODEL

@f.udf(returnType=VectorUDT())
def encode_sentence(x):
  model = get_model_magic()
  emb = model([x]).numpy()[0]
  return Vectors.dense(emb)

In [None]:
columns = ['title', 'brand', 'cpu_brand', 'cpu_model', 'ram_type', 'ram_capacity', 'hdd_capacity', 'weight']
blocking_df = tokenize(df, columns[:2])

In [None]:
columns = ['title', 'brand', 'cpu_brand', 'cpu_model', 'ram_type', 'ram_capacity', 'hdd_capacity', 'weight']
for c in columns:
    blocking_df = blocking_df.withColumn(c+'_encoding', encode_sentence(f.coalesce(f.col(c), f.lit(''))))

blocking_df = generate_blocking_keys(blocking_df,
                                     [c+'_swRemoved' for c in columns[:2]])

In [None]:
blocking_df.groupby('blocking_key').count().show()

# 2. Candidate pairs generation and match likelihood

In [None]:
"""
This cell output a candidates dataframe that has
instance_ids pairs that makes sense to compare, i.e each
entity will be paired with another entity from the same block
"""
#cols_to_keep = [c+'_encoding' for c in columns]
#for c in columns:
#    cols_to_keep.append(c)
#cols_to_keep.append('tokens_swRemoved')
#cols_to_keep.append('tfidf')
#cols_to_keep.append('instance_id')
cols_to_keep = ['instance_id', 'title_encoding']
#node = blocking_df.select(f.col('instance_id').alias('id'), *cols_to_keep).drop('instance_id')
node = blocking_df.select(f.col('instance_id').alias('id'), 'title_encoding').drop('instance_id')
node.select('title_encoding').limit(1).show()
print(blocking_df.columns)
pairs = blocking_df.select(*cols_to_keep, 'blocking_key')\
    .groupby('blocking_key').agg(f.count('instance_id').alias('size'), f.collect_set('instance_id').alias('id'))\
    .filter(f.col('size') > 1).select('blocking_key',f.explode('id').alias('id'))
pairs.show()

left = pairs.withColumnRenamed('id', 'src')
right = pairs.withColumnRenamed('id', 'dst')
#candidates based on matching of blocking_key (i.e inside the block)
candidates = left.join(right, ['blocking_key'], 'inner')\
    .filter(f.col('src') < f.col('dst'))\
    .select('src', 'dst').distinct()

In [None]:
"""
@f.udf(returnType=t.DoubleType())
def dot(x, y):
  if x is not None and y is not None:
    return float(x.dot(y))
  else:
    return 0

def null_safe_levenshtein_sim(c1, c2):
  output = f.when(f.col(c1).isNull() | f.col(c2).isNull(), 0)\
            .otherwise(1 - f.levenshtein(c1, c2) / f.greatest(f.length(c1), f.length(c2)))
  return output

def null_safe_num_sim(c1, c2):
  output = f.when(f.col(c1).isNull() | f.col(c2).isNull(), 0)\
            .when((f.col(c1) == 0) & (f.col(c2) == 0), 1)\
            .when((f.col(c1) == 0) | (f.col(c2) == 0), 0)\
            .otherwise(1 - f.abs(f.col(c1) - f.col(c2)) / f.greatest(c1, c2))
  return output

def null_safe_token_overlap(c1, c2):
  # is the overlap a significant part of the shorter string
  output = f.when(f.col(c1).isNull() | f.col(c2).isNull(), 0)\
            .when((f.size(f.array_distinct(c1)) == 0) | (f.size(f.array_distinct(c2)) == 0), 0)\
            .otherwise(f.size(f.array_intersect(c1, c2)) / f.least(f.size(f.array_distinct(c1)), f.size(f.array_distinct(c1))))
  return output

def calc_sim(df, candidates):
    metrics = []
    for c in columns[:2]:
        if '_encoding' not in c:
            candidates = candidates.withColumn(c+'_lev', null_safe_levenshtein_sim(df.filter(df.id == candidates.src).select(c),df.filter(df.id == candidates.dst).select(c)))
            metrics.append(c+'_lev')
        else:
            metrics.append(c+'_sim')
            candidates = candidates.withColumn(c+'_sim', dot(df.filter(df.id == candidates.src).select(c), df.filter(df.id == candidates.dst).select(c)))
    candidates = candidates.withColumn('tfidf_sim', dot(df.filter(df.id == candidates.src).select('tfidf'),df.filter(df.id == candidates.dst).select('tfidf')))
    candidates = candidates.withColumn('token_sim', dot(df.filter(df.id == candidates.src).select('tokens_swRemoved'), df.filter(df.id == candidates.dst).select('tokens_swRemoved')))
    candidates = candidates.withColumn('weight_sim', dot(df.filter(df.id == candidates.src).select('weight'),df.filter(df.id == candidates.dst).select('weight')))
    metrics.append('tfidf_sim')
    metrics.append('token_sim')
    metrics.append('weigth_sim')
    def sum_distance(distances):
        return sum(d for d in distances)
    udf_sum = f.udf(sum_distance, t.DoubleType())
    candidates = candidates.withColumn('sum_sim', udf_sum([f.col(c) for c in metrics]))
    udf_norm = f.udf(lambda d : d / len(metrics))
    candidates = candidates.withColumn('overall_sim', udf_norm(f.col('sum_sim'))).drop(f.col('sum_sim'))
    return candidates

distance_df = calc_sim(node, candidates)
"""

In [None]:
labels = spark.read.csv("data/Y2.csv", header=True).withColumnRenamed('left_instance_id', 'lid').withColumnRenamed('right_instance_id', 'rid')
labels.show()
#label_df = labels.join(candidates.withColumnRenamed('src','lid').withColumnRenamed('dst','rid'), ['lid','rid'], 'inner')
label_df = labels.join(node.alias("node_1"), labels.lid == node.id, 'inner').drop('id')
for c in cols_to_keep[1:]:
    label_df = label_df.withColumnRenamed(c, 'l_'+c)
print(label_df.columns)
label_df = label_df.alias('one').join(node.alias("node_2"), label_df.rid == node.id, 'inner').drop('id')
for c in cols_to_keep[1:]:
    label_df = label_df.withColumnRenamed(c, 'r_'+c)

In [None]:
def toList(row):
    l = []
    for v in row:
        for n in v:
            l.append(float(n))
    return l
udf_toList = f.udf(toList, t.ArrayType(t.FloatType()))
label_df = label_df.withColumn('features', udf_toList(f.array('l_title_encoding', 'r_title_encoding')))