# 0. Load data

In [None]:

import tensorflow_hub as hub
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t
from pyspark.sql import Window as w

from pyspark.ml.linalg import DenseVector, SparseVector
from pyspark.ml.feature import HashingTF, IDF,  Tokenizer, RegexTokenizer, CountVectorizer, StopWordsRemover, NGram, Normalizer, VectorAssembler, Word2Vec, Word2VecModel, PCA
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.clustering import LDA
from pyspark.mllib.linalg import Vectors, VectorUDT

spark = SparkSession.builder.getOrCreate()

In [None]:
df = spark.read.csv("data/X2.csv", header=True)

# 0. Data cleaning

In [None]:
for c in df.columns:
#set everything to lowercase
    df = df.withColumn(c, f.lower(f.col(c)))

#extract brand or infer from title
df = df.withColumn('brand', f.regexp_extract('brand', "^(\w+)", 0))
computer_brands = ['(lenovo', 'acer', 'hp', 'dell', 'asus', 'samsung', 'huawei', 'surface', 'apple)']
computer_brands_pattern = '|'.join(computer_brands)
df = df.withColumn('brand', f.when( f.regexp_extract('title', computer_brands_pattern, 0)!='', f.regexp_extract('title', computer_brands_pattern, 0))\
                   .otherwise(df.brand))
#exctract cpu_brand and infer type if intel
cpu_brands = ['(intel', 'apple', 'amd', 'nvidia', 'arm)']
cpu_pattern = '|'.join(cpu_brands)
df = df.withColumn('cpu_model', f.when( (f.regexp_extract('cpu_brand','intel', 0 )!='') & f.isnull(df.cpu_model) ,\
                                        f.regexp_extract('cpu_brand', '(i\d|pentium|celeron)', 0))\
                   .otherwise(df.cpu_model))
df = df.withColumn('cpu_brand', f.when(f.regexp_extract('cpu_brand', cpu_pattern, 0) != '', f.regexp_extract('cpu_brand', cpu_pattern, 1))\
                                       .otherwise(f.regexp_extract('title', cpu_pattern, 0)))
df = df.withColumn('weight', f.when(df.weight.contains('pounds') | df.weight.contains('lbs'),
                                    (f.regexp_extract('weight', '(\d+\.?\d*)', 0).cast(t.DoubleType()))).otherwise(
                                    f.round(f.regexp_extract('weight', '(\d+\.?\d*)', 0).cast(t.DoubleType())*2.20462,1)
                        )
                    )
df.select('weight').show()

# 1. Blocking

In [None]:
"""UTILITIES"""

"""Returns the df with tokenized columns with stopwords removed"""
def tokenize(df, string_cols):
  output = df
  stopW = ['softwarecity', 'amazon', 'com','pc', 'windows', 'computers', 'computer', 'accessories', 'laptop', 'notebook', 'kg', 'inch', 'processor', 'memory','gb', 'ram', 'hdd', 'ssd', 'cpu', 'display', 'hz', 'ghz', 'tb', 'mhz', 'cache', 'ram', 'ddram', 'dram', 'hd']
  for c in string_cols:
    output = output.withColumn('temp', f.coalesce(f.col(c), f.lower(c), f.lit('')))
    tokenizer = RegexTokenizer(inputCol='temp', outputCol=c+"_tokens", pattern = "\\W")
    remover = StopWordsRemover(inputCol=c+"_tokens", outputCol=c+"_swRemoved", stopWords=stopW)
    output = tokenizer.transform(output)

    filter_alnum = f.udf(lambda l : [t for t in l[:10] if t.isalpha() and len(t) >= 2], t.ArrayType(t.StringType()))
    output = output.withColumn(c+'_tokens', filter_alnum(f.col(c+"_tokens")))

    output.select(c+'_tokens').show()

    output = remover.transform(output)\
      .drop('temp', c+"_tokens")
    # output has c+swRemoved columns
    output.select(c+'_swRemoved').show()

  return output

def generate_blocking_keys(df, token_cols, min_freq=1):
    """Pipeline:
            1 - CountVectorizer -> TF
            2 - IDF
            3 - LDA
    """
    def assemble_tokens(list_of_lists):
        tokens = []
        for l in list_of_lists:
            for t in l:
                tokens.append(t)
        return tokens
    udf_assemble = f.udf(assemble_tokens, t.ArrayType(t.StringType()))
    df = df.withColumn('tokens_swRemoved', f.concat(*token_cols))
    cv = CountVectorizer(inputCol='tokens_swRemoved', outputCol="rawFeatures")
    cvmodel = cv.fit(df)
    df_vect = cvmodel.transform(df)

    idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=min_freq)
    idfModel = idf.fit(df_vect)
    df_idf= idfModel.transform(df_vect)

    normalizer = Normalizer(p=2.0, inputCol='features', outputCol='tfidf')
    output = normalizer.transform(df_idf)

    lda = LDA(k=3, maxIter=500, featuresCol='tfidf')
    lda_model = lda.fit(output)
    vocab = cvmodel.vocabulary

    def get_words(token_list):
        return [vocab[token_id] for token_id in token_list]

    udf_to_words = f.udf(get_words, t.ArrayType(t.StringType()))

    topics = lda_model.describeTopics(2).withColumn('topicWords', udf_to_words(f.col('termIndices')))\
        .show()
    output = lda_model.transform(output)
    output.select('topicDistribution').show()

    #returns the index of the topic for which the distr is higher
    def get_key(topicDistr):
        max_i = 0
        max = 0.0
        i = 0
        for t in topicDistr:
            if t >= max:
                max = t
                max_i = i
            i += 1
        return max_i

    udf_get_key = f.udf(get_key, t.IntegerType())
    output = output.withColumn("blocking_key", udf_get_key(f.col("topicDistribution")))
    output.select("blocking_key").show()
    return output

"""Use universal sentence encoder from tensorflow_hub"""
MODEL = None
def get_model_magic():
  global MODEL
  if MODEL is None:
      MODEL = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
  return MODEL

@f.udf(returnType=VectorUDT())
def encode_sentence(x):
  model = get_model_magic()
  emb = model([x]).numpy()[0]
  return Vectors.dense(emb)

In [None]:
columns = ['brand','cpu_brand','title']
blocking_df = tokenize(df, columns)

In [None]:
blocking_df = generate_blocking_keys(blocking_df, [c+'_swRemoved' for c in columns])

In [None]:
blocking_df.groupby('blocking_key').count().show()

# 2. Canidate pairs match generation

In [None]:
import numpy as np
from graphframes import GraphFrame



