# 0. Load data

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, RegexTokenizer, CountVectorizer, StopWordsRemover, NGram, Normalizer, VectorAssembler, Word2Vec, Word2VecModel, PCA

spark = SparkSession.builder.getOrCreate()

In [None]:
df = spark.read.csv("data/X2.csv", header=True)
df.toPandas()

# 0 Cleaning

In [None]:
df = spark.read.csv("data/X2.csv", header=True)
for c in df.columns:
    df = df.withColumn(c, f.lower(f.col(c)))
df = df.withColumn('brand', f.regexp_extract('brand', "^(\w+)", 1))
df = df.withColumn('cpu_brand', f.regexp_extract('cpu_brand', '(intel|amd|apple)', 1))
df = df.withColumn('cpu_model', f.when(df.cpu_brand == 'intel', f.regexp_extract('cpu_model', '(i\d)', 1)).otherwise(df.cpu_model))
df.select('cpu_model').show()

# 1. Blocking

In [None]:
tokenizer = RegexTokenizer(inputCol="title", outputCol="title_tokens", pattern = "\\W")
remover = StopWordsRemover(stopWords=["amazon", "com"], inputCol="title_tokens", outputCol="title_tokens_nosw")
output = tokenizer.transform(df)
output = remover.transform(output)
output.toPandas()

In [None]:
def top_kw_from_tfidf(vocab, n=3):
  @f.udf(returnType=t.ArrayType(t.StringType()))
  def _(arr):
    inds = arr.indices
    vals = arr.values
    top_inds = vals.argsort()[-n:][::-1]
    top_keys = inds[top_inds]
    output = []

    for k in top_keys:
      kw = vocab.value[k]
      output.append(kw)

    return output
  return _

In [None]:
cv = CountVectorizer(inputCol="title_tokens_nosw", outputCol="raw_features", minDF=1.0)
cvModel = cv.fit(output)
featurizedData = cvModel.transform(output)
# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="raw_features", outputCol="features", minDocFreq=1.0)
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

vocab = spark.sparkContext.broadcast(cvModel.vocabulary)

rescaledData.withColumn('top_tokens', top_kw_from_tfidf(vocab, n=5)(f.col("features"))).select("top_tokens").toPandas()

#rescaledData.select("title_tokens_nosw", "features").toPandas()

In [None]:
split_col = f.split(df.brand, " ")
df = df.withColumn("brand_name", split_col.getItem(0))
df.groupby(df.brand_name).count().show()