In [1]:
from pyspark import SparkContext, SparkConf
import tensorflow_hub as hub
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t
from pyspark.sql import Window as w
from graphframes import GraphFrame
from pyspark.ml.linalg import DenseVector, SparseVector
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, RegexTokenizer, CountVectorizer, StopWordsRemover, NGram, Normalizer, VectorAssembler, Word2Vec, Word2VecModel, PCA

from pyspark.ml.clustering import LDA
from pyspark.ml.linalg import VectorUDT, Vectors

conf = (SparkConf().setMaster('local').setAppName('sigmod-21').set('spark.executor.memory', '2G').set('spark.driver.memory', '4G')\
        .set('spark.sql.broadcastTimeout', '1000'))
sc = SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

# 0. Load data

In [2]:
df = spark.read.csv("data/X2.csv", header=True)
df.toPandas()

Unnamed: 0,instance_id,brand,cpu_brand,cpu_model,cpu_type,cpu_frequency,ram_capacity,ram_type,ram_frequency,hdd_capacity,ssd_capacity,weight,dimensions,title
0,www.softwarecity.ca//737,Lenovo,Intel. i5-3320M,i5-3320M,Dual-core ( 2 Core ). Core i5,2.60 GHz,,DDR3 SDRAM. DDR3-1600/PC3-12800. DDR3 SDRAM,DDR3-1600/PC3-12800,320 GB,,1.80 kg,,"""Lenovo Thinkpad X230 34352jf Tablet Pc - 12.5..."
1,www.isupplyhub.com//1256,Acer,1.6 GHz Intel Core i5-4200U. Intel Core I5,,1.6 GHz Intel Core i5-4200U,1.6 GHz Intel Core i5-4200U,8 GB DDR3L SDRAM,DDR3 SDRAM. 8 GB DDR3L SDRAM,,500 GB mechanical_hard_drive,,4.8 pounds,15.02 x 10.08 x 0.90 inches,Amazon.com : Acer Aspire V7-582PG-6479 15.6-In...
2,www.isupplyhub.com//326,Acer,1.6 GHz Intel Core i5. Intel Core I5,,1.6 GHz Intel Core i5,1.6 GHz Intel Core i5,4 GB DDR3-SDRAM,DDR3 SDRAM. 4 GB DDR3-SDRAM,,500 GB mechanical_hard_drive,,5.2 pounds,15.02 x 10.08 x 1 inches,Amazon.com : Acer Aspire E1-572-6870 15.6 Inch...
3,www.isupplyhub.com//821,HP,,,,,4 GB SDRAM DDR3,DDR3 SDRAM. 4 GB SDRAM DDR3,,500 GB,,4.8 pounds,15.18 x 0.89 x 10.16 inches,"""Amazon.com : 15.6"""" HP 15-f009wm Amd Dual-Cor..."
4,www.isupplyhub.com//157,Asus,1.7 GHz Core i5-3317U. Intel,,1.7 GHz Core i5-3317U,1.7 GHz Core i5-3317U,4 GB DDR3,DDR3 SDRAM. 4 GB DDR3,,256 MB,,2.9 pounds,8.80 x 0.70 x 12.80 inches,Amazon.com : ASUS UX31A-XB52 13.3-Inch Ultrabo...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,www.vology.com//873,Lenovo ThinkPad X230 2320 - 12.5 '' - Core i5 ...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,4 GB DDR3 Slots Qty 2 Empty Slots 1 Max RAM Su...,4 GB DDR3 Slots Qty 2 Empty Slots 1 Max RAM Su...,4 GB DDR3 Slots Qty 2 Empty Slots 1 Max RAM Su...,180 GB SSD. 180 GB SSD. Lenovo ThinkPad X230 2...,180 GB SSD. 180 GB SSD,3.3 lbs 3.3 lbs,8.1 in. 12 in x 8.1 in x 1 in. 1 in. 12 in,"""Lenovo ThinkPad X230 2320 - 12.5"""" - Core i5 ..."
339,www.vology.com//823,Lenovo ThinkPad X230 2325 - 12.5 '' - Core i5 ...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,4 GB DDR3 Slots Qty 2 Max RAM Supported 16 GB ...,4 GB DDR3 Slots Qty 2 Max RAM Supported 16 GB ...,4 GB DDR3 Slots Qty 2 Max RAM Supported 16 GB ...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm. ...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm,3.3 lbs 3.3 lbs,8.1 in. 12 in x 8.1 in x 1 in. 1 in. 12 in,"""Lenovo ThinkPad X230 2325 - 12.5"""" - Core i5 ..."
340,www.vology.com//2723,Lenovo ThinkPad X230 Tablet 3438 - 12.5 '' - C...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm. ...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm,4 lbs 4 lbs,9 in. 12 in x 9 in x 1.2 in. 1.2 in. 12 in,"""Lenovo ThinkPad X230 Tablet 3438 - 12.5"""" - C..."
341,www.vology.com//1349,Lenovo ThinkPad X230 2324 - 12.5 '' - Core i5 ...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,320 GB HDD / 7200 rpm. 320 GB HDD / 7200 rpm. ...,320 GB HDD / 7200 rpm. 320 GB HDD / 7200 rpm,3.3 lbs 3.3 lbs,8.1 in. 12 in x 8.1 in x 1 in. 1 in. 12 in,"""Lenovo ThinkPad X230 2324 - 12.5"""" - Core i5 ..."


# 0. Data cleaning

In [3]:
for c in df.columns:
#set everything to lowercase
    df = df.withColumn(c, f.lower(f.col(c)))

#extract brand or infer from title
df = df.drop('ssd_capacity')
df = df.withColumn('brand', f.regexp_extract('brand', "^(\w+)", 0))
computer_brands = ['(lenovo', 'acer', 'hp', 'dell', 'asus', 'samsung', 'huawei', 'surface', 'apple)']
computer_brands_pattern = '|'.join(computer_brands)
df = df.withColumn('brand', f.when( f.regexp_extract('title', computer_brands_pattern, 0)!='', f.regexp_extract('title', computer_brands_pattern, 0))\
                   .otherwise(df.brand))
#exctract cpu_brand and infer type if intel
cpu_brands = ['(intel', 'apple', 'amd', 'nvidia', 'arm)']
cpu_pattern = '|'.join(cpu_brands)
df = df.withColumn('cpu_model',f.regexp_extract('cpu_model', '(i\d|pentium|celeron|a\d)', 0))
df = df.withColumn('cpu_model', f.when( (f.regexp_extract('cpu_brand','(intel|amd)', 0 )!='') & f.isnull(df.cpu_model) ,\
                                        f.regexp_extract('cpu_brand', '(i\d|pentium|celeron|a\d)', 0))\
                   .otherwise(df.cpu_model))
df = df.withColumn('cpu_brand', f.when(f.regexp_extract('cpu_brand', cpu_pattern, 0) != '', f.regexp_extract('cpu_brand', cpu_pattern, 1))\
                                       .otherwise(f.regexp_extract('title', cpu_pattern, 0)))
df = df.withColumn('weight', f.when(df.weight.contains('pounds') | df.weight.contains('lbs'),
                                    (f.regexp_extract('weight', '(\d+.?\d)', 0).cast(t.DoubleType()))).otherwise(
                                    f.round(f.regexp_extract('weight', '(\d+.?\d)', 0).cast(t.DoubleType())*2.20462,1)
                        )
                    )


# 1. Blocking
Blocking will be done feeding a TF-IDF matrix to an LDA model and extracting
keywords from the title matching them to topics.

In [4]:
"""UTILITIES"""

"""Returns the df with tokenized columns with stopwords removed"""
def tokenize(df, string_cols):
  output = df
  stopW = ['softwarecity', 'amazon', 'com','pc', 'windows', 'computers', 'computer', 'accessories', 'laptop', 'notebook', 'kg', 'inch', 'processor', 'memory','gb', 'ram', 'hdd', 'ssd', 'cpu', 'display', 'hz', 'ghz', 'tb','rpm', 'slot', 'slots', 'mhz', 'cache', 'ram', 'ddram', 'dram', 'hd']
  for c in string_cols:
    output = output.withColumn('temp', f.coalesce(f.col(c), f.lower(c), f.lit('')))
    tokenizer = RegexTokenizer(inputCol='temp', outputCol=c+"_tokens", pattern = "\\W")
    remover = StopWordsRemover(inputCol=c+"_tokens", outputCol=c+"_swRemoved", stopWords=stopW)
    output = tokenizer.transform(output)

    filter_alnum = f.udf(lambda l : [t for t in l if t.isalpha() and len(t) >= 2], t.ArrayType(t.StringType()))
    output = output.withColumn(c+'_tokens', filter_alnum(f.col(c+"_tokens")))

    output = remover.transform(output)\
      .drop('temp', c+"_tokens")
    # output has c+swRemoved columns
  return output

def generate_blocking_keys(df, token_cols, min_freq=1):
    """Pipeline:
            1 - CountVectorizer -> TF
            2 - IDF
            3 - LDA
    """
    df = df.withColumn('tokens_swRemoved', f.concat(*token_cols))
    cv = CountVectorizer(inputCol='tokens_swRemoved', outputCol="rawFeatures")
    cvmodel = cv.fit(df)
    df_vect = cvmodel.transform(df)

    idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=min_freq)
    idfModel = idf.fit(df_vect)
    df_idf= idfModel.transform(df_vect)

    normalizer = Normalizer(p=2.0, inputCol='features', outputCol='tfidf')
    output = normalizer.transform(df_idf)

    lda = LDA(k=5, maxIter=1000, featuresCol='tfidf')
    lda_model = lda.fit(output)
    vocab = cvmodel.vocabulary
    #returns words for each topic term
    def get_words(token_list):
        return [vocab[token_id] for token_id in token_list]

    udf_to_words = f.udf(get_words, t.ArrayType(t.StringType()))

    #create list of topic keywords
    # i.e topic 1 -> acer, anspire, intel
    topics = lda_model.describeTopics(3).withColumn('topicWords', udf_to_words(f.col('termIndices'))).collect()
    list_of_topics = []
    for r in topics:
        topicW = r.__getitem__('topicWords')
        for w in topicW:
            list_of_topics.append(w)

    #returns list of 3 'hashtags' i.e keywords for topic
    #from tokens: title, brand, cpu_brand
    def get_key(words):
        l = [w for w in words if w in list_of_topics]
        l = list(set(l))
        l.sort()
        return l[:3]
    udf_get_key = f.udf(get_key, t.ArrayType(t.StringType()))
    output = output.withColumn("blocking_key", udf_get_key(f.col("tokens_swRemoved")))
    output.select("blocking_key").show()
    return output

"""Use universal sentence encoder from tensorflow_hub"""
MODEL = None
def get_model_magic():
  global MODEL
  if MODEL is None:
      MODEL = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
  return MODEL

@f.udf(returnType=VectorUDT())
def encode_sentence(x):
  model = get_model_magic()
  emb = model([x]).numpy()[0]
  return Vectors.dense(emb)

In [5]:
columns = ['title', 'brand', 'cpu_brand', 'cpu_model', 'ram_type', 'ram_capacity', 'hdd_capacity', 'weight']
blocking_df = tokenize(df, columns[:2])

In [6]:
#Generate Blocking Keys
#for c in columns:
    #blocking_df = blocking_df.withColumn(c+'_encoding', encode_sentence(f.coalesce(f.col(c), f.lit(''))))
blocking_df = generate_blocking_keys(blocking_df,
                                     [c+'_swRemoved' for c in columns[:2]])

#Add encoding on title
blocking_df = blocking_df.withColumn('title_encoding', encode_sentence(f.coalesce(f.col('title'), f.lit(''))))

+--------------------+
|        blocking_key|
+--------------------+
|  [lenovo, thinkpad]|
|      [acer, aspire]|
|[acer, aspire, dr...|
|                [hp]|
|                  []|
|[drive, lenovo, t...|
|      [acer, aspire]|
|      [acer, aspire]|
|[drive, lenovo, t...|
|      [acer, aspire]|
|    [dell, inspiron]|
|[dell, drive, ins...|
|    [dell, inspiron]|
|      [acer, aspire]|
|      [acer, aspire]|
|      [acer, aspire]|
|  [lenovo, thinkpad]|
|    [dell, inspiron]|
|      [acer, aspire]|
|      [acer, aspire]|
+--------------------+
only showing top 20 rows



In [7]:
blocking_df.groupby('blocking_key').count().show()

+--------------------+-----+
|        blocking_key|count|
+--------------------+-----+
|[acer, aspire, home]|   22|
|[dell, drive, ins...|    4|
|                  []|    2|
|[acer, aspire, dr...|    7|
|[downgrade, elite...|    2|
|[elitebook, hp, pro]|    2|
|      [acer, aspire]|   32|
|[downgrade, lenov...|   28|
|[home, lenovo, th...|    3|
|[drive, lenovo, t...|    2|
|[elitebook, folio...|    8|
|[lenovo, pro, thi...|  120|
|         [drive, hp]|    1|
|            [lenovo]|    1|
|                [hp]|    7|
|    [dell, inspiron]|    5|
|[carbon, lenovo, ...|   52|
|  [lenovo, thinkpad]|    7|
|     [elitebook, hp]|   17|
|[carbon, downgrad...|   21|
+--------------------+-----+



# 2. Candidate pairs generation and match likelihood

In [8]:
"""
This cell output a candidates dataframe that has
instance_ids pairs that makes sense to compare, i.e each
entity will be paired with another entity from the same block
"""
#cols_to_keep = [c+'_encoding' for c in columns]
#for c in columns:
#    cols_to_keep.append(c)
#cols_to_keep.append('tokens_swRemoved')
#cols_to_keep.append('tfidf')
#cols_to_keep.append('instance_id')
cols_to_keep = ['instance_id', 'title_encoding']
#node = blocking_df.select(f.col('instance_id').alias('id'), *cols_to_keep).drop('instance_id')
node = blocking_df.select(f.col('instance_id').alias('id'), 'title_encoding').drop('instance_id')
node.select('title_encoding').limit(1).show()
print(blocking_df.columns)
pairs = blocking_df.select(*cols_to_keep, 'blocking_key')\
    .groupby('blocking_key').agg(f.count('instance_id').alias('size'), f.collect_set('instance_id').alias('id'))\
    .filter(f.col('size') > 1).select('blocking_key',f.explode('id').alias('id'))
pairs.show()

left = pairs.withColumnRenamed('id', 'src')
right = pairs.withColumnRenamed('id', 'dst')
#candidates based on matching of blocking_key (i.e inside the block)
candidates = left.join(right, ['blocking_key'], 'inner')\
    .filter(f.col('src') < f.col('dst'))\
    .select('src', 'dst').distinct()

+--------------------+
|      title_encoding|
+--------------------+
|[0.01956569217145...|
+--------------------+

['instance_id', 'brand', 'cpu_brand', 'cpu_model', 'cpu_type', 'cpu_frequency', 'ram_capacity', 'ram_type', 'ram_frequency', 'hdd_capacity', 'weight', 'dimensions', 'title', 'title_swRemoved', 'brand_swRemoved', 'tokens_swRemoved', 'rawFeatures', 'features', 'tfidf', 'blocking_key', 'title_encoding']
+--------------------+--------------------+
|        blocking_key|                  id|
+--------------------+--------------------+
|[acer, aspire, home]|       buy.net//1992|
|[acer, aspire, home]|www.flexshopper.c...|
|[acer, aspire, home]|       buy.net//2012|
|[acer, aspire, home]|        buy.net//634|
|[acer, aspire, home]|www.flexshopper.c...|
|[acer, aspire, home]|www.flexshopper.c...|
|[acer, aspire, home]|www.flexshopper.c...|
|[acer, aspire, home]|www.flexshopper.c...|
|[acer, aspire, home]|www.amazon.com//1664|
|[acer, aspire, home]|www.amazon.com//1081|
|[acer, as

In [9]:
"""
@f.udf(returnType=t.DoubleType())
def dot(x, y):
  if x is not None and y is not None:
    return float(x.dot(y))
  else:
    return 0

def null_safe_levenshtein_sim(c1, c2):
  output = f.when(f.col(c1).isNull() | f.col(c2).isNull(), 0)\
            .otherwise(1 - f.levenshtein(c1, c2) / f.greatest(f.length(c1), f.length(c2)))
  return output

def null_safe_num_sim(c1, c2):
  output = f.when(f.col(c1).isNull() | f.col(c2).isNull(), 0)\
            .when((f.col(c1) == 0) & (f.col(c2) == 0), 1)\
            .when((f.col(c1) == 0) | (f.col(c2) == 0), 0)\
            .otherwise(1 - f.abs(f.col(c1) - f.col(c2)) / f.greatest(c1, c2))
  return output

def null_safe_token_overlap(c1, c2):
  # is the overlap a significant part of the shorter string
  output = f.when(f.col(c1).isNull() | f.col(c2).isNull(), 0)\
            .when((f.size(f.array_distinct(c1)) == 0) | (f.size(f.array_distinct(c2)) == 0), 0)\
            .otherwise(f.size(f.array_intersect(c1, c2)) / f.least(f.size(f.array_distinct(c1)), f.size(f.array_distinct(c1))))
  return output

def calc_sim(df, candidates):
    metrics = []
    for c in columns[:2]:
        if '_encoding' not in c:
            candidates = candidates.withColumn(c+'_lev', null_safe_levenshtein_sim(df.filter(df.id == candidates.src).select(c),df.filter(df.id == candidates.dst).select(c)))
            metrics.append(c+'_lev')
        else:
            metrics.append(c+'_sim')
            candidates = candidates.withColumn(c+'_sim', dot(df.filter(df.id == candidates.src).select(c), df.filter(df.id == candidates.dst).select(c)))
    candidates = candidates.withColumn('tfidf_sim', dot(df.filter(df.id == candidates.src).select('tfidf'),df.filter(df.id == candidates.dst).select('tfidf')))
    candidates = candidates.withColumn('token_sim', dot(df.filter(df.id == candidates.src).select('tokens_swRemoved'), df.filter(df.id == candidates.dst).select('tokens_swRemoved')))
    candidates = candidates.withColumn('weight_sim', dot(df.filter(df.id == candidates.src).select('weight'),df.filter(df.id == candidates.dst).select('weight')))
    metrics.append('tfidf_sim')
    metrics.append('token_sim')
    metrics.append('weigth_sim')
    def sum_distance(distances):
        return sum(d for d in distances)
    udf_sum = f.udf(sum_distance, t.DoubleType())
    candidates = candidates.withColumn('sum_sim', udf_sum([f.col(c) for c in metrics]))
    udf_norm = f.udf(lambda d : d / len(metrics))
    candidates = candidates.withColumn('overall_sim', udf_norm(f.col('sum_sim'))).drop(f.col('sum_sim'))
    return candidates

distance_df = calc_sim(node, candidates)
"""

"\n@f.udf(returnType=t.DoubleType())\ndef dot(x, y):\n  if x is not None and y is not None:\n    return float(x.dot(y))\n  else:\n    return 0\n\ndef null_safe_levenshtein_sim(c1, c2):\n  output = f.when(f.col(c1).isNull() | f.col(c2).isNull(), 0)            .otherwise(1 - f.levenshtein(c1, c2) / f.greatest(f.length(c1), f.length(c2)))\n  return output\n\ndef null_safe_num_sim(c1, c2):\n  output = f.when(f.col(c1).isNull() | f.col(c2).isNull(), 0)            .when((f.col(c1) == 0) & (f.col(c2) == 0), 1)            .when((f.col(c1) == 0) | (f.col(c2) == 0), 0)            .otherwise(1 - f.abs(f.col(c1) - f.col(c2)) / f.greatest(c1, c2))\n  return output\n\ndef null_safe_token_overlap(c1, c2):\n  # is the overlap a significant part of the shorter string\n  output = f.when(f.col(c1).isNull() | f.col(c2).isNull(), 0)            .when((f.size(f.array_distinct(c1)) == 0) | (f.size(f.array_distinct(c2)) == 0), 0)            .otherwise(f.size(f.array_intersect(c1, c2)) / f.least(f.size(f.array_

In [15]:
import csv
dictionary = {}
def countNum(row):
   if row.lid in dictionary.keys():
       dictionary[row.lid].add(row.rid)
   else:
       dictionary[row.lid] = set([row.rid, row.lid])

   if row.rid in dictionary.keys():
       dictionary[row.rid].add(row.lid)
   else:
       dictionary[row.rid] = set([row.lid, row.rid])


labels = spark.read.csv("data/Y2.csv", header=True).withColumnRenamed('left_instance_id', 'lid').withColumnRenamed('right_instance_id', 'rid')
filter_label = labels.filter(labels.label == '1')
collected_filtered = filter_label.collect()
for row in collected_filtered:
    countNum(row)
f = open('./data/tmp.csv', 'w+')
csv_writer = csv.writer(f)
csv_writer.writerow(['lid','rid','label'])
for key in dictionary.keys():
    elem_set = dictionary[key]
    while len(elem_set) > 0:
        first_elem = elem_set.pop()
        for second_elem in elem_set:
            if first_elem != second_elem:
                l = [first_elem, second_elem, '1']
                csv_writer.writerow(l)
                if key == first_elem:
                    continue
                if first_elem in dictionary.keys() and second_elem in dictionary[first_elem]:
                    dictionary[first_elem].remove(second_elem)
f.close()
updated_labels = spark.read.csv("data/tmp.csv", header=True)
labels = labels.union(updated_labels)
print(labels.count())




+--------------------+--------------------+-----+
|                 lid|                 rid|label|
+--------------------+--------------------+-----+
|www.flexshopper.c...|www.amazon.com//1389|    1|
| www.amazon.com//291|www.amazon.com//1081|    1|
|        buy.net//634|www.amazon.com//1014|    1|
|www.amazon.com//2395|        buy.net//393|    1|
|www.flexshopper.c...|        buy.net//634|    1|
|www.amazon.com//1313| www.amazon.com//291|    1|
|www.amazon.com//1313|www.amazon.com//1014|    1|
|www.amazon.com//1081|www.amazon.com//2395|    1|
|www.amazon.com//1081|www.amazon.com//1389|    1|
| www.amazon.com//291|www.flexshopper.c...|    1|
|www.amazon.com//1014|www.amazon.com//1389|    1|
| www.amazon.com//291|        buy.net//393|    1|
|www.amazon.com//2395|www.amazon.com//1389|    1|
|www.amazon.com//1313|www.amazon.com//1081|    1|
|www.amazon.com//1081|www.amazon.com//2226|    1|
|        buy.net//634|www.flexshopper.c...|    1|
|www.amazon.com//1014|www.amazon.com//2226|    1|


In [27]:

#label_df = labels.join(candidates.withColumnRenamed('src','lid').withColumnRenamed('dst','rid'), ['lid','rid'], 'inner')
label_df = labels.join(node.alias("node_1"), labels.lid == node.id, 'inner').drop('id')
for c in cols_to_keep[1:]:
    label_df = label_df.withColumnRenamed(c, 'l_'+c)

label_df = label_df.alias('one').join(node.alias("node_2"), label_df.rid == node.id, 'inner').drop('id')
for c in cols_to_keep[1:]:
    label_df = label_df.withColumnRenamed(c, 'r_'+c)
print(label_df.columns)

['lid', 'rid', 'label', 'l_title_encoding', 'r_title_encoding']


In [28]:
def toList(row):
    l = []
    for v in row:
        for n in v:
            l.append(float(n))
    return Vectors.dense(l)

udf_toList = f.udf(toList, VectorUDT())
label_df = label_df.withColumn('features', udf_toList(f.array('l_title_encoding', 'r_title_encoding')))\
    .drop('l_title_encoding', 'r_title_encoding')
label_df = label_df.withColumn('label', f.col('label').cast(t.IntegerType()))

# 3. Machine Learning Magic Bitch

In [43]:
from pyspark.ml.classification import LinearSVC, LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [44]:
model = LinearSVC(featuresCol='features', labelCol='label', maxIter=500)
param_grid = ParamGridBuilder().addGrid(model.regParam, [0.5, 0.4, 0.3, 0.2, 0.1, 0.02, 0.01]).build()
tvs = CrossValidator(estimator=model,
                           estimatorParamMaps=param_grid,
                           evaluator=BinaryClassificationEvaluator(),#(rawPredictionCol='prediction', labelCol='label'),\
                           numFolds=10)

In [45]:
training_set, test_set = label_df.randomSplit([0.8, 0.2])

In [None]:
#grid_search, hyperpar tuning...
estimator = tvs.fit(training_set)


In [40]:
prediction = estimator.transform(test_set).select('lid','rid','label','prediction')

In [42]:
accuracy = prediction.filter(f.col('label')==f.col('prediction').cast(t.IntegerType())).count() / prediction.count()
print("Accuracy: ", accuracy)

+--------------------+--------------------+-----+----------+
|                 lid|                 rid|label|prediction|
+--------------------+--------------------+-----+----------+
|       buy.net//1960|www.vology.com//4272|    1|       0.0|
|        buy.net//243|       buy.net//2109|    1|       0.0|
|        buy.net//370| www.vology.com//668|    1|       0.0|
|        buy.net//393|www.amazon.com//1389|    1|       0.0|
|        buy.net//634|www.amazon.com//1389|    1|       0.0|
|         buy.net//93|       buy.net//1759|    1|       0.0|
|www.amazon.com//1014|www.amazon.com//2226|    1|       0.0|
|www.amazon.com//1081|www.amazon.com//1014|    1|       0.0|
|www.amazon.com//1081|www.flexshopper.c...|    1|       0.0|
|www.amazon.com//1652|www.isupplyhub.co...|    1|       0.0|
|www.amazon.com//1671|www.isupplyhub.co...|    1|       0.0|
|www.amazon.com//1780|www.amazon.com//1836|    1|       0.0|
|www.amazon.com//1780| www.amazon.com//753|    1|       0.0|
|www.amazon.com//1835|ww