In [1]:
!pip install pyspark graphframes



In [2]:
!export PYSPARK_SUBMIT_ARGS='--packages graphframes:graphframes:0.8.1-spark3.0-s_2.12'

In [3]:
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import tensorflow_hub as hub
from pyspark.sql import functions as f
from pyspark.sql import types as t
from pyspark.sql import Window as w
from graphframes import GraphFrame
from pyspark.ml.linalg import DenseVector, SparseVector
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, RegexTokenizer, CountVectorizer, StopWordsRemover, NGram, Normalizer, VectorAssembler, Word2Vec, Word2VecModel, PCA
from pyspark.ml.clustering import LDA
from pyspark.ml.linalg import VectorUDT, Vectors

In [4]:
spark = (
    SparkSession.builder
    .config('spark.executor.memory', '4g')
    .config('spark.app.name', 'Spark Updated Conf')
    .config('spark.executor.cores', '2')
    .config('spark.cores.max', '2')
    .config('spark.driver.memory','8g')
    .getOrCreate()
)

# 0. Load data

In [5]:
df = spark.read.csv("./data/X2.csv", header=True)
df.toPandas()

Unnamed: 0,instance_id,brand,cpu_brand,cpu_model,cpu_type,cpu_frequency,ram_capacity,ram_type,ram_frequency,hdd_capacity,ssd_capacity,weight,dimensions,title
0,www.softwarecity.ca//737,Lenovo,Intel. i5-3320M,i5-3320M,Dual-core ( 2 Core ). Core i5,2.60 GHz,,DDR3 SDRAM. DDR3-1600/PC3-12800. DDR3 SDRAM,DDR3-1600/PC3-12800,320 GB,,1.80 kg,,"""Lenovo Thinkpad X230 34352jf Tablet Pc - 12.5..."
1,www.isupplyhub.com//1256,Acer,1.6 GHz Intel Core i5-4200U. Intel Core I5,,1.6 GHz Intel Core i5-4200U,1.6 GHz Intel Core i5-4200U,8 GB DDR3L SDRAM,DDR3 SDRAM. 8 GB DDR3L SDRAM,,500 GB mechanical_hard_drive,,4.8 pounds,15.02 x 10.08 x 0.90 inches,Amazon.com : Acer Aspire V7-582PG-6479 15.6-In...
2,www.isupplyhub.com//326,Acer,1.6 GHz Intel Core i5. Intel Core I5,,1.6 GHz Intel Core i5,1.6 GHz Intel Core i5,4 GB DDR3-SDRAM,DDR3 SDRAM. 4 GB DDR3-SDRAM,,500 GB mechanical_hard_drive,,5.2 pounds,15.02 x 10.08 x 1 inches,Amazon.com : Acer Aspire E1-572-6870 15.6 Inch...
3,www.isupplyhub.com//821,HP,,,,,4 GB SDRAM DDR3,DDR3 SDRAM. 4 GB SDRAM DDR3,,500 GB,,4.8 pounds,15.18 x 0.89 x 10.16 inches,"""Amazon.com : 15.6"""" HP 15-f009wm Amd Dual-Cor..."
4,www.isupplyhub.com//157,Asus,1.7 GHz Core i5-3317U. Intel,,1.7 GHz Core i5-3317U,1.7 GHz Core i5-3317U,4 GB DDR3,DDR3 SDRAM. 4 GB DDR3,,256 MB,,2.9 pounds,8.80 x 0.70 x 12.80 inches,Amazon.com : ASUS UX31A-XB52 13.3-Inch Ultrabo...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,www.vology.com//873,Lenovo ThinkPad X230 2320 - 12.5 '' - Core i5 ...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,4 GB DDR3 Slots Qty 2 Empty Slots 1 Max RAM Su...,4 GB DDR3 Slots Qty 2 Empty Slots 1 Max RAM Su...,4 GB DDR3 Slots Qty 2 Empty Slots 1 Max RAM Su...,180 GB SSD. 180 GB SSD. Lenovo ThinkPad X230 2...,180 GB SSD. 180 GB SSD,3.3 lbs 3.3 lbs,8.1 in. 12 in x 8.1 in x 1 in. 1 in. 12 in,"""Lenovo ThinkPad X230 2320 - 12.5"""" - Core i5 ..."
339,www.vology.com//823,Lenovo ThinkPad X230 2325 - 12.5 '' - Core i5 ...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,4 GB DDR3 Slots Qty 2 Max RAM Supported 16 GB ...,4 GB DDR3 Slots Qty 2 Max RAM Supported 16 GB ...,4 GB DDR3 Slots Qty 2 Max RAM Supported 16 GB ...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm. ...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm,3.3 lbs 3.3 lbs,8.1 in. 12 in x 8.1 in x 1 in. 1 in. 12 in,"""Lenovo ThinkPad X230 2325 - 12.5"""" - Core i5 ..."
340,www.vology.com//2723,Lenovo ThinkPad X230 Tablet 3438 - 12.5 '' - C...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm. ...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm,4 lbs 4 lbs,9 in. 12 in x 9 in x 1.2 in. 1.2 in. 12 in,"""Lenovo ThinkPad X230 Tablet 3438 - 12.5"""" - C..."
341,www.vology.com//1349,Lenovo ThinkPad X230 2324 - 12.5 '' - Core i5 ...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,320 GB HDD / 7200 rpm. 320 GB HDD / 7200 rpm. ...,320 GB HDD / 7200 rpm. 320 GB HDD / 7200 rpm,3.3 lbs 3.3 lbs,8.1 in. 12 in x 8.1 in x 1 in. 1 in. 12 in,"""Lenovo ThinkPad X230 2324 - 12.5"""" - Core i5 ..."


# 0. Data cleaning

In [6]:
# Set everything to lowercase
for c in df.columns:
    df = df.withColumn(c, f.lower(f.col(c)))

df = df.drop('ssd_capacity')
df = df.drop('dimensions')
# Extract brand or infer from title
df = df.withColumn('brand', f.regexp_extract('brand', "^(\w+)", 0))
computer_brands = ['lenovo', 'acer', 'hp', 'dell', 'asus', 'samsung', 'huawei', 'surface', 'apple']
computer_brands_pattern = '({})'.format('|'.join(computer_brands))
df = df.withColumn('brand', f.when( f.regexp_extract('title', computer_brands_pattern, 0)!='', f.regexp_extract('title', computer_brands_pattern, 0))\
                   .otherwise(df.brand))
#exctract cpu_brand and infer type if intel
cpu_brands = ['intel', 'apple', 'amd', 'nvidia', 'arm']
cpu_pattern = '({})'.format('|'.join(cpu_brands))
df = df.withColumn('cpu_model',f.regexp_extract('cpu_model', '(i\d|pentium|celeron|a\d)', 0))
df = df.withColumn('cpu_model', f.when( (f.regexp_extract('cpu_brand','(intel|amd)', 0 )!='') & f.isnull(df.cpu_model) ,\
                                        f.regexp_extract('cpu_brand', '(i\d|pentium|celeron|a\d)', 0))\
                   .otherwise(df.cpu_model))
df = df.withColumn('cpu_brand', f.when(f.regexp_extract('cpu_brand', cpu_pattern, 0) != '', f.regexp_extract('cpu_brand', cpu_pattern, 1))\
                                       .otherwise(f.regexp_extract('title', cpu_pattern, 0)))
#convert weight from pounds to kilos
df = df.withColumn('weight', f.when(df.weight.contains('pounds') | df.weight.contains('lbs'),
                                    (f.regexp_extract('weight', '(\d+.?\d)', 0).cast(t.DoubleType()))).otherwise(
                                    f.round(f.regexp_extract('weight', '(\d+.?\d)', 0).cast(t.DoubleType())*2.20462,1)
                        )
                    )


In [7]:
#from more_itertools import intersperse

def merge_columns(df, column_names, output):
    df = df.withColumn(output, f.concat_ws(" ", *column_names))
    return df.drop(*column_names)

ddf = df.drop("ram_frequency")
#merge ram columns and cpu columns into one for ram and one for cpu
ddf = merge_columns(ddf, ["cpu_brand", "cpu_model", "cpu_frequency", "cpu_type"], "cpu")
ddf = merge_columns(ddf, ["ram_capacity", "ram_type"], "ram")
ddf.columns

['instance_id', 'brand', 'hdd_capacity', 'weight', 'title', 'cpu', 'ram']

# 1. Blocking
Blocking will be done feeding a TF-IDF matrix to an LDA model and extracting
keywords from the title matching them to topics.

In [12]:
@f.udf(returnType=t.ArrayType(t.StringType()))
def filter_alnum(arr):
    return [t for t in arr if t.isalnum() and len(t) > 2]

"""Returns the df with tokenized columns with stopwords removed"""
def tokenize(df, string_cols):
    output = df
    stopW = ['softwarecity', 'amazon', 'com', 'pc', 'windows', 'computers', 'computer', 'accessories', 'laptop', 'notebook', 'kg', 'inch', 'processor', 'memory','gb', 'ram', 'hdd', 'ssd', 'cpu', 'display', 'hz', 'ghz', 'tb','rpm', 'slot', 'slots', 'mhz', 'cache', 'ram', 'ddram', 'dram', 'hd']
    for c in string_cols:
        output = output.withColumn('temp', f.coalesce(f.col(c), f.lower(c), f.lit('')))
        tokenizer = RegexTokenizer(inputCol='temp', outputCol=c+"_rawtokens", pattern = "\\W")
        remover = StopWordsRemover(inputCol=c+"_rawtokens", outputCol=c+"_tokens", stopWords=stopW)

        output = tokenizer.transform(output)
        output = remover.transform(output).drop(c+"_rawtokens")
        output = output.withColumn(c+'_tokens', f.array_distinct(filter_alnum(f.col(c+"_tokens"))))
    # output has c+tokens columns
    return output.drop("temp")

def generate_blocking_keys(df, token_cols, min_freq=1):
    """Pipeline:
            1 - CountVectorizer -> TF
            2 - IDF
            3 - LDA
    """
    df = df.withColumn('tokens', f.array_distinct(f.concat(*token_cols)))\
        .drop('title_tokens')\
        .drop('cpu_tokens')\
        .drop('ram_tokens')\
        .drop('brand_tokens')
    cv = CountVectorizer(inputCol='tokens', outputCol="raw_features")
    cvmodel = cv.fit(df)
    df_vect = cvmodel.transform(df)

    idf = IDF(inputCol="raw_features", outputCol="features", minDocFreq=min_freq)
    idfModel = idf.fit(df_vect)
    df_idf= idfModel.transform(df_vect)

    normalizer = Normalizer(p=2.0, inputCol='features', outputCol='tfidf')
    output = normalizer.transform(df_idf)
    output = output.drop('raw_features').drop('features')
    lda = LDA(k=5, maxIter=1000, featuresCol='tfidf')
    lda_model = lda.fit(output)
    vocab = cvmodel.vocabulary
    #returns words for each topic term
    @f.udf(returnType=t.ArrayType(t.StringType()))
    def get_words(token_list):
        return [vocab[token_id] for token_id in token_list]

    #create list of topic keywords
    # i.e topic 1 -> acer, anspire, intel
    topics = lda_model.describeTopics(3).withColumn('topicWords', get_words(f.col('termIndices'))).collect()
    list_of_topics = []
    for r in topics:
        topicW = r.__getitem__('topicWords')
        for w in topicW:
            list_of_topics.append(w)

    #returns list of 3 'hashtags' i.e keywords for topic
    #from tokens: title, brand, cpu_brand
    @f.udf(returnType=t.ArrayType(t.StringType()))
    def get_key(words):
        l = [w for w in words if w in list_of_topics]
        l = list(set(l))
        l.sort()
        return l[:3]
    output = output.withColumn("blocking_key", get_key(f.col("tokens")))
    return output

"""Use universal sentence encoder from tensorflow_hub"""
MODEL = None
def get_model_magic():
  global MODEL
  if MODEL is None:
      MODEL = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
  return MODEL

@f.udf(returnType=VectorUDT())
def encode_sentence(x):
  model = get_model_magic()
  emb = model([x]).numpy()[0]
  return Vectors.dense(emb)

In [16]:
columns = ['title', 'brand', 'cpu', 'ram']
#Generate Blocking Keys

blocking_df = tokenize(ddf, columns)
blocking_df = generate_blocking_keys(blocking_df, [c+'_tokens' for c in columns])
blocking_df.limit(1).show()

+--------------------+------+------------+------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+
|         instance_id| brand|hdd_capacity|weight|               title|                 cpu|                 ram|brand_tokens|              tokens|               tfidf|        blocking_key|
+--------------------+------+------------+------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+
|www.softwarecity....|lenovo|      320 gb|   4.0|"lenovo thinkpad ...|intel i5 2.60 ghz...|ddr3 sdram. ddr3-...|    [lenovo]|[lenovo, thinkpad...|(350,[0,1,2,3,6,7...|[12800, 3320m, co...|
+--------------------+------+------------+------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+



In [21]:
#Add encoding
cols_to_encode = columns
cols_to_encode.append('hdd_capacity')
cols_to_encode.append('weight')
blocking_df = blocking_df.withColumn('to_encode', f.concat(*cols_to_encode))
blocking_df = blocking_df.withColumn('encoding', encode_sentence(f.coalesce(f.col('to_encode'), f.lit(''))))

blocking_df.limit(1).show()

+--------------------+------+------------+------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|         instance_id| brand|hdd_capacity|weight|               title|                 cpu|                 ram|brand_tokens|              tokens|               tfidf|        blocking_key|           to_encode|            encoding|
+--------------------+------+------------+------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|www.softwarecity....|lenovo|      320 gb|   4.0|"lenovo thinkpad ...|intel i5 2.60 ghz...|ddr3 sdram. ddr3-...|    [lenovo]|[lenovo, thinkpad...|(350,[0,1,2,3,6,7...|[12800, 3320m, co...|"lenovo thinkpad ...|[0.02892581000924...|
+--------------------+------+------------+------+--------------------+------

In [22]:
blocking_df.groupby('blocking_key').count().show()

+--------------------+-----+
|        blocking_key|count|
+--------------------+-----+
|[12800, 3320m, co...|    1|
|  [4005u, 571, acer]|    1|
|      [12800, 3320m]|    3|
|[acer, aspire, pr...|   21|
|                  []|   57|
|[12800, 3320m, x230]|  104|
|   [011, 4005u, 571]|    1|
|      [acer, aspire]|   31|
|             [12800]|    6|
|       [convertible]|    4|
|       [12800, x230]|   26|
|[integrated, prov...|   71|
|       [3320m, x230]|    4|
| [571, acer, aspire]|    7|
|              [x230]|    1|
|           [premium]|    3|
|[12800, 3320m, mu...|    1|
|[3320m, convertib...|    1|
+--------------------+-----+



# 2. Candidate pairs generation and match likelihood

In [25]:
"""
This cell output a candidates dataframe that has
instance_ids pairs that makes sense to compare, i.e each
entity will be paired with another entity from the same block
"""

cols_to_keep = ["instance_id", "encoding"]
# Filter blocks to only keep ones bigger than one
pairs = (
    blocking_df
    .select(*cols_to_keep, 'blocking_key')
    .groupby('blocking_key').agg(f.count('instance_id').alias('size'), f.collect_set('instance_id').alias('id'))\
    .filter(f.col('size') > 1).select('blocking_key',f.explode('id').alias('id'))
)

left = pairs.withColumnRenamed('id', 'src')
right = pairs.withColumnRenamed('id', 'dst')
#candidates based on matching of blocking_key (i.e inside the block)
candidates = left.join(right, ['blocking_key'], 'inner')\
    .filter(f.col('src') < f.col('dst'))\
    .select('src', 'dst').distinct()
node = blocking_df.select(f.col('instance_id').alias('id'), 'encoding').drop('instance_id')

In [None]:
"""
@f.udf(returnType=t.DoubleType())
def dot(x, y):
  if x is not None and y is not None:
    return float(x.dot(y))
  else:
    return 0

def null_safe_levenshtein_sim(c1, c2):
  output = f.when(f.col(c1).isNull() | f.col(c2).isNull(), 0)\
            .otherwise(1 - f.levenshtein(c1, c2) / f.greatest(f.length(c1), f.length(c2)))
  return output

def null_safe_num_sim(c1, c2):
  output = f.when(f.col(c1).isNull() | f.col(c2).isNull(), 0)\
            .when((f.col(c1) == 0) & (f.col(c2) == 0), 1)\
            .when((f.col(c1) == 0) | (f.col(c2) == 0), 0)\
            .otherwise(1 - f.abs(f.col(c1) - f.col(c2)) / f.greatest(c1, c2))
  return output

def null_safe_token_overlap(c1, c2):
  # is the overlap a significant part of the shorter string
  output = f.when(f.col(c1).isNull() | f.col(c2).isNull(), 0)\
            .when((f.size(f.array_distinct(c1)) == 0) | (f.size(f.array_distinct(c2)) == 0), 0)\
            .otherwise(f.size(f.array_intersect(c1, c2)) / f.least(f.size(f.array_distinct(c1)), f.size(f.array_distinct(c1))))
  return output

def calc_sim(df, candidates):
    metrics = []
    for c in columns[:2]:
        if '_encoding' not in c:
            candidates = candidates.withColumn(c+'_lev', null_safe_levenshtein_sim(df.filter(df.id == candidates.src).select(c),df.filter(df.id == candidates.dst).select(c)))
            metrics.append(c+'_lev')
        else:
            metrics.append(c+'_sim')
            candidates = candidates.withColumn(c+'_sim', dot(df.filter(df.id == candidates.src).select(c), df.filter(df.id == candidates.dst).select(c)))
    candidates = candidates.withColumn('tfidf_sim', dot(df.filter(df.id == candidates.src).select('tfidf'),df.filter(df.id == candidates.dst).select('tfidf')))
    candidates = candidates.withColumn('token_sim', dot(df.filter(df.id == candidates.src).select('tokens_swRemoved'), df.filter(df.id == candidates.dst).select('tokens_swRemoved')))
    candidates = candidates.withColumn('weight_sim', dot(df.filter(df.id == candidates.src).select('weight'),df.filter(df.id == candidates.dst).select('weight')))
    metrics.append('tfidf_sim')
    metrics.append('token_sim')
    metrics.append('weigth_sim')
    def sum_distance(distances):
        return sum(d for d in distances)
    udf_sum = f.udf(sum_distance, t.DoubleType())
    candidates = candidates.withColumn('sum_sim', udf_sum([f.col(c) for c in metrics]))
    udf_norm = f.udf(lambda d : d / len(metrics))
    candidates = candidates.withColumn('overall_sim', udf_norm(f.col('sum_sim'))).drop(f.col('sum_sim'))
    return candidates

distance_df = calc_sim(node, candidates)
"""

In [27]:
"""
Read label.csv and expand it trough transitivity
"""
labels = (
    spark.read.csv("./data/Y2.csv", header=True)
    .withColumnRenamed('left_instance_id', 'lid')
    .withColumnRenamed('right_instance_id', 'rid')
)

In [84]:
"""
Reuse this cell to join a <left_id,right_id> with node to extract features
"""
#label_df = labels.join(candidates.withColumnRenamed('src','lid').withColumnRenamed('dst','rid'), ['lid','rid'], 'inner')
label_df = labels.join(node.alias("node_1"), labels.lid == node.id, 'inner').drop('id')
for c in cols_to_keep[1:]:
    label_df = label_df.withColumnRenamed(c, 'l_'+c)

label_df = label_df.alias('one').join(node.alias("node_2"), label_df.rid == node.id, 'inner').drop('id')
for c in cols_to_keep[1:]:
    label_df = label_df.withColumnRenamed(c, 'r_'+c)
print(label_df.columns)

matching_pairs = candidates.join(node.alias("node_1"), candidates.src == node.id, 'inner').drop('id')
for c in cols_to_keep[1:]:
    matching_pairs = matching_pairs.withColumnRenamed(c, 'l_'+c)

matching_pairs = matching_pairs.alias('one').join(node.alias("node_2"), matching_pairs.dst == node.id, 'inner').drop('id')
for c in cols_to_keep[1:]:
    matching_pairs = matching_pairs.withColumnRenamed(c, 'r_'+c)

matching_pairs.columns

['lid', 'rid', 'label', 'l_encoding', 'r_encoding']


['src', 'dst', 'l_encoding', 'r_encoding']

In [85]:
@f.udf(returnType=VectorUDT())
def toList(row):
    l = []
    for v in row:
        for n in v:
            l.append(float(n))
    return Vectors.dense(l)

label_df = label_df.withColumn('features', toList(f.array('l_encoding', 'r_encoding')))\
    .drop('l_encoding', 'r_encoding')
label_df = label_df.withColumn('label', f.col('label').cast(t.IntegerType()))


matching_pairs = matching_pairs.\
    withColumn('features', toList(f.array('l_encoding', 'r_encoding')))\
    .drop('l_encoding', 'r_encoding')

matching_pairs = matching_pairs.withColumnRenamed('src', 'lid').withColumnRenamed('dst','rid')

# 3. Machine Learning Magic Bitch

In [73]:
from pyspark.ml.classification import LinearSVC, LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [86]:
model = LinearSVC(featuresCol='features', labelCol='label', weightCol='weights',maxIter=100)
param_grid = ParamGridBuilder().addGrid(model.regParam, [0.5, 0.4, 0.3, 0.2, 0.1]).build()
cvs = CrossValidator(estimator=model,
                           estimatorParamMaps=param_grid,
                           evaluator=BinaryClassificationEvaluator(),#(rawPredictionCol='prediction', labelCol='label'),\
                           numFolds=4)

In [87]:
#Set weights
label_df = label_df.withColumn('weights', f.when(f.col('label')==0, 0.05).otherwise(1.0))
training_set, test_set = label_df.randomSplit([0.8, 0.2])

In [88]:
#grid_search, hyperpar tuning...
estimator = cvs.fit(training_set)

In [89]:
prediction = estimator.transform(test_set).select('lid','rid','label','prediction')

In [90]:
prediction.groupby("prediction").count().toPandas()

Unnamed: 0,prediction,count
0,0.0,11635
1,1.0,119


In [91]:
estimator.save("model.model")

In [92]:
!zip -r model.zip model.model

updating: model.model/ (stored 0%)
updating: model.model/bestModel/ (stored 0%)
updating: model.model/bestModel/data/ (stored 0%)
updating: model.model/bestModel/data/._SUCCESS.crc (stored 0%)
updating: model.model/bestModel/data/_SUCCESS (stored 0%)
updating: model.model/bestModel/metadata/ (stored 0%)
updating: model.model/bestModel/metadata/.part-00000.crc (stored 0%)
updating: model.model/bestModel/metadata/part-00000 (deflated 45%)
updating: model.model/bestModel/metadata/._SUCCESS.crc (stored 0%)
updating: model.model/bestModel/metadata/_SUCCESS (stored 0%)
updating: model.model/estimator/ (stored 0%)
updating: model.model/estimator/metadata/ (stored 0%)
updating: model.model/estimator/metadata/.part-00000.crc (stored 0%)
updating: model.model/estimator/metadata/part-00000 (deflated 45%)
updating: model.model/estimator/metadata/._SUCCESS.crc (stored 0%)
updating: model.model/estimator/metadata/_SUCCESS (stored 0%)
updating: model.model/evaluator/ (stored 0%)
upda

In [93]:
accuracy = prediction.filter(f.col('label')==f.col('prediction').cast(t.IntegerType())).count() / prediction.count()
print("Accuracy: ", accuracy)
p = prediction.filter("label==1 AND prediction==1").count() / prediction.filter('prediction==1').count()
r = prediction.filter("label==1 AND prediction==1").count() / prediction.filter(f.col('label')==1).count()
f1 = 2*p*r/(p+r)
print("F1 score: ", f1)

Accuracy:  0.9611196188531563
F1 score:  0.13282732447817838
