In [14]:
import pyspark as ps
from pyspark.sql.types import *
from pyspark.ml.clustering import LDA, KMeans
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

In [2]:
spark = (ps.sql.SparkSession.builder
        .master("local[3]")
        .appName("capstone")
        .getOrCreate()
        )
sc = spark.sparkContext

In [7]:
import pandas as pd
df = pd.read_csv('s3a://capstone-3/data/products_art_only.csv')

In [8]:
df.drop('Unnamed: 0', axis=1,inplace=True)
spark_df = spark.createDataFrame(df)

In [9]:
spark_df.printSchema()

root
 |-- vendor_variant_id: long (nullable = true)
 |-- vendor_id: long (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_description: string (nullable = true)
 |-- vendor_name: string (nullable = true)
 |-- taxonomy_name: string (nullable = true)
 |-- taxonomy_id: double (nullable = true)
 |-- weblink: string (nullable = true)
 |-- color: string (nullable = true)
 |-- material: string (nullable = true)
 |-- pattern: string (nullable = true)
 |-- is_returnable: boolean (nullable = true)
 |-- ship_surcharge: double (nullable = true)
 |-- is_assembly_required: boolean (nullable = true)
 |-- is_feed: long (nullable = true)
 |-- commission_tier: string (nullable = true)
 |-- inventory_type: string (nullable = true)
 |-- division: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: double (nullable = true)
 |-- sale_price: double (nullable = true)
 |-- combo: string (nullable = true)



In [10]:
spark_df.select('product_title','combo').show()

+--------------------+--------------------+
|       product_title|               combo|
+--------------------+--------------------+
|Framed Canvas Pri...|Framed Canvas Pri...|
|Framed Print, Abs...|Framed Print, Abs...|
|Janel Foo Glasswo...|Janel Foo Glasswo...|
|The Arts Capsule ...|The Arts Capsule ...|
|Native Maps - Aus...|Native Maps - Aus...|
|Framed Print, Tur...|Framed Print, Tur...|
|Sarah Campbell Wa...|Sarah Campbell Wa...|
|Framed Print, Ope...|Framed Print, Ope...|
|Framed Print, Mid...|Framed Print, Mid...|
|The Arts Capsule ...|The Arts Capsule ...|
|Canvas Print - Ab...|Canvas Print - Ab...|
|Ashley Mary Art L...|Ashley Mary Art L...|
|The Arts Capsule ...|The Arts Capsule ...|
|Ashley Mary Balan...|Ashley Mary Balan...|
|The Arts Capsule ...|The Arts Capsule ...|
|The Arts Capsule ...|The Arts Capsule ...|
|Felt Wall Art, Bl...|Felt Wall Art, Bl...|
|Minted for west e...|Minted for west e...|
|Erik Barthels Pri...|Erik Barthels Pri...|
|The Arts Capsule ...|The Arts C

In [15]:
def tfidf_pipeline():
    tokenizer = Tokenizer(inputCol="combo", outputCol="words")
    hashingTF = HashingTF(inputCol='words', outputCol="rawFeatures", numFeatures=20)
    idf = IDF(inputCol='rawFeatures', outputCol="features")
    pipeline = Pipeline(stages=[tokenizer, hashingTF, idf])
    return pipeline

In [16]:
pipeline = tfidf_pipeline()
features_df = pipeline.fit(spark_df).transform(spark_df)

In [27]:
def kmeans_rec(dataset):
    kmeans = KMeans(k=10)
    model = kmeans.fit(dataset)
    result = model.transform(dataset)
    return result

#     labels = result.select('prediction')
#     #find label for specified item
#     #find other items with same label
#     #return n items

def get_centers(model):   # Evaluate clustering by computing Within Set Sum of Squared Errors.
    wssse = model.computeCost(dataset)
    print("Within Set Sum of Squared Errors = " + str(wssse))

    # Shows the result.
    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)


In [28]:
result = kmeans_rec(features_df)

In [29]:
result.show()

+-----------------+---------+--------------------+--------------------+-----------+--------------------+-----------+--------------------+-----+--------------------+-------+-------------+--------------+--------------------+-------+---------------+--------------+--------+--------+-----+----------+--------------------+--------------------+--------------------+--------------------+----------+
|vendor_variant_id|vendor_id|       product_title| product_description|vendor_name|       taxonomy_name|taxonomy_id|             weblink|color|            material|pattern|is_returnable|ship_surcharge|is_assembly_required|is_feed|commission_tier|inventory_type|division|category|price|sale_price|               combo|               words|         rawFeatures|            features|prediction|
+-----------------+---------+--------------------+--------------------+-----------+--------------------+-----------+--------------------+-----+--------------------+-------+-------------+--------------+---------------

In [31]:
item_index = 100
item_id = df['vendor_variant_id'].iloc[item_index]
print(item_id)

7432702


In [35]:
from pyspark.sql.functions import col
item_cluster_label = result.filter(col('vendor_variant_id') == str(item_id)).select('prediction')

In [37]:
item_cluster_label.show()

+----------+
|prediction|
+----------+
|         0|
+----------+



In [41]:
item_cluster_label.collect()[0][0]

0

In [42]:
item_cluster_label = result.filter(col('vendor_variant_id') == str(item_id)).select('prediction').collect()[0][0]

In [43]:
item_cluster_label

0

In [44]:
cluster_members = result.filter(col('prediction') == item_cluster_label)

In [45]:
cluster_members.show()

+-----------------+---------+--------------------+--------------------+----------------+--------------------+-----------+--------------------+-----+--------------------+-------+-------------+--------------+--------------------+-------+---------------+--------------+--------+--------+------+----------+--------------------+--------------------+--------------------+--------------------+----------+
|vendor_variant_id|vendor_id|       product_title| product_description|     vendor_name|       taxonomy_name|taxonomy_id|             weblink|color|            material|pattern|is_returnable|ship_surcharge|is_assembly_required|is_feed|commission_tier|inventory_type|division|category| price|sale_price|               combo|               words|         rawFeatures|            features|prediction|
+-----------------+---------+--------------------+--------------------+----------------+--------------------+-----------+--------------------+-----+--------------------+-------+-------------+-------------

In [49]:
n = 5
cluster_members.select('product_title','weblink').show(n)

+--------------------+--------------------+
|       product_title|             weblink|
+--------------------+--------------------+
|Native Maps - Aus...|https://www.weste...|
|Sarah Campbell Wa...|https://www.weste...|
|Framed Canvas Pri...|https://www.weste...|
|       Wake, I print|https://www.weste...|
|       Surf - Canvas|https://www.weste...|
+--------------------+--------------------+
only showing top 5 rows



In [50]:
result.filter(col('vendor_variant_id') == str(item_id)).select('product_title','weblink').show()

+--------------------+--------------------+
|       product_title|             weblink|
+--------------------+--------------------+
|Antique Aviary Gi...|http://www.ballar...|
+--------------------+--------------------+



In [51]:
spark.stop()