In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
spark = SparkSession.builder.appName("recommendation_system").getOrCreate()
spark

In [4]:
import gzip
import json
from pyspark.sql.types import *


def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

        
def getMetaData(path):
    data = []
    data_schema =  [
                       StructField("asin", StringType(), True),
                       StructField("title", StringType(), True),
                       StructField("brand", StringType(), True),
                       StructField("category", ArrayType(StringType(), True), True),
                       StructField("main_category", StringType(), True),
                       StructField("image", ArrayType(StringType(), True), True)
                   ]
    final_schema = StructType(fields=data_schema)
    for d in parse(path):
        review = {}
        review['asin'] = d['asin']
        review['title'] = d['title']
        review['brand'] = d['brand']
        review['category'] = d['category']
#         print(d['category'])
        review['main_category'] = next(reversed(d['category']), None) if len(d['category'])!= 0 else ''
        review['image'] = d['image']
        data.append(review)
#   print(df)
    return spark.createDataFrame(data, schema=final_schema)

product_data = getMetaData('./data/meta_Appliances.json.gz')
product_data = product_data.dropDuplicates(['asin'])
product_data.limit(1).toPandas()
# product_data.printSchema()

Unnamed: 0,asin,title,brand,category,main_category,image
0,B000BEZV7M,Extech RH401 Triple Display Hygro Thermometer ...,Extech,"[Appliances, Parts & Accessories, Humidifier P...",Humidity Meters,[https://images-na.ssl-images-amazon.com/image...


In [5]:
product_data.groupBy("main_category").count().orderBy(col('count').desc()).show(100)

+--------------------+-----+
|       main_category|count|
+--------------------+-----+
| Parts & Accessories| 4513|
|Refrigerator Part...| 3733|
|Washer Parts & Ac...| 2270|
|Dishwasher Parts ...| 1790|
|Range Parts & Acc...| 1710|
|       Water Filters| 1572|
|   Replacement Parts| 1556|
|Cooktop Parts & A...| 1171|
|         Range Hoods|  951|
|Humidifier Parts ...|  887|
|                    |  805|
|       Refrigerators|  722|
|Oven Parts & Acce...|  645|
|          Ice Makers|  453|
|            Cooktops|  436|
| Freestanding Ranges|  412|
|               Knobs|  406|
|Freezer Parts & A...|  360|
|Built-In Dishwashers|  357|
|         Accessories|  341|
|             Washers|  302|
|                Bins|  273|
|              Dryers|  253|
|               Vents|  243|
|Dryer Parts & Acc...|  235|
|              Motors|  224|
|             Filters|  213|
|     Humidity Meters|  185|
|   Replacement Wicks|  177|
|Refrigerators, Fr...|  175|
|Ranges, Ovens & C...|  169|
|Range Hood Pa

In [6]:
from elasticsearch import Elasticsearch

# test your ES instance is running
es = Elasticsearch()
es.info(pretty=True)

{'name': 'PRINHYLTPDL1275',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'EzFBPnY5SauC7OJNNutBtA',
 'version': {'number': '7.9.0',
  'build_flavor': 'default',
  'build_type': 'deb',
  'build_hash': 'a479a2a7fce0389512d6a9361301708b92dff667',
  'build_date': '2020-08-11T21:36:48.204330Z',
  'build_snapshot': False,
  'lucene_version': '8.6.0',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [7]:
es.indices.delete(index="products")
VECTOR_DIM = 25

product_mapping = {
    # this mapping definition sets up the metadata fields for the products
    "mappings": {
        "properties": {
            "asin": {
                "type": "keyword"
            },
            "title": {
                "type": "keyword"
            },
            "image": {
                "type": "keyword"
            },
            "brand": {
                "type": "keyword"
            },
            "category": {
                "type": "keyword"
            },
            "main_category": {
                "type": "keyword"
            },
            # the following fields define our model factor vectors and metadata
            "model_factor": {
                "type": "dense_vector",
                "dims" : VECTOR_DIM
            },
            "model_version": {
                "type": "keyword"
            },
            "model_timestamp": {
                "type": "date"
            }          
        }
    }
}

res_products = es.indices.create(index="products", body=product_mapping)

print("Created indices:")
print(res_products)

Created indices:
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'products'}


In [10]:
 es.count(index="products")['count']

30239

In [9]:
product_data.write.format("es").option("es.mapping.id", "asin").save("products")
num_products_df = product_data.count()
num_products_es = es.count(index="products")['count']
# check load went ok
print("Product DF count: {}".format(num_products_df))
print("ES index count: {}".format(num_products_es))

Product DF count: 30239
ES index count: 30239


In [15]:
es.search(index="products", q="main_category:Refrigerators", size=3)

{'took': 55,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 722, 'relation': 'eq'},
  'max_score': 3.7342029,
  'hits': [{'_index': 'products',
    '_type': '_doc',
    '_id': 'B002ACEF9S',
    '_score': 3.7342029,
    '_source': {'asin': 'B002ACEF9S',
     'title': 'Whirlpool W4TXNWFWQ 14.4 Cu. Ft. White Top Freezer Refrigerator',
     'brand': 'Whirlpool',
     'category': ['Appliances',
      'Refrigerators, Freezers & Ice Makers',
      'Refrigerators'],
     'main_category': 'Refrigerators',
     'image': []}},
   {'_index': 'products',
    '_type': '_doc',
    '_id': 'B0074WFW28',
    '_score': 3.7342029,
    '_source': {'asin': 'B0074WFW28',
     'title': 'GE GTH17DBDWW 16.5 Cu. Ft. White Top Freezer Refrigerator - Energy Star',
     'brand': 'GE',
     'category': ['Appliances',
      'Refrigerators, Freezers & Ice Makers',
      'Refrigerators'],
     'main_category': 'Refrigerators',
     'image': ['http

In [33]:
def getRatingData(path):
    data = []
    data_schema = [
               StructField("asin", StringType(), True),
               StructField("reviewerId", StringType(), True),
               StructField("rating", FloatType(), True)]
    final_schema = StructType(fields=data_schema)
    for d in parse(path):
        review = {}
        review['asin'] = d['asin']
        review['reviewerId'] = d['reviewerID']
        review['rating'] = d['overall']
        data.append(review)
#   print(df)
    return spark.createDataFrame(data, schema=final_schema)

df_rating= getRatingData('./data/Appliances_all.json.gz')
df_rating.limit(3).toPandas()

Unnamed: 0,asin,reviewerId,rating
0,1118461304,A3NHUQ33CFH3VM,5.0
1,1118461304,A3SK6VNBQDNBJE,5.0
2,1118461304,A3SOFHUR27FO3K,5.0


In [18]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

indexer = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in list(set(df_rating.columns)-set(['rating'])) ]
pipeline = Pipeline(stages=indexer)
transformed = pipeline.fit(df_rating).transform(df_rating)
transformed.show()

+----------+--------------+------+----------------+----------+
|      asin|    reviewerId|rating|reviewerId_index|asin_index|
+----------+--------------+------+----------------+----------+
|1118461304|A3NHUQ33CFH3VM|   5.0|           119.0|    2250.0|
|1118461304|A3SK6VNBQDNBJE|   5.0|        154267.0|    2250.0|
|1118461304|A3SOFHUR27FO3K|   5.0|         93964.0|    2250.0|
|1118461304|A1HOG1PYCAE157|   5.0|        217964.0|    2250.0|
|1118461304|A26JGAM6GZMM4V|   5.0|         79756.0|    2250.0|
|1118461304|A17K8WANMYHTX2|   5.0|         82127.0|    2250.0|
|1118461304|A13IW3A6W43U0G|   5.0|        410633.0|    2250.0|
|1118461304|A1ECEGG1MP7J8J|   5.0|        417266.0|    2250.0|
|1118461304|A2D5X9G9S3A7RN|   5.0|        414321.0|    2250.0|
|1118461304| AP2F86JFRQ205|   5.0|         93636.0|    2250.0|
|1118461304|A3VF3A5A3O04E1|   4.0|        252906.0|    2250.0|
|1118461304|A14DW5UMQ1M96O|   5.0|        395199.0|    2250.0|
|1118461304|A2V7UVKOFG57IW|   4.0|        255448.0|    

In [19]:
als=ALS(maxIter=5,regParam=0.09,rank=25,userCol="reviewerId_index",itemCol="asin_index",ratingCol="rating",coldStartStrategy="drop",nonnegative=True)
model=als.fit(transformed)

In [11]:
model.itemFactors.count()

30252

In [83]:
evaluator=RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="prediction")
predictions=model.transform(transformed)
rmse=evaluator.evaluate(predictions)
print("RMSE="+str(rmse))
predictions.show()

RMSE=0.03298011012132724
+----------+--------------+------+----------------+----------+----------+
|      asin|    reviewerId|rating|reviewerId_index|asin_index|prediction|
+----------+--------------+------+----------------+----------+----------+
|B0053F7TQA| AJPRN1TD1A0SD|   2.0|            40.0|      31.0| 2.0062504|
|B0053F7TQA|A1AJW9DILZFTQI|   5.0|            39.0|      31.0|  4.890765|
|B0053F7TQA|A1LN48DHHCKLR3|   5.0|            24.0|      31.0|   4.97714|
|B0053F7TQA|A3J8IC20SGBS2O|   5.0|            32.0|      31.0|  4.888028|
|B000XTJRRA|A1T1YSCDW0PD25|   4.0|            23.0|      34.0| 3.9580224|
|B000XTJRRA|A1T1YSCDW0PD25|   4.0|            23.0|      34.0| 3.9580224|
|B00AHR3IG4|A1Y4UNHRP312HS|   5.0|            18.0|      28.0|  4.985533|
|B00AHR3IG4|A1Y4UNHRP312HS|   5.0|            18.0|      28.0|  4.985533|
|B00AHR3IG4|A329823SXZ8IBE|   5.0|            36.0|      28.0|  4.993231|
|B00AHR3IG4|A329823SXZ8IBE|   5.0|            36.0|      28.0|  4.993231|
|B00DM8JA7Q| 

In [20]:
from pyspark.sql.functions import lit, current_timestamp, unix_timestamp
ver = model.uid
ts = unix_timestamp(current_timestamp())
product_vectors = model.itemFactors.select("id",\
                                         col("features").alias("model_factor"),\
                                         lit(ver).alias("model_version"),\
                                         ts.alias("model_timestamp"))
product_vectors.show(2)

+---+--------------------+----------------+---------------+
| id|        model_factor|   model_version|model_timestamp|
+---+--------------------+----------------+---------------+
|  0|[0.15479389, 0.48...|ALS_34b5e7b4e040|     1599109562|
| 10|[0.59706867, 0.56...|ALS_34b5e7b4e040|     1599109562|
+---+--------------------+----------------+---------------+
only showing top 2 rows



In [21]:
asin_index_meta = [
    f.metadata for f in transformed.schema.fields if f.name == "asin_index"]
asin_index_labels = asin_index_meta[0]["ml_attr"]["vals"]

from pyspark.ml.feature import IndexToString

reviewerId_converter = IndexToString(inputCol="id", outputCol="asin",   labels=asin_index_labels)
PredictedLabels = reviewerId_converter.transform(product_vectors)
PredictedLabels = PredictedLabels.drop('id')
PredictedLabels.show(10)

+--------------------+----------------+---------------+----------+
|        model_factor|   model_version|model_timestamp|      asin|
+--------------------+----------------+---------------+----------+
|[0.15479389, 0.48...|ALS_34b5e7b4e040|     1599109571|B000AST3AK|
|[0.59706867, 0.56...|ALS_34b5e7b4e040|     1599109571|B0006GVNOA|
|[0.11962461, 0.14...|ALS_34b5e7b4e040|     1599109571|B01CTNA1VI|
|[0.32039857, 0.04...|ALS_34b5e7b4e040|     1599109571|B00126NABC|
|[0.047667623, 0.0...|ALS_34b5e7b4e040|     1599109571|B00UB441HS|
|[0.1308046, 0.191...|ALS_34b5e7b4e040|     1599109571|B0042U16YI|
|[0.12145985, 0.13...|ALS_34b5e7b4e040|     1599109571|B00W0W8LMK|
|[0.2897823, 0.301...|ALS_34b5e7b4e040|     1599109571|B004XLDE5A|
|[0.021742053, 0.1...|ALS_34b5e7b4e040|     1599109571|B00W0WXHCO|
|[0.7827199, 0.146...|ALS_34b5e7b4e040|     1599109571|B00NIZ0DV0|
+--------------------+----------------+---------------+----------+
only showing top 10 rows



In [131]:
PredictedLabels.count()

30252

In [23]:
PredictedLabels.write.format("es") \
    .option("es.mapping.id", "asin") \
    .option("es.write.operation", "upsert") \
    .save("products", mode="append")

In [62]:
def vector_query(query_vec, category,vector_field, cosine=False):
    """
    Construct an Elasticsearch script score query using `dense_vector` fields
    
    The script score query takes as parameters the query vector (as a Python list)
    
    Parameters
    ----------
    query_vec : list
        The query vector
    vector_field : str
        The field name in the document against which to score `query_vec`
    q : str, optional
        Query string for the search query (default: '*' to search across all documents)
    cosine : bool, optional
        Whether to compute cosine similarity. If `False` then the dot product is computed (default: False)
     
    Note: Elasticsearch cannot rank negative scores. Therefore, in the case of the dot product, a sigmoid transform
    is applied. In the case of cosine similarity, 1.0 is added to the score. In both cases, documents with no 
    factor vectors are ignored by applying a 0.0 score.
    
    The query vector passed in will be the user factor vector (if generating recommended items for a user)
    or product factor vector (if generating similar items for a given item)
    """
    
    if cosine:
        score_fn = "doc['{v}'].size() == 0 ? 0 : cosineSimilarity(params.vector, '{v}') + 1.0"
    else:
        score_fn = "doc['{v}'].size() == 0 ? 0 : sigmoid(1, Math.E, -dotProduct(params.vector, '{v}'))"
       
    score_fn = score_fn.format(v=vector_field, fn=score_fn)
    
    return {
    "query": {
        "script_score": {
            "query" : { 
                "bool" : {
                      "filter" : {
                            "term" : {
                              "main_category" : category
                            }
                        }
                }
            },
            "script": {
                "source": score_fn,
                "params": {
                    "vector": query_vec
                }
            }
        }
    }
}


def get_similar(the_id, num=10, index="products", vector_field='model_factor'):
    """
    Given a item id, execute the recommendation script score query to find similar items,
    ranked by cosine similarity. We return the `num` most similar, excluding the item itself.
    """
    response = es.get(index=index, id=the_id)
    src = response['_source']
    if vector_field in src:
        query_vec = src[vector_field]
        category = src['main_category']
        q = vector_query(query_vec, category,vector_field, cosine=True)
#         print(q)
        results = es.search(index=index, body=q)
        hits = results['hits']['hits']
        return src,hits[1:num+1]

def display_similar(the_id, num=10, es_index="products"):
    """
    Display query product, together with similar product and similarity scores, in a table
    """
    product, recs = get_similar(the_id, num, es_index)
       
    display(HTML("<h2>Get similar products for:</h2>"))
    display(HTML("<h4>%s (ASIN - %s)</h4>" % (product['title'], product['asin'])))
    display(HTML("<br>"))
    display(HTML("<h2>People who liked this product also liked these:</h2>"))
    sim_html = "<table border=0>"
    i = 0
    pd_data = []
    for rec in recs:
        r_score = rec['_score']
        r_title = rec['_source']['title']
        r = {}
        r['asin'] = rec['_source']['asin']
        r['title'] = r_title
        r['score'] = r_score
        pd_data.append(r)
        r_im_url = next(iter(rec['_source']['image']), '')
        sim_html += "<tr><td><h5>%s</h5><img src=%s width=150></img></td><td><h5>%2.3f</h5></td></tr>" % (r_title, r_im_url, r_score)
        i += 1
    sim_html += "</table>"
    pd.set_option('display.max_colwidth', -1) 
    pd_df = pd.DataFrame (pd_data)
    display(HTML(pd_df.to_html()))
    display(HTML(sim_html))


In [63]:
from IPython.display import Image, HTML, display
data = display_similar('B001A5HT94', num=5)



Unnamed: 0,asin,title,score
0,B00VVH5PRE,"Samsung RS25J500DSR 36&quot; Freestanding Side by Side Refrigerator with 24.52 cu. ft. Capacity,",1.773148
1,B004Y264RI,KitchenAid KBFS25EWMS Architect Series II 24.8 cu. ft. French Door Refrigerator - Stainless Steel,1.772508
2,B002ACEF9S,Whirlpool W4TXNWFWQ 14.4 Cu. Ft. White Top Freezer Refrigerator,1.749504
3,B00K7LDYWS,Oster 3.25 CF Refrigerator-Black-OSDR325B1,1.727583
4,B00NI9T5ME,Electrolux EI32AR80QS 18.6 Cu. Ft. Stainless Steel Freezerless Refrigerator,1.71689


0,1
"Samsung RS25J500DSR 36"" Freestanding Side by Side Refrigerator with 24.52 cu. ft. Capacity,",1.773
KitchenAid KBFS25EWMS Architect Series II 24.8 cu. ft. French Door Refrigerator - Stainless Steel,1.773
Whirlpool W4TXNWFWQ 14.4 Cu. Ft. White Top Freezer Refrigerator,1.75
Oster 3.25 CF Refrigerator-Black-OSDR325B1,1.728
Electrolux EI32AR80QS 18.6 Cu. Ft. Stainless Steel Freezerless Refrigerator,1.717
