# Setting up Retrotech Products + Signals

In [11]:
import sys
sys.path.append('..')
from aips import *
import os
from IPython.core.display import display,HTML
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("aips-ch4-signals-boosting").getOrCreate()

## Create Products and Signals Collections

In [2]:
#Import Datasets
products_collection="products"
create_collection(products_collection)
#make the following fields explicitly text fields for later searching
upsert_text_field(products_collection, "name")
upsert_text_field(products_collection, "longDescription")
upsert_text_field(products_collection, "manufacturer")

signals_collection="signals"
create_collection(signals_collection)

Wiping 'products' collection
Creating products' collection
Status: Success
Adding 'name' field to collection
Status: Success
Adding 'longDescription' field to collection
Status: Success
Adding 'manufacturer' field to collection
Status: Success
Wiping 'signals' collection
Creating signals' collection
Status: Success


In [3]:
#Get datasets
![ ! -d 'retrotech' ] && git clone https://github.com/ai-powered-search/retrotech.git
! cd retrotech && git pull
! cd retrotech && tar -xvf products.tgz && tar -xvf signals.tgz

Cloning into 'retrotech'...
remote: Enumerating objects: 19, done.[K
remote: Total 19 (delta 0), reused 0 (delta 0), pack-reused 19[K
Unpacking objects: 100% (19/19), done.
Already up to date.
products.csv
signals.csv


In [4]:
#index products
print("Loading Products...")
csvFile = "retrotech/products.csv"
product_update_opts={"zkhost": "aips-zk", "collection": products_collection, "gen_uniq_key": "true", "commit_within": "5000"}
csvDF = spark.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load(csvFile)
csvDF.write.format("solr").options(**product_update_opts).mode("overwrite").save()
print("Products Schema: ")
csvDF.printSchema()

#index signals
print("Loading Signals...")
csvFile = "retrotech/signals.csv"
signals_update_opts={"zkhost": "aips-zk", "collection": signals_collection, "gen_uniq_key": "true", "commit_within": "5000"}
csvDF = spark.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load(csvFile)
csvDF.write.format("solr").options(**signals_update_opts).mode("overwrite").save()
print("Signals Schema: ")
csvDF.printSchema()

Loading Products...
Products Schema: 
root
 |-- upc: string (nullable = true)
 |-- name: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- shortDescription: string (nullable = true)
 |-- longDescription: string (nullable = true)

Loading Signals...
Signals Schema: 
root
 |-- query_id: string (nullable = true)
 |-- user: string (nullable = true)
 |-- type: string (nullable = true)
 |-- target: string (nullable = true)
 |-- signal_time: timestamp (nullable = true)



## Create Signals Boosts (Signals Aggregation)

In [5]:
signals_boosting_collection="signals_aggregation"
create_collection(signals_boosting_collection)

signals_opts={"zkhost": "aips-zk", "collection": signals_collection}
signals_boosting_opts={"zkhost": "aips-zk", "collection": signals_boosting_collection, "gen_uniq_key": "true", "commit_within": "5000"}

df = spark.read.format("solr").options(**signals_opts).load()
df.registerTempTable("signals")

print("Aggregating Signals to Create Signals Boosts...")

signals_aggregation_query = """
select q.target as query, c.target as doc, count(c.target) as boost
  from signals c left join signals q on c.query_id = q.query_id
  where c.type = 'click' AND q.type = 'query'
  group by query, doc
  order by boost desc
"""

spark.sql(signals_aggregation_query).write.format("solr").options(**signals_boosting_opts).mode("overwrite").save()
print("Signals Aggregation Completed!")

Wiping 'signals_aggregation' collection
Creating signals_aggregation' collection
Status: Success
Aggregating Signals to Create Signals Boosts...
Signals Aggregation Completed!


## Search without Signals Boosts

In [22]:
query = "ipad"

collection = "products"
request = {
    "query": query,
    "fields": ["upc", "name", "manufacturer", "score"],
    "limit": 5,
    "params": {
      "qf": "name manufacturer longDescription",
      "defType": "edismax",
      "indent": "true",
      "sort": "score desc, upc asc"
    }
}

search_results = requests.post(solr_url + collection + "/select", json=request).json()["response"]["docs"]
display(HTML(render_search_results(query, search_results)))

## Search with Signals Boosts Applied

In [20]:
query = "ipad"

signals_boosts_query = {
    "query": query,
    "fields": ["doc", "boost"],
    "limit": 10,
    "params": {
      "defType": "edismax",
      "qf": "query",
      "sort": "boost desc"
    }
}

signals_boosts = requests.post(solr_url + signals_boosting_collection + "/select", json=signals_boosts_query).json()["response"]["docs"]
print("Boost Documents: \n")
print(signals_boosts)

product_boosts = ""
for entry in signals_boosts:
    if len(product_boosts) > 0:  product_boosts += " "
    product_boosts += '"' + entry['doc'] + '"^' + str(entry['boost'])

print("\nBoost Query: \n" + product_boosts)


collection = "products"
request = {
    "query": query,
    "fields": ["upc", "name", "manufacturer", "score"],
    "limit": 5,
    "params": {
      "qf": "name manufacturer longDescription",
      "defType": "edismax",
      "indent": "true",
      "sort": "score desc, upc asc",
      "qf": "name manufacturer longDescription",
      "boost": "sum(1,query({! df=upc v=$signals_boosting}))",
      "signals_boosting": product_boosts
    }
}

search_results = requests.post(solr_url + collection + "/select", json=request).json()["response"]["docs"]
display(HTML(render_search_results(query, search_results)))

Boost Documents: 

[{'doc': '885909457588', 'boost': 966}, {'doc': '885909457595', 'boost': 205}, {'doc': '885909471812', 'boost': 202}, {'doc': '886111287055', 'boost': 109}, {'doc': '843404073153', 'boost': 73}, {'doc': '635753493559', 'boost': 62}, {'doc': '885909457601', 'boost': 62}, {'doc': '885909472376', 'boost': 61}, {'doc': '610839379408', 'boost': 29}, {'doc': '884962753071', 'boost': 28}]

Boost Query: 
"885909457588"^966 "885909457595"^205 "885909471812"^202 "886111287055"^109 "843404073153"^73 "635753493559"^62 "885909457601"^62 "885909472376"^61 "610839379408"^29 "884962753071"^28
