**Note: This notebook is an adaptation of the lab 5 & 6 notebooks, so variables and the structure of the code may seem similar**

* Please run all of this code below on **Colab notebooks**

# Inspection of the dataset

In [1]:
import pandas as pd
# Full preprocessed dataset - case-folding, stop word removal, puncuation removal, and lemmatisation 
# applied to fields: product_title and product_description 
df = pd.read_csv("home_depot_trainset_preprocessed.csv", 
                 usecols = ["id", "product_uid", "product_title", 
                            "product_description", "search_term", "relevance"])
df.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description
0,2,100001,simpson strongtie 12gauge angle,angle bracket,3.0,angle make joint stronger also provide consist...
1,3,100001,simpson strongtie 12gauge angle,l bracket,2.5,angle make joint stronger also provide consist...
2,9,100002,behr premium textured deckover 1gal sc141 tugb...,deck over,3.0,behr premium textured deckover innovative soli...
3,16,100005,delta vero 1handle shower faucet trim kit chro...,rain shower head,2.33,update bathroom delta vero singlehandle shower...
4,17,100005,delta vero 1handle shower faucet trim kit chro...,shower only faucet,2.67,update bathroom delta vero singlehandle shower...


In [2]:
len(df['search_term'].unique())

11795

In [3]:
len(df['product_uid'].unique())

54667

In [4]:
len(df)

74067

In [5]:
# dedup dataset - keep distinct product_uid (document ids)
df2 = pd.read_csv("home_depot_trainset_preprocessed_dedup.csv", 
                 usecols = ["id", "product_uid", "product_title", 
                            "product_description", "search_term", "relevance"])
df2.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description
0,2,100001,simpson strongtie 12gauge angle,angle bracket,3.0,angle make joint stronger also provide consist...
1,9,100002,behr premium textured deckover 1gal sc141 tugb...,deck over,3.0,behr premium textured deckover innovative soli...
2,16,100005,delta vero 1handle shower faucet trim kit chro...,rain shower head,2.33,update bathroom delta vero singlehandle shower...
3,18,100006,whirlpool 19 cu ft range convection microwave ...,convection otr,3.0,achieving delicious result almost effortless w...
4,23,100007,lithonia lighting quantum 2light black led eme...,emergency light,2.67,quantum adjustable 2light led black emergency ...


In [6]:
print(len(df), len(df2))

74067 54667


In [7]:
len(df2['search_term'].unique())

11092

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74067 entries, 0 to 74066
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   74067 non-null  int64  
 1   product_uid          74067 non-null  int64  
 2   product_title        74067 non-null  object 
 3   search_term          74067 non-null  object 
 4   relevance            74067 non-null  float64
 5   product_description  74067 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 3.4+ MB


In [9]:
df2.shape

(54667, 6)

In [10]:
#drop duplicate products that are there because of the relevance data
df2 = df2.drop_duplicates(subset='product_uid')

In [11]:
#we now have 10824 unique products
df2.shape

(54667, 6)

# Get ready to feed into the model

Make sure to run each cell separately and wait for each cell to finish running in order for ElasticSearch to work properly

In [12]:
!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.0.0-linux-x86_64.tar.gz -q

In [13]:
!tar -xzf elasticsearch-7.0.0-linux-x86_64.tar.gz

In [14]:
!chown -R daemon:daemon elasticsearch-7.0.0

In [15]:
! pip install elasticsearch -q

In [16]:
# import elasticsearch
from elasticsearch import Elasticsearch
# start es server
import os
from subprocess import Popen, PIPE, STDOUT

es_server = Popen(['elasticsearch-7.0.0/bin/elasticsearch'], 
                  stdout=PIPE, stderr=STDOUT,
                  preexec_fn=lambda: os.setuid(1)  # as daemon need 0 not 1
                 )

In [17]:
# instantiating ES
def test_ES(es):
  """
  This function checks to see if we have successfully started an 
  ES instance and imported its python (version) library

  parameters:
    (es): takes in a instance of elasticsearch
  
  returns:
    boolean value of whether the instantiation is working or not
  """
  
  return es.ping()  # got True

# start and testing es 
es = Elasticsearch()
if test_ES(es):
  print('ES instance is working, now run the cells below')
else:
  print('ES instance is not working and re-run the cells above again')

ES instance is working, now run the cells below


In [18]:
# getting our dataset ready to put into the retrieval model
def ready2feed(df2):
  """
  This function separates the features of each product into separate lists
  and then combines them in a tuple 

  parameters:
    (df): pandas dataframe that is used to get our dataset in an appropriate 
          format
  
  returns:
    (list) : list containing tuples of each product where each tuple contains
             the features of the product
  """

  id, product_uid = [id for id in df2.id], [uid for uid in df2.product_uid]

  product_title, product_description = [title for title in df2.product_title], \
   [description for description in df2.product_description]

  search_term, relevance = [s_t for s_t in df2.search_term], \
  [rel for rel in df2.relevance]

  return list(zip(id, product_uid, 
                  product_title, product_description, 
                  search_term, relevance))

# testing to see if our function works
corpus = ready2feed(df2) # feed the dataset into the function
# check to see if the first product is in the right format as we intended
corpus[0]

(2,
 100001,
 'simpson strongtie 12gauge angle',
 'angle make joint stronger also provide consistent straight corner simpson strongtie offer wide variety angle various size thickness handle lightduty job project structural connection needed bent skewed match project outdoor project moisture present use zmax zinccoated connector provide extra resistance corrosion look z end model numberversatile connector various 90 connection home repair projectsstronger angled nailing screw fastening alonehelp ensure joint consistently straight strongdimensions 3 x 3 x 112 inmade 12gauge steelgalvanized extra corrosion resistanceinstall 10d common nail 9 x 112 strongdrive sd screw',
 'angle bracket',
 3.0)

In [19]:
print(len(corpus))

54667


# Models

In [20]:
# do not run this cell if this is your "first run" of the notebook
# uncomment this when you want to use another retrieval model 
# and then comment it again after running this cell
#es.indices.delete(index_name)

#### bm25_default

In [21]:
# mappings are used to define what kind of structure your data has. here explicit mapping is used: 
# https://www.elastic.co/guide/en/elasticsearch/reference/current/explicit-mapping.html

# The mapping is used when creating the index through the request body:

bm25_default = {
    'settings': {
        'number_of_shards': 1,
        'number_of_replicas': 1,
        'index': {
            'similarity': {
                'bm25_similarity': {
                    'type': 'BM25',
                    'b': '0.75',
                    'k1': '1.2'

                }
            }
        }
        
    },
    'mappings': {
          'properties': {
              'id': {'type': 'integer', 'similarity': 'bm25_similarity'},
              'product_uid': {'type': 'integer', 'similarity': 'bm25_similarity'},
              'product_title': {'type': 'text', 'similarity': 'bm25_similarity'},
              'product_description': {'type': 'text', 'similarity': 'bm25_similarity'},
              'search_term': {'type': 'text', 'similarity': 'bm25_similarity'},
              'relevance': {'type': 'float', 'similarity': 'bm25_similarity'}
          }
    }
}

#### bm25_high

In [22]:
# high values
# b = 1 corresponds to fully scaling the term weight by the document length
# full doc length normalization
bm25_high = {
    'settings': {
        'number_of_shards': 1,
        'number_of_replicas': 1,
        'index': {
            'similarity': {
                'bm25_similarity': {
                    'type': 'BM25',
                    'b': '1.0',
                    'k1': '2.0'

                }
            }
        }
        
    },
    'mappings': {
          'properties': {
              'id': {'type': 'integer', 'similarity': 'bm25_similarity'},
              'product_uid': {'type': 'integer', 'similarity': 'bm25_similarity'},
              'product_title': {'type': 'text', 'similarity': 'bm25_similarity'},
              'product_description': {'type': 'text', 'similarity': 'bm25_similarity'},
              'search_term': {'type': 'text', 'similarity': 'bm25_similarity'},
              'relevance': {'type': 'float', 'similarity': 'bm25_similarity'}
          }
    }
}

#### bm25_low

In [23]:
# low values
# b = 0 corresponds to no doc length normalization.
bm25_low = {
    'settings': {
        'number_of_shards': 1,
        'number_of_replicas': 1,
        'index': {
            'similarity': {
                'bm25_similarity': {
                    'type': 'BM25',
                    'b': '0.0',
                    'k1': '1.0'

                }
            }
        }
        
    },
    'mappings': {
          'properties': {
              'id': {'type': 'integer', 'similarity': 'bm25_similarity'},
              'product_uid': {'type': 'integer', 'similarity': 'bm25_similarity'},
              'product_title': {'type': 'text', 'similarity': 'bm25_similarity'},
              'product_description': {'type': 'text', 'similarity': 'bm25_similarity'},
              'search_term': {'type': 'text', 'similarity': 'bm25_similarity'},
              'relevance': {'type': 'float', 'similarity': 'bm25_similarity'}
          }
    }
}

#### dfr

In [24]:
# DFR
dfr = {
    'settings': {
        'number_of_shards': 1,
        'number_of_replicas': 1,
        'index': {
            'similarity': {
                'dfr_similarity': {
                    'type': 'DFR',
                    'basic_model': 'g',
                    'after_effect': 'l',
                    'normalization': 'h2',
                    'normalization.h2.c':'3.0'

                }
            }
        }
        
    },
    'mappings': {
          'properties': {
              'id': {'type': 'integer', 'similarity': 'dfr_similarity'},
              'product_uid': {'type': 'integer', 'similarity': 'dfr_similarity'},
              'product_title': {'type': 'text', 'similarity': 'dfr_similarity'},
              'product_description': {'type': 'text', 'similarity': 'dfr_similarity'},
              'search_term': {'type': 'text', 'similarity': 'dfr_similarity'},
              'relevance': {'type': 'float', 'similarity': 'dfr_similarity'}
          }
    }
}

# After

In [25]:
# run this and wait before running next cell
index_name = 'test-index'
try:
  es.indices.get(index_name)
  print('index {} already exists'.format(index_name))
except:
  print('creating index {}'.format(index_name))
  es.indices.create(index_name, body=bm25_default) # change body arguement to different retrieval models - default as seen here is bm25_default

creating index test-index


In [26]:
# now what we want to do is put some data in the index, i.e. index it: 
for id, product_uid, product_title, product_description, search_term, relevance in corpus:
  # these have the same as the mappings in the above cell
  doc_body = {
      'id': id,'product_uid': product_uid,
      'product_title': product_title,'product_description': product_description,
      'search_term': search_term, 'relevance': relevance
  }
  es.index(index_name, doc_body)

In [27]:
# Now let's have a look at our index: got 50 which was supposed to be 25 as ran the cell above twice so duplicated docs
print('we have made and index called {} with {} documents'.format(index_name, es.cat.count(index=index_name,h=['count'])))

we have made and index called test-index with 54667
 documents


In [28]:
def index_info(index_name):
  #docs.count - n.o docs
  #docs.deleted - n.o docs deleted
  count, deleted, shards, =  es.cat.indices(index=index_name, h=['docs.count', 'docs.deleted', 'pri'])[:-1].split(' ')
  print(
      """
      #### INDEX INFO #####
      index_name = {}
      doc_count = {}
      shard_count = {}
      deleted_doc_count = {}
      """.format(index_name, count, shards, deleted)
  )

# User Interface demo

Use the search on the right hand side to test given search queries. 
<br>
**product_description** is the text field searched over.

In [29]:
def search(index_name, query_body):
  # return only doc_id and rank
  results = es.search(index=index_name, body=query_body, explain=False)
  plain_results = [(x['_source']['product_uid'], x['_source']['product_title'], x['_source']['relevance'], x['_score']) for x in results['hits']['hits']]
  return results, plain_results

In [30]:
#@title Product Search - type a search query
query = 'rain shower head'  #@param {type:"string"}
query_body = {
      'query':{
          'query_string': {
              'query': query, 
              'default_field': 'product_description'
          }
      }
  }

results, plain_results = search(index_name, query_body)
for product_id, title, relevance, score in plain_results:
    print(product_id, title, relevance, score)


136785 1spray 16 raincan square ceiling mount rain ultra thin showerhead stainless steel 2.33 17.157784
174206 speakman 10 extension arm 2.33 16.365002
180868 1spray 8 filtered showerhead satin nickel led light 2.0 15.841835
147214 waterpik kent 7spray 6 showerhead chrome 2.67 15.726058
124591 vigo 6jet shower panel system rain shower head handshower stainless steel 2.33 15.169331
155361 kohler watertile rain 1spray 9875 overhead showerhead panel oilrubbed bronze 2.67 15.083951
165792 kohler watertile rain 1spray 9875 overhead showerhead brushed bronze 2.67 15.083951
173237 kohler watertile rain 1spray 9875 overhead showerhead polished chrome 1.67 15.083951
174388 grohe euphoria 1function handshower showerhead combo kit starlight chrome 2.33 14.466092
120585 vigo 6jet shower panel system stainless steel 2.33 14.293229


# Evaluation

In [31]:
# Retrieve distinct queries - dedup dataframe
queries = []
for i in df2['search_term'].unique():
  queries.append(i.lower())
print(len(queries))

11092


In [32]:
# Create test list of queries
import string
queries_select = queries[:200]
queries_select = [''.join(c for c in s if c not in string.punctuation) for s in queries_select]
print(queries_select)

['angle bracket', 'deck over', 'rain shower head', 'convection otr', 'emergency light', 'mdf 34', 'steele stake', 'briggs and stratton lawn mower', 'hampton bay chestnut pull up shade', 'disposer', 'grill gazebo', 'door guards', '1x1 rail decorative wood', 'lawn sprkinler', 'platform for washers', 'concrete  masonry cleaner  etcher', 'belgium block pavers', 'insulation roll', '6ft h bamboo fencing', 'chalk paint', '8 4616809045 9', '6 teir shelving', 'mortar tools', '12 boltless bracket', 'husky tool bag', 'impact driver drill battery powered', 'pellet stove', 'american standard bone round toliet', 'grayson', '6 stell', 'bolt 12 in by 12', 'elastomeric roof coating', 'outdoor dining', 'cushions outdoorlounge', 'wiremesh', 'front doors', 'kingsley moen kitchen faucet', 'roof melter', '6 kraft faced insulation', 'battery lanterns', 'planters pots', '5x5 post', 'bazz lighting', 'tiles 1212', '10  rough toilet bowl', '60 heater gallon gas water', 'shark vacuum', 'under cabinet led', 'doors

In [33]:
# Get the declared retrieval model's scores for each product uid
from statistics import mean
output = []
for i in queries_select: # iterate through list of queries created above

  query_body = {
      'query':{
          'query_string': {
              'query': i, # i in queries list
              'default_field': 'product_description'
          }
      }
  }
  # index_name - index your searching
  results = es.search(index=index_name, body=query_body)['hits']['hits']
  result_list = []
  for hit in results:
    # BM25 is the default at the bottom of the output
    # results is a list of dicts and accessing certain keys in the dicts 
    # i.e. source which is a key with a dict as a value so accessible that dict with a key title, and acesse the score
    x = (i, hit['_source']['product_uid'], hit['_score'])
    result_list.append(x)
  output.extend(result_list)

In [34]:
print(len(output))
print(output)

1826
[('angle bracket', 165217, 13.302946), ('angle bracket', 165126, 12.88057), ('angle bracket', 196666, 12.88057), ('angle bracket', 120048, 12.704327), ('angle bracket', 198519, 12.657677), ('angle bracket', 102651, 12.26751), ('angle bracket', 101571, 12.118238), ('angle bracket', 185641, 11.963935), ('angle bracket', 102005, 11.483875), ('angle bracket', 200720, 11.2125025), ('deck over', 159065, 10.211855), ('deck over', 138042, 7.9596496), ('deck over', 192588, 6.6416717), ('deck over', 180318, 6.6280003), ('deck over', 182306, 6.6280003), ('deck over', 126085, 6.6127906), ('deck over', 130898, 6.6127906), ('deck over', 168842, 6.5983515), ('deck over', 128206, 6.557383), ('deck over', 112952, 6.44625), ('rain shower head', 136785, 17.157784), ('rain shower head', 174206, 16.365002), ('rain shower head', 180868, 15.841835), ('rain shower head', 147214, 15.726058), ('rain shower head', 124591, 15.169331), ('rain shower head', 155361, 15.083951), ('rain shower head', 165792, 15.0

In [35]:
import numpy as np
#turn output into dataframe with each query as a column, row  doc ids, and values as scores
_= pd.DataFrame(output, columns=['Query', 'Doc_ID', 'Score'])
print(len(_['Doc_ID'].unique()))
score_df = _.pivot_table(index ='Doc_ID', columns= 'Query', values = 'Score')
score_df = score_df.replace(np.nan, 0)
score_df

1728


Query,1 black self tapping screws,10 rough toilet bowl,10 window sping rod,10000 btu portable ac,12 boltless bracket,12 x 5 black pipe nipple,14 wonderboard,18 dishwasher,1x1 rail decorative wood,2 paint brush pack,2 panel door,20v dewalt kombo,23 stud,24 bathroom vanities,24 stainless gas range,28 snow thower,2x4 board,3 blue masking tape,3 way,30 x 60 molded one piece acrylic shower stall,3way electrical sockets,4 lights bulbs,48 beadboard paneling,4x4 deck post,4x6,5 gal buckets,5gallon roof patch,5x5 post,6 kraft faced insulation,6 stell,6 teir shelving,60 heater gallon gas water,60w bulb,6ft h bamboo fencing,6in by 6 inlumder,6x6 p sand tile,8 4616809045 9,80 x 36 solid wood,ac window unit,ajustable ladder feet,...,molding trim,mortar tools,mosaic tiles,omnifilter,outdoor dining,pantry rack,patio furniture covers,pellet stove,pine straw,plants moses in a cradle,plastic tubing,platform for washers,plexiglas 18 x 24,ply 12,pruning saw,rain shower head,respirator,roller,roof melter,rug doctor carpet cleaner,satin nickel pull,screen frame,shark cleaner,shark vacuum,sheet metal,sheetrock,shelf track,steele stake,tent,tiles 1212,treated fence posts,tree pruner,under cabinet led,under cabinet lighting,washer dryer sets,water heater blanket,water trap moisture filter,werner ladder,window insulation kit,wiremesh
Doc_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
100010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.984669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
100016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
100019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
100045,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
100049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
206140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.408565,0.0
206243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
206429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0


In [36]:
# output from df without duplicates removed for a search term 
df.loc[df['search_term'] == 'rain shower head']

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description
3,16,100005,delta vero 1handle shower faucet trim kit chro...,rain shower head,2.33,update bathroom delta vero singlehandle shower...
583,1870,100331,pulse showerspas kauai ii brushed nickel showe...,rain shower head,2.67,kauai rain shower system brilliantly simple de...
1124,3517,100616,moen halo 3spray 9 rainshower showerhead chrome,rain shower head,3.0,customize bathing experience moen halo 3spray ...
2120,6541,101126,glacier bay 1spray 8 square showerhead chrome,rain shower head,3.0,glacier bay 1spray 8 square showerhead chrome ...
2477,7615,101287,grohe powerampsoul cosmopolitan 4spray 712 sho...,rain shower head,3.0,grohe powersoul cosmopolitan shower created fu...
4652,14566,102542,kohler forte singlefunction 1spray katalyst sh...,rain shower head,3.0,forte singlefunction showerhead brings innovat...
4849,15205,102654,moen ignite 5spray 9 rainshower showerhead chrome,rain shower head,3.0,subtle elegant detail ignite collection create...
5325,16674,102941,delta 1spray 8 overhead raincan shower head ch...,rain shower head,2.67,create spalike sanctuary bathroom delta arzo 1...


In [37]:
# create qrels list of list from df without duplicates removed
qrels = []
for q in queries_select:  
  df_ = df.loc[df['search_term'] == q]
  product_uid = df_['product_uid'].values[:]
  search_term = df_['search_term'].values[:]
  relevance = df_['relevance'].values[:]
  qrel = []
  for i in range(len(product_uid)):
    qr = [search_term[i], product_uid[i], relevance[i]]
    qrel.append(qr)
  qrels.extend(qrel)
print('number of qrels:', len(qrels))
print(qrels)

number of qrels: 1380
[['angle bracket', 100001, 3.0], ['angle bracket', 100664, 2.67], ['angle bracket', 100672, 2.33], ['angle bracket', 100739, 3.0], ['angle bracket', 101036, 1.33], ['angle bracket', 101370, 3.0], ['angle bracket', 101440, 2.33], ['angle bracket', 101534, 2.33], ['angle bracket', 102239, 2.67], ['angle bracket', 103250, 3.0], ['angle bracket', 103262, 3.0], ['deck over', 100002, 3.0], ['deck over', 100092, 1.67], ['deck over', 101629, 3.0], ['deck over', 101795, 2.67], ['deck over', 103015, 1.67], ['deck over', 103057, 2.33], ['deck over', 103356, 2.67], ['deck over', 103384, 2.33], ['rain shower head', 100005, 2.33], ['rain shower head', 100331, 2.67], ['rain shower head', 100616, 3.0], ['rain shower head', 101126, 3.0], ['rain shower head', 101287, 3.0], ['rain shower head', 102542, 3.0], ['rain shower head', 102654, 3.0], ['rain shower head', 102941, 2.67], ['convection otr', 100006, 3.0], ['convection otr', 101635, 2.67], ['convection otr', 105261, 2.67], ['con

In [38]:
# Delete search terms which did not yield results from query list and enumerate query list to give numbered query id dictionary
# left as is - approximately 185 qrels 
queries = []
for i in queries_select:
  if i in score_df.columns:
    queries.append(i)
queries = dict(enumerate(queries))
print(queries)

{0: 'angle bracket', 1: 'deck over', 2: 'rain shower head', 3: 'convection otr', 4: 'emergency light', 5: 'mdf 34', 6: 'steele stake', 7: 'briggs and stratton lawn mower', 8: 'hampton bay chestnut pull up shade', 9: 'disposer', 10: 'grill gazebo', 11: 'door guards', 12: '1x1 rail decorative wood', 13: 'lawn sprkinler', 14: 'platform for washers', 15: 'concrete  masonry cleaner  etcher', 16: 'belgium block pavers', 17: 'insulation roll', 18: '6ft h bamboo fencing', 19: 'chalk paint', 20: '8 4616809045 9', 21: '6 teir shelving', 22: 'mortar tools', 23: '12 boltless bracket', 24: 'husky tool bag', 25: 'impact driver drill battery powered', 26: 'pellet stove', 27: 'american standard bone round toliet', 28: 'grayson', 29: '6 stell', 30: 'bolt 12 in by 12', 31: 'elastomeric roof coating', 32: 'outdoor dining', 33: 'wiremesh', 34: 'front doors', 35: 'kingsley moen kitchen faucet', 36: 'roof melter', 37: '6 kraft faced insulation', 38: 'battery lanterns', 39: '5x5 post', 40: 'bazz lighting', 4

In [39]:
# Use above dictionary to convert search term to query id
for i in qrels:
  for key, value in queries.items():
    if value == i[0]:
      i[0] = key
print(qrels)

[[0, 100001, 3.0], [0, 100664, 2.67], [0, 100672, 2.33], [0, 100739, 3.0], [0, 101036, 1.33], [0, 101370, 3.0], [0, 101440, 2.33], [0, 101534, 2.33], [0, 102239, 2.67], [0, 103250, 3.0], [0, 103262, 3.0], [1, 100002, 3.0], [1, 100092, 1.67], [1, 101629, 3.0], [1, 101795, 2.67], [1, 103015, 1.67], [1, 103057, 2.33], [1, 103356, 2.67], [1, 103384, 2.33], [2, 100005, 2.33], [2, 100331, 2.67], [2, 100616, 3.0], [2, 101126, 3.0], [2, 101287, 3.0], [2, 102542, 3.0], [2, 102654, 3.0], [2, 102941, 2.67], [3, 100006, 3.0], [3, 101635, 2.67], [3, 105261, 2.67], [3, 118541, 2.67], [3, 153486, 3.0], [3, 153568, 3.0], [3, 159017, 3.0], [4, 100007, 2.67], [4, 100058, 3.0], [4, 101502, 2.33], [4, 101836, 2.0], [4, 101894, 3.0], [4, 102044, 3.0], [4, 102098, 2.67], [4, 102418, 3.0], [4, 102599, 2.67], [6, 100010, 2.67], [6, 100790, 2.0], [6, 102221, 2.33], [6, 153601, 2.67], [6, 175540, 2.67], [6, 179690, 3.0], [6, 184373, 3.0], [7, 100011, 3.0], [7, 105641, 2.67], [7, 110755, 3.0], [8, 100012, 2.67],

In [40]:
# Qrels with binary relevance score
qrels_= qrels.copy()
for i in qrels_:
  if i[2] >= 2.0:
    i[2] = 1
  else:
    i[2] = 0
print(qrels_)

[[0, 100001, 1], [0, 100664, 1], [0, 100672, 1], [0, 100739, 1], [0, 101036, 0], [0, 101370, 1], [0, 101440, 1], [0, 101534, 1], [0, 102239, 1], [0, 103250, 1], [0, 103262, 1], [1, 100002, 1], [1, 100092, 0], [1, 101629, 1], [1, 101795, 1], [1, 103015, 0], [1, 103057, 1], [1, 103356, 1], [1, 103384, 1], [2, 100005, 1], [2, 100331, 1], [2, 100616, 1], [2, 101126, 1], [2, 101287, 1], [2, 102542, 1], [2, 102654, 1], [2, 102941, 1], [3, 100006, 1], [3, 101635, 1], [3, 105261, 1], [3, 118541, 1], [3, 153486, 1], [3, 153568, 1], [3, 159017, 1], [4, 100007, 1], [4, 100058, 1], [4, 101502, 1], [4, 101836, 1], [4, 101894, 1], [4, 102044, 1], [4, 102098, 1], [4, 102418, 1], [4, 102599, 1], [6, 100010, 1], [6, 100790, 1], [6, 102221, 1], [6, 153601, 1], [6, 175540, 1], [6, 179690, 1], [6, 184373, 1], [7, 100011, 1], [7, 105641, 1], [7, 110755, 1], [8, 100012, 1], [8, 202245, 1], [9, 100013, 1], [9, 107084, 0], [9, 107177, 1], [9, 111749, 0], [9, 119721, 1], [9, 123157, 1], [9, 142327, 0], [9, 149

In [41]:
def retrieve_ranking(query, score_df):
  q_terms_only = score_df[query]
  return sorted(zip(score_df.index.values,q_terms_only.values), key = lambda tup:tup[1], reverse=True)

In [42]:
# To retrieve and calculate accuracy metrics for each query lets loop over them
k = 10

precision_scores = []
for query_id, query in queries.items():
  doc_ranking = retrieve_ranking(query, score_df)
  retrieved = [doc[0] for doc in doc_ranking[:k]] # take only the document id, rather than score
  qrels_query = [qrel for qrel in qrels_ if qrel[0] == query_id]
  relevant_doc_ids = [qrel[1] for qrel in qrels_query if qrel[-1] == 1]
  non_relevant_doc_ids = [qrel[1] for qrel in qrels_query if qrel[-1] == 0]
  TP = len(set(retrieved) & set(relevant_doc_ids))
  FP = len(set(retrieved) & set(non_relevant_doc_ids))
  FN = len(set(relevant_doc_ids) - set(retrieved))
  if TP or FP != 0:
    precision = (TP) / (TP + FP)
  else:
    precision = 0
  if TP or FN != 0:
    recall = (TP) / (TP + FN)
  else:
    recall = 0
  if precision or recall != 0:
    f1 = 2 * precision * recall / (precision + recall)
  else:
    f1 = 0
  precision_scores.append(precision)

  print('For the query:', query, 'TP: ', TP, 'FP: ', FP, 'FN: ', FN, ' precision:', precision, ',recall:', recall, 'f1:', f1)

For the query: angle bracket TP:  0 FP:  0 FN:  10  precision: 0 ,recall: 0.0 f1: 0
For the query: deck over TP:  0 FP:  0 FN:  6  precision: 0 ,recall: 0.0 f1: 0
For the query: rain shower head TP:  0 FP:  0 FN:  8  precision: 0 ,recall: 0.0 f1: 0
For the query: convection otr TP:  0 FP:  0 FN:  7  precision: 0 ,recall: 0.0 f1: 0
For the query: emergency light TP:  0 FP:  0 FN:  9  precision: 0 ,recall: 0.0 f1: 0
For the query: mdf 34 TP:  0 FP:  0 FN:  0  precision: 0 ,recall: 0 f1: 0
For the query: steele stake TP:  2 FP:  0 FN:  5  precision: 1.0 ,recall: 0.2857142857142857 f1: 0.4444444444444445
For the query: briggs and stratton lawn mower TP:  1 FP:  0 FN:  2  precision: 1.0 ,recall: 0.3333333333333333 f1: 0.5
For the query: hampton bay chestnut pull up shade TP:  0 FP:  0 FN:  2  precision: 0 ,recall: 0.0 f1: 0
For the query: disposer TP:  2 FP:  2 FN:  4  precision: 0.5 ,recall: 0.3333333333333333 f1: 0.4
For the query: grill gazebo TP:  4 FP:  0 FN:  0  precision: 1.0 ,recall

In [43]:
# Calculate mean average precision from precision scores for every query for the chosen model
sum_pre = sum(precision_scores)
map = sum_pre/len(queries_select)
print('mean average precision: ',map)

mean average precision:  0.36011904761904767
