In [1]:
import os
import pandas as pd
import re
import json
import time
import requests
from requests.auth import HTTPBasicAuth
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

## Home depot use-case

In [2]:
# custom ES wrapper
class Elastic:
    """
    A convenience object to send HTTP requests to Elasticsearch
    """
    def __init__(self, endpoint, username, password, path_to_ca_certificates):
        """
        @param endpoint: the URL of the Elasticsearch instance
        @param username: the Elasticsearch username 
        @param password: the Elasticsearch password
        """
        self.header = {'Content-Type': 'application/json', 'charset':'UTF-8'}
        #self.header={'Content-Type': '--data-binary application/x-ndjson'}
        self.endpoint = endpoint
        self.username = username
        self.password = password
        self.path_to_ca_certificates = path_to_ca_certificates
        self.methods_mapping = {'get': requests.get, 
                                'put':requests.put, 
                                'post':requests.post, 
                                'delete':requests.delete}
        
    def curl(self, method, handle, json=None):
        """
        Sends an HTTP request to the Elasticsearch instance
        @param method: can be 'get', 'put', 'post', 'delete'
        @param handle: the API handle to be appended to the Elasticsearch url
        @param json: the json payload of the HTTP request
        """
        http_method = self.methods_mapping[method.lower()]
        r = http_method(f'{self.endpoint}/{handle}', auth=HTTPBasicAuth(USER, PWD), 
                        headers=self.header, json=json,
                        verify = self.path_to_ca_certificates)
        return r

In [3]:
# !!! CUSTOMIZE THIS SECTION WITH YOUR CREDENTIALS !!!

# credendials and global variables
USER = 'elastic'
PWD = 'asdqweasd'
index_name = 'homedepot'
ES_ENDPOINT = 'https://localhost:9200'
path_to_ca_certificates = '../certs/ca/ca.crt'
# path_to_ca_certificates = '/PATH/TO/CERT/elasticsearch-8.5.3/config/certs/http_ca.crt'
e = Elastic(ES_ENDPOINT, USER, PWD, path_to_ca_certificates)

In [12]:
import os
os.system('java -jar RankLib-2.18.jar …params…')

Usage: java -jar RankLib.jar <Params>
Params:
  [+] Training (+ tuning and evaluation)
	-train <file>		Training data
	-ranker <type>		Specify which ranking algorithm to use
				0: MART (gradient boosted regression tree)
				1: RankNet
				2: RankBoost
				3: AdaRank
				4: Coordinate Ascent
				6: LambdaMART
				7: ListNet
				8: Random Forests
				9: Linear regression (L2 regularization)
	[ -feature <file> ]	Feature description file: list features to be considered by the learner, each on a separate line
				If not specified, all features will be used.
	[ -metric2t <metric> ]	Metric to optimize on the training data.  Supported: MAP, NDCG@k, DCG@k, P@k, RR@k, ERR@k (default=ERR@10)
	[ -gmax <label> ]	Highest judged relevance label. It affects the calculation of ERR (default=4, i.e. 5-point scale {0,1,2,3,4})
	[ -qrel <file> ]	TREC-style relevance judgment file. It only affects MAP and NDCG (default=unspecified)
	[ -silent ]		Do not print progress messages (which are printed by default)


0

### Step 1. Data preparation, index population

In [4]:
df_queries = pd.read_csv('queries.csv')
df_products = pd.read_csv('products.csv')

In [5]:
df_queries.head()

Unnamed: 0,qid,product_uid,relevance,product_title,search_term
0,0,100490,3.0,Milwaukee Metal Hole Saw Kit (15-Piece),$ hole saw
1,0,165925,3.0,Milwaukee 6-3/8 in. Recessed Light Hole Saw,$ hole saw
2,0,125403,2.33,Ryobi Hole Saw Set (6-Piece),$ hole saw
3,0,121588,2.33,PRO-SERIES 19-Piece Hole Saw Set with Case,$ hole saw
4,0,197110,1.33,Milwaukee 2 in. to 7 in. Dia. Adjustable Hole ...,$ hole saw


In [6]:
#transform dataframe into json format
df_products = df_products.set_index('product_uid')
docs = df_products.to_dict(orient='records')
doc_ids = df_products.index
print(doc_ids[0])
print(docs[0])

100001
{'product_description': 'Not only do angles make joints stronger, they also provide more consistent, straight corners. Simpson Strong-Tie offers a wide variety of angles in various sizes and thicknesses to handle light-duty jobs or projects where a structural connection is needed. Some can be bent (skewed) to match the project. For outdoor projects or those where moisture is present, use our ZMAX zinc-coated connectors, which provide extra resistance against corrosion (look for a "Z" at the end of the model number).Versatile connector for various 90 connections and home repair projectsStronger than angled nailing or screw fastening aloneHelp ensure joints are consistently straight and strongDimensions: 3 in. x 3 in. x 1-1/2 in.Made from 12-Gauge steelGalvanized for extra corrosion resistanceInstall with 10d common nails or #9 x 1-1/2 in. Strong-Drive SD screws', 'product_title': 'Simpson Strong-Tie 12-Gauge Angle'}


In [7]:
# create an index
create_index_json={
  "mappings" : {
      "properties" : {
        "product_description" : {
          "type" : "text"
        },
        "product_title" : {
          "type" : "text"
        }
      }
  },
  "settings": {
    "number_of_shards": 2, 
    "number_of_replicas": 2,
    "index" : {
        "similarity" : {
          "default" : {
            "type" : "BM25", "b": 0.75, "k1": 1.2
          }
        }
    },
    "analysis": {
      "analyzer": {
        "std_english": {"type": "standard", "stopwords": "_english_" }
      }
    }
  }
}

e.curl('put', index_name, json=create_index_json).json()

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'homedepot'}

In [10]:
# bulk indexing 
es = Elasticsearch([ES_ENDPOINT], ca_certs=path_to_ca_certificates, http_auth=(USER, PWD))
actions = [
  {
    "_index": index_name,
    "_id": doc_id,
    "_source": doc
  }
  for doc_id, doc in list(zip(doc_ids, docs))
]
# deactivate refresh in preparation of data indexing
r = e.curl('put', f'{index_name}/_settings', {'index' : {'refresh_interval' : -1}})
# send actions in bulk (the API takes care of chunking them optimally)
bulk(es, actions)
# re-activate refresh
r = e.curl('put', f'{index_name}/_settings', {'index' : {'refresh_interval' : '3s'}})

In [13]:
#see if all products have been indexed
e.curl('get', f'{index_name}/_count').json()

{'count': 124428,
 '_shards': {'total': 2, 'successful': 2, 'skipped': 0, 'failed': 0}}

In [14]:
#check one example document
e.curl('get', f'{index_name}/_doc/100001').json()

{'_index': 'homedepot',
 '_id': '100001',
 '_version': 2,
 '_seq_no': 56481,
 '_primary_term': 1,
 'found': True,
 '_source': {'product_description': 'Not only do angles make joints stronger, they also provide more consistent, straight corners. Simpson Strong-Tie offers a wide variety of angles in various sizes and thicknesses to handle light-duty jobs or projects where a structural connection is needed. Some can be bent (skewed) to match the project. For outdoor projects or those where moisture is present, use our ZMAX zinc-coated connectors, which provide extra resistance against corrosion (look for a "Z" at the end of the model number).Versatile connector for various 90 connections and home repair projectsStronger than angled nailing or screw fastening aloneHelp ensure joints are consistently straight and strongDimensions: 3 in. x 3 in. x 1-1/2 in.Made from 12-Gauge steelGalvanized for extra corrosion resistanceInstall with 10d common nails or #9 x 1-1/2 in. Strong-Drive SD screws',

### Learning to rank

#### Step 2. Create the feature store

In [30]:
#initialize feature store
e.curl('put', f'_ltr',).json()
#to reset everything, run:
# e.curl('delete', f'ltr').json()

{'acknowledged': True, 'shards_acknowledged': True, 'index': '.ltrstore'}

#### Step 3. Create a feature set

In [31]:
create_feature_set_json={
   "featureset": {
        "features": [
            {
                "name": "f_product_title",
                "params": [
                    "keywords"
                ],
                "template_language": "mustache",
                "template": {
                    "match": {
                        "product_title": "{{keywords}}"
                    }
                }
            },
            {
                "name": "f_product_description",
                "params": [
                    "keywords"
                ],
                "template_language": "mustache",
                "template": {
                    "match": {
                        "product_description": "{{keywords}}"
                    }
                }
            }
        ]
   },
   "validation": {
     "params": {
         "keywords": "circular saw"
     },
     "index": "homedepot"
    }
}

e.curl('post', f'_ltr/_featureset/depot_textual_features', create_feature_set_json).json()

{'_index': '.ltrstore',
 '_id': 'featureset-depot_textual_features',
 '_version': 1,
 'result': 'created',
 'forced_refresh': True,
 '_shards': {'total': 1, 'successful': 1, 'failed': 0},
 '_seq_no': 0,
 '_primary_term': 1}

In [32]:
# see the created feature list
e.curl('get', '_ltr/_featureset').json()

{'took': 3,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1, 'relation': 'eq'},
  'max_score': 0.0,
  'hits': [{'_index': '.ltrstore',
    '_id': 'featureset-depot_textual_features',
    '_score': 0.0,
    '_source': {'name': 'depot_textual_features',
     'type': 'featureset',
     'featureset': {'name': 'depot_textual_features',
      'features': [{'name': 'f_product_title',
        'params': ['keywords'],
        'template_language': 'mustache',
        'template': {'match': {'product_title': '{{keywords}}'}}},
       {'name': 'f_product_description',
        'params': ['keywords'],
        'template_language': 'mustache',
        'template': {'match': {'product_description': '{{keywords}}'}}}]}}}]}}

#### Step 4. Query the index to get features for query-doc pairs 

In [33]:
# reformat the judgement set 
df_judgements = df_queries.set_index(['qid'])
judgements = df_judgements.to_dict(orient='records')
qid_list = df_judgements.index

# this will be a dictionary-like object structured as follows:
# (qid,query)->docid->{'relevance':relevance}
l = {} 

for qid, info in zip(qid_list, judgements):
    try:
        l[(qid, info['search_term'])]
    except:
        l[(qid, info['search_term'])] = {}
    l[(qid, info['search_term'])][info['product_uid']] = {'relevance': info['relevance']}
#    ({'product_uid': info['product_uid'], 'relevance': info['relevance']})
print(len(l))
l[(0, '$ hole saw')]

11795


{100490: {'relevance': 3.0},
 165925: {'relevance': 3.0},
 125403: {'relevance': 2.33},
 121588: {'relevance': 2.33},
 197110: {'relevance': 1.33},
 111472: {'relevance': 2.0},
 100712: {'relevance': 3.0},
 133315: {'relevance': 2.0}}

In [34]:
q_json = {
    "query": {
        "bool": {
            "filter": [
                {
                    "terms": {
                        "_id": None
                    }
                },
                {
                    "sltr": {
                        "_name": "logged_featureset",
                        "featureset": "depot_textual_features",
                        "params": {
                            "keywords": None
                        }
                    }
                }
            ]
        }
    },
    "ext": {
        "ltr_log": {
            "log_specs": {
                "name": "log_entry",
                "named_query": "logged_featureset"
            }
        }
    },
    "size":10000
}

# loops over all queries (takes several minutes). can you do the same in bulk?
for qid,query in l:
    pid_list = [x for x in l[(qid,query)]]
    #set the list of product ids and the query in the query template
    q_json["query"]["bool"]["filter"][0]["terms"]["_id"] = pid_list
    q_json["query"]["bool"]["filter"][1]["sltr"]["params"]["keywords"] = query
    #send request to ES to get feature values for the given query,product pairs
    res = e.curl('post', 'homedepot/_search', q_json).json()
    #parse response
    for hit in res['hits']['hits']: #for each hit
        docid = int(hit['_id']) # get the document id
        fields = hit['fields']['_ltrlog'][0]['log_entry'] # get the fields with feature vals
        for i,field in enumerate(fields):
            feat_name = field['name']
            if 'value' in field:
                feat_value = field['value']
            else: # when no values are reported, means that no query-text match has been found
                feat_value = 0            
            l[(qid,query)][docid][feat_name]=feat_value
print(f'Example query: id={qid}, terms={query}')
print('Corresponding documents with their relevance and features:')
print(l[(qid,query)])

Example query: id=11794, terms=zwave switch
Corresponding documents with their relevance and features:
{116711: {'relevance': 3.0, 'f_product_title': 4.940411, 'f_product_description': 0}, 135547: {'relevance': 2.67, 'f_product_title': 4.795978, 'f_product_description': 5.5609503}, 142033: {'relevance': 3.0, 'f_product_title': 4.890269, 'f_product_description': 3.7596512}, 141628: {'relevance': 3.0, 'f_product_title': 0, 'f_product_description': 0}, 134888: {'relevance': 2.0, 'f_product_title': 0, 'f_product_description': 3.467341}, 113534: {'relevance': 2.33, 'f_product_title': 0, 'f_product_description': 0}, 152640: {'relevance': 2.67, 'f_product_title': 0, 'f_product_description': 0}}


In [35]:
def to_ranklib_format(d, feats):
    """
    Helper function to create a Ranklib-formatted dataframe
    e.g., 4   qid:1   1:9.510193  2:10.7808075
    @param d: a dictionary object formatted as:
           (qid, query) -> docid -> {'relevance':relevance_score, feature_name:feature_value
    @param feats: ordered_features: the names of the features, in the intended order
    @return a dataframe formatted according to Ranklib format
    """
    res = []
    for qid, _ in d:
        for docid in d[(qid,_)]:
            relevance = d[(qid,_)][docid]['relevance']
            r = [relevance, f'qid:{qid}'] 
            for feat_idx, feat in enumerate(feats):
                try:
                    r.append( f'{feat_idx+1}:{d[(qid,_)][docid][feat]}')
                except:
                    pass
            res.append(r)
    header = ['relevance', 'qid'] + feats
    df = pd.DataFrame(res,columns=header)
    return df

feat_names = ['f_product_title','f_product_description']
df_ranklib = to_ranklib_format(l, feat_names)
df_ranklib.to_csv('homedepot_ranklib.tsv', header=False, index=False, sep='\t')
print(len(df_ranklib))
df_ranklib.head()

74067


Unnamed: 0,relevance,qid,f_product_title,f_product_description
0,3.0,qid:0,1:11.129553,2:10.917992
1,3.0,qid:0,1:10.309599,2:12.055068
2,2.33,qid:0,1:11.590466,2:5.591045
3,2.33,qid:0,1:10.215725,2:10.783504
4,1.33,qid:0,1:4.8012733,2:4.6392903


#### Step 5. Train a LTR model

Check out documentation: https://sourceforge.net/p/lemur/wiki/browse_pages/

Rankers:

0: MART (gradient boosted regression tree)

1: RankNet

2: RankBoost

3: AdaRank

4: Coordinate Ascent

6: LambdaMART

7: ListNet

8: Random Forests

In [36]:
feature_file = 'homedepot_ranklib.tsv'
model_file = 'homedepot_ndgc_model.txt'
test_file = 'homedepot_testfile.tsv'
cmd = f'java -jar RankLib-2.18.jar -train {feature_file} -ranker 6 -metric2t NDCG@5 -save {model_file}'
print('Running RankLib')
os.system(cmd)
print('Finished')

Running RankLib

Discard orig. features
Training data:	homedepot_ranklib.tsv
Feature vector representation: Dense.
Ranking method:	LambdaMART
Feature description file:	Unspecified. All features will be used.
Train metric:	NDCG@5
Test metric:	NDCG@5
Feature normalization: No
Model file: homedepot_ndgc_model.txt

[+] LambdaMART's Parameters:
No. of trees: 1000
No. of leaves: 10
No. of threshold candidates: 256
Min leaf support: 1
Learning rate: 0.1
Stop early: 100 rounds without performance gain on validation data

Reading feature file [homedepot_ranklib.tsv]... [Done.]            
(11795 ranked lists, 74067 entries read)
Initializing... [Done]
---------------------------------
Training starts...
---------------------------------
#iter   | NDCG@5-T  | NDCG@5-V  | 
---------------------------------
1       | 0.8639    | 
2       | 0.8638    | 
3       | 0.8647    | 
4       | 0.8646    | 
5       | 0.8645    | 
6       | 0.8646    | 
7       | 0.8645    | 
8       | 0.8647    | 
9       |

#### Step 6. Upload the model to ES

In [37]:
with open(model_file,'rt') as fin:
    model_specification = fin.read()
    
model_json={
    "model": {
        "name": "ndcg_model",
        "model": {
            "type": "model/ranklib",
            "definition": model_specification
        }
    }
}
e.curl('post', '_ltr/_featureset/depot_textual_features/_createmodel', model_json).json()

{'_index': '.ltrstore',
 '_id': 'model-ndcg_model',
 '_version': 1,
 'result': 'created',
 'forced_refresh': True,
 '_shards': {'total': 1, 'successful': 1, 'failed': 0},
 '_seq_no': 1,
 '_primary_term': 1}

#### Step 7. Use model to rerank query results

In [38]:
#standard query, results ranked with BM25
query = {
    "query": {
        "bool": {
            "must": [
                {"match": {"product_title": "circular saw"}},
                {"match": {"product_description": "circular saw"}}
            ]
        }
    }
}

res = dict(e.curl('post', 'homedepot/_search', query).json())
docids_ranked_lambdamart = [hit['_id'] for hit in res['hits']['hits']]

In [42]:
# same query, results re-ranked with LTR 
query = {
    "query": {
        "bool": {
            "must": [
                {"match": {"product_title": "circular saw"}},
                {"match": {"product_description": "circular saw"}}
            ]
        }
    },
    "rescore": {
        "window_size": 50,
        "query": {
            "rescore_query": {
                "sltr": {
                    "params": {
                        "keywords": "circular saw"
                    },
                    "model": "ndcg_model"
                }
            }
        }
    }
}

res = dict(e.curl('post', 'homedepot/_search', query).json())
# print(res)
docids_ranked_bm25 = [hit['_id'] for hit in res['hits']['hits']]

In [43]:
print('Standard BM25 ranking:')
print(docids_ranked_bm25)
print('LambdaMART re-ranking:')
print(docids_ranked_lambdamart)

Standard BM25 ranking:
['168789', '147893', '188157', '173997', '132370', '168199', '110984', '150220', '144745', '160195']
LambdaMART re-ranking:
['168789', '147893', '173997', '188157', '132370', '110984', '168199', '160195', '144745', '150220']
