In [None]:
import os
import pandas as pd
import re
import json
import time
import requests
from requests.auth import HTTPBasicAuth
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

## Home depot use-case

In [None]:
# custom ES wrapper
class Elastic:
    """
    A convenience object to send HTTP requests to Elasticsearch
    """
    def __init__(self, endpoint, username, password, path_to_ca_certificates):
        """
        @param endpoint: the URL of the Elasticsearch instance
        @param username: the Elasticsearch username 
        @param password: the Elasticsearch password
        """
        self.header = {'Content-Type': 'application/json', 'charset':'UTF-8'}
        #self.header={'Content-Type': '--data-binary application/x-ndjson'}
        self.endpoint = endpoint
        self.username = username
        self.password = password
        self.path_to_ca_certificates = path_to_ca_certificates
        self.methods_mapping = {'get': requests.get, 
                                'put':requests.put, 
                                'post':requests.post, 
                                'delete':requests.delete}
        
    def curl(self, method, handle, json=None):
        """
        Sends an HTTP request to the Elasticsearch instance
        @param method: can be 'get', 'put', 'post', 'delete'
        @param handle: the API handle to be appended to the Elasticsearch url
        @param json: the json payload of the HTTP request
        """
        http_method = self.methods_mapping[method.lower()]
        r = http_method(f'{self.endpoint}/{handle}', auth=HTTPBasicAuth(USER, PWD), 
                        headers=self.header, json=json,
                        verify = self.path_to_ca_certificates)
        return r

In [None]:
# !!! CUSTOMIZE THIS SECTION WITH YOUR CREDENTIALS !!!

# credendials and global variables
USER = 'elastic'
PWD = 'YOUR_PWD_HERE'
index_name = 'homedepot'
ES_ENDPOINT = 'https://localhost:9200'
path_to_ca_certificates = '/PATH/TO/CERT/elasticsearch-8.5.3/config/certs/http_ca.crt'
e = Elastic(ES_ENDPOINT, USER, PWD, path_to_ca_certificates)

### Step 1. Data preparation, index population

In [None]:
df_queries = pd.read_csv('queries.csv')
df_products = pd.read_csv('products.csv')

In [None]:
df_queries.head()

In [None]:
#transform dataframe into json format
df_products = df_products.set_index('product_uid')
docs = df_products.to_dict(orient='records')
doc_ids = df_products.index
print(doc_ids[0])
print(docs[0])

In [None]:
# create an index
create_index_json={
  "mappings" : {
      "properties" : {
        "product_description" : {
          "type" : "text"
        },
        "product_title" : {
          "type" : "text"
        }
      }
  },
  "settings": {
    "number_of_shards": 2, 
    "number_of_replicas": 2,
    "index" : {
        "similarity" : {
          "default" : {
            "type" : "BM25", "b": 0.75, "k1": 1.2
          }
        }
    },
    "analysis": {
      "analyzer": {
        "std_english": {"type": "standard", "stopwords": "_english_" }
      }
    }
  }
}

e.curl('put', index_name, json=create_index_json).json()

In [None]:
# bulk indexing 
es = Elasticsearch([ES_ENDPOINT], ca_certs=path_to_ca_certificates, basic_auth=(USER, PWD))
actions = [
  {
    "_index": index_name,
    "_id": doc_id,
    "_source": doc
  }
  for doc_id, doc in list(zip(doc_ids, docs))
]
# deactivate refresh in preparation of data indexing
r = e.curl('put', f'{index_name}/_settings', {'index' : {'refresh_interval' : -1}})
# send actions in bulk (the API takes care of chunking them optimally)
bulk(es, actions)
# re-activate refresh
r = e.curl('put', f'{index_name}/_settings', {'index' : {'refresh_interval' : '3s'}})

In [None]:
#see if all products have been indexed
e.curl('get', f'{index_name}/_count').json()

In [None]:
#check one example document
e.curl('get', f'{index_name}/_doc/100001').json()

### Learning to rank

#### Step 2. Create the feature store

In [None]:
#initialize feature store
e.curl('put', f'_ltr').json()
#to reset everything, run:
#e.curl('delete', f'_ltr').json()

#### Step 3. Create a feature set

In [None]:
create_feature_set_json={
   "featureset": {
        "features": [
            {
                "name": "f_product_title",
                "params": [
                    "keywords"
                ],
                "template_language": "mustache",
                "template": {
                    "match": {
                        "product_title": "{{keywords}}"
                    }
                }
            },
            {
                "name": "f_product_description",
                "params": [
                    "keywords"
                ],
                "template_language": "mustache",
                "template": {
                    "match": {
                        "product_description": "{{keywords}}"
                    }
                }
            }
        ]
   },
   "validation": {
     "params": {
         "keywords": "circular saw"
     },
     "index": "homedepot"
    }
}

e.curl('post', f'_ltr/_featureset/depot_textual_features', create_feature_set_json).json()

In [None]:
# see the created feature list
e.curl('get', '_ltr/_featureset').json()

#### Step 4. Query the index to get features for query-doc pairs 

In [None]:
# reformat the judgement set 
df_judgements = df_queries.set_index(['qid'])
judgements = df_judgements.to_dict(orient='records')
qid_list = df_judgements.index

# this will be a dictionary-like object structured as follows:
# (qid,query)->docid->{'relevance':relevance}
l = {} 

for qid, info in zip(qid_list, judgements):
    try:
        l[(qid, info['search_term'])]
    except:
        l[(qid, info['search_term'])] = {}
    l[(qid, info['search_term'])][info['product_uid']] = {'relevance': info['relevance']}
#    ({'product_uid': info['product_uid'], 'relevance': info['relevance']})
print(len(l))
l[(0, '$ hole saw')]

In [None]:
q_json = {
    "query": {
        "bool": {
            "filter": [
                {
                    "terms": {
                        "_id": None
                    }
                },
                {
                    "sltr": {
                        "_name": "logged_featureset",
                        "featureset": "depot_textual_features",
                        "params": {
                            "keywords": None
                        }
                    }
                }
            ]
        }
    },
    "ext": {
        "ltr_log": {
            "log_specs": {
                "name": "log_entry",
                "named_query": "logged_featureset"
            }
        }
    },
    "size":10000
}

# loops over all queries (takes several minutes). can you do the same in bulk?
for qid,query in l:
    pid_list = [x for x in l[(qid,query)]]
    #set the list of product ids and the query in the query template
    q_json["query"]["bool"]["filter"][0]["terms"]["_id"] = pid_list
    q_json["query"]["bool"]["filter"][1]["sltr"]["params"]["keywords"] = query
    #send request to ES to get feature values for the given query,product pairs
    res = e.curl('post', 'homedepot/_search', q_json).json()
    #parse response
    for hit in res['hits']['hits']: #for each hit
        docid = int(hit['_id']) # get the document id
        fields = hit['fields']['_ltrlog'][0]['log_entry'] # get the fields with feature vals
        for i,field in enumerate(fields):
            feat_name = field['name']
            if 'value' in field:
                feat_value = field['value']
            else: # when no values are reported, means that no query-text match has been found
                feat_value = 0            
            l[(qid,query)][docid][feat_name]=feat_value
print(f'Example query: id={qid}, terms={query}')
print('Corresponding documents with their relevance and features:')
print(l[(qid,query)])

In [None]:
def to_ranklib_format(d, feats):
    """
    Helper function to create a Ranklib-formatted dataframe
    e.g., 4   qid:1   1:9.510193  2:10.7808075
    @param d: a dictionary object formatted as:
           (qid, query) -> docid -> {'relevance':relevance_score, feature_name:feature_value
    @param feats: ordered_features: the names of the features, in the intended order
    @return a dataframe formatted according to Ranklib format
    """
    res = []
    for qid, _ in d:
        for docid in d[(qid,_)]:
            relevance = d[(qid,_)][docid]['relevance']
            r = [relevance, f'qid:{qid}'] 
            for feat_idx, feat in enumerate(feats):
                try:
                    r.append( f'{feat_idx+1}:{d[(qid,_)][docid][feat]}')
                except:
                    pass
            res.append(r)
    header = ['relevance', 'qid'] + feats
    df = pd.DataFrame(res,columns=header)
    return df

feat_names = ['f_product_title','f_product_description']
df_ranklib = to_ranklib_format(l, feat_names)
df_ranklib.to_csv('homedepot_ranklib.tsv', header=False, index=False, sep='\t')
print(len(df_ranklib))
df_ranklib.head()

#### Step 5. Train a LTR model

Check out documentation: https://sourceforge.net/p/lemur/wiki/browse_pages/

Rankers:

0: MART (gradient boosted regression tree)

1: RankNet

2: RankBoost

3: AdaRank

4: Coordinate Ascent

6: LambdaMART

7: ListNet

8: Random Forests

In [None]:
feature_file = 'homedepot_ranklib.tsv'
model_file = 'homedepot_ndgc_model.txt'
cmd = f'java -jar RankLib-2.18.jar -train {feature_file} -ranker 6 -metric2t NDCG@5 -save {model_file}'
print('Running RankLib')
os.system(cmd)
print('Finished')

#### Step 6. Upload the model to ES

In [None]:
with open(model_file,'rt') as fin:
    model_specification = fin.read()
    
model_json={
    "model": {
        "name": "ndcg_model",
        "model": {
            "type": "model/ranklib",
            "definition": model_specification
        }
    }
}
e.curl('post', '_ltr/_featureset/depot_textual_features/_createmodel', model_json).json()

#### Step 7. Use model to rerank query results

In [None]:
#standard query, results ranked with BM25
query = {
    "query": {
        "bool": {
            "must": [
                {"match": {"product_title": "circular saw"}},
                {"match": {"product_description": "circular saw"}}
            ]
        }
    }
}

res = dict(e.curl('post', 'homedepot/_search', query).json())
docids_ranked_lambdamart = [hit['_id'] for hit in res['hits']['hits']]

In [None]:
# same query, results re-ranked with LTR 
query = {
    "query": {
        "bool": {
            "must": [
                {"match": {"product_title": "circular saw"}},
                {"match": {"product_description": "circular saw"}}
            ]
        }
    },
    "rescore": {
        "window_size": 50,
        "query": {
            "rescore_query": {
                "sltr": {
                    "params": {
                        "keywords": "circular saw"
                    },
                    "model": "ndcg_model"
                }
            }
        }
    }
}

res = dict(e.curl('post', 'homedepot/_search', query).json())
#print(res)
docids_ranked_bm25 = [hit['_id'] for hit in res['hits']['hits']]

In [None]:
print('Standard BM25 ranking:')
print(docids_ranked_bm25)
print('LambdaMART re-ranking:')
print(docids_ranked_lambdamart)