## Using ElasticSearch to index the news dataset and implementing a Pseudo-Relevance Feedback based on Rocchio's Algorithm

In [1]:
import pandas as pd
import json
import requests
import numpy as np
from requests.auth import HTTPBasicAuth
from elasticsearch import Elasticsearch
import elasticsearch
from elasticsearch.helpers import bulk
import math

print(elasticsearch.__version__)

(8, 12, 0)


In [None]:
# !!! CUSTOMIZE THIS SECTION WITH YOUR CREDENTIALS !!!

USER = 'elastic'
PWD = 'lke-UQaMH*fci8A3xjo+'
index_name = 'news'
ES_ENDPOINT = 'https://localhost:9200'

path_to_ca_certificates = 'C:/Users/user/pathtothefolder/elasticsearch-8.12.0/config/certs/http_ca.crt'

### Read data

In [3]:
df_dup = pd.read_csv('../data/news/news.csv', index_col = False)
df_dup

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22
...,...,...,...,...,...,...
209522,https://www.huffingtonpost.com/entry/rim-ceo-t...,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,TECH,Verizon Wireless and AT&T are already promotin...,"Reuters, Reuters",2012-01-28
209523,https://www.huffingtonpost.com/entry/maria-sha...,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS,"Afterward, Azarenka, more effusive with the pr...",,2012-01-28
209524,https://www.huffingtonpost.com/entry/super-bow...,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS,"Leading up to Super Bowl XLVI, the most talked...",,2012-01-28
209525,https://www.huffingtonpost.com/entry/aldon-smi...,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS,CORRECTION: An earlier version of this story i...,,2012-01-28


### Data Processing

In [4]:
duplicates = df_dup.duplicated().sum()
print("Number of duplicate rows:", duplicates)

Number of duplicate rows: 13


In [5]:
df = df_dup.drop_duplicates()

In [6]:
missing_values = df.isnull().sum()
print(missing_values)

link                     0
headline                 6
category                 0
short_description    19712
authors              37405
date                     0
dtype: int64


In [7]:
# Replace various representations of empty values with None
df.replace(['', ' ', '-', np.nan], None, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.replace(['', ' ', '-', np.nan], None, inplace=True)


In [8]:
df

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22
...,...,...,...,...,...,...
209522,https://www.huffingtonpost.com/entry/rim-ceo-t...,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,TECH,Verizon Wireless and AT&T are already promotin...,"Reuters, Reuters",2012-01-28
209523,https://www.huffingtonpost.com/entry/maria-sha...,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS,"Afterward, Azarenka, more effusive with the pr...",,2012-01-28
209524,https://www.huffingtonpost.com/entry/super-bow...,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS,"Leading up to Super Bowl XLVI, the most talked...",,2012-01-28
209525,https://www.huffingtonpost.com/entry/aldon-smi...,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS,CORRECTION: An earlier version of this story i...,,2012-01-28


In [8]:
# Creating a new column id to index by that
df.reset_index(inplace=True)
df.rename(columns={'index': 'id'}, inplace=True)
df.index.name = "id"
df = df.set_index('id')
#df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={'index': 'id'}, inplace=True)


In [9]:
# transform the dataframe into json format
docs = df.to_dict(orient='records')
doc_ids = df.index
print(doc_ids[1])
print(docs[1])

1
{'link': 'https://www.huffpost.com/entry/american-airlines-passenger-banned-flight-attendant-punch-justice-department_n_632e25d3e4b0e247890329fe', 'headline': 'American Airlines Flyer Charged, Banned For Life After Punching Flight Attendant On Video', 'category': 'U.S. NEWS', 'short_description': "He was subdued by passengers and crew when he fled to the back of the aircraft after the confrontation, according to the U.S. attorney's office in Los Angeles.", 'authors': 'Mary Papenfuss', 'date': '2022-09-23'}


### Indexing with python cURL (Requests)

In [10]:
class Elastic:
    """
    A convenience object to send HTTP requests to Elasticsearch
    """
    def __init__(self, endpoint, username, password, path_to_ca_certificates):
        """
        @param endpoint: the URL of the Elasticsearch instance
        @param username: the Elasticsearch username 
        @param password: the Elasticsearch password
        """
        self.header = {'Content-Type': 'application/json', 'charset':'UTF-8'}
        #self.header={'Content-Type': '--data-binary application/x-ndjson'}
        self.endpoint = endpoint
        self.username = username
        self.password = password
        self.path_to_ca_certificates = path_to_ca_certificates
        self.methods_mapping = {'get': requests.get, 
                                'put':requests.put, 
                                'post':requests.post, 
                                'delete':requests.delete}
        
    def curl(self, method, handle, json=None):
        """
        Sends an HTTP request to the Elasticsearch instance
        @param method: can be 'get', 'put', 'post', 'delete'
        @param handle: the API handle to be appended to the Elasticsearch url
        @param json: the json payload of the HTTP request
        """
        http_method = self.methods_mapping[method.lower()]
        r = http_method(f'{self.endpoint}/{handle}', auth=HTTPBasicAuth(USER, PWD), 
                        headers=self.header, json=json,
                        verify = self.path_to_ca_certificates)
        return r

In [11]:
e = Elastic(ES_ENDPOINT, USER, PWD, path_to_ca_certificates)

In [12]:
# delete the index
r = e.curl('delete', 'news')
r.json()

{'acknowledged': True}

In [13]:
create_index_json={
# Point #4 By putting the type text to beer_name and review_text we allow for full-text query
    # while for beer_style we put keyword to omake it useful for filtering creating a DocValue
  "mappings" : {
      "properties" : {
        "link" : {
          "type" : "text" 
        },
        "headline" : {
          "type" : "text"
        },
        "category" : {
          "type" : "keyword"
        },
        "short_description" : {
          "type" : "text"
        },
        "authors" : {
          "type" : "text"
        },
        "date" : {
          "type" : "text"
        }
      }
  },
  "settings": {
    "refresh_interval": "-1", 
    "number_of_shards": 1, 
    "number_of_replicas": 0, 
    "index.max_result_window": 20000,
    "index" : {
        "similarity" : {
          "default" : {
            "type" : "BM25", "b": 0.75, "k1": 1.2 
          }
        }
    },
    "analysis": {
      "analyzer": {
        "std_english": {"type": "standard", "stopwords": "_english_" }
      }
    }
  }
}

# create an index
r = e.curl('put', index_name, json=create_index_json)
r.json()

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'news'}

In [14]:
# get the index details and settings
r = e.curl('get', index_name)
r.json()

{'news': {'aliases': {},
  'mappings': {'properties': {'authors': {'type': 'text'},
    'category': {'type': 'keyword'},
    'date': {'type': 'text'},
    'headline': {'type': 'text'},
    'link': {'type': 'text'},
    'short_description': {'type': 'text'}}},
  'settings': {'index': {'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}},
    'refresh_interval': '-1',
    'number_of_shards': '1',
    'provided_name': 'news',
    'similarity': {'default': {'type': 'BM25', 'b': '0.75', 'k1': '1.2'}},
    'max_result_window': '20000',
    'creation_date': '1716652012861',
    'analysis': {'analyzer': {'std_english': {'type': 'standard',
       'stopwords': '_english_'}}},
    'number_of_replicas': '0',
    'uuid': 'YUlVIEcETAWLjSA_SdqJ-w',
    'version': {'created': '8500008'}}}}}

In [15]:
# bulk indexing (via official API)

#connect to the local elasticsearch node and authenticate
es = Elasticsearch([ES_ENDPOINT], ca_certs=path_to_ca_certificates, basic_auth=(USER, PWD))

actions = [
  {
    "_index": index_name,
    "_id": doc_id,
    "_source": doc
  }
  for doc_id, doc in list(zip(doc_ids, docs))
]

# send actions in bulk (the API takes care of chunking them optimally)
bulk(es, actions)

(209514, [])

In [16]:
# reset the refresh interval to 2 seconds
r = e.curl('put', 'news/_settings', {'index' : {'refresh_interval' : '2s'}})
r.json()

{'acknowledged': True}

In [17]:
# reset the refresh interval to -1 seconds
r = e.curl('put', 'news/_settings', {'index' : {'refresh_interval' : '-1'}})
r.json()

{'acknowledged': True}

In [18]:
# Checking
r = e.curl('get', f'news/_doc/{doc_ids[42]}')
r.json()

{'_index': 'news',
 '_id': '42',
 '_version': 1,
 '_seq_no': 42,
 '_primary_term': 1,
 'found': True,
 '_source': {'link': 'https://www.huffpost.com/entry/bc-us-water-woes-mississippi-racism_n_6324b8c1e4b046aa023f398e',
  'headline': 'Racism Seen As Root Of Water Crisis In Mississippi Capital',
  'category': 'U.S. NEWS',
  'short_description': "A boil-water advisory has been lifted for Mississippi's capital, and the state will stop handing out free bottled water on Saturday.",
  'authors': 'DREW COSTLEY, AP',
  'date': '2022-09-16'}}

### Relevance Feedback

#### Based on Rocchio's Algorithm

In [19]:
def retrieve_top_k_documents(initial_query, K):
    """
    Retrieve the top K relevant documents based on the initial query.
    """
    # Execute the search query
    r1 = e.curl('get', f'news/_search', {
        "query": {
            "match": {
                "short_description": {
                    "query": initial_query                
    }}}})
    response = r1.json()
    hits = response.get('hits', {}).get('hits', [])
    top_k_documents = hits[:K]
    
    return top_k_documents

In [20]:
def get_term_vectors_for_document(doc_id):
    """
    Retrieve the term vectors for given document
    """       
    doc_idd = doc_id['_id']

    r = e.curl('get', f'news/_termvectors/{doc_idd}', {
        "fields": ["short_description"],
        "term_statistics": True,
        "field_statistics": True
    })
        
    term_vectors = r.json().get('term_vectors', {})
    #print(term_vectors) # debugging o.o
    doc_count = term_vectors.get('short_description', {}).get('field_statistics', {}).get('doc_count')    
    #print(doc_count)
    #print("Processing document:", doc_id)
    return term_vectors, doc_count

In [21]:
def calculate_top_m_terms(top_k_documents, m):
    """
    Extract terms and calculate TF-IDF for each term on the top K documents 
    """   
    terms_tfidf_top_k = {}
    num_processed_top_k = 0
    for doc_id in top_k_documents:
        num_processed_top_k += 1
        term_vectors, total_documents = get_term_vectors_for_document(doc_id)
        for field, field_info in term_vectors.items():
            for term, term_info in field_info.get('terms', {}).items():
                tf = 1 + math.log(term_info.get('term_freq', 0))
                df = term_info.get('doc_freq', 0)
                idf = math.log(total_documents / df)
                tfidf = tf * idf
                if term not in terms_tfidf_top_k:
                    terms_tfidf_top_k[term] = tfidf
                else:
                    terms_tfidf_top_k[term] += tfidf
        
    top_m_terms = dict(sorted(terms_tfidf_top_k.items(), key=lambda item: item[1], reverse=True)[:m])

    return top_m_terms

In [22]:
def extended_query(initial_query, top_m_terms):
    """
    Submit extended query and retrieve results.
    """
    
    initial_terms = set(initial_query.split())
    extended_terms = set(top_m_terms.keys())
    combined_terms = extended_terms.union(initial_terms)
    
    extended_query = " ".join(combined_terms)
    
    # Execute the search query
    r = e.curl('get', f'news/_search', {
        "query": {
            "match": {
                "short_description": {
                    "query": extended_query
    }}}})

    
    extended_results = r.json()

    print("Extended query short_desription results:")
    for hit in extended_results.get('hits', {}).get('hits', []):
        print(hit['_source'].get('short_description', 'No description available'))

    return extended_query

In [27]:
def pseudo_relevance_feedback(initial_query, K, m):
    top_k_documents = retrieve_top_k_documents(initial_query, K)
    top_m = calculate_top_m_terms(top_k_documents, m)
    ex_query = extended_query(initial_query, top_m)
    
    return ex_query

## Example

In [28]:
initial_query = "plane crash"
K = 3
m = 5
expanded_q = pseudo_relevance_feedback(initial_query,K,m)
print("\nExpanded query:",expanded_q)

Extended query short_desription results:
The plane crash killed all 176 people on board, mostly Iranians and Iranian-Canadians.
Iran released a new preliminary report on the crash of the Ukraine International Airways plane, which killed 176 people.
Ukrainian investigators are also assisting in the probe of the disaster that killed 176 people.
Four officers were killed in the crash.
They know what to do during a plane crash. "A number of crash studies focusing on both survivors and staged experiments have
Authorities say the crash killed at least 27 and injured 50.
The family sustained serious injuries in the crash, which killed one person.
A commuter train crash in New Jersey killed one and injured 108 Thursday.
The crash killed 71 people last month, including most of Brazil's Chapecoense soccer team.
This week's Germanwings Airbus crash -- and recent reports that the co-pilot may have crashed the plane deliberately -- have

Expanded query: iranians crash killed plane 176


### Non Expanded Query

In [30]:
""" 
execute full-text query for "plane crash" in short_description field
"""
r1 = e.curl('get', f'news/_search',{
 "query": {
 "match": {
 "short_description": {
 "query": initial_query
}}}})

r1.json()

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 392, 'relation': 'eq'},
  'max_score': 16.316483,
  'hits': [{'_index': 'news',
    '_id': '133956',
    '_score': 16.316483,
    '_source': {'link': 'https://www.huffingtonpost.com/entry/master-traveler-tips_us_5b9dfa8ee4b03a1dcc8fcb0e',
     'headline': '16 Things Master Travelers Do Differently',
     'category': 'TRAVEL',
     'short_description': 'They know what to do during a plane crash. "A number of crash studies focusing on both survivors and staged experiments have',
     'authors': 'Suzy Strutner',
     'date': '2014-04-15'}},
   {'_index': 'news',
    '_id': '5462',
    '_score': 16.089506,
    '_source': {'link': 'https://www.huffpost.com/entry/iran-protests-plane-shot-down_n_5e1b1a07c5b6640ec3d5df2f',
     'headline': 'Iranians Defy Police, Protest Over Ukranian Plane Shootdown',
     'category': 'WORLD NEWS',
     'short_description': 'The p

### Expanded Query

In [29]:
r1 = e.curl('get', f'news/_search',{
 "query": {
 "match": {
 "short_description": {
 "query": expanded_q
}}}})

r1.json()

{'took': 3,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1152, 'relation': 'eq'},
  'max_score': 45.38703,
  'hits': [{'_index': 'news',
    '_id': '5462',
    '_score': 45.38703,
    '_source': {'link': 'https://www.huffpost.com/entry/iran-protests-plane-shot-down_n_5e1b1a07c5b6640ec3d5df2f',
     'headline': 'Iranians Defy Police, Protest Over Ukranian Plane Shootdown',
     'category': 'WORLD NEWS',
     'short_description': 'The plane crash killed all 176 people on board, mostly Iranians and Iranian-Canadians.',
     'authors': 'Joseph Krauss and Jon Gambrell, AP',
     'date': '2020-01-12'}},
   {'_index': 'news',
    '_id': '5412',
    '_score': 31.197323,
    '_source': {'link': 'https://www.huffpost.com/entry/iran-acknowledges-2-russian-made-missiles-targeted-ukraine-jet_n_5e26dd98c5b6321176197a88',
     'headline': 'Iran Acknowledges 2 Russian-Made Missiles Targeted Ukraine Jet',
     'category': 'WORLD

As we can see from the expanded query, it makes the queries and results more focused in one topic but it is limited and dependent on the terms that are extracted from the top documents that are assumed as relevant. So if we don't have relevant results in the top k the expanded query might do bad as well.

### Another example with more printing statements :)

In [78]:
second_query = "best lasagna"
K = 3  
top_k_documents = retrieve_top_k_documents(second_query, K)

for i, doc in enumerate(top_k_documents, 1):
    print(f"Document {i}: {doc['_source']['short_description']}")

Document 1: If you're someone who watches what you eat, you probably don't get down with a lot of lasagna. Lasagna is typically one big
Document 2: A trio lasagna pan is a real game-changer
Document 3: Ultra-comforting, and packed with layers of meat sauce, cheese, and tender noodles, Olive Garden's Lasagna Classico is one


In [79]:
topm= calculate_top_m_terms(top_k_documents, 8)
topm

{'lasagna': 35.708256782092484,
 'classico': 12.153694552434894,
 "garden's": 12.153694552434894,
 'noodles': 9.445644351332685,
 'watches': 8.821490042259692,
 'changer': 8.821490042259692,
 'trio': 8.516108392708508,
 'olive': 8.093251541888476}

In [80]:
expand_q = pseudo_relevance_feedback(second_query,K,5)
print("\nExpanded query:",expand_q)

Extended query short_desription results:
Ultra-comforting, and packed with layers of meat sauce, cheese, and tender noodles, Olive Garden's Lasagna Classico is one
If you're someone who watches what you eat, you probably don't get down with a lot of lasagna. Lasagna is typically one big
Dry noodles everywhere!
*watches them on repeat*
Do you know who made your noodles? 🍜
A trio lasagna pan is a real game-changer
A fried noodles recipe that's perfect for a quick meal.
"The Clinton campaign watches 'Morning Joe' every morning."
You'll never look at Kim Kardashian eating noodles the same again.
Chef Bill Kim makes smoky ribs, fried ramen noodles and more.

Expanded query: garden's best noodles watches lasagna classico


In [81]:
r1 = e.curl('get', f'news/_search',{
 "query": {
 "match": {
 "short_description": {
 "query": second_query
}}}})

r1.json()

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 3498, 'relation': 'eq'},
  'max_score': 13.047355,
  'hits': [{'_index': 'news',
    '_id': '188452',
    '_score': 13.047355,
    '_source': {'link': 'https://www.huffingtonpost.com/entry/healthy-lasagna-recipes_us_5b9c4250e4b03a1dcc7d69a9',
     'headline': 'Healthy Lasagna Recipes (PHOTOS)',
     'category': 'FOOD & DRINK',
     'short_description': "If you're someone who watches what you eat, you probably don't get down with a lot of lasagna. Lasagna is typically one big",
     'authors': None,
     'date': '2012-09-11'}},
   {'_index': 'news',
    '_id': '2064',
    '_score': 12.6796465,
    '_source': {'link': 'https://www.huffpost.com/entry/kitchen-products-that-dont-cost-a-lot_l_6130ea72e4b0eab0ad955e2d',
     'headline': '29 Inexpensive Kitchen Products That Will Be Used All The Time',
     'category': 'FOOD & DRINK',
     'short_description': 'A 

In [82]:
r1 = e.curl('get', f'news/_search',{
 "query": {
 "match": {
 "short_description": {
 "query": expand_q
}}}})

r1.json()

{'took': 11,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 3539, 'relation': 'eq'},
  'max_score': 44.943886,
  'hits': [{'_index': 'news',
    '_id': '162723',
    '_score': 44.943886,
    '_source': {'link': 'https://www.huffingtonpost.com/entry/olive-garden-lasagna_us_5b9d3fe8e4b03a1dcc85f0ff',
     'headline': 'Get The Dish: Olive Garden Lasagna Classico',
     'category': 'FOOD & DRINK',
     'short_description': "Ultra-comforting, and packed with layers of meat sauce, cheese, and tender noodles, Olive Garden's Lasagna Classico is one",
     'authors': 'PopSugar Food, PopSugar Food',
     'date': '2013-06-12'}},
   {'_index': 'news',
    '_id': '188452',
    '_score': 21.6675,
    '_source': {'link': 'https://www.huffingtonpost.com/entry/healthy-lasagna-recipes_us_5b9c4250e4b03a1dcc7d69a9',
     'headline': 'Healthy Lasagna Recipes (PHOTOS)',
     'category': 'FOOD & DRINK',
     'short_description': "If you