In [22]:
from elasticsearch import Elasticsearch, helpers
from sentence_transformers import SentenceTransformer
from datetime import datetime
import json
import re


In [23]:
INDEX_NAME = "smart_docs"
es = Elasticsearch("http://localhost:9200")

In [None]:
mapping = {
    "settings": {
        "index": {
            "max_ngram_diff": 5
        },
        "analysis": {
            "char_filter": {
                "html_strip": {"type": "html_strip"}
            },
            "filter": {
                "length_filter": {"type": "length", "min": 3}
            },
            "tokenizer": {
                "autocomplete_infix_tokenizer": {
                    "type": "ngram",
                    "min_gram": 3,
                    "max_gram": 8,
                    "token_chars": ["letter", "digit"]
                }
            },
            "analyzer": {
                "autocomplete_infix": {
                    "type": "custom",
                    "tokenizer": "autocomplete_infix_tokenizer",
                    "filter": ["lowercase"]
                },
                "autocomplete_infix_search": {
                    "type": "custom",
                    "tokenizer": "lowercase"
                },
                "content_analyzer": {
                    "type": "custom",
                    "char_filter": ["html_strip"],
                    "tokenizer": "standard",
                    "filter": ["lowercase", "stop", "length_filter", "porter_stem"]
                }
            }
        }
    },

    "mappings": {
        "properties": {
            # title for autocomplete
            "title": {
                "type": "text",
                "analyzer": "autocomplete_infix",
                "search_analyzer": "autocomplete_infix_search",
                "fields": {
                    "keyword": {"type": "keyword"}
                }
            },
            
            # semantic search vector
            "title_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            
            # content with proper analysis
            "content": {
                "type": "text",
                "analyzer": "content_analyzer",
                "fields": {
                    "keyword": {"type": "keyword"}
                }
            },
            
            # authors as nested objects
            "authors": {
                "type": "nested",
                "properties": {
                    "first_name": {"type": "text"},
                    "last_name": {"type": "text"},
                    "email": {"type": "keyword"}
                }
            },
            
            # proper date field
            "date": {
                "type": "date",
                "format": "dd-MMM-yyyy HH:mm:ss.SS||strict_date_optional_time||epoch_millis"
            },
            
            # main geopoint for the document
            "geopoint": {
                "type": "geo_point"
            },
            
            # temporal expressions extracted
            "temporalExpressions": {
                "type": "keyword"
            },
            
            # georeferences extracted
            "georeferences": {
                "type": "keyword"
            },
            
            # original places from reuters data
            "places": {
                "type": "keyword"
            },
            
            # additional geopoints for multiple locations
            "geopoints": {
                "type": "nested",
                "properties": {
                    "place": {"type": "keyword"},
                    "location": {"type": "geo_point"}
                }
            },
            
            # metadata fields
            "dateline": {"type": "text"},
            "topics": {"type": "keyword"},
            "people": {"type": "keyword"},
            "orgs": {"type": "keyword"},
            "exchanges": {"type": "keyword"},
            "companies": {"type": "keyword"}
        }
    }
}

In [25]:
if es.indices.exists(index=INDEX_NAME):
    es.indices.delete(index=INDEX_NAME)
es.indices.create(index=INDEX_NAME, body=mapping)

model = SentenceTransformer('all-MiniLM-L6-v2')     

In [30]:
file_path = r"C:\Users\asus\Desktop\NewsRetrival\smart-news-retrieval-\output\all_reuters_parsed.json"

with open(file_path, "r", encoding="utf-8") as f:
    documents = json.load(f)
actions = []

for doc in documents:
    title = doc.get("title") or ""  # never None
    content = doc.get("content") or ""
    author_raw = doc.get("author_raw") or ""
    dateline_raw = doc.get("dateline_raw") or ""
    date_raw = doc.get("date_raw") or ""

    places = doc.get("places") or []
    temporal = doc.get("temporalExpressions") or []
    georefs = doc.get("georeferences") or []

    geopoints = []
    for g in doc.get("geopoints") or []:
        if g:
            lat = g.get("lat")
            lon = g.get("lon")
            place = g.get("place")
            if lat is not None and lon is not None:
                geopoints.append({
                    "place": place,
                    "location": {"lat": lat, "lon": lon}
                })

    title_vector = model.encode(title).tolist()  # safe now


    es_doc = {
        "_index": INDEX_NAME,
        "_source": {
            "title": title,
            "title_vector": title_vector,
            "content": content,
            "author_raw": author_raw,
            "dateline_raw": dateline_raw,
            "date_raw": date_raw,
            "places": places,
            "temporalExpressions": temporal,
            "georeferences": georefs,
            "geopoints": geopoints
        }
    }
    actions.append(es_doc)


In [None]:

helpers.bulk(es, actions)
print(f"Indexed {len(actions)} documents successfully.")

def hybrid_autocomplete(query, top_k=10):

    lexical_query = {
        "multi_match": {
            "query": query,
            "fields": ["title^3", "content"],
            "type": "bool_prefix"
        }
    }

    query_vector = model.encode(query).tolist()

    semantic_query = {
        "knn": {
            "field": "title_vector",
            "query_vector": query_vector,
            "k": top_k,
            "num_candidates": 50
        }
    }

    response = es.search(
        index=INDEX_NAME,
        body={
            "size": top_k,
            "query": {
                "bool": {
                    "should": [
                        lexical_query,
                        {"constant_score": {"filter": semantic_query}}
                    ]
                }
            }
        }
    )

    return [hit["_source"]["title"] for hit in response["hits"]["hits"]]

# Test
print(hybrid_autocomplete("index removal"))


Indexed 21578 documents successfully.
['PLACER <PLC> TO INCREASE STAKE IN EQUITY SILVER', 'REUTERS LAUNCHES TWO NEW SERVICES IN EUROPE', 'FRENCH STOCK BILL TO BE PRESENTED BEFORE JUNE', 'INDUSTRIAL EQUITY TO MAKE PROPOSAL TO CALMAT<CZM>', 'LONDON OPTIONS MARKET SEES CONTINUED HIGH GROWTH', "MERRILL LYNCH <MER> DISPUTES MOODY'S DOWNGRADE", 'SAFEGUARD SCIENTIFIC <SFE> IN EQUITY DEAL', 'CORADIAN <CDIN.O> STAKE ACQUIRED BY SAGE', 'CALMAT <CZM> SUES INDUSTRIAL EQUITY', 'PACIFIC STOCK EXCHANGE RESUMES SOME OPTIONS']
