In [24]:
from elasticsearch import Elasticsearch, helpers
from sentence_transformers import SentenceTransformer
from datetime import datetime
import json
import re

In [25]:
INDEX_NAME = "smart_docs"
es = Elasticsearch("http://localhost:9200")

In [26]:
mapping = {
    "settings": {
        "index": {
            "max_ngram_diff": 5
        },
        "analysis": {
            "char_filter": {
                "html_strip": {"type": "html_strip"}
            },
            "filter": {
                "length_filter": {"type": "length", "min": 3}
            },
            "tokenizer": {
                "autocomplete_infix_tokenizer": {
                    "type": "ngram",
                    "min_gram": 3,
                    "max_gram": 5,
                    "token_chars": ["letter", "digit"]
                }
            },
            "analyzer": {
                "autocomplete_infix": {
                    "type": "custom",
                    "tokenizer": "autocomplete_infix_tokenizer",
                    "filter": ["lowercase"]
                },
                "autocomplete_infix_search": {
                    "type": "custom",
                    "tokenizer": "lowercase"
                },
                "content_analyzer": {
                    "type": "custom",
                    "char_filter": ["html_strip"],
                    "tokenizer": "standard",
                    "filter": ["lowercase", "stop", "length_filter", "porter_stem"]
                },
                "keyword_analyzer": {
                    "type": "keyword"
                }
            }
        }
    },

    "mappings": {
        "properties": {
            "title": {
                "type": "text",
                "analyzer": "autocomplete_infix",
                "search_analyzer": "autocomplete_infix_search"
            },
            "title_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "content": {"type": "text", "analyzer": "content_analyzer"},
            "dateline": {"type": "text"},
            "date": {
                "type": "date",
                "format": "dd-MMM-yyyy HH:mm:ss.SS||strict_date_optional_time"
            },
            "places": {"type": "keyword"},
            "topics": {"type": "keyword"},
            "people": {"type": "keyword"},
            "orgs": {"type": "keyword"},
            "exchanges": {"type": "keyword"},
            "companies": {"type": "keyword"}
        }
    }
}


In [27]:
if es.indices.exists(index=INDEX_NAME):
    es.indices.delete(index=INDEX_NAME)
es.indices.create(index=INDEX_NAME, body=mapping)
model=SentenceTransformer('all-MiniLM-L6-v2')       

In [None]:
with open ("retures_json.json",'r')as f:
    data = json.load(f)
actions = []    
for doc in data:
    tivector=model.encode(doc['title']).tolist()
    es_doc = {    

In [30]:
with open('reuters_documents.json', 'r') as f:
    documents = json.load(f)
actions = []
for doc in documents:
    title = doc.get("title", "")
    body = doc.get("body", "")
    dateline = doc.get("dateline", "")
    date_str = doc.get("date", "")
    try:
        date_obj = datetime.strptime(date_str, '%d-%b-%Y %H:%M:%S.%f')
    except ValueError:
        date_obj = None
    title_vector = model.encode(doc['title']).tolist()
    es_doc = {
        "_index": INDEX_NAME,
        "_source": {
            "title": title,
            "title_vector": title_vector,
            "content": body,
            "dateline": dateline,
            "date": date_obj,
            "places": doc.get("places", []),
            "topics": doc.get("topics", []),
            "people": doc.get("people", []),
            "orgs": doc.get("orgs", []),
            "exchanges": doc.get("exchanges", []),
            "companies": doc.get("companies", []),
        }
    }
    actions.append(es_doc)    
    

In [31]:

helpers.bulk(es, actions)
print(f"Indexed {len(actions)} documents successfully.")


def hybrid_autocomplete(query, top_k=10):

    lexical_query = {
        "multi_match": {
            "query": query,
            "fields": ["title^3", "content"],
            "type": "bool_prefix"
        }
    }

    query_vector = model.encode(query).tolist()

    semantic_query = {
        "knn": {
            "field": "title_vector",
            "query_vector": query_vector,
            "k": top_k,
            "num_candidates": 50
        }
    }

    response = es.search(
        index=INDEX_NAME,
        body={
            "size": top_k,
            "query": {
                "bool": {
                    "should": [
                        lexical_query,
                        {"constant_score": {"filter": semantic_query}}
                    ]
                }
            }
        }
    )

    return [hit["_source"]["title"] for hit in response["hits"]["hits"]]


# Test it
print(hybrid_autocomplete("equity options"))

Indexed 21578 documents successfully.
['PLACER <PLC> TO INCREASE STAKE IN EQUITY SILVER', 'REUTERS LAUNCHES TWO NEW SERVICES IN EUROPE', 'FRENCH STOCK BILL TO BE PRESENTED BEFORE JUNE', 'INDUSTRIAL EQUITY TO MAKE PROPOSAL TO CALMAT<CZM>', 'LONDON OPTIONS MARKET SEES CONTINUED HIGH GROWTH', "MERRILL LYNCH <MER> DISPUTES MOODY'S DOWNGRADE", 'SAFEGUARD SCIENTIFIC <SFE> IN EQUITY DEAL', 'CORADIAN <CDIN.O> STAKE ACQUIRED BY SAGE', 'CALMAT <CZM> SUES INDUSTRIAL EQUITY', 'PACIFIC STOCK EXCHANGE RESUMES SOME OPTIONS']
