In [7]:
# Import necessary libraries
from elasticsearch import Elasticsearch
import logging
from elasticsearch.exceptions import NotFoundError

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class ElasticsearchHook:
    def __init__(self, hosts=None):
        self.es = Elasticsearch(hosts or ["http://localhost:9200"])
        self.logger = logging.getLogger(__name__)
        self.logger.info("Initialized ElasticsearchHook with hosts: %s", hosts)

    def remove_fields(self, index, fields):
        """Remove specified fields from documents in the given index."""
        # Construct the script to remove a specific field
        script = {
            "source": "ctx._source.remove(params.field)",
            "params": {"field": fields}
        }
        
        # Query to select documents that contain the 'nlp_processed' field
        query = {
            "bool": {
                "must": [
                    {
                        "exists": {
                            "field": "nlp_processed"
                        }
                    }
                ]
            }
        }
        
        # Body for the update_by_query request
        body = {
            "script": script,
            "query": query
        }
        
        try:
            response = self.es.update_by_query(index=index, body=body)
            self.logger.info(f"Removed fields {fields} from index {index}. Response: {response}")
            return response
        except Exception as e:
            self.logger.error(
                "Failed to remove fields from index: %s. Error: %s",
                index,
                e,
            )
            raise

# Initialize ElasticsearchHook
es_hook = ElasticsearchHook()

# Define fields to remove
fields_to_remove = ["nlp_processed", "sentiment", "entities"]

# Remove fields from the index
index_name = "rss_feeds"
for field in fields_to_remove:
    try:
        es_hook.remove_fields(index=index_name, fields=field)
    except NotFoundError:
        logger.warning(f"Index {index_name} not found. Skipping removal of field {field}.")
    except Exception as e:
        logger.error(f"Error removing field {field} from index {index_name}: {e}")


INFO:__main__:Initialized ElasticsearchHook with hosts: None
INFO:elastic_transport.transport:POST http://localhost:9200/rss_feeds/_update_by_query [status:200 duration:0.783s]
  response = self.es.update_by_query(index=index, body=body)
INFO:__main__:Removed fields nlp_processed from index rss_feeds. Response: {'took': 780, 'timed_out': False, 'total': 2824, 'updated': 2824, 'deleted': 0, 'batches': 3, 'version_conflicts': 0, 'noops': 0, 'retries': {'bulk': 0, 'search': 0}, 'throttled_millis': 0, 'requests_per_second': -1.0, 'throttled_until_millis': 0, 'failures': []}
INFO:elastic_transport.transport:POST http://localhost:9200/rss_feeds/_update_by_query [status:409 duration:0.074s]
ERROR:__main__:Failed to remove fields from index: rss_feeds. Error: ConflictError(409, "{'took': 68, 'timed_out': False, 'total': 2824, 'updated': 0, 'deleted': 0, 'batches': 1, 'version_conflicts': 1000, 'noops': 0, 'retries': {'bulk': 0, 'search': 0}, 'throttled_millis': 0, 'requests_per_second': -1.0, 