In [1]:
import sys
!{sys.executable} -m pip install spacy
!{sys.executable} -m spacy download en_core_web_sm

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [1]:
# Import necessary libraries
import logging
import os
import spacy
from hooks.elasticsearch_hook import ElasticsearchHook
from spacytextblob.spacytextblob import SpacyTextBlob # noqa: F401
from spacy.language import Language

# Define custom component to filter entities
@Language.component("filter_entities")
def filter_entities(doc):
    RELEVANT_LABELS = {
        "PERSON",
        "ORG",
        "GPE",
        "LOC",
        # "DATE", # Removing DATE due to noise
        "EVENT",
        "MONEY",
        "PRODUCT",
        "WORK_OF_ART",
    }
    doc.ents = [ent for ent in doc.ents if ent.label_ in RELEVANT_LABELS]
    return doc

# Install and load NLP model
nlp = spacy.load("en_core_web_md")

# Add SpacyTextBlob and custom filter component to spaCy pipeline
nlp.add_pipe("spacytextblob")
nlp.add_pipe("filter_entities", after="spacytextblob")

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

os.environ['ES_HOST'] = 'http://localhost:9200'

def process_articles_with_nlp():
    es = ElasticsearchHook()
    try:
        articles = es.get_new_articles(index="rss_feeds")
        logger.info(f"Retrieved {len(articles)} new articles from Elasticsearch")
    except Exception as e:
        logger.error(f"Error retrieving articles from Elasticsearch: {e}")
        return

    for article in articles:
        try:
            doc = nlp(article["_source"]["summary"])
            entities = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]
            sentiment = (
                doc._.blob.sentiment.polarity
            )  # Using spacytextblob for sentiment analysis
            vector = doc.vector.tolist()  # Convert vector to list for JSON serialization

            enriched_data = {
                "nlp_processed": True,
                "entities": entities,
                "sentiment": sentiment,
                "vector": vector,  # Add vector to the enriched data
            }

            es.update_article(index="rss_feeds", id=article["_id"], body=enriched_data)
            logger.info(f"Enriched data: {enriched_data}")
            logger.info(f"Successfully processed and updated article: {article['_id']}")
        except Exception as e:
            logger.error(f"Error processing article ID: {article['_id']}, Error: {e}")

# Execute the function
process_articles_with_nlp()

2024-08-30 19:33:08,334 - INFO - Initialized ElasticsearchHook with hosts: ['http://localhost:9200']
2024-08-30 19:33:08,440 - INFO - POST http://localhost:9200/rss_feeds/_search [status:200 duration:0.005s]
  result = self.es.search(index=index, body=query, size=size)
2024-08-30 19:33:08,452 - INFO - Retrieved 0 new articles from Elasticsearch
2024-08-30 19:33:08,807 - INFO - Initialized ElasticsearchHook with hosts: ['http://localhost:9200']
2024-08-30 19:33:08,813 - INFO - POST http://localhost:9200/rss_feeds/_search [status:400 duration:0.005s]
2024-08-30 19:33:08,813 - ERROR - Failed to perform vector search in index: rss_feeds. Error: BadRequestError(400, 'search_phase_execution_exception', 'runtime error')


BadRequestError: BadRequestError(400, 'search_phase_execution_exception', 'runtime error')