In [1]:
import elasticsearch_dsl as dsl
import spacy
from elasticsearch_dsl import Document, Text, Keyword, Date, Boolean, Float, Nested, DenseVector
from typing import List, Dict, Any

class Article(Document):
    title: str = Text()
    link: str = Keyword()
    published: str = Date()
    summary: str = Text()
    source: str = Keyword()
    embedding: List[float] = DenseVector()
    nlp_processed: bool = Boolean()
    entities: List[Dict[str, Any]] = Nested()
    sentiment: float = Float()

    class Index:
        name = "rss_feeds"

    def clean(self):
        if not self.embedding:
            doc = nlp(self.summary)
            self.entities = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]
            self.sentiment = (
                doc._.blob.sentiment.polarity
            )  # Using spacytextblob for sentiment analysis
            self.nlp_processed = True
            self.embedding = doc.vector.tolist()

dsl.connections.create_connection(hosts=['http://localhost:9200'])



<Elasticsearch(['http://localhost:9200'])>

In [9]:
from transformers import pipeline
from sentence_transformers import SentenceTransformer

# Specify the sentiment analysis model
model_name = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
sentiment_pipeline = pipeline("sentiment-analysis", model=model_name, device=-1)  # device=-1 ensures CPU usage

# Load the sentiment analysis and embedding models
sentiment_pipeline = pipeline("sentiment-analysis", device=-1)  # device=-1 ensures CPU usage
embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')  # Ensure CPU usage
nlp = spacy.load("en_core_web_md")

input = "Data careers"
# Use Hugging Face for sentiment analysis
sentiment_result = sentiment_pipeline(input[:512])  # Truncate to 512 tokens if necessary
sentiment = sentiment_result[0]['score'] if sentiment_result else 0.0

# Use SentenceTransformers for embeddings
vector = embedding_model.encode(input).tolist()


# # Convert the vector to a list and join with commas
# vector_list = vector.tolist()
# vector_str = ', '.join(map(str, vector_list))
# print(f"Query vector:\n[{vector_str}]")

s = Article.search(index="rss_feeds")
s = s.knn(field="embedding", k=5, num_candidates=10, query_vector=vector)
# print("Elasticsearch query:", s.to_dict())

response = s.execute()
# print("Raw response:", response.to_dict())
for hit in response:
    # print(hit.summary)
#     # print(hit.embedding)
    hit_doc = nlp(hit.summary)
    query = nlp(input)
    similarity = query.similarity(hit_doc)
    print(similarity, hit.title)
#     print("\n")
print(len(response.hits))

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


0.3392058203934381 Lauren Lior-Liechtenstein's journey from the arts to medicine
0.4280747579199786 MMDC, Concentrix prop up working students
-0.01880078985642257 Piolo gagawin ang buhay nina Magalong at Marcos; handang i-give up ang lahat para sa reunion nila ni Juday
0.36797330752791274 Spotlight: the UAP's 17th Likha Awardee
0.41667625766460326 Leading the way to develop multi-skilled Filipinos
5


In [None]:
query = "Carlos Yulo"

s = Article.search(index="rss_feeds")
s = s.query(dsl.query.Match(summary=query))
response = s.execute()
for hit in response:
    print(hit.title)
    # print(hit.summary)
    print("\n")

MORE FOR CALOY


‘Laruang de baterya’: Carlos Yulo amuses with ‘Maybe This Time’ dance trend take


‘Fake’: Nueva Ecija governor disowns viral congratulatory post for Carlos Yulo


Yulo gets house and lot from Century Properties


Caloy may payo sa kapatid


Castañeda receives recognition in Cebu


WATCH: Yulo recalls struggles


Carlos Yulo open to join 'Batang Quiapo'


Yulo inspirasyon ng mga Kabataan


Carlos Yulo ka-look alike Andres Santos meets Olympic champ, GF Chloe San Jose


