In [40]:
from eventregistry import EventRegistry, TopicPage, SourceInfoFlags, ReturnInfo
from dotenv import load_dotenv
import os
import pandas as pd
import json 
from pydantic import BaseModel
from typing import List, Optional, Union


load_dotenv()
EVENT_REGISTRY_API_KEY = os.getenv("EVENT_REGISTRY_API_KEY")
event_registry = EventRegistry(apiKey = EVENT_REGISTRY_API_KEY)

## Initialize topic

In [2]:
uri = "be2534a5-e15e-42d8-b939-dcba8269a85c"
topic = "AI-ML"

In [19]:
topic = TopicPage(event_registry)
source_info = SourceInfoFlags(ranking=True, description=True, title=True)
return_info = ReturnInfo(sourceInfo=source_info)
topic.loadTopicPageFromER(uri) # intialized the topic page with URI, articles are not yet fetched


## Fetch articles for a page

In [25]:
# invoking the API call
# note that the number of articles returned is not proportional to the tokens used.
# It seems that every time the request is sent, 1 token is used.
# hence it makes sense to fetch as many articles per page as possible in one go -> max = 100
topic_page = topic.getArticles(page = 1, returnInfo=return_info, count=100)

In [34]:
# this is the list of articles that were fetched in the page, 100 articles
articles = topic_page['articles']['results']

# as all item in the list are dictionaries of the same keys, we can easily convert it to a pandas dataframe
df = pd.DataFrame(articles)
df.head(2)

Unnamed: 0,uri,lang,isDuplicate,date,time,dateTime,dateTimePub,dataType,sim,url,title,body,source,authors,image,eventUri,sentiment,wgt,relevance
0,b-8185228332,eng,False,2024-06-19,11:40:44,2024-06-19T11:40:44Z,2024-06-19T11:39:56Z,blog,0,https://www.analyticsvidhya.com/blog/2024/06/l...,Guide to LLM Observability and Evaluations for...,"In the fast-evolving world of AI, it's crucial...","{'uri': 'analyticsvidhya.com', 'dataType': 'bl...",[],,,0.254902,160,160
1,b-8185132568,eng,False,2024-06-19,10:39:56,2024-06-19T10:39:56Z,2024-06-19T10:38:45Z,blog,0,https://www.analyticsvidhya.com/blog/2024/06/a...,Building an Agentic Workflow with CrewAI and Groq,"""AI Agentic workflow will drive massive progre...","{'uri': 'analyticsvidhya.com', 'dataType': 'bl...",[],https://cdn.analyticsvidhya.com/wp-content/upl...,,0.294118,137,137


In [41]:
df.columns

Index(['uri', 'lang', 'isDuplicate', 'date', 'time', 'dateTime', 'dateTimePub',
       'dataType', 'sim', 'url', 'title', 'body', 'source', 'authors', 'image',
       'eventUri', 'sentiment', 'wgt', 'relevance'],
      dtype='object')

## JSON validation with Pydantic

In [52]:
# we can define a pydantic model to do 2 things:
# 1. validate the data: only fields that are defined in the model will be validated, others are ignored by default
class ArticleModel(BaseModel):
    uri: Union[str, int]
    dateTime: str
    dateTimePub: str
    url: str
    title: str
    body: str
    class Config:
        extra = "ignore" # this is default, but explicitly put here for clarity

In [54]:
article = ArticleModel.validate(articles[0])
print(article.dict())

{'uri': 'b-8185228332', 'dateTime': '2024-06-19T11:40:44Z', 'dateTimePub': '2024-06-19T11:39:56Z', 'url': 'https://www.analyticsvidhya.com/blog/2024/06/llm-observability-and-evaluations/', 'title': 'Guide to LLM Observability and Evaluations for RAG Application', 'body': 'In the fast-evolving world of AI, it\'s crucial to keep track of your API costs, especially when building LLM-based applications such as Retrieval-Augmented Generation (RAG) pipelines in production. Experimenting with different LLMs to get the best results often involves making numerous API requests to the server, each request incurring a cost. Understanding and tracking where every dollar is spent is vital to managing these expenses effectively.\n\nIn this article, we will implement LLM observability with RAG using just 10-12 lines of code. Observability helps us monitor key metrics such as latency, the number of tokens, prompts, and the cost per request.\n\nThis article was published as a part of the Data Science Bl

## Saving to JSON files

In [36]:
# alternatively, we can save each article as a json file
# this is useful if we want to save the articles for later use, or if we want to save the articles in a database
# try with 5 articles
for i, article in enumerate(articles[:5]):
    with open(f"./raw_articles/article_{i}.json", "w") as f:
        json.dump(article, f)

## Additional Info

In [23]:
# metadata of the topic page:
topic_page['topicPage']

# metadata of the returned articles: current page, number of pages, numer of articles, and the articles themselves for the page
# it is optional to iterate through all the pages, but it is not necessary for demonstration purposes
topic_page['articles']

{'page': 1,
 'pages': 6,
 'totalResults': 521,
 'results': [{'uri': 'b-8185228332',
   'lang': 'eng',
   'isDuplicate': False,
   'date': '2024-06-19',
   'time': '11:40:44',
   'dateTime': '2024-06-19T11:40:44Z',
   'dateTimePub': '2024-06-19T11:39:56Z',
   'dataType': 'blog',
   'sim': 0,
   'url': 'https://www.analyticsvidhya.com/blog/2024/06/llm-observability-and-evaluations/',
   'title': 'Guide to LLM Observability and Evaluations for RAG Application',
   'body': 'In the fast-evolving world of AI, it\'s crucial to keep track of your API costs, especially when building LLM-based applications such as Retrieval-Augmented Generation (RAG) pipelines in production. Experimenting with different LLMs to get the best results often involves making numerous API requests to the server, each request incurring a cost. Understanding and tracking where every dollar is spent is vital to managing these expenses effectively.\n\nIn this article, we will implement LLM observability with RAG using jus

In [16]:
# for each page, 100 articles are returned by default
len(res['articles']['results'])

100

In [18]:
t.__dict__

{'eventRegistry': <eventregistry.EventRegistry.EventRegistry at 0x1fff95b2a10>,
 'topicPage': {'autoAddArticles': True,
  'articleHasDuplicate': 'keepAll',
  'articleHasEvent': 'keepAll',
  'articleIsDuplicate': 'skipDuplicates',
  'maxDaysBack': 1,
  'articleTreshWgt': 0,
  'eventTreshWgt': 0,
  'concepts': [{'uri': 'http://en.wikipedia.org/wiki/Artificial_intelligence',
    'label': 'Artificial intelligence (Machine intelligence)',
    'type': 'wiki',
    'wgt': 30,
    'excluded': False},
   {'uri': 'http://en.wikipedia.org/wiki/Machine_learning',
    'label': 'Machine learning',
    'type': 'wiki',
    'wgt': 30,
    'excluded': False},
   {'uri': 'http://en.wikipedia.org/wiki/Technology',
    'label': 'Technology',
    'type': 'wiki',
    'wgt': 30,
    'excluded': False},
   {'uri': 'http://en.wikipedia.org/wiki/Deep_learning',
    'label': 'Deep learning',
    'type': 'wiki',
    'wgt': 30,
    'excluded': False},
   {'uri': 'http://en.wikipedia.org/wiki/Research',
    'label': 