In [None]:
import pandas as pd
import dagster as dg
from datetime import datetime, timedelta, timezone
import os
from pathlib import Path
from typing import Optional
from dotenv import load_dotenv
from eventregistry import EventRegistry, QueryArticlesIter, QueryItems

load_dotenv()

True

In [2]:
er = EventRegistry(apiKey=os.getenv("NEWSAPI_KEY"))

In [4]:
q = QueryArticlesIter(
	categoryUri=QueryItems.AND(["dmoz/Computers/Artificial_Intelligence", "dmoz/Business/Marketing_and_Advertising"]),
	lang="eng",
	dateStart=(datetime.now() - timedelta(days=7)).strftime("%Y-%m-%d")
)
articles = []
for article in q.execQuery(er, sortBy="date", maxItems=100):
	articles.append(article)

In [5]:
df = pd.DataFrame(articles)
df

Unnamed: 0,uri,lang,isDuplicate,date,time,dateTime,dateTimePub,dataType,sim,url,title,body,source,authors,image,eventUri,sentiment,wgt,relevance
0,2025-12-924368101,eng,False,2025-12-16,09:58:35,2025-12-16T09:58:35Z,2025-12-16T09:33:00Z,news,0.000000,https://www.finextra.com/pressarticle/108334/s...,Smartstream relocates North American HQ to Man...,This content is provided by an external author...,"{'uri': 'finextra.com', 'dataType': 'news', 't...","[{'uri': 'editorial_team@finextra.com', 'name'...",https://www.finextra.com/about/finextra-logo.png,,0.317647,503575115,4
1,2025-12-924269998,eng,False,2025-12-16,08:08:00,2025-12-16T08:08:00Z,2025-12-16T07:59:00Z,news,0.000000,https://economictimes.indiatimes.com/news/comp...,Propagate India elevates Parikshit Bhattachary...,Propagate India has appointed Parikshit Bhatta...,"{'uri': 'economictimes.indiatimes.com', 'dataT...",[],"https://img.etimg.com/thumb/msid-125999158,wid...",,0.388235,503568480,3
2,2025-12-924264245,eng,False,2025-12-16,08:05:37,2025-12-16T08:05:37Z,2025-12-16T07:43:04Z,news,0.000000,https://www.chosun.com/english/industry-en/202...,Rebellions Challenges NVIDIA in AI Inference M...,"South Korean AI Semiconductor Startup, Valued ...","{'uri': 'chosun.com', 'dataType': 'news', 'tit...",[],https://www.chosun.com/resizer/v2/MNSDAOBWHAYT...,,0.121569,503568337,5
3,2025-12-924229348,eng,False,2025-12-16,07:24:49,2025-12-16T07:24:49Z,2025-12-16T06:55:08Z,news,0.000000,https://roushada13.medium.com/when-brands-grow...,When Brands Grow a Personality: Inside the Ris...,"The AIsh -- 16 december, 2025\n\nFor over a de...","{'uri': 'roushada13.medium.com', 'dataType': '...",[],https://miro.medium.com/v2/resize:fit:800/1*t4...,,0.231373,503565889,6
4,2025-12-924131287,eng,False,2025-12-16,05:12:13,2025-12-16T05:12:13Z,2025-12-16T05:00:11Z,news,0.000000,https://uk.themedialeader.com/the-brief-tuesda...,The Brief - Tuesday 16 December - More scams f...,⌛ The US government appears poised to extend t...,"{'uri': 'uk.themedialeader.com', 'dataType': '...",[{'uri': 'ellie_hammonds@uk.themedialeader.com...,https://uk.themedialeader.com/wp-content/uploa...,,-0.137255,503557933,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2025-12-921088160,eng,False,2025-12-12,14:56:28,2025-12-12T14:56:28Z,2025-12-12T12:18:28Z,news,0.000000,https://www.mysmartprice.com/gear/appliances/a...,Samsung Marks 30 Years in India With a New Inn...,Samsung used its 30-year milestone in India to...,"{'uri': 'mysmartprice.com', 'dataType': 'news'...",[],https://www.mysmartprice.com/wp-content/upload...,,0.411765,503247388,4
96,2025-12-921008233,eng,False,2025-12-12,13:36:55,2025-12-12T13:36:55Z,2025-12-12T13:33:45Z,news,0.780392,https://www.investing.com/news/analyst-ratings...,Piper Sandler raises Broadcom stock price targ...,Investing.com - Piper Sandler has raised its p...,"{'uri': 'investing.com', 'dataType': 'news', '...",[],https://i-invdn-com.investing.com/news/news_si...,eng-11225511,0.231373,503242615,3
97,2025-12-920993415,eng,False,2025-12-12,13:17:27,2025-12-12T13:17:27Z,2025-12-12T13:00:00Z,news,0.000000,https://searchengineland.com/black-friday-llms...,What Black Friday reveals about how LLMs under...,"Analysis of 10,000 LLM responses reveals the d...","{'uri': 'searchengineland.com', 'dataType': 'n...",[],https://searchengineland.com/wp-content/seload...,,0.050980,503241447,4
98,2025-12-920947399,eng,False,2025-12-12,12:38:00,2025-12-12T12:38:00Z,2025-12-12T12:27:25Z,news,0.764706,https://www.investing.com/news/analyst-ratings...,Broadcom stock price target maintained at $525...,Investing.com - Cantor Fitzgerald maintained i...,"{'uri': 'investing.com', 'dataType': 'news', '...",[],https://i-invdn-com.investing.com/news/interna...,eng-11225511,0.176471,503239080,2


In [6]:
# Add metadata columns
df["fetched_at"] = datetime.now()

# Map eventregistry fields to consistent schema
if "uri" in df.columns:
	df["article_id"] = df["uri"]
if "url" not in df.columns and "uri" in df.columns:
	df["url"] = df["uri"]
if "dateTime" in df.columns:
	df["publishedAt"] = pd.to_datetime(df["dateTime"])
elif "date" in df.columns:
	df["publishedAt"] = pd.to_datetime(df["date"])

# Handle source information
if "source" in df.columns:
	df["source_name"] = df["source"].apply(
		lambda x: x.get("title")
		if isinstance(x, dict)
		else str(x)
		if x
		else None
	)
	df["source_uri"] = df["source"].apply(
		lambda x: x.get("uri") if isinstance(x, dict) else None
	)

In [7]:
df

Unnamed: 0,uri,lang,isDuplicate,date,time,dateTime,dateTimePub,dataType,sim,url,...,image,eventUri,sentiment,wgt,relevance,fetched_at,article_id,publishedAt,source_name,source_uri
0,2025-12-924368101,eng,False,2025-12-16,09:58:35,2025-12-16T09:58:35Z,2025-12-16T09:33:00Z,news,0.000000,https://www.finextra.com/pressarticle/108334/s...,...,https://www.finextra.com/about/finextra-logo.png,,0.317647,503575115,4,2025-12-16 12:46:17.614966,2025-12-924368101,2025-12-16 09:58:35+00:00,Finextra Research,finextra.com
1,2025-12-924269998,eng,False,2025-12-16,08:08:00,2025-12-16T08:08:00Z,2025-12-16T07:59:00Z,news,0.000000,https://economictimes.indiatimes.com/news/comp...,...,"https://img.etimg.com/thumb/msid-125999158,wid...",,0.388235,503568480,3,2025-12-16 12:46:17.614966,2025-12-924269998,2025-12-16 08:08:00+00:00,Economic Times,economictimes.indiatimes.com
2,2025-12-924264245,eng,False,2025-12-16,08:05:37,2025-12-16T08:05:37Z,2025-12-16T07:43:04Z,news,0.000000,https://www.chosun.com/english/industry-en/202...,...,https://www.chosun.com/resizer/v2/MNSDAOBWHAYT...,,0.121569,503568337,5,2025-12-16 12:46:17.614966,2025-12-924264245,2025-12-16 08:05:37+00:00,Chosun.com,chosun.com
3,2025-12-924229348,eng,False,2025-12-16,07:24:49,2025-12-16T07:24:49Z,2025-12-16T06:55:08Z,news,0.000000,https://roushada13.medium.com/when-brands-grow...,...,https://miro.medium.com/v2/resize:fit:800/1*t4...,,0.231373,503565889,6,2025-12-16 12:46:17.614966,2025-12-924229348,2025-12-16 07:24:49+00:00,Medium,roushada13.medium.com
4,2025-12-924131287,eng,False,2025-12-16,05:12:13,2025-12-16T05:12:13Z,2025-12-16T05:00:11Z,news,0.000000,https://uk.themedialeader.com/the-brief-tuesda...,...,https://uk.themedialeader.com/wp-content/uploa...,,-0.137255,503557933,2,2025-12-16 12:46:17.614966,2025-12-924131287,2025-12-16 05:12:13+00:00,"The Media Leader - News analysis, opinion and ...",uk.themedialeader.com
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2025-12-921088160,eng,False,2025-12-12,14:56:28,2025-12-12T14:56:28Z,2025-12-12T12:18:28Z,news,0.000000,https://www.mysmartprice.com/gear/appliances/a...,...,https://www.mysmartprice.com/wp-content/upload...,,0.411765,503247388,4,2025-12-16 12:46:17.614966,2025-12-921088160,2025-12-12 14:56:28+00:00,MySmartPrice.com,mysmartprice.com
96,2025-12-921008233,eng,False,2025-12-12,13:36:55,2025-12-12T13:36:55Z,2025-12-12T13:33:45Z,news,0.780392,https://www.investing.com/news/analyst-ratings...,...,https://i-invdn-com.investing.com/news/news_si...,eng-11225511,0.231373,503242615,3,2025-12-16 12:46:17.614966,2025-12-921008233,2025-12-12 13:36:55+00:00,Investing.com,investing.com
97,2025-12-920993415,eng,False,2025-12-12,13:17:27,2025-12-12T13:17:27Z,2025-12-12T13:00:00Z,news,0.000000,https://searchengineland.com/black-friday-llms...,...,https://searchengineland.com/wp-content/seload...,,0.050980,503241447,4,2025-12-16 12:46:17.614966,2025-12-920993415,2025-12-12 13:17:27+00:00,Search Engine Land,searchengineland.com
98,2025-12-920947399,eng,False,2025-12-12,12:38:00,2025-12-12T12:38:00Z,2025-12-12T12:27:25Z,news,0.764706,https://www.investing.com/news/analyst-ratings...,...,https://i-invdn-com.investing.com/news/interna...,eng-11225511,0.176471,503239080,2,2025-12-16 12:46:17.614966,2025-12-920947399,2025-12-12 12:38:00+00:00,Investing.com,investing.com


In [None]:

{
	"num_articles": len(df),
	"last_fetch_timestamp": dg.MetadataValue.timestamp(datetime.now(timezone.utc)),
	"preview": dg.MetadataValue.md(
		df[["title", "source_name", "publishedAt"]].head(5).to_markdown()
		if not df.empty
		and all(
			col in df.columns
			for col in ["title", "source_name", "publishedAt"]
		)
		else df.head(5).to_markdown()
		if not df.empty
		else "No articles"
	),
}

CheckError: Failure condition: Datetime values provided to MetadataValue.timestamp must have timezones, but 2025-12-16T12:46:52.676054 does not