In [4]:
import feedparser
import json
import os

def verify_rss_json(json_file):
    # Read the JSON file
    with open(json_file, "r") as f:
        feeds = json.load(f)

    # Assert the JSON structure
    assert "name" in feeds, f"Missing 'name' in {json_file}"
    assert "rss_feeds" in feeds, f"Missing 'rss_feeds' in {json_file}"
    assert isinstance(feeds["rss_feeds"], list), f"'rss_feeds' should be a list in {json_file}"

    outlet_name = feeds["name"]

    for feed in feeds["rss_feeds"]:
        # Check for feed_name or name
        assert "feed_name" in feed or "name" in feed, f"Missing 'feed_name' or 'name' in one of the feeds in {json_file}"
        
        # Check for url or link
        assert "url" in feed or "link" in feed, f"Missing 'url' or 'link' in one of the feeds in {json_file}"

        feed_name = feed.get("feed_name", feed.get("name", "default_feed")).lower().replace(" ", "_")
        feed_url = feed.get("url", feed.get("link"))

        task_id = f'fetch_{outlet_name.lower().replace(" ", "_")}_{feed_name}'
        print(f"Task ID: {task_id}")

        # # Parse the feed URL to ensure it is valid
        # parsed_feed = feedparser.parse(feed_url)
        # assert parsed_feed.bozo == 0, f"Error parsing feed URL: {feed_url}"
        # print(f"Parsed feed URL successfully: {feed_url}")

# Path to the directory containing the JSON files
config_dir = os.path.join(os.getcwd(), "..", "airflow", "config", "news_outlets")

# Iterate over all JSON files in the config directory and verify them
for file_name in os.listdir(config_dir):
    if file_name.endswith(".json"):
        file_path = os.path.join(config_dir, file_name)
        print(f"Verifying JSON file: {file_path}")
        verify_rss_json(file_path)


Verifying JSON file: /home/luke/projects/clio/notebooks/../airflow/config/news_outlets/business_world.json
Task ID: fetch_business_world_feed
Verifying JSON file: /home/luke/projects/clio/notebooks/../airflow/config/news_outlets/gma.json
Task ID: fetch_gma_public_affairs
Task ID: fetch_gma_photo
Task ID: fetch_gma_video
Task ID: fetch_gma_newstv
Task ID: fetch_gma_serbisyo_publiko
Task ID: fetch_gma_walang_pasok
Task ID: fetch_gma_transportation
Task ID: fetch_gma_missing_persons
Task ID: fetch_gma_news
Task ID: fetch_gma_world
Task ID: fetch_gma_metro
Task ID: fetch_gma_nation
Task ID: fetch_gma_regions
Task ID: fetch_gma_special_reports
Task ID: fetch_gma_ulat_filipino
Task ID: fetch_gma_hashtag
Task ID: fetch_gma_community_bulletin_board
Task ID: fetch_gma_money
Task ID: fetch_gma_personal_finance
Task ID: fetch_gma_companies
Task ID: fetch_gma_economy
Task ID: fetch_gma_motoring
Task ID: fetch_gma_sports
Task ID: fetch_gma_basketball
Task ID: fetch_gma_boxing
Task ID: fetch_gma_foo