In [None]:
from time import sleep
from json import dumps
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer

# Kafka Configuration
topic_name = 'vnexpress'
kafka_server = 'localhost:9092'
producer = KafkaProducer(
    bootstrap_servers=[kafka_server],
    value_serializer=lambda x: dumps(x).encode('utf-8')
)

# RSS Feed URL
rss_url = 'https://vnexpress.net/rss/thoi-su.rss'

#Function to fetch the full content and author of the article from the given URL
def fetch_article_content(url):
    """
    Fetch the full content and author of the article from the given URL.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for request errors
        soup = BeautifulSoup(response.content, 'html.parser')  # Use 'html.parser' to parse the article page

        # Find the main content of the article
        content_div = soup.find('article')  # Most VNExpress articles are within <article>
        if content_div:
            paragraphs = content_div.find_all('p')  # Collect all <p> tags within the article
            content = "\n".join(p.text for p in paragraphs)  # Combine paragraphs into a single string

            # Extract the last paragraph as author if it looks like a name
            author = paragraphs[-1].text.strip() if paragraphs else None

            # Optional: Additional check to verify the author field (e.g., contains only name-like text)
            if len(author.split()) < 2 or "Image:" in author or "Author:" in author:
                author = None  # Not a valid author name
        else:
            content = None
            author = None

        return content, author
    except Exception as e:
        print(f"Error fetching article content: {e}")
        return None, None

#Function to fetch RSS feed data and send articles to Kafka
def fetch_rss_data(rss_url):
    """
    Fetch RSS feed data and send articles to Kafka.
    """
    try:
        response = requests.get(rss_url)
        response.raise_for_status()  # Check for request errors
        soup = BeautifulSoup(response.content, 'lxml-xml')  # Use 'lxml-xml' parser

        items = soup.find_all('item')
        for item in items:
            title = item.title.text if item.title else None
            link = item.link.text if item.link else None
            pub_date = item.pubDate.text if item.pubDate else None
            description = item.description.text if item.description else None
            image = item.enclosure['url'] if item.enclosure else None

            # Fetch full content of the article
            content, author = fetch_article_content(link)

            data = {
                'title': title,
                'link': link,
                'pub_date': pub_date,
                'description': description,
                'image': image,
                'content': content,  # Add the full article content
                'author': author  # Add the author of the article
            }

            # Send data to Kafka
            producer.send(topic_name, value=data)
            print(f"Sent: {data}")
            sleep(1)
    except Exception as e:
        print(f"Error fetching RSS feed: {e}")

# Fetch and stream data
if __name__ == "__main__":
    try:
        while True:
            fetch_rss_data(rss_url)
            sleep(300)  # Crawl every 5 minutes
    except KeyboardInterrupt:
        print("break")  # Print 'break' when Ctrl+C is pressed
    finally:
        print("Stopped fetching RSS data.")


Sent: {'title': 'Tăng cường an toàn bay sau tai nạn máy bay Jeju Air', 'link': 'https://vnexpress.net/tang-cuong-an-toan-bay-sau-tai-nan-may-bay-jeju-air-4833794.html', 'pub_date': 'Mon, 30 Dec 2024 21:08:09 +0700', 'description': '<a href="https://vnexpress.net/tang-cuong-an-toan-bay-sau-tai-nan-may-bay-jeju-air-4833794.html"><img src="https://i1-vnexpress.vnecdn.net/2024/12/30/sai-gon-cach-ly-xa-hoi-17-1735-5749-3075-1735562047.jpg?w=1200&h=0&q=100&dpr=1&fit=crop&s=9H2LYhuGV4LEgGsQGNKQxA"></a></br>Các cơ quan chức năng cần tăng cường kiểm tra kỹ thuật bảo dưỡng máy bay, phòng ngừa nguy cơ uy hiếp an toàn với máy bay, Cục trưởng Hàng không Việt Nam yêu cầu.', 'image': 'https://i1-vnexpress.vnecdn.net/2024/12/30/sai-gon-cach-ly-xa-hoi-17-1735-5749-3075-1735562047.jpg?w=1200&h=0&q=100&dpr=1&fit=crop&s=9H2LYhuGV4LEgGsQGNKQxA', 'content': 'Chỉ đạo của Cục trưởng Hàng không Việt Nam Đinh Việt Thắng đưa ra sau vụ tai nạn máy bay B737-800 của Jeju Air ngày 29/12 làm 179 người chết và vụ