In [8]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [6]:
import requests
import sqlite3
from lxml import etree

def parse_xml_feed(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise HTTPError for bad status codes
        root = etree.fromstring(response.content)
        entries = []
        for item in root.xpath('//item'):
            title = item.findtext('title')
            link = item.findtext('link')
            description = item.findtext('description')
            entries.append({'title': title, 'link': link, 'description': description})
        return entries
    except Exception as e:
        print(f"An error occurred while fetching XML from {url}: {e}")
        return []

def parse_rss_feed(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise HTTPError for bad status codes
        root = etree.fromstring(response.content)
        entries = []
        for item in root.xpath('//item'):
            entry = {
                'title': item.findtext('title'),
                'summary': item.findtext('description'),
                'link': item.findtext('link')
            }
            entries.append(entry)
        return entries
    except Exception as e:
        print(f"An error occurred while parsing RSS feed from {url}: {e}")
        return []

def categorize_article(article):
    keywords = {
        'terrorism': ['terrorism', 'terrorist', 'attack', 'protest', 'political unrest', 'riot'],
        'positive': ['uplifting', 'inspiring', 'hopeful', 'positive', 'joyful'],
        'disasters': ['natural disaster', 'earthquake', 'flood', 'hurricane', 'tsunami'],
    }
    
    title = article.get('title', '')
    summary = article.get('summary', '')
    
    # Check if both 'title' and 'summary' are not None before converting to lowercase
    if title is not None and summary is not None:
        title_lower = title.lower()
        summary_lower = summary.lower()
        
        for category, words in keywords.items():
            for word in words:
                if word in title_lower or word in summary_lower:
                    return category
    
    return 'Others'

def save_to_database(articles):
    conn = sqlite3.connect('news_articles.db')
    c = conn.cursor()
    c.execute('''CREATE TABLE IF NOT EXISTS articles
                 (id INTEGER PRIMARY KEY, title TEXT, summary TEXT, link TEXT, category TEXT)''')
    for article in articles:
        category = categorize_article(article)
        c.execute("INSERT INTO articles (title, summary, link, category) VALUES (?, ?, ?, ?)",
                  (article['title'], article['summary'], article['link'], category))
    conn.commit()
    conn.close()

rss_feeds = [
    'http://rss.cnn.com/rss/cnn_topstories.rss',
    'http://qz.com/feed',
    'http://feeds.foxnews.com/foxnews/politics',
    'http://feeds.reuters.com/reuters/businessNews',  # Added a comma here
    'http://feeds.feedburner.com/NewshourWorld',
    'https://feeds.bbci.co.uk/news/world/asia/india/rss.xml'
]

if __name__ == "__main__":
    all_articles = []
    for feed_url in rss_feeds:
        articles = parse_rss_feed(feed_url)
        all_articles.extend(articles)

    save_to_database(all_articles)
    print("Articles saved to database.")


An error occurred while parsing RSS feed from http://feeds.reuters.com/reuters/businessNews: HTTPConnectionPool(host='feeds.reuters.com', port=80): Max retries exceeded with url: /reuters/businessNews (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000002498C3A2380>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Articles saved to database.


In [7]:
import sqlite3

# Function to print articles from the database
def print_articles_from_database():
    conn = sqlite3.connect('news_articles.db')
    c = conn.cursor()
    c.execute("SELECT * FROM articles")
    articles = c.fetchall()
    conn.close()
    
    print("Articles in the database:")
    for article in articles:
        print("Title:", article[0])
        print("Summary:", article[1])
        print("Link:", article[2])
        print("Category:", article[3])
        print()

# Call the function to print articles from the database
print_articles_from_database()


Articles in the database:
Title: Widening Mideast Crisis: Embattled U.N. Agency for Gaza Says Its Funding Will Run Out Within Weeks
Summary: More than a dozen countries have suspended support for UNRWA since Israel accused some of the agency’s workers of participating in the Oct. 7 terror attacks or in their aftermath.
Link: https://www.nytimes.com/live/2024/01/29/world/israel-gaza-jordan-news
Category: terrorism

Title: For Europe and NATO, a Russian Invasion Is No Longer Unthinkable
Summary: Amid crumbling U.S. support for Ukraine and Donald Trump’s rising candidacy, European nations and NATO are making plans to take on Russia by themselves.
Link: https://www.nytimes.com/2024/01/29/us/politics/europe-nato-russia-trump.html
Category: Others

Title: Where Is Hamas Getting Weapons? Increasingly, From Israel.
Summary: The very weapons that Israeli forces have used to enforce a blockade of Gaza are now being used against them.
Link: https://www.nytimes.com/2024/01/28/world/middleeast/isra