*Importing Libraries*

In [39]:
import feedparser
import hashlib
from datetime import datetime
from sqlalchemy import create_engine, Column, String, Text, DateTime
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, declarative_base
from celery import Celery
import spacy
import logging
import pandas as pd

In [54]:
def fetch_feed(feed_url):
    return feedparser.parse(feed_url)

def extract_article_info(entry):
    article = {
        'title': entry.title,
        'content': entry.summary,
        'publication_date': datetime(*entry.published_parsed[:6]),
        'source_url': entry.link,
        'hash': hashlib.md5(entry.link.encode()).hexdigest()  # Unique identifier for duplicate handling
    }
    return article

In [56]:
DATABASE_URL = 'postgresql://jay:1234@localhost/news'

Base = declarative_base()

class Article(Base):
    __tablename__ = 'articles'
    hash = Column(String, primary_key=True)
    title = Column(String)
    content = Column(Text)
    publication_date = Column(DateTime)
    source_url = Column(String)
    category = Column(String)

engine = create_engine(DATABASE_URL, future=True)  # Use future=True for SQLAlchemy 2.0 compatibility
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine, future=True)  # Use future=True for SQLAlchemy 2.0 compatibility
session = Session()

def store_article(article):
    if not session.query(Article).filter_by(hash=article['hash']).first():
        new_article = Article(**article)
        session.add(new_article)
        session.commit()

In [57]:
app = Celery('Automatic-News-Scrapper', broker='redis://localhost:6379')

nlp = spacy.load("en_core_web_sm")

@app.task
def process_article(article):
    doc = nlp(article['content'])
    categories = {
        'Terrorism / protest / political unrest / riot': ['terrorism', 'protest', 'political unrest', 'riot'],
        'Positive/Uplifting': ['positive', 'uplifting', 'inspiring'],
        'Natural Disasters': ['natural disaster', 'earthquake', 'flood', 'hurricane'],
        'Others': []
    }
    
    article['category'] = 'Others'
    for category, keywords in categories.items():
        if any(keyword in doc.text.lower() for keyword in keywords):
            article['category'] = category
            break
    
    store_article(article)

In [58]:
def enqueue_article(article):
    process_article.delay(article)

# Logging Setup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Fetch and Process Feeds
def fetch_and_process_feed(feed_url):
    try:
        feed = fetch_feed(feed_url)
        for entry in feed.entries:
            article = extract_article_info(entry)
            enqueue_article(article)
    except Exception as e:
        logger.error(f"Error processing feed {feed_url}: {e}")


In [59]:
feed_urls = [
'http://qz.com/feed',
'http://feeds.foxnews.com/foxnews/politics',
'http://feeds.reuters.com/reuters/businessNews',
'http://feeds.feedburner.com/NewshourWorld',
'https://feeds.bbci.co.uk/news/world/asia/india/rss.xml']

In [60]:
for url in feed_urls:
    fetch_and_process_feed(url)

# Export Articles to CSV
def export_articles_to_csv():
    articles = session.query(Article).all()
    article_dicts = [
        {
            'title': article.title,
            'content': article.content,
            'publication_date': article.publication_date,
            'source_url': article.source_url,
            'category': article.category
        }
        for article in articles
    ]
    df = pd.DataFrame(article_dicts)
    df.to_csv('articles.csv', index=False)

# Call export function after processing
export_articles_to_csv()

ERROR:__main__:Error processing feed http://qz.com/feed: Connection closed by server.
ERROR:__main__:Error processing feed http://feeds.foxnews.com/foxnews/politics: Connection closed by server.
ERROR:__main__:Error processing feed http://feeds.feedburner.com/NewshourWorld: Connection closed by server.
ERROR:__main__:Error processing feed https://feeds.bbci.co.uk/news/world/asia/india/rss.xml: Connection closed by server.


In [53]:
def print_articles_from_db():
    articles = session.query(Article).all()
    for article in articles:
        print(f"Title: {article.title}, URL: {article.source_url}, Category: {article.category}")

print_articles_from_db()