In [38]:
import feedparser
from newspaper import Article
import pandas as pd

In [28]:
def get_domain_name(url):
    domain = urlparse(url).netloc
    return domain

In [42]:
def parse_rss_and_extract_info(rss_urls):
    # List to hold article data
    articles_info = []
    
    for rss_url in rss_urls:
        # Parse the RSS feed
        feed = feedparser.parse(rss_url)
        
        for entry in feed.entries:
            article_url = entry.link
            
            # Download and parse the article
            article = Article(article_url)
            article.download()
            article.parse()
            
            # Extract relevant information
            title = article.title
            author = ', '.join(article.authors) if article.authors else 'Unknown'
            publish_date = article.publish_date
            content = article.text
            
            # Store the information in a dictionary
            article_data = {
                'title': title,
                'author': author,
                'publish_date': publish_date,
                'content': content
            }
            
            # Add to the list of articles
            articles_info.append(article_data)
    
    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(articles_info)
    df['author'] = df['author'].apply(lambda x: x.split(',')[0] if ',' in x else x)
    
    return df

In [43]:
def save_to_csv(df, filename):
    # Save DataFrame to a CSV file
    df.to_csv(filename, index=False)

In [44]:
rss_urls = [
    'https://indianexpress.com/feed/',
    'https://feeds.bbci.co.uk/news/world/rss.xml',
    'https://feeds.reuters.com/reuters/topNews',
    'https://www.theguardian.com/world/rss'
]
articles_df = parse_rss_and_extract_info(rss_urls)
save_to_csv(articles_df, 'articles.csv')