# Personalized News Aggregator

###### End Gold :  Develop a news aggregator that scrapes articles from multiple sources, categorizes them, and provides access via a REST API and a simple front-end interface.

#### IMPORTING REQUIRED LIBRARIES

In [1]:
# Import necessary libraries
from bs4 import BeautifulSoup  # For parsing HTML content
import requests  # For making HTTP requests to fetch web pages
from datetime import datetime  # For working with date and time
import pandas as pd  # For data manipulation and storage
import spacy  # For NLP tasks like tokenization and lemmatization
import re  # For regular expressions used in text preprocessing

## Part 1: News Scraping
### Objective:
Scrape news articles from multiple sources (any 2 news sources e.g., BBC, CNN, Times of India, etc.) and collect the following data:

- **Title**: The article's headline.
- **Summary**: A brief overview or the first few sentences.
- **Publication Date**: The date the article was published.
- **Source**: The news outlet's name.
- **URL**: Link to the article.

In [2]:
# URL of the news page to scrape
url = 'https://timesofindia.indiatimes.com/news'

# Fetch the page content using requests
response = requests.get(url)

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.text, 'html')

In [None]:
# Find the specific section that contains news articles (adjust class as needed)
news = soup.find('ul', class_ = 'HytnJ')

In [None]:
# Find all articles within the section (adjust tag and class based on website structure)
articles = news.find_all('li')

In [None]:
len(articles) # Output the number of articles found

In [None]:
# Extract the titles, summaries, and URLs of the articles
titles = [article.find('p', class_ = 'CRKrj').text if article.find('p', class_ = 'CRKrj')!= None else None for article in articles]
summary = [article.find('p', class_ = 'W4Hjm').text if article.find('p', class_ = 'W4Hjm')!= None else None for article in articles]
urls = [article.find('a', class_ = 'VeCXM').get('href') if article.find('a', class_ = 'VeCXM')!= None else None for article in articles]

In [None]:
# Function to convert date string to datetime format
def date_time(date):
    return datetime.strptime(date.strip("Update:").strip("IST").strip(), "%b %d, %Y, %H:%M" )

In [None]:
date = []
source = []
# Iterate through the article URLs to scrape additional information from individual pages
for url in urls:
    response = requests.get(url)  # Fetch the article page
    soup = BeautifulSoup(response.text, "html")  # Parse the article HTML content
    
    # Extract the publication date from the article page
    date_ = soup.find("div", class_ = 'xf8Pm').find('span').text
    date.append(date_time(date_))  # Convert date to datetime format and add to list
    
    # Extract the source (e.g., Times of India) from the article page, handle cases where source is not available
    source_ = soup.find("div", class_ = 'kgcOp').find('a').text if soup.find("div", class_ = 'kgcOp')!= None else None
    source.append(source_)  # Add source to the list

In [None]:
# Create a dictionary to store the scraped data
data = { 'Title' : titles, 'Summary' : summary, 'URL' : urls, 'Source' : source, 'Date' : date}
df = pd.DataFrame(data) # Convert the dictionary into a pandas DataFrame

In [None]:
df

In [None]:
# Save the DataFrame to a CSV file
df.to_csv('news_articles.csv', index=False)

## Part 2: Content Categorization

### Objective:
Use NLP to categorize articles into topics (e.g., politics, technology, sports).


In [None]:
# Load the pre-trained spacy model
nlp = spacy.load('en_core_web_sm')

In [None]:
# Function to preprocess the text by removing non-alphabetic characters, lowercasing, and lemmatizing
def preprocess_text(text):
    filtered_text = ' '.join(re.sub('[^a-z^A-Z]', ' ', text).lower().split())
    doc = nlp(filtered_text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop])

In [None]:
# Dictionary mapping categories to their relevant keywords for article classification
category_keywords = {
    "World news": ["world", "international", "global", "abroad"],
    "Editorial": ["opinion", "editorial", "column", "commentary"],
    "Obituaries": ["obituary", "death", "died", "passed away"],
    "Business": ["business", "economy", "finance", "market", "stock"],
    "Lifestyle journalism": ["lifestyle", "fashion", "travel", "leisure"],
    "Weather": ["weather", "forecast", "temperature", "climate"],
    "Business journalism": ["corporate", "startup", "trade", "merger"],
    "Science journalism": ["science", "research", "technology", "innovation"],
    "Crime news": ["crime", "murder", "theft", "assault", "police"],
    "Political journalism": ["politics", "government", "election", "policy"],
    "Government": ["government", "policy", "administration", "regulation"],
    "Local news": ["local", "community", "neighborhood", "city", "town"],
    "Press release": ["press release", "announcement", "statement"],
    "Feature": ["feature", "profile", "in-depth", "special report"],
    "Health education": ["health", "wellness", "medicine", "fitness"],
    "Sports": ["sports", "game", "match", "tournament", "athlete"],
    "Letters to the editor": ["letter", "reader", "response", "feedback"]
}

In [None]:
# Function to categorize articles based on keywords
def categorize_article(text):
    if not text:
        return "Uncategorized"
    
    processed_text = preprocess_text(text) # Preprocess the article summary
    
    matched_categories = []
    
    # Check if any keywords match the processed tex
    for category, keywords in category_keywords.items():
        if any(keyword in processed_text for keyword in keywords):
            matched_categories.append(category)
            
    # Return the matched categories or "Uncategorized" if no matches
    return ", ".join(matched_categories) if matched_categories else "Uncategorized"

In [None]:
# Apply the categorization function to the "Summary" column
df['Category'] = df['Summary'].apply(categorize_article)

In [None]:
df

In [None]:
# Save the DataFrame to a CSV file
df.to_csv('news_articles.csv', index=False)