Using jupyter and an cloud scheduler to scrape AI news from news sites using news API. This then send me the news in a csv to my email weekly

In [17]:
#Installing and running necessary libraries

from datetime import datetime, timedelta
import requests
import pandas as pd
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email.mime.text import MIMEText
from email import encoders
import os

In [8]:
#configuration
API_KEY = "80e9d4f5c07e4882a64c64c9cc472460" 
EMAIL_SENDER = "muterujecinta@gmail.com"  
EMAIL_PASSWORD = "livebig20"  
EMAIL_RECEIVER = "muterujecinta@gmail.com" 
CSV_FILE = "ai_data_news.csv"

In [9]:
#Fetch news from news API
def fetch_news():
    url = "https://newsapi.org/v2/everything"
    from_date = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
    params = {
        "q": "artificial intelligence OR data science OR data analytics OR data jobs OR machine learning OR LLM OR large language models",
        "from": from_date,
        "sortBy": "publishedAt",
        "language": "en",
        "apiKey": API_KEY
    }
    
    try:
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()  # Raise exception for bad status codes
        data = response.json()
        return data.get("articles", [])
    except requests.RequestException as e:
        print(f"Error fetching news: {e}")
        return []

In [10]:
#Load existing links for deduplication
def load_existing_links():
    if os.path.exists(CSV_FILE):
        try:
            df = pd.read_csv(CSV_FILE)
            return set(df["Link"].dropna().tolist())
        except Exception as e:
            print(f"Error reading CSV: {e}")
    return set()

In [14]:
#Saving the news article to the csv file
def save_to_csv(articles, existing_links):
    if not articles:
        return 0
    rows = []
    scraped_date = datetime.now().strftime('%Y-%m-%d')
    
    for article in articles:
        link = article.get("url", "N/A")
        if link in existing_links:
            continue
        description = article.get("description", "N/A")
        if description is None:  # Handle null description
            description = "N/A"
        row = {
            "Scraped Date": scraped_date,
            "Title": article.get("title", "N/A"),
            "Link": link,
            "Pub Date": article.get("publishedAt", "N/A").split("T")[0],
            "Source": article.get("source", {}).get("name", "N/A"),
            "Description": description[:500]  # Truncate for brevity
        }
        rows.append(row)
    
    if rows:
        df = pd.DataFrame(rows)
        df.to_csv(CSV_FILE, mode='a', index=False, header=not os.path.exists(CSV_FILE), encoding='utf-8')
    return len(rows)

In [18]:
#Send CSV via email
def send_email(num_articles):
    if not os.path.exists(CSV_FILE):
        print("No CSV file to send.")
        return
    
    msg = MIMEMultipart()
    msg['From'] = EMAIL_SENDER
    msg['To'] = EMAIL_RECEIVER
    msg['Subject'] = "Weekly AI/Data News Summary CSV"
    body = f"Attached is the CSV with {num_articles} new articles on AI, data science, analytics, and jobs from the past week."
    msg.attach(MIMEText(body, 'plain'))
    
    with open(CSV_FILE, 'rb') as f:
        part = MIMEBase('application', 'octet-stream')
        part.set_payload(f.read())
    encoders.encode_base64(part)
    part.add_header('Content-Disposition', f'attachment; filename={CSV_FILE}')
    msg.attach(part)
    
    try:
        server = smtplib.SMTP('smtp.gmail.com', 587)  # Adjust for your email provider
        server.starttls()
        server.login(EMAIL_SENDER, EMAIL_PASSWORD)
        server.sendmail(EMAIL_SENDER, EMAIL_RECEIVER, msg.as_string())
        server.quit()
        print(f"Email sent with {num_articles} articles.")
    except Exception as e:
        print(f"Error sending email: {e}")

In [19]:
def main():
    articles = fetch_news()
    if not articles:
        print("No new AI/data news found this week.")
        return
    
    existing_links = load_existing_links()
    num_articles = save_to_csv(articles, existing_links)
    
    if num_articles > 0:
        send_email(num_articles)
    else:
        print("No new articles to save or send.")

if __name__ == "__main__":
    main()

No new articles to save or send.
