# Yahoo! News Scraper
Scrape news from Yahoo! based on a specific search criteria

In [1]:
import csv
from datetime import datetime
from time import sleep
from random import random
import requests
from bs4 import BeautifulSoup


headers = {
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'referer': 'https://www.google.com',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'
}    

def get_url(search):
    """Generate a url based on the search term"""
    template = 'https://news.search.yahoo.com/search?p={}'
    url = template.format(search)
    return url

def get_article(card):
    """Extract article information from the raw html"""
    headline = card.find('h4', 's-title').text.strip()
    source = card.find('span', 's-source').text
    date_posted = card.find('span', 's-time').text.replace('·', '').strip()
    today = datetime.today().strftime('%Y-%m-%d')
    description = card.find('p', 's-desc').text.strip()
    link = card.find('h4', 's-title').a.get('href')
        
    article = (headline, source, date_posted, today, description, link)
    return article

def main(search):
    """Run the main program routine"""
    url = get_url(search)
    articles = []
        
    while True:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('div', 'NewsArticle')
        
        # extract articles from page
        for card in cards:
            article = get_article(card)
            if article:
                articles.append(article)
        
        # find the next page
        try:    
            url = soup.find('a', 'next').get('href')
        except AttributeError:
            break

        # add random delay to prevent getting blocked from server
        delay = random()
        sleep(delay)

            
    # save article data
    with open('results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Headline', 'Source', 'DatePosted', 'RequestDate', 'Description', 'Link'])
        writer.writerows(articles)
        
    return articles

In [2]:
# run the main program
articles = main('iphone')

In [4]:
# show the first 4 records
articles[0:4]

[("Apple's iPhone 12 announcement may be coming soon, introducing big changes like a new design, 5G,...",
  'Business Insider via Yahoo News',
  '2 days ago',
  '2020-09-05',
  "Apple's new iPhones are expected to launch a few weeks later than usual this year because of the...",
  'https://r.search.yahoo.com/_ylt=AwrC1CnFLVRf0l0AOyDQtDMD;_ylu=X3oDMTByOHZyb21tBGNvbG8DYmYxBHBvcwMxBHZ0aWQDBHNlYwNzcg--/RV=2/RE=1599381061/RO=10/RU=https%3a%2f%2fnews.yahoo.com%2fnext-years-iphone-12-starting-143900059.html/RK=2/RS=p79ct1_M2lOlPrb0AwdZBW5s7To-'),
 ('The best Apple iPhone deals for September 2020',
  'Digital Trends via Yahoo Finance',
  '27 minutes ago',
  '2020-09-05',
  'More than 10 years after ushering in the smartphone era, the Apple iPhone is still the single most famous device on the market — even if Android makers...',
  'https://r.search.yahoo.com/_ylt=AwrC1CnFLVRf0l0APSDQtDMD;_ylu=X3oDMTBybGY3bmpvBGNvbG8DYmYxBHBvcwMyBHZ0aWQDBHNlYwNzcg--/RV=2/RE=1599381061/RO=10/RU=https%3a%2f%2ffina