# Yahoo! News Scraper

In [None]:
import csv
from time import sleep
from datetime import datetime
from random import random
import requests
from bs4 import BeautifulSoup

In [None]:
headers = {
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'referer': 'https://www.google.com',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'
}

In [None]:
template = 'https://news.search.yahoo.com/search?p={}'

In [None]:
def get_url(search):
    """Generate a url based on the search term"""
    template = 'https://news.search.yahoo.com/search?p={}'
    url = template.format(search)
    return url

In [None]:
url = get_url('iphone')

In [None]:
response = requests.get(url, headers=headers)

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')

## Get collection

In [None]:
cards = soup.find_all('div', 'NewsArticle')

In [None]:
len(cards)

## Create a prototype model for single record

In [None]:
card = cards[0]

In [None]:
headline = card.find('h4', 's-title').text.strip()

In [None]:
link = card.find('h4', 's-title').a.get('href')

In [None]:
source = card.find('span', 's-source').text

In [None]:
date_posted = card.find('span', 's-time').text.replace('·', '').strip()

In [None]:
date_requested = datetime.today().strftime('%Y-%m-%d')

In [None]:
description = card.find('p', 's-desc').text.strip()

## Generalize the model in a function

In [None]:
def get_article(card):
    """Extract article information from the raw html"""
    headline = card.find('h4', 's-title').text.strip()
    source = card.find('span', 's-source').text
    date_posted = card.find('span', 's-time').text.replace('·', '').strip()
    date_requested = datetime.today().strftime('%Y-%m-%d')
    description = card.find('p', 's-desc').text.strip()
    link = card.find('h4', 's-title').a.get('href')
        
    article = (headline, source, date_posted, description, link)
    
    return article

In [None]:
articles = []

for card in cards:
    article = get_article(card)
    articles.append(article)

## Get the next page

In [None]:
url = soup.find('a', 'next').get('href')

## Bringing it all together

In [None]:
import csv
from time import sleep
from random import random
import requests
from bs4 import BeautifulSoup

headers = {
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'referer': 'https://www.google.com',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'
}

def get_url(search):
    """Generate a url based on the search term"""
    template = 'https://news.search.yahoo.com/search?p={}'
    url = template.format(search)
    return url

def get_article(card):
    """Extract article information from the raw html"""
    try:
        headline = card.find('h4', 's-title').text.strip()
    except AttributeError:
        return
    source = card.find('span', 's-source').text
    date_posted = card.find('span', 's-time').text.replace('·', '').strip()
    date_requested = datetime.today().strftime('%Y-%m-%d')
    description = card.find('p', 's-desc').text.strip()
    link = card.find('h4', 's-title').a.get('href')
        
    article = (headline, source, date_posted, description, link)
    return article

def main(search):
    """Run the main program routine"""
    url = get_url(search)
    articles = []
        
    while True:
        # add random delay to prevent getting blocked from server
        delay = random()
        sleep(delay)
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('div', 'NewsArticle')
        
        # extract articles from page
        for card in cards:
            article = get_article(card)
            if article:
                articles.append(article)
        
        # find the next page
        try:    
            url = soup.find('a', 'next').get('href')
        except AttributeError:
            break
    
    # save article data
    with open('results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Headline', 'Source', 'DatePosted', 'Description', 'Link'])
        writer.writerows(articles)

In [None]:
# run main program
main('iphone')