# Yahoo! News Scraper

In [1]:
import re
import csv
from time import sleep
from bs4 import BeautifulSoup
import requests

#### Setup the url template and request headers

In [2]:
template = 'https://news.search.yahoo.com/search?p={}'

In [3]:
url = template.format('iphone 12 leaked')

In [4]:
headers = {
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'referer': 'https://www.google.com',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'
}

#### Get the collection of articles

In [5]:
response = requests.get(url, headers=headers)

In [6]:
soup = BeautifulSoup(response.text, 'html.parser')

In [7]:
cards = soup.find_all('div', 'NewsArticle')

In [8]:
len(cards)

10

#### Create a prototype model for extracting article data

In [9]:
card = cards[5]

In [10]:
headline = card.find('h4', 's-title').text

In [11]:
source = card.find("span", 's-source').text

In [12]:
posted = card.find('span', 's-time').text.replace('·', '').strip()

In [13]:
description = card.find('p', 's-desc').text.strip()

The url needs a little more cleaning... the original URL is hiding inside the Yahoo! News link

In [14]:
raw_link = card.find('a').get('href')
raw_link

'https://r.search.yahoo.com/_ylt=AwrC0CP4OXVfWXEAegHQtDMD;_ylu=Y29sbwNiZjEEcG9zAzYEdnRpZAMEc2VjA3Ny/RV=2/RE=1601546872/RO=10/RU=https%3a%2f%2fwww.ibtimes.com%2fapple-rumors-iphone-12-may-not-come-earpods-box-report-says-3054323/RK=2/RS=LzK2Rj4LT8D9wvTORmd4NlpBF34-'

In [15]:
unquoted_link = requests.utils.unquote(raw_link)
unquoted_link

'https://r.search.yahoo.com/_ylt=AwrC0CP4OXVfWXEAegHQtDMD;_ylu=Y29sbwNiZjEEcG9zAzYEdnRpZAMEc2VjA3Ny/RV=2/RE=1601546872/RO=10/RU=https://www.ibtimes.com/apple-rumors-iphone-12-may-not-come-earpods-box-report-says-3054323/RK=2/RS=LzK2Rj4LT8D9wvTORmd4NlpBF34-'

In [16]:
pattern = re.compile(r'RU=(.+)\/RK')
clean_link = re.search(pattern, unquoted_link).group(1)
clean_link

'https://www.ibtimes.com/apple-rumors-iphone-12-may-not-come-earpods-box-report-says-3054323'

#### Generalize the model

In [17]:
def get_article(card):
    """Extract article information from the raw html"""
    headline = card.find('h4', 's-title').text
    source = card.find("span", 's-source').text
    posted = card.find('span', 's-time').text.replace('·', '').strip()
    description = card.find('p', 's-desc').text.strip()
    raw_link = card.find('a').get('href')
    unquoted_link = requests.utils.unquote(raw_link)
    pattern = re.compile(r'RU=(.+)\/RK')
    clean_link = re.search(pattern, unquoted_link).group(1)
    
    article = (headline, source, posted, description, clean_link)
    return article

In [18]:
articles = []
links = set()

for card in cards:
    article = get_article(card)
    link = article[-1]
    if not link in links:
        links.add(link)
        articles.append(article)

In [19]:
articles[:5]

[('iPhone 12 and iPhone 12 Pro: Everything we know about Apple’s new phones',
  'Digital Trends via Yahoo Finance',
  '11 hours ago',
  'This year, it’s going to be the latter — considering Apple’s September 15 hardware event has come...',
  'https://finance.yahoo.com/news/iphone-12-coming-fall-everything-185615475.html'),
 ('iPhone 12 Mini, iPhone 12 And iPhone 12 Pro Storage Configurations Leaked',
  'International Business Times',
  '14 hours ago',
  "Rumors about Apple's upcoming iPhone 12 series handsets abound as the Cupertino tech giant has yet...",
  'https://www.ibtimes.com/iphone-12-mini-iphone-12-iphone-12-pro-storage-configurations-leaked-3054361'),
 ('The first iPhone 12 models to launch next month just leaked',
  'BGR',
  '2 days ago',
  'The first iPhone 12 versions to launch in stores in October might be the cheapest models, according...',
  'https://bgr.com/2020/09/29/iphone-12-release-date-models-apple-event-october-13th-rumor/'),
 ('Apple Accident Confirms New iPhone

#### Get the next page

In [20]:
url = soup.find('a', 'next').get('href')
url

'https://news.search.yahoo.com/search;_ylt=AwrC0CP4OXVfWXEAhgHQtDMD;_ylu=Y29sbwNiZjEEcG9zAzEEdnRpZAMEc2VjA3BhZ2luYXRpb24-?p=iphone+12+leaked&b=11&pz=10&bct=0&xargs=0'

#### Bringing it all together

In [21]:
import re
import csv
from time import sleep
from bs4 import BeautifulSoup
import requests

headers = {
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'referer': 'https://www.google.com',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'
}

def get_article(card):
    """Extract article information from the raw html"""
    headline = card.find('h4', 's-title').text
    source = card.find("span", 's-source').text
    posted = card.find('span', 's-time').text.replace('·', '').strip()
    description = card.find('p', 's-desc').text.strip()
    raw_link = card.find('a').get('href')
    unquoted_link = requests.utils.unquote(raw_link)
    pattern = re.compile(r'RU=(.+)\/RK')
    clean_link = re.search(pattern, unquoted_link).group(1)
    
    article = (headline, source, posted, description, clean_link)
    return article

def get_the_news(search):
    """Run the main program"""
    template = 'https://news.search.yahoo.com/search?p={}'
    url = template.format(search)
    articles = []
    links = set()
    
    while True:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('div', 'NewsArticle')
        
        # extract articles from page
        for card in cards:
            article = get_article(card)
            link = article[-1]
            if not link in links:
                links.add(link)
                articles.append(article)        
                
        # find the next page
        try:
            url = soup.find('a', 'next').get('href')
            sleep(1)
        except AttributeError:
            break
            
    # save article data
    with open('results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Headline', 'Source', 'Posted', 'Description', 'Link'])
        writer.writerows(articles)
        
    return articles

In [22]:
articles = get_the_news('iphone 12 leaked')

In [23]:
articles[:5]

[('iPhone 12 and iPhone 12 Pro: Everything we know about Apple’s new phones',
  'Digital Trends via Yahoo Finance',
  '11 hours ago',
  'This year, it’s going to be the latter — considering Apple’s September 15 hardware event has come...',
  'https://finance.yahoo.com/news/iphone-12-coming-fall-everything-185615475.html'),
 ('iPhone 12 Mini, iPhone 12 And iPhone 12 Pro Storage Configurations Leaked',
  'International Business Times',
  '14 hours ago',
  "Rumors about Apple's upcoming iPhone 12 series handsets abound as the Cupertino tech giant has yet...",
  'https://www.ibtimes.com/iphone-12-mini-iphone-12-iphone-12-pro-storage-configurations-leaked-3054361'),
 ('The first iPhone 12 models to launch next month just leaked',
  'BGR',
  '2 days ago',
  'The first iPhone 12 versions to launch in stores in October might be the cheapest models, according...',
  'https://bgr.com/2020/09/29/iphone-12-release-date-models-apple-event-october-13th-rumor/'),
 ('Apple Accident Confirms New iPhone