In [1]:
import time
import requests
from urllib.parse import urlparse
import sys
import json
import lxml.html
import csv
from utils import make_request, parse_html, make_link_absolute, page_grab

In [53]:
def get_urls(url, articles=set(), videos=set()):
    """
    This function takes a URLs and returns lists of URLs
    for containing each article and video on that page.

    Parameters:
        * url:  a URL to a page of articles

    Returns:
        A list of URLs to each video and article on that page.
    """
    response = page_grab(url)
    urls = []
    container = response.cssselect("div.ssrcss-53phst-Promo.ett16tt0")

    for j in container:
        # find video/article
        type = j.get("type")
        # find link
        hyperlinkdiv = j.cssselect("div.ssrcss-1f3bvyz-Stack.e1y4nx260")
        a = hyperlinkdiv[0].cssselect("a")
        href = a[0].get("href")
        href = make_link_absolute(href, "https://www.bbc.com")
        if type == "article":
            articles.add(href)
        elif type == "video":
            videos.add(href)
        else:
            print("other", type)
        # urls.append(make_link_absolute(href, 'https://www.politico.com'))
    return articles, videos


get_urls("https://www.bbc.com/news/topics/cwnpxwzd269t?page=41")


def recurse_bbc(url, articles=set(), videos=set()):
    '''
    Takes an initial url and runs get_urls on all possible
    API queries. Gathering all possible articles and videos
    from the API into a set. 
    '''
    article, video = get_urls(url)
    articles = articles.union(article)
    videos = videos.union(video)
    begin = url.find("page=") + 5
    pagenumber = int(url[begin : len(url)])
    if pagenumber < 42:
        newlink = url[: -len(str(pagenumber))] + str(pagenumber + 1)
        recurse_bbc(newlink, articles, videos)
    return articles, videos

Fetching https://www.bbc.com/news/topics/cwnpxwzd269t?page=41
other audio
Fetching https://www.bbc.com/news/topics/cwnpxwzd269t?page=41
other audio
Fetching https://www.bbc.com/news/topics/cwnpxwzd269t?page=42


({'https://www.bbc.com/news/live/world-us-canada-50399361',
  'https://www.bbc.com/news/world-us-canada-34006916',
  'https://www.bbc.com/news/world-us-canada-50203276',
  'https://www.bbc.com/news/world-us-canada-50214895',
  'https://www.bbc.com/news/world-us-canada-50272171',
  'https://www.bbc.com/news/world-us-canada-50305467',
  'https://www.bbc.com/news/world-us-canada-50323190',
  'https://www.bbc.com/news/world-us-canada-50323607',
  'https://www.bbc.com/news/world-us-canada-50336197',
  'https://www.bbc.com/news/world-us-canada-50338937',
  'https://www.bbc.com/news/world-us-canada-50352801',
  'https://www.bbc.com/news/world-us-canada-50355758',
  'https://www.bbc.com/news/world-us-canada-50395008',
  'https://www.bbc.com/news/world-us-canada-50399230',
  'https://www.bbc.com/news/world-us-canada-50408246',
  'https://www.bbc.com/news/world-us-canada-50417641',
  'https://www.bbc.com/news/world-us-canada-50423763'},
 {'https://www.bbc.com/news/magazine-50352589',
  'https://

In [55]:
# save as csv
def create_csv(articles, videos, filename):
    '''
    Takes two sets of video and article urls and 
    puts them into a csv.
    '''
    with open(filename, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Articles", "Video"])
        max_length = max(len(articles), len(videos))
        for i in range(max_length):
            row = [
                list(articles)[i] if i < len(articles) else "",
                list(videos)[i] if i < len(videos) else "",
            ]
            writer.writerow(row)

In [56]:
# Calling the function to create the CSV file
articles, videos = recurse_bbc("https://www.bbc.com/news/topics/cwnpxwzd269t?page=1")
create_csv(articles, videos, "bbc_articles.csv")

Fetching https://www.bbc.com/news/topics/cwnpxwzd269t?page=1
Fetching https://www.bbc.com/news/topics/cwnpxwzd269t?page=2
other audio
Fetching https://www.bbc.com/news/topics/cwnpxwzd269t?page=3
Fetching https://www.bbc.com/news/topics/cwnpxwzd269t?page=4
Fetching https://www.bbc.com/news/topics/cwnpxwzd269t?page=5
Fetching https://www.bbc.com/news/topics/cwnpxwzd269t?page=6
Fetching https://www.bbc.com/news/topics/cwnpxwzd269t?page=7
other audio
Fetching https://www.bbc.com/news/topics/cwnpxwzd269t?page=8
other audio
Fetching https://www.bbc.com/news/topics/cwnpxwzd269t?page=9
other audio
other audio
Fetching https://www.bbc.com/news/topics/cwnpxwzd269t?page=10
Fetching https://www.bbc.com/news/topics/cwnpxwzd269t?page=11
other audio
other audio
Fetching https://www.bbc.com/news/topics/cwnpxwzd269t?page=12
Fetching https://www.bbc.com/news/topics/cwnpxwzd269t?page=13
other audio
Fetching https://www.bbc.com/news/topics/cwnpxwzd269t?page=14
other audio
Fetching https://www.bbc.com/news