# Web Scraping: The Guardian

Returns a dataset of all the major headlines from the financial section of The Guardian ranging from 2007 to today (6/2020)

Dataset contains: headline, date, time, and link to article

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import time

In [127]:
def get_news_theguardian(page):
    
    # url definition
    url = "https://www.theguardian.com/theguardian/mainsection/financial3"
    url = url + "?page=" + str(page)
    
    # Request
    r1 = requests.get(url)
    r1.status_code

    # We'll save in coverpage the cover page content
    coverpage = r1.content

    # Soup creation
    soup1 = BeautifulSoup(coverpage, 'html5lib')

    # News identification
    coverpage_news = soup1.find_all('h3', class_='fc-item__title')
    
    number_of_articles = len(coverpage_news)

    # Empty lists for content, links and titles
    list_links = []
    list_titles = []
    list_dates = []
    list_times = []

    for n in np.arange(0, number_of_articles):

        # We need to ignore "live" pages since they are not articles
        if "live" in coverpage_news[n].find('a')['href']:  
            continue

        # Getting the link of the article
        link = coverpage_news[n].find('a')['href']
        list_links.append(link)

        # Getting the title
        title = coverpage_news[n].find('a').get_text()
        list_titles.append(title)

        # Reading the content (it is divided in paragraphs)
        article = requests.get(link)
        article_content = article.content
        soup_article = BeautifulSoup(article_content, 'html5lib')
        dates = soup_article.find_all('meta', property="article:published_time")
        dateSoup = BeautifulSoup(str(dates))

        #getting the date
        date = dateSoup.meta['content'][:10]
        list_dates.append(date)

        #getting the time
        time = dateSoup.meta['content'][11:19]
        list_times.append(time)

        # df_show_info
        df_show_info = pd.DataFrame(
        {'Article Title': list_titles,
        'Article Date': list_dates,
        'Article Time' :list_times,
        'Article Link': list_links})
    
    return (df_show_info)

In [None]:
list_links = []
list_titles = []
list_dates = []
list_times = []

final_df = pd.DataFrame(
{'Article Title': list_titles,
'Article Date': list_dates,
'Article Time' :list_times,
'Article Link': list_links})

#append info of each coverpage to final dataframe; theguardian has 1900 pages
totalpages = 1900
for page in range(totalpages):
    final_df.append(get_news_theguardian(page))

In [None]:
final_df.head()