In [95]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [96]:
def the_guardian_crawler(url, data, class_name):
  """
  Crawls The Guardian website for headlines and their corresponding tags.
  This function requres the accurate class name of the headline links to work properly which is subject to change as this is auto generated from css of the website."""

  driver.get(url)

  # Wait until Headline links load
  WebDriverWait(driver, 10).until(
      EC.presence_of_element_located((By.CLASS_NAME, class_name))
  )

  elements = driver.find_elements(By.CLASS_NAME, class_name)

  for element in elements:
    headline = element.get_attribute("aria-label")
    href = element.get_attribute("href")

    if headline and href:
      tag = href.split("/")[3] if len(href.split("/")) > 3 else "unknown"

      data.append({
        "headline": headline,
        "tag": tag
      })

  return data

In [97]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

url_int = "https://www.theguardian.com/international"
url_uk_lifestyle = "https://www.theguardian.com/uk/lifeandstyle"
url_us_lifestyle = "https://www.theguardian.com/us/lifeandstyle"
url_uk_sport = "https://www.theguardian.com/uk/sport"
url_us_sport = "https://www.theguardian.com/us/sport"
url_food = "https://www.theguardian.com/food"
url_family = "https://www.theguardian.com/lifeandstyle/family"
url_uk_tech = "https://www.theguardian.com/uk/technology"
url_uk_environment = "https://www.theguardian.com/uk/environment"
url_games = "https://www.theguardian.com/games"
url_uk_film = "https://www.theguardian.com/uk/film"
url_fashion = "https://www.theguardian.com/fashion"
url_uk_travel = "https://www.theguardian.com/uk/travel"
url_art = "https://www.theguardian.com/artanddesign"

url_middle_east = []
url_america = []
url_africa = []
url_unequality = []
url_cricket = []
url_f1 = []
url_cycling = []
url_men = []
url_women = []
url_opinion_video = []

for i in range(1, 20):
    url_middle_east.append(f"https://www.theguardian.com/world/middleeast?page={i}")
    url_america.append(f"https://www.theguardian.com/world/americas?page={i}")
    url_africa.append(f"https://www.theguardian.com/world/africa?page={i}")
    url_unequality.append(f"https://www.theguardian.com/inequality?page={i}")
    url_cricket.append(f"https://www.theguardian.com/sport/cricket?page={i}")
    url_f1.append(f"https://www.theguardian.com/sport/formulaone?page={i}")
    url_cycling.append(f"https://www.theguardian.com/sport/cycling?page={i}")
    url_men.append(f"https://www.theguardian.com/lifeandstyle/men?page={i}")
    url_women.append(f"https://www.theguardian.com/lifeandstyle/women?page={i}")
    url_opinion_video.append(f"https://www.theguardian.com/type/video+tone/comment?page={i}")


class_name = "dcr-2yd10d"

DATA = the_guardian_crawler(url_int, [], class_name)
DATA = the_guardian_crawler(url_uk_lifestyle, DATA, class_name)
DATA = the_guardian_crawler(url_us_lifestyle, DATA, class_name)
DATA = the_guardian_crawler(url_uk_sport, DATA, class_name)
DATA = the_guardian_crawler(url_us_sport, DATA, class_name)
DATA = the_guardian_crawler(url_food, DATA, class_name)
DATA = the_guardian_crawler(url_family, DATA, class_name)
DATA = the_guardian_crawler(url_uk_tech, DATA, class_name)
DATA = the_guardian_crawler(url_uk_environment, DATA, class_name)
DATA = the_guardian_crawler(url_games, DATA, class_name)
DATA = the_guardian_crawler(url_uk_film, DATA, class_name)
DATA = the_guardian_crawler(url_fashion, DATA, class_name)
DATA = the_guardian_crawler(url_uk_travel, DATA, class_name)
DATA = the_guardian_crawler(url_art, DATA, class_name)

for i in range(1, 20):
    DATA = the_guardian_crawler(url_middle_east[i-1], DATA, class_name)
    DATA = the_guardian_crawler(url_america[i-1], DATA, class_name)
    DATA = the_guardian_crawler(url_africa[i-1], DATA, class_name)
    DATA = the_guardian_crawler(url_unequality[i-1], DATA, class_name)
    DATA = the_guardian_crawler(url_cricket[i-1], DATA, class_name)
    DATA = the_guardian_crawler(url_f1[i-1], DATA, class_name)
    DATA = the_guardian_crawler(url_cycling[i-1], DATA, class_name)
    DATA = the_guardian_crawler(url_men[i-1], DATA, class_name)
    DATA = the_guardian_crawler(url_women[i-1], DATA, class_name)
    DATA = the_guardian_crawler(url_opinion_video[i-1], DATA, class_name)


driver.quit()


In [98]:
len(DATA)

4382

In [117]:
df = pd.DataFrame(DATA)
df = df.drop_duplicates(subset=["headline"])

In [118]:
len(df)

4143

In [119]:
df['tag'].unique()

<StringArray>
[                                     'world',
                              'commentisfree',
                                      'sport',
                                      'books',
                               'lifeandstyle',
                                    'uk-news',
                                    'us-news',
                                       'film',
                         'global-development',
                                 'technology',
                                    'society',
                               'artanddesign',
                                   'football',
                                'environment',
                                   'politics',
                                      'money',
                                      'music',
                                      'games',
                                    'culture',
                               'tv-and-radio',
                                      'stage',

In [120]:
# Generalising the tags into broader categories for better classification and analysis

category_map = {

    # ---------------------------
    # Politics
    # ---------------------------
    "politics": "Politics",
    "law": "Politics",
    "commentisfree": "Politics",
    "public-leaders-network": "Politics",

    # ---------------------------
    # World (includes Environment + Other + Society)
    # ---------------------------
    "world": "World",
    "us-news": "World",
    "uk-news": "World",
    "australia-news": "World",
    "global-development": "World",
    "global": "World",
    "news": "World",
    "society": "World",
    "community": "World",
    "education": "World",
    "inequality": "World",
    "cities": "World",
    "theobserver": "World",
    "environment": "World",
    "info": "World",
    "theguardian": "World",
    "membership": "World",

    # ---------------------------
    # Sports
    # ---------------------------
    "sport": "Sports",
    "football": "Sports",
    "guardian_sport": "Sports",

    # ---------------------------
    # Business (includes Technology)
    # ---------------------------
    "business": "Business",
    "money": "Business",
    "small-business-network": "Business",
    "technology": "Business",
    "science": "Business",

    # ---------------------------
    # Entertainment (includes Media)
    # ---------------------------
    "film": "Entertainment",
    "music": "Entertainment",
    "tv-and-radio": "Entertainment",
    "stage": "Entertainment",
    "games": "Entertainment",
    "culture": "Entertainment",
    "artanddesign": "Entertainment",
    "books": "Entertainment",
    "crosswords": "Entertainment",
    "media": "Entertainment",

    # ---------------------------
    # Lifestyle
    # ---------------------------
    "lifeandstyle": "Lifestyle",
    "fashion": "Lifestyle",
    "food": "Lifestyle",
    "wellness": "Lifestyle",
    "travel": "Lifestyle",

    # ---------------------------
    # Promotional / Campaigns â†’ World
    # ---------------------------
    "priceless-experiences-with-mastercard": "World",
    "thefilter": "World",
    "thefilter-us": "World",
    "a-golden-state-of-mind": "World",
    "the-grid": "World",
    "scotland-for-the-head-heart-and-the-spirit": "World",
    "out-of-the-ordinary": "World",
}

In [121]:
# Map the tags to broader categories and fill unmapped tags with "Other"

df['Topic'] = df['tag'].map(category_map)
df['Topic'] = df['Topic'].fillna("Other")

In [122]:
print(df["Topic"].value_counts())

Topic
World            1461
Sports           1150
Politics          676
Lifestyle         440
Entertainment     318
Business           98
Name: count, dtype: int64


In [123]:
df.to_csv("guardian_headlines.csv", index=False)

In [124]:
len(df)

4143