In [2]:
# Import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import time
import random

# Declare headers for the requests agent
headers = {
    "User-Agent" : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
    "Connection" : 'keep-alive',
}

In [6]:
link = 'https://www.rappler.com/philippines/harry-roque-released-after-detention-house-representatives-august-2024/'
r = requests.get(link, headers=headers)
r

<Response [200]>

In [7]:
# Inspect the content
r.content



In [8]:
# Use BeautifulSoup to parse the HTML page
soup = bs(r.content, 'html.parser')
soup

<!DOCTYPE html>

<html class="no-js" lang="en-US">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<script>(function(c){c.add('has-js');c.remove('no-js')})(document.documentElement.classList)</script>
<meta content="index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1" name="robots">
<script>window.dataLayer = window.dataLayer || []; window.dataLayer.push( {"type":"post","subtype":"post","context":{"is_front_page":false,"is_singular":true,"is_archive":false,"is_home":false,"is_search":false,"is_404":false,"is_post_type_archive":false,"is_tax":false,"is_article":true},"user":{"role":[]},"blog":{"url":"https:\/\/www.rappler.com","id":1},"network":{"url":"https:\/\/www.rappler.com","id":1},"post":{"ID":2760147,"slug":"harry-roque-released-after-detention-house-representatives-august-2024","published":"2024-08-23 13:37:02","modified":"2024-08-23 13:37:07","comments":0,"template":"","thumbnail":"https:\/\/www.rappl

In [9]:
soup.title  # with HTML tags
soup.title.text # text within the HTML tags
title = soup.title.text.strip() # whitespaces are removed

In [11]:
# Extract date published
date_published = soup.find(
    'time', # HTML element tag
    {
        'class' : 'published',
        'datetime' : True,
    } # HTML attribute
)
date = date_published['datetime'] # If you're after the HTML attribute, use the attribute name for the key
date_published

<time class="entry-date published post__timeago" datetime="2024-08-23T21:37:02+08:00">Aug 23, 2024 9:37 PM PHT</time>

In [13]:
# Extract article content
article_content = soup.find(
    'div',
    {
        'class' : 'post-single__content entry-content'
    }
)
# Returns the HTML code for the article
article_content

# Get all paragraph (<p>) from the article_text
tagged_lines = article_content.find_all('p') # Return list of paragraph elements

# Removes the HTML tags
text = ''

for line in tagged_lines :
  untagged_line = line.text
  text += untagged_line + '\n'

# Returns the article text as One Big String
print(text)

MANILA, Philippines – Former presidential spokesperson Harry Roque walked free on Friday night, August 23, after spending one day in the detention cell of the House of Representatives. 
House Secretary General Reginald Velasco confirmed with Rappler that Roque had been released, quoting House Sergeant-at-Arms Napoleon Taas.
Roque was detained on Thursday evening as punishment for the contempt citation slapped by lawmakers against him.
Members of the so-called quad committee investigating the proliferation of offshore gambling in the country sanctioned Roque for lying about his unavailability on August 16, the committee’s first day of inquiry.
Roque wrote to the committee on August 13, asking that he be excused during the August 16 congressional hearing, citing a court commitment in Manila on the same day.
Kabayan Representative Ron Salo — Roque’s godson-turned-political nemesis — bared during Thursday’s hearing that Roque had no scheduled court appearance on August 16, a Friday, based 

In [28]:
# Put information into a dataframe: Link, Title, Date Published, Article Content
rappler = pd.DataFrame(
    columns=['link','title','date_published','text',]
)
details = [link, title, date, text]
rappler.loc[0] = details

# Save to local machine
rappler.to_csv('rappler-articles.csv')

In [24]:
# Compile everything into a function for ease of reuse
def extract_rappler_article_data(link):
  r = requests.get(link, headers=headers)

  soup = bs(r.content, 'html.parser')

  title = soup.title.text.strip()

  date = soup.find('time', {'datetime' : True})['datetime']

  text = ''
  tagged_lines = soup.find('div', {'class' : 'post-single__content entry-content'}).find_all('p')
  for line in tagged_lines:
    untagged_line = line.text
    text += untagged_line + '\n'

  details = [link, title, date, text,]
  return details

In [5]:
mother_url = 'https://www.rappler.com/environment/disasters/page/'
page = 1
corpus = pd.DataFrame(columns=['link', 'title', 'date_published', 'text'])

while True:
  # Convert page number to string
  page_str = str(page)

  # Form the article page
  page_url = mother_url + page_str
  print('Working on ' + page_url)

  # Avoid being blocked
  time.sleep(random.randint(1,5))

  # Get the list of articles within each page
  page_r = requests.get(page_url, headers=headers)
  page_soup = bs(page_r.content, 'html.parser')
  # Get article container
  article_container = page_soup.find('div', {'id' : 'primary'})
  article_previews = article_container.find_all('article', {'class' : True})
  no_articles = len(article_previews)

  # If there are no articles found, end
  if no_articles < 1:
    print('Extraction finished.')
    break

  # Go through each article to extract and save to the dataframe
  for article_id in range(no_articles):
    article = article_previews[article_id]
    article_title = article.find('h3')

    # If no title, skip
    if article_title is None: continue

    try: corpus.loc[len(corpus)] = extract_rappler_article_data(article_title.find('a')['href'])
    except: continue

  # For each article, invoke `extract_rappler_article_data`

  # Check whether you have enough articles
  if page >= 5: break

  # Go to next page
  page += 1

corpus.to_csv('environment-articles.csv')

Working on https://www.rappler.com/environment/disasters/page/1
Working on https://www.rappler.com/environment/disasters/page/2
Working on https://www.rappler.com/environment/disasters/page/3
Working on https://www.rappler.com/environment/disasters/page/4
Working on https://www.rappler.com/environment/disasters/page/5


In [31]:
page_url = 'https://www.rappler.com/topic/bataan-oil-spill/' 
corpus = pd.DataFrame(columns=['link', 'title', 'date_published', 'text'])

page_r = requests.get(page_url, headers=headers)
page_soup = bs(page_r.content, 'html.parser')
  # Get article container
article_container = page_soup.find('div', {'id' : 'primary'})
article_previews = article_container.find_all('article', {'class' : True})
no_articles = len(article_previews)

for article_id in range(no_articles):
  article = article_previews[article_id]
  if article_id == 0: article_title = article.find('h3')
  else: article_title = article.find('h2')

     # If no title, skip
  if article_title is None: continue

  try: corpus.loc[len(corpus)] = extract_rappler_article_data(article_title.find('a')['href'])
  except: continue

article_list = [
  'https://www.rappler.com/philippines/coast-guard-confirms-minimal-oil-leak-mt-terranova/', 
  'https://www.rappler.com/philippines/owner-mt-terranova-taps-harbor-star-salvage-tanker-clean-oil-spill/', 
  'https://www.rappler.com/philippines/coast-guard-says-siphoning-mt-terranova-oil-start-after-leaking-valves-sealed-oil-spill-bataan/', 
  'https://www.rappler.com/philippines/salvage-operations-mtkr-jason-bradley-commence-soon/', 
  'https://www.rappler.com/philippines/coast-guard-oil-recovery-operations-mv-mirola-bataan/', 
  'https://www.rappler.com/philippines/bataan-oil-spill-cost-damage-fisheries-department-agriculture-august-7-2024/', 
  'https://www.rappler.com/philippines/was-mt-terranova-other-troubled-ships-bataan-involved-oil-smuggling/', 
]

for article in article_list: corpus.loc[len(corpus)] = extract_rappler_article_data(article)

corpus.to_csv('oilspill-rappler.csv')

In [32]:
corpus

Unnamed: 0,link,title,date_published,text
0,https://www.rappler.com/philippines/senator-cy...,Villar urges faster case buildup in Bataan oil...,2024-09-17T20:01:32+08:00,"MANILA, Philippines – Senator Cynthia Villar o..."
1,https://www.rappler.com/philippines/luzon/bata...,Bataan coastal community still feels the pain ...,2024-09-17T14:15:46+08:00,"BATAAN, Philippines – Residents of Gatchalian ..."
2,https://www.rappler.com/philippines/mt-terrano...,MT Terranova siphoning operations end with 97%...,2024-09-13T10:17:31+08:00,"MANILA, Philippines – The oil siphoning operat..."
3,https://www.rappler.com/philippines/luzon/marc...,"A month into Bataan oil spill, Marcos says Cav...",2024-08-29T18:01:02+08:00,"MANILA, Philippines – President Ferdinand Marc..."
4,https://www.rappler.com/philippines/liters-oil...,"Over 160,000 liters of oil siphoned from MT Te...",2024-08-23T16:35:30+08:00,"MANILA, Philippines – A total of 161,612 liter..."
5,https://www.rappler.com/philippines/remulla-sa...,Red flags in Bataan? Remulla says billions los...,2024-08-15T11:44:39+08:00,"MANILA, Philippines – With allegations of oil ..."
6,https://www.rappler.com/philippines/hectares-m...,DENR: Over 500 hectares of mangroves in Manila...,2024-08-14T17:17:39+08:00,"MANILA, Philippines – Over 500 hectares of man..."
7,https://www.rappler.com/philippines/fish-manil...,Fish in Manila Bay safe to eat except those fr...,2024-08-13T21:13:37+08:00,"MANILA, Philippines – Fish and shellfish in Ba..."
8,https://www.rappler.com/philippines/oil-remova...,Removal of oil from MT Terranova spill begins,2024-08-13T20:45:12+08:00,"MANILA, Philippines – Harbor Star, the company..."
9,https://www.rappler.com/business/things-to-kno...,FAST FACTS: Things to know about MT Terranova ...,2024-08-10T18:00:00+08:00,"MANILA, Philippines – Here are some informatio..."
