# Data Collection

In [4]:
%pip install -r requirements.txt

# web scraping
from bs4 import BeautifulSoup
from datetime import datetime, timezone

# (asynchronous) requests
import requests
import asyncio
import aiohttp

# data processing
import pandas

# initialize training data
with open('data.csv', 'w') as file:
    file.write('label,title\n')

[0mNote: you may need to restart the kernel to use updated packages.


## Rappler: positive

In [5]:
RAPPLER_POSITIVE = 'https://www.rappler.com/topic/class-suspensions/page'
page_no = 1

articles = []
while True:
    try:
        response = requests.get(f'{RAPPLER_POSITIVE}/{page_no}')
        page = response.text
        soup = BeautifulSoup(page, 'html.parser')
        main = soup.body.main

        for headline in main.select('article'):
            content = headline.find('div', class_='archive-article__content')
            header = content.find(['h2', 'h3'])
            anchor = header.a
            title = anchor.text.strip()

            title.replace(r'"', r'\"')
            articles.append(f'positive,"{title}"')

        page_no += 1
    except: break

plaintext = '\n'.join(articles)
with open('data.csv', 'a') as file:
    file.write(plaintext)

print(plaintext)


positive,"[Walang Pasok] Class suspensions, Monday, March 24, 2025"
positive,"[Walang Pasok] Class suspensions, Friday, March 7, 2025"
positive,"[Walang Pasok] Class suspensions, Tuesday, March 4, 2025"
positive,"[Walang Pasok] Class suspensions, Monday, March 3, 2025"
positive,"[Walang Pasok] Class suspensions, EDSA anniversary, February 25, 2025"
positive,"‘Walang pasok’ in disaster-prone Philippines causes ‘significant learning losses’"
positive,"Gov’t work, classes suspended in Manila, Pasay, Davao City on January 13 for Iglesia Ni Cristo rally"
positive,"DepEd revises class and work suspension guidelines for public schools"
positive,"[Walang Pasok] Class suspensions, Monday, December 2, 2024"
positive,"[Walang Pasok] Class suspensions, Tuesday, November 19, 2024"
positive,"[Walang Pasok] Class suspensions, Monday, November 18, 2024"
positive,"[Walang Pasok] Class suspensions, Friday, November 15, 2024"
positive,"[Walang Pasok] Class suspensions, Thursday, November 14, 2024"
positi

## Rappler: Negative

In [6]:
RAPPLER_NEGATIVE = 'https://www.rappler.com/latest/page'
MAX_PAGES = 200
page_nos = range(1, MAX_PAGES + 1)
articles = []

async def fetch_and_parse(session, page_no):
    try:
        url = f'{RAPPLER_NEGATIVE}/{page_no}'
        async with session.get(url) as response:
            page = await response.text()
            soup = BeautifulSoup(page, 'html.parser')
            
            main = soup.body.main
            for headline in main.select('article'):
                content = headline.find('div', class_='archive-article__content')
                header = content.find(['h2', 'h3'])
                anchor = header.a
                title = anchor.text.strip()
                
                title.replace(r'"', r'\"')
                articles.append(f'negative,"{title}"')
    except: pass

async def main():
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_and_parse(session, page_no) for page_no in page_nos]
        await asyncio.gather(*tasks)

await main()

plaintext = '\n'.join(articles)
with open('data.csv', 'a') as file:
    file.write(plaintext)