<a href="https://colab.research.google.com/github/iHakawaTi/web-scraping-project/blob/main/web_scraping_bs4_praw.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# --- Part 1: Web Scraping with BeautifulSoup ---

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [None]:
base_url = 'https://www.careopinion.org.uk/opinions/?page={}'
base_domain = 'https://www.careopinion.org.uk'

def scrape_page(url):
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'lxml')
    opinions_link = soup.find_all("a", class_='font-c-1 tooltip')
    story_titles = []
    story_dates = []
    story_opinions = []
    story_services = []
    story_summaries = []

    for link in opinions_link:
        opinion_title = link.get_text(strip=True).replace('"', '')
        story_titles.append(opinion_title)
        href = link.get('href')
        if href:
            full_url = base_domain + href

            opinion_response = requests.get(full_url)
            opinion_response.raise_for_status()
            opinion_soup = BeautifulSoup(opinion_response.content, 'lxml')

            opinion_tag = opinion_soup.find('blockquote', id='opinion_body')
            opinion_text = opinion_tag.get_text(strip=True) if opinion_tag else 'Story text does not exist.'
            story_opinions.append(opinion_text)

            date_tag = opinion_soup.find('time')
            opinion_date = date_tag.get_text(strip=True) if date_tag else 'Date does not exist.'
            story_dates.append(opinion_date)

            service_tag = opinion_soup.find('p', class_='service_location m-margin-w-1')
            opinion_related_service = service_tag.get_text(strip=True) if service_tag else 'Related Service does not exist.'
            story_services.append(opinion_related_service)

            summary_tag = opinion_soup.find('div',class_='inner')
            opinion_summary = summary_tag.get_text(separator=' ',strip=True) if summary_tag else 'No summary'
            story_summaries.append(opinion_summary)

            time.sleep(1)

    return story_titles, story_dates, story_opinions, story_services, story_summaries


def scraping_all_pages(base_url, max_pages):
    all_titles = []
    all_dates = []
    all_opinions = []
    all_services = []
    all_summaries = []

    total_stories = 0

    for page in range(1, max_pages + 1):
        print(f"Collecting data from page: {page}")
        page_url = base_url.format(page)
        titles, dates, opinions, services, summaries = scrape_page(page_url)

        all_titles.extend(titles)
        all_dates.extend(dates)
        all_opinions.extend(opinions)
        all_services.extend(services)
        all_summaries.extend(summaries)

        total_stories += len(titles)
        print(f"Total stories: {total_stories}")
        if total_stories >= 100:
            break

        time.sleep(1)

    return all_titles, all_dates, all_opinions, all_services, all_summaries


titles, dates, opinions, services, summaries = scraping_all_pages(base_url, max_pages=25)

df = pd.DataFrame({
    'Story Title': titles,
    'Date Published': dates,
    'Story Text': opinions,
    'Related Service': services,
    'Summary':summaries
})

df.to_csv('care_opinion.csv', index=False)
print("Saved results to csv file.")

Collecting data from page: 1
Total stories: 6
Collecting data from page: 2
Total stories: 12
Collecting data from page: 3
Total stories: 18
Collecting data from page: 4
Total stories: 24
Collecting data from page: 5
Total stories: 30
Collecting data from page: 6
Total stories: 36
Collecting data from page: 7
Total stories: 42
Collecting data from page: 8
Total stories: 48
Collecting data from page: 9
Total stories: 54
Collecting data from page: 10
Total stories: 60
Collecting data from page: 11
Total stories: 66
Collecting data from page: 12
Total stories: 72
Collecting data from page: 13
Total stories: 78
Collecting data from page: 14
Total stories: 84
Collecting data from page: 15
Total stories: 90
Collecting data from page: 16
Total stories: 96
Collecting data from page: 17
Total stories: 102
Saved results to csv file.


In [None]:
!pip install praw

Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Downloading praw-7.8.1-py3-none-any.whl (189 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.3/189.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading prawcore-2.4.0-py3-none-any.whl (17 kB)
Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: update_checker, prawcore, praw
Successfully installed praw-7.8.1 prawcore-2.4.0 update_checker-0.18.0


# --- Part 2: Web Scraping with PRAW (Reddit API) ---

In [None]:
import praw
import pandas as pd
from datetime import datetime
import csv
import logging

# Suppress PRAW warnings about asynchronous environment
logging.getLogger('praw').setLevel(logging.ERROR)

In [None]:
reddit = praw.Reddit(
    client_id='eQapkTFF6nCSbK3-_CFDgg',
    client_secret='oamVjLeRyBzUAshqEWVGgloerFR0Ww',
    user_agent='test'
)

keyword = 'hospital'
subreddits = ['hospitals', 'nursing', 'medicine']
limit_per_sub = 30
posts_data = []
for sub in subreddits:
    subreddit = reddit.subreddit(sub)
    print(f'Searching subreddit: {sub}')
    for post in subreddit.search(keyword, sort='new', limit=limit_per_sub):
        print(f'Processing post: {post.id}')

        user_id = f'user_{post.author}' if post.author else 'Deleted'
        post_content = f"{post.title} {post.selftext}"
        keyword_found = keyword.lower() in post_content.lower()
        number_replies=len(post.comments.list())

        posts_data.append({
            'Post_ID': post.id,
            'Title': post.title,
            'Content': post.selftext,
            'Author': user_id,
            'Subreddit': post.subreddit.display_name,
            'Date': datetime.fromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
            'Num_Comments': post.num_comments,
            'URL': post.url,
            'Keyword_Found': keyword_found,
            'subreddit':sub
        })

df = pd.DataFrame(posts_data)
df.to_csv('reddit_topic_posts.csv', index=False, quoting=csv.QUOTE_ALL)

print('Finished scraping Reddit posts.')
#print(df.head(10))

Searching subreddit: hospitals
Processing post: nk4vu3
Processing post: mw6618
Processing post: kk207b
Processing post: kaukkg
Processing post: jqxsrp
Processing post: izdup1
Processing post: izdiyt
Processing post: hq8ttc
Processing post: h8ntet
Processing post: g7a28i
Processing post: fxjgun
Processing post: ft3yl6
Processing post: fmumr1
Processing post: fgzm16
Processing post: fdrakw
Processing post: f3n1ip
Processing post: ednkid
Processing post: edmcpy
Processing post: edm5n8
Processing post: edm1p8
Processing post: edlvx6
Processing post: edlra3
Processing post: edlnqb
Processing post: edlj4u
Processing post: edleyz
Processing post: edlae9
Processing post: edl0zd
Processing post: ed6ntd
Processing post: ed6crm
Processing post: ed5wbm
Searching subreddit: nursing
Processing post: 1mlpcd1
Processing post: 1mlmtee
Processing post: 1mlmm97
Processing post: 1mlgyrs
Processing post: 1mlfvyn
Processing post: 1mlavgk
Processing post: 1mlau8z
Processing post: 1ml803u
Processing post: 1ml