In [None]:
glassdoor_urls = [
    "https://www.glassdoor.com/Reviews/Daffodil-International-University-Reviews-E1394524.htm",
    "https://www.glassdoor.com/Reviews/International-Islamic-University-Chittagong-Reviews-E936727.htm",
    "https://www.glassdoor.com/Reviews/East-Delta-University-Reviews-E2352449.htm",
    "https://www.glassdoor.com/Reviews/North-South-University-Reviews-E465930.htm",
    "https://www.glassdoor.com/Reviews/East-West-University-Reviews-E128305.htm"
]


In [4]:
import os
from dotenv import load_dotenv
from firecrawl import FirecrawlApp
from bs4 import BeautifulSoup
import re
import math

# Load environment variables and Firecrawl API key
load_dotenv()
api_key = os.getenv("FIRECRAWL_API_KEY")

# Initialize Firecrawl client
app = FirecrawlApp(api_key=api_key)

# Dictionary to store total reviews & total pages for each link
pages_info = {}

# Loop over each Glassdoor URL, scrape the first page,
# extract total reviews, and calculate total pages
for g_url in glassdoor_urls:
    print(f"Scraping the first page to get total reviews for: {g_url}")

    # 1) Scrape the first page
    result = app.crawl_url(
        url=g_url,
        params={
            "scrapeOptions": {
                "formats": ["html"]
            }
        }
    )

    # Ensure we got valid data
    if not result.get('data'):
        print("❌ No data returned. Possible error or blocking for this URL.\n")
        continue

    html = result['data'][0]['html']
    soup = BeautifulSoup(html, 'html.parser')

    # 2) Find the <span> containing the total review count
    span = soup.find("span", class_="PaginationContainer_paginationCount__DdbVG")
    if not span:
        print("❌ Could not find the pagination span for this URL.\n")
        continue

    text = span.get_text()  # Example: "Viewing 1 - 10 of 120 Reviews"
    match = re.search(r"Viewing\s+\d+\s*-\s*\d+\s+of\s+([\d,]+)\s+Reviews", text)

    if match:
        total_reviews = int(match.group(1).replace(",", ""))
        total_tabs = math.ceil(total_reviews / 10)
        pages_info[g_url] = (total_reviews, total_tabs)
        print("✅ Total reviews found:", total_reviews)
        print("🧾 Total review tabs (pages):", total_tabs, "\n")
    else:
        print("❌ Could not extract review count from span.\n")


Scraping the first page to get total reviews for: https://www.glassdoor.com/Reviews/Daffodil-International-University-Reviews-E1394524.htm
✅ Total reviews found: 102
🧾 Total review tabs (pages): 11 

Scraping the first page to get total reviews for: https://www.glassdoor.com/Reviews/International-Islamic-University-Chittagong-Reviews-E936727.htm
✅ Total reviews found: 25
🧾 Total review tabs (pages): 3 

Scraping the first page to get total reviews for: https://www.glassdoor.com/Reviews/East-Delta-University-Reviews-E2352449.htm
✅ Total reviews found: 11
🧾 Total review tabs (pages): 2 

Scraping the first page to get total reviews for: https://www.glassdoor.com/Reviews/North-South-University-Reviews-E465930.htm
✅ Total reviews found: 199
🧾 Total review tabs (pages): 20 

Scraping the first page to get total reviews for: https://www.glassdoor.com/Reviews/East-West-University-Reviews-E128305.htm
✅ Total reviews found: 45
🧾 Total review tabs (pages): 5 



In [5]:
from bs4 import BeautifulSoup
import os

# We'll now loop through 'pages_info' to scrape each URL's paginated links
# and save the reviews to separate .txt files (one per link).
for g_url, (total_reviews, total_tabs) in pages_info.items():
    # Create a descriptive filename, based on the last part of the URL
    # Example: "Daffodil-International-University-Reviews-E1394524.htm" -> "Daffodil-International-University-Reviews-E1394524.txt"
    base_name = os.path.splitext(os.path.basename(g_url))[0]  # remove ".htm"
    file_name = f"{base_name}.txt"

    print(f"Now scraping *all* pages for: {g_url}")
    print(f"Total reviews = {total_reviews}, Pages = {total_tabs}")
    print(f"Saving reviews in: {file_name}\n")

    # 1) Strip '.htm' to prepare for "_P{page}.htm"
    url_base = g_url.replace(".htm", "")

    # 2) Generate list of paginated URLs
    urls = [
        f"{url_base}{'_P' + str(page) if page > 1 else ''}.htm"
        for page in range(1, total_tabs + 1)
    ]

    # 3) Perform batch scrape
    result = app.batch_scrape_urls(
        urls=urls,
        params={"formats": ["html"]}
    )

    # 4) Open the file (write mode) to store all pages' reviews for this link
    with open(file_name, "w", encoding="utf-8") as f:
        # 5) Extract and write the review sections from each page
        for index, data in enumerate(result.get('data', []), start=1):
            html = data.get('html', '')
            soup = BeautifulSoup(html, 'html.parser')

            reviews_section = soup.find('div', id='ReviewsFeed', attrs={'data-test': 'reviews-list'})
            if reviews_section:
                # Write the raw text of the reviews section to the file.
                # (You could refine this to extract each review separately.)
                f.write(f"\n--- Page {index} of {g_url} ---\n")
                f.write(reviews_section.get_text(separator="\n", strip=True))
            # No need to show the output in console.
            # We're simply writing everything to the .txt file.

    print("-----\n")


Now scraping *all* pages for: https://www.glassdoor.com/Reviews/Daffodil-International-University-Reviews-E1394524.htm
Total reviews = 102, Pages = 11
Saving reviews in: Daffodil-International-University-Reviews-E1394524.txt

-----

Now scraping *all* pages for: https://www.glassdoor.com/Reviews/International-Islamic-University-Chittagong-Reviews-E936727.htm
Total reviews = 25, Pages = 3
Saving reviews in: International-Islamic-University-Chittagong-Reviews-E936727.txt

-----

Now scraping *all* pages for: https://www.glassdoor.com/Reviews/East-Delta-University-Reviews-E2352449.htm
Total reviews = 11, Pages = 2
Saving reviews in: East-Delta-University-Reviews-E2352449.txt

-----

Now scraping *all* pages for: https://www.glassdoor.com/Reviews/North-South-University-Reviews-E465930.htm
Total reviews = 199, Pages = 20
Saving reviews in: North-South-University-Reviews-E465930.txt

-----

Now scraping *all* pages for: https://www.glassdoor.com/Reviews/East-West-University-Reviews-E128305.h