In [4]:
glassdoor_url = "https://www.glassdoor.com/Reviews/Daffodil-International-University-Reviews-E1394524.htm"

In [5]:
import os
from dotenv import load_dotenv
from firecrawl import FirecrawlApp
from bs4 import BeautifulSoup
import re
import math

# Load environment variables and Firecrawl API key
load_dotenv()
api_key = os.getenv("FIRECRAWL_API_KEY")

# Initialize Firecrawl client
app = FirecrawlApp(api_key=api_key)

# Use the base URL (page 1)
result = app.crawl_url(
    url=glassdoor_url,
    params={
        "scrapeOptions": {
            "formats": ["html"]
        }
    }
)

# Extract the HTML
html = result['data'][0]['html']
soup = BeautifulSoup(html, 'html.parser')

# Find the exact <span> containing review count
span = soup.find("span", class_="PaginationContainer_paginationCount__DdbVG")

if span:
    text = span.get_text()
    match = re.search(r"Viewing\s+\d+\s*-\s*\d+\s+of\s+([\d,]+)\s+Reviews", text)
    if match:
        total_reviews = int(match.group(1).replace(",", ""))
        total_tabs = math.ceil(total_reviews / 10)
        print("✅ Total reviews found:", total_reviews)
        print("🧾 Total review tabs (pages):", total_tabs)
    else:
        print("❌ Could not extract review count from span.")
else:
    print("❌ Review count span not found.")


✅ Total reviews found: 102
🧾 Total review tabs (pages): 11


In [6]:
from bs4 import BeautifulSoup

# Strip `.htm` to prepare for _P{page}.htm
url_base = glassdoor_url.replace(".htm", "")

# Generate list of paginated URLs
urls = [
    f"{url_base}{'_P' + str(page) if page > 1 else ''}.htm"
    for page in range(1, total_tabs + 1)
]

# Perform batch scrape
result = app.batch_scrape_urls(
    urls=urls,
    params={
        "formats": ["html"]
    }
)

# Extract and print the review sections
for index, data in enumerate(result.get('data', []), start=1):
    html = data.get('html', '')
    soup = BeautifulSoup(html, 'html.parser')
    reviews_section = soup.find('div', id='ReviewsFeed', attrs={'data-test': 'reviews-list'})
    if reviews_section:
        print(f"✅ Page {index} - Successfully extracted the reviews section.\n")
        print(reviews_section.prettify())
    else:
        print(f"❌ Page {index} - Couldn't find the reviews section.")


✅ Page 1 - Successfully extracted the reviews section.

<div data-test="reviews-list" id="ReviewsFeed">
 <ol class="ReviewsList_reviewsList__Qfw6M">
  <li>
   <div class="module-container_moduleContainer__tpBfv module-container_redesignContainer__rLCJ4" data-size-variant="md">
    <div data-test="review-details-container" id="empReview_24705290">
     <div class="review-details-bar_reviewTopContainer__a7VnD">
      <div class="review-details-bar_reviewTopLeftContainer__smvGD">
       <div class="review-details-bar_reviewRatingAndFeaturedContainer__J8iG9">
        <div class="review-details-bar_structureFeaturedContainer__vJOz1">
        </div>
        <div class="review-rating_ratingContainer__sQ_4_">
         <span class="review-rating_ratingLabel__0_Hk9" data-test="review-rating-label">
          3.0
         </span>
         <div aria-hidden="false" aria-live="polite" class="rating_RatingContainer__4Rsbr" role="status">
          <div aria-hidden="false" aria-live="polite" class="ra

In [7]:
from openai import OpenAI

# Load API key
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Function to estimate tokens roughly (approx. 4 chars = 1 token)
def is_too_long(text, limit_tokens=8192):
    estimated_tokens = len(text) // 4
    return estimated_tokens > limit_tokens

# Extract all review <li> elements
review_items = []
for data in result.get('data', []):
    html = data.get('html', '')
    soup = BeautifulSoup(html, 'html.parser')
    review_list = soup.select_one('div#ReviewsFeed[data-test="reviews-list"] > ol')
    if review_list:
        lis = review_list.find_all('li')
        for li in lis:
            review_items.append(str(li))

# Format each <li> with token check
def format_review_chunks(li_chunks):
    formatted = []
    for i, li_html in enumerate(li_chunks, start=1):
        if is_too_long(li_html, limit_tokens=7000):  # safety buffer under 8192
            print(f"⚠️ Skipped Review {i} - Too long ({len(li_html)} chars)\n")
            continue
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {
                        "role": "system",
                        "content": "You are a helpful assistant that formats raw Glassdoor review HTML into a clean structure with: Job Title, Pros, Cons, Rating."
                    },
                    {
                        "role": "user",
                        "content": f"Format this review:\n\n{li_html}"
                    }
                ],
                temperature=0.3
            )
            output = response.choices[0].message.content
            print(f"\n✅ Review {i}:\n{output}\n")
            formatted.append(output)
        except Exception as e:
            print(f"❌ Failed on Review {i}: {e}")
    return formatted

# Run it
formatted_reviews = format_review_chunks(review_items)



✅ Review 1:
**Job Title:** Lecturer

**Pros:** 
- I can't find anything good here.

**Cons:** 
1. Too narrow space
2. Lack of facility, particularly in research support
3. High teaching load
4. Bad management

**Rating:** 3.0


✅ Review 2:
**Job Title:** Lecturer  
**Pros:** Nothing Nothing Nothing Nothing Nothing  
**Cons:** Poor salary, Poor work environment, Worst management, Forced overtimes, Non paid overtime  
**Rating:** 1.0  


✅ Review 3:
**Job Title:** Lecturer  
**Pros:** Salaries are paid on time.  
**Cons:** Too much complications for each and every administrative works.  
**Rating:** 4.0  


✅ Review 4:
**Job Title:** Assistant Professor  
**Pros:** Very challenging and eye opener for the people who want to be a leader.  
**Cons:** A true challenge as the organization is truly international.  
**Rating:** 4.0  


✅ Review 5:
**Job Title:** Lecturer

**Pros:** The university provided buses for faculty to different routes, friendly students.

**Cons:** The work environment

KeyboardInterrupt: 