# Scraping GNC

https://www.gnc.com/protein/protein-powder/?start=0&sz=60&sizeAdjusted=true

In [4]:
import re
import urllib.parse as urlparse

import requests
import lxml.html as lx
import pandas as pd

In [7]:
import requests

LIST_URL = "https://www.gnc.com/protein/protein-powder/?start=0&sz=60&sizeAdjusted=true"

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:140.0) Gecko/20100101 Firefox/140.0"
}

resp = requests.get(LIST_URL, headers=headers)
html_text = resp.text

print(resp.status_code)
print("product-tile in HTML?:", "product-tile" in html_text)
print("grid-tile in HTML?:   ", "grid-tile" in html_text)

307
product-tile in HTML?: False
grid-tile in HTML?:    False


In [10]:
import requests

# The exact URL you provided
url = "https://www.gnc.com/protein/protein-powder/?start=0&sz=60&sizeAdjusted=true"

# Rigid headers to mimic a real browser to avoid 403 Forbidden errors
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-User": "?1",
    "Pragma": "no-cache",
    "Cache-Control": "no-cache"
}

try:
    print(f"Attempting to fetch: {url}")
    response = requests.get(url, headers=headers, timeout=15)
    
    # Check the status code
    print(f"Status Code: {response.status_code}")
    
    # CRITICAL CHECK: Look for the specific class name from your screenshot in the raw text
    if "product-tile" in response.text:
        print("SUCCESS: Found 'product-tile' in the HTML. We are ready to scrape.")
    else:
        print("WARNING: 'product-tile' NOT found. We might be blocked or the page is empty.")

    # Save the HTML to a file so we can see exactly what GNC gave us
    with open("gnc_debug.html", "w", encoding="utf-8") as f:
        f.write(response.text)
    print("Saved response to 'gnc_debug.html'. Please open this file in your browser or text editor.")

except Exception as e:
    print(f"Connection failed: {e}")

Attempting to fetch: https://www.gnc.com/protein/protein-powder/?start=0&sz=60&sizeAdjusted=true
Status Code: 307
Saved response to 'gnc_debug.html'. Please open this file in your browser or text editor.


In [9]:
import pandas as pd

def scrape_gnc_page(url):
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    html = lx.fromstring(response.text)

    # 1. Select all product cards based on your screenshot
    cards = html.cssselect("div.product-tile")
    print(f"Found {len(cards)} products on this page.")

    rows = []

    for card in cards:
        try:
            # --- Extract Name and Link ---
            # Attempt to find the link containing the name
            name_link_el = card.cssselect("a.name-link")
            
            if name_link_el:
                name_el = name_link_el[0]
                # Get the link
                href = name_el.get("href")
                # Handle relative vs absolute URLs (GNC usually gives absolute, but just in case)
                full_url = href if href.startswith("http") else "https://www.gnc.com" + href
                
                # Get the text inside the name div
                # We strip whitespace to clean it up
                name = name_el.text_content().strip()
            else:
                # Skip this card if it's a promo tile or formatting glitch
                continue

            # --- Extract Price ---
            # Note: Your screenshot cut off the price internals. 
            # Usually GNC uses span.value inside .product-pricing
            price_el = card.cssselect(".product-pricing .value")
            if not price_el:
                # Fallback: sometimes it's just in .product-pricing
                price_el = card.cssselect(".product-pricing")
            
            price_text = price_el[0].text_content().strip() if price_el else "$0.00"
            
            # Clean Price (Standard Regex)
            clean_price = re.sub(r"[^\d\.]", "", price_text)
            price_value = float(clean_price) if clean_price else None

            # --- Extract Ratings (The GNC Way) ---
            # Your screenshot showed: <div class="TTRatingBox" data-starrating="4.5">
            rating_el = card.cssselect("div.TTRatingBox")
            
            if rating_el:
                # Get the attribute directly! Much safer than Regex on text.
                raw_rating = rating_el[0].get("data-starrating") 
                rating_value = float(raw_rating) if raw_rating else None
                
                # Get review count (inside the span: "1,786")
                # We reuse the element to look inside it
                count_text = rating_el[0].text_content().strip() 
                clean_count = re.sub(r"\D", "", count_text) # Remove non-digits
                review_count = int(clean_count) if clean_count else 0
            else:
                rating_value = None
                review_count = 0

            rows.append({
                "store": "GNC",
                "product_name": name,
                "product_url": full_url,
                "price_total": price_value,
                "rating": rating_value,
                "review_count": review_count
            })
            
        except Exception as e:
            # If one card fails, don't crash the whole script
            print(f"Error parsing a card: {e}")
            continue

    return pd.DataFrame(rows)

# Test it
df_gnc = scrape_gnc_page("https://www.gnc.com/protein-powder/")
df_gnc.head()

Found 0 products on this page.
