# Scraping GNC

https://www.gnc.com/protein/protein-powder/?start=0&sz=60&sizeAdjusted=true

using a stealth browser to grab html

In [4]:
import undetected_chromedriver as uc
import time
import random
import pandas as pd
import lxml.html as lx
import re
import glob
import os

In [None]:
# scrape HTML from all pages on GNC website
def scrape_all_gnc_pages():
    print("Launching Stealth Browser...")
    options = uc.ChromeOptions()
    # Force version 142
    driver = uc.Chrome(options=options, version_main=142)

    # there's a total of 476 products, we'll check 60 at a time
    offsets = range(0, 480, 60) 

    try:
        for start_val in offsets:
            url = f"https://www.gnc.com/protein/protein-powder/?start={start_val}&sz=60&sizeAdjusted=true"
            print(f"------------------------------------------------")
            print(f"Navigating to offset {start_val}...")
            
            driver.get(url)

            # manual check for first page
            if start_val == 0:
                print("Potential CAPTCHA?")
                time.sleep(20) # Long wait for first login
            else:
                # shorter wait for subsequent pages
                wait_time = random.uniform(5, 8)
                time.sleep(wait_time)

            # save the file
            filename = f"html/gnc_page_{start_val}.html"
            with open(filename, "w", encoding="utf-8") as f:
                f.write(driver.page_source)
            print(f"Saved {filename}")

    except Exception as e:
        print(f"Script crashed: {e}")

    finally:
        driver.quit()
        print("Done scraping.")

if __name__ == "__main__":
    scrape_all_gnc_pages()

Launching Stealth Browser...
------------------------------------------------
Navigating to offset 0...
>>> PLEASE SOLVE CAPTCHAS NOW IF PRESENT <<<
Saved html/gnc_page_0.html
------------------------------------------------
Navigating to offset 60...
Saved html/gnc_page_60.html
------------------------------------------------
Navigating to offset 120...
Saved html/gnc_page_120.html
------------------------------------------------
Navigating to offset 180...
Saved html/gnc_page_180.html
------------------------------------------------
Navigating to offset 240...
Saved html/gnc_page_240.html
------------------------------------------------
Navigating to offset 300...
Saved html/gnc_page_300.html
------------------------------------------------
Navigating to offset 360...
Saved html/gnc_page_360.html
------------------------------------------------
Navigating to offset 420...
Saved html/gnc_page_420.html
Done scraping.


In [9]:
# read a single HTML file and extract product data
def parse_single_html_file(filepath):

    # Read HTML content
    with open(filepath, "r", encoding="utf-8") as f:
        html_content = f.read()
    
    # Parse HTML
    html = lx.fromstring(html_content)
    cards = html.cssselect("div.product-tile")
    
    page_products = []

    for card in cards:
        try:
            # name and link
            name_el = card.cssselect("a.name-link")
            if not name_el: continue
            
            # link cleaning
            raw_link = name_el[0].get("href")
            full_link = raw_link if raw_link.startswith("http") else "https://www.gnc.com" + raw_link
            
            # name cleaning
            raw_name = name_el[0].text_content().strip()
            name = " ".join(raw_name.split())

            # price
            price_el = card.cssselect(".product-pricing .product-standard-price, .product-pricing .value")
            if not price_el:
                price_el = card.cssselect(".product-pricing") # Fallback
            
            price_text = price_el[0].text_content().strip() if price_el else ""
            match = re.search(r"(\d+\.\d+)", price_text)
            price = float(match.group(1)) if match else None

            # ratings
            star_div = card.cssselect("div[data-starrating]")
            if star_div:
                rating = float(star_div[0].get("data-starrating"))
            else:
                rating = None

            # reviews
            review_container = card.cssselect("div.product-review")
            if review_container:
                all_text = review_container[0].text_content()
                digits = re.sub(r"\D", "", all_text)
                reviews = int(digits) if digits else 0
            else:
                reviews = 0

            page_products.append({
                "Product Name": name,
                "Price": price,
                "Rating": rating,
                "Reviews": reviews,
                "Link": full_link
            })

        except Exception:
            continue
            
    return page_products

def create_master_list():
    # get all saved files
    all_files = glob.glob("html/gnc_page_*.html")
    all_files.sort()
    
    print(f"Found {len(all_files)} files to process.")
    
    master_data = []

    # loop through every file
    for filename in all_files:
        print(f"Processing {filename}...")
        products = parse_single_html_file(filename)
        master_data.extend(products)

    # create DataFrame
    df = pd.DataFrame(master_data)
    
    # remove Duplicates
    initial_count = len(df)
    df = df.drop_duplicates(subset=["Link"])
    final_count = len(df)
    
    print(f"\nSuccess! Merged {len(all_files)} files.")
    print(f"Total raw items: {initial_count}")
    print(f"Total unique items after cleaning: {final_count}")
    
    return df

In [10]:
# run the merger
df_gnc_all = create_master_list()

Found 8 files to process.
Processing html/gnc_page_0.html...
Processing html/gnc_page_120.html...
Processing html/gnc_page_180.html...
Processing html/gnc_page_240.html...
Processing html/gnc_page_300.html...
Processing html/gnc_page_360.html...
Processing html/gnc_page_420.html...
Processing html/gnc_page_60.html...

Success! Merged 8 files.
Total raw items: 546
Total unique items after cleaning: 486


In [11]:
# strip servings from title and put in new column
df_gnc_all["Servings"] = (
    df_gnc_all["Product Name"]
    .str.extract(r"\((\d+)\s*Serving[s]?\)", expand=False)
)

df_gnc_all["Servings"] = pd.to_numeric(df_gnc_all["Servings"], errors="coerce")

df_gnc_all["Product Name"] = df_gnc_all["Product Name"].str.replace(
    r"\s*\(\d+\s*Serving[s]?\)", "", regex=True
).str.strip()

# drop rows where Servings could not be determined (AKA Null)
df_gnc_all.dropna(subset=["Servings"], inplace=True)

In [12]:
# save to csv
df_gnc_all.to_csv("data/gnc_master_products.csv", index=False)

In [13]:
MASTER_FILE = "data/gnc_master_products.csv"
FINAL_OUTPUT_FILE = "data/gnc_nutrition.csv"
ITEMS_TO_RUN_THIS_SESSION = 500 

# wait if captcha is detected
def wait_for_captcha_solve(driver):
    """
    Checks if a CAPTCHA is on screen. If yes, it pauses the script
    indefinitely until the user solves it.
    """
    # Common text found on the PerimeterX block screen
    block_keywords = [
        "Press & Hold",
        "Access to this page has been denied"
    ]
    
    # check if any of these are currently visible
    is_blocked = any(keyword in driver.page_source for keyword in block_keywords)
    
    if is_blocked:
        print("\nCAPTCHA DETECTED!")
        print("The script is PAUSED. Please solve the 'Press & Hold' challenge manually.")
        
        # enter infinite loop until the block text disappears
        while True:
            time.sleep(1) # Check every second
            
            # re-read the page source to see if it changed
            try:
                page_text = driver.page_source
                still_blocked = any(keyword in page_text for keyword in block_keywords)
                
                if not still_blocked:
                    print("CAPTCHA Solved! Resuming scrape...")
                    time.sleep(3) # give the new page a moment to fully load
                    return # exit the loop and continue the script
            except:
                # if page is reloading, just wait
                continue

# extract nutrition info from text
def get_nutrition_from_text(page_text):
    data = {
        "protein_g": None,
        "calories": None,
        "serving_size_text": None,
        "servings_count": None
    }
    
    # 1. Protein
    match = re.search(r"Protein[\s\n]+(\d+(\.\d+)?)[\s\n]*g", page_text, re.IGNORECASE)
    if not match: match = re.search(r"(\d+(\.\d+)?)[\s\n]*g[\s\n]+(of[\s\n]+)?Protein", page_text, re.IGNORECASE)
    if not match: match = re.search(r"Protein[\s\n]*:[\s\n]*(\d+(\.\d+)?)[\s\n]*g", page_text, re.IGNORECASE)
    if match: data["protein_g"] = float(match.group(1))

    # 2. Calories
    cal_match = re.search(r"Calories[\s\n]*:?[\s\n]*(\d{2,4})", page_text, re.IGNORECASE)
    if cal_match: data["calories"] = int(cal_match.group(1))

    # 3. Serving Size
    serv_match = re.search(r"Serving Size[\s\n]*:?[\s\n]*(.{1,50})", page_text, re.IGNORECASE)
    if serv_match:
        raw_serv = serv_match.group(1).strip()
        clean_serv = re.split(r'\n|\r|Amount', raw_serv)[0]
        data["serving_size_text"] = clean_serv
        
    # 4. Servings Per Container
    count_match = re.search(r"Servings Per Container[\s\n]*:?[\s\n]*(\d+)", page_text, re.IGNORECASE)
    if count_match: data["servings_count"] = int(count_match.group(1))

    return data

def smart_scraper():
    print("--- INITIALIZING PATIENT SCRAPER ---")

    if not os.path.exists(MASTER_FILE):
        print(f"Error: Could not find {MASTER_FILE}")
        return
    df_master = pd.read_csv(MASTER_FILE)
    
    if os.path.exists(FINAL_OUTPUT_FILE):
        df_existing = pd.read_csv(FINAL_OUTPUT_FILE)
        # drop incomplete rows so they get retried
        df_valid = df_existing.dropna(subset=['protein_g'])
        if len(df_existing) - len(df_valid) > 0:
            print(f"ðŸ§¹ Cleaned {len(df_existing) - len(df_valid)} empty rows to retry.")
            df_valid.to_csv(FINAL_OUTPUT_FILE, index=False)
        completed_links = set(df_valid["Link"])
        df_current_results = df_valid
    else:
        completed_links = set()
        df_current_results = pd.DataFrame()

    queue_df = df_master[~df_master["Link"].isin(completed_links)]
    session_queue = queue_df.head(ITEMS_TO_RUN_THIS_SESSION)
    
    print(f"ðŸŽ¯ Target: {ITEMS_TO_RUN_THIS_SESSION} items. (Queued: {len(queue_df)})")

    options = uc.ChromeOptions()
    driver = uc.Chrome(options=options, version_main=142)

    try:
        count = 0
        for index, row in session_queue.iterrows():
            count += 1
            print(f"\n[{count}/{ITEMS_TO_RUN_THIS_SESSION}] Processing: {row['Product Name'][:40]}...")

            try:
                driver.get(row["Link"])
                
                # check for captcha
                wait_for_captcha_solve(driver)
                
                # random wait
                time.sleep(random.uniform(2.5, 4.0))

                # scroll
                driver.execute_script("window.scrollTo(0, 700);")
                time.sleep(1.0)
                
                # click Ingredients
                try:
                    btns = driver.find_elements("xpath", "//*[contains(text(), 'Ingredients')]")
                    for btn in btns:
                        if btn.is_displayed():
                            driver.execute_script("arguments[0].click();", btn)
                except:
                    pass
                
                time.sleep(1.5) 

                # extract
                body_text = driver.find_element("tag name", "body").text
                
                # double check for captcha
                wait_for_captcha_solve(driver)
                
                nutrition_data = get_nutrition_from_text(body_text)

                if nutrition_data['protein_g'] is None:
                    print("No protein found (Blank save for retry).")
                else:
                    print(f"Success: {nutrition_data['protein_g']}g Protein")

                # merge and save
                row_data = row.to_dict()
                row_data.update(nutrition_data)
                
                new_row_df = pd.DataFrame([row_data])
                df_current_results = pd.concat([df_current_results, new_row_df], ignore_index=True)
                df_current_results.to_csv(FINAL_OUTPUT_FILE, index=False)

            except Exception as e:
                print(f"Error: {e}")
                continue

    except KeyboardInterrupt:
        print("\nStopped manually.")

    finally:
        driver.quit()
        print(f"\nSaved to {FINAL_OUTPUT_FILE}")

if __name__ == "__main__":
    smart_scraper()

--- INITIALIZING PATIENT SCRAPER ---
ðŸ§¹ Cleaned 18 empty rows to retry.
ðŸŽ¯ Target: 500 items. (Queued: 19)

[1/500] Processing: DymatizeÂ® ISO 100Â® Whey Protein Isolate ...
No protein found (Blank save for retry).

[2/500] Processing: Beyond RawÂ® Dynamic Gainer High-Tech Mas...


  df_current_results = pd.concat([df_current_results, new_row_df], ignore_index=True)


No protein found (Blank save for retry).

[3/500] Processing: Beyond RawÂ® Dynamic Gainer High-Tech Mas...
No protein found (Blank save for retry).

[4/500] Processing: Beyond RawÂ® Dynamic Gainer High-Tech Mas...
No protein found (Blank save for retry).

[5/500] Processing: CTRLÂ® Meal Replacement Shake - Fruity Fl...
No protein found (Blank save for retry).

[6/500] Processing: DymatizeÂ® Super Mass Gainer - Rich Choco...
No protein found (Blank save for retry).

[7/500] Processing: DymatizeÂ® Super Mass Gainer - Fruity Peb...
No protein found (Blank save for retry).

[8/500] Processing: DymatizeÂ® Super Mass Gainerâ„¢ - Gourmet V...
No protein found (Blank save for retry).

[9/500] Processing: AboutTimeÂ® Vegan Protein - Natural Choco...
No protein found (Blank save for retry).

[10/500] Processing: GR8 Lifestyle Ultim8 Protein + Colostrum...
No protein found (Blank save for retry).

[11/500] Processing: CTRLÂ® Meal Replacement Shake - Cinnamon ...
No protein found (Blank save for r

In [None]:
nutrition_final = pd.read_csv("gnc_nutrition.csv")
print(nutrition_final.shape)
nutrition_final.head()

(449, 10)


Unnamed: 0,Product Name,Price,Rating,Reviews,Link,Servings,protein_g,calories,serving_size_text,servings_count
0,RYSE Loaded Protein - Jet-Puffed Marshmallow,49.99,4.5,136,https://www.gnc.com/whey-protein/556534.html,27.0,25.0,130.0,1 scoop,27.0
1,JymÂ® Pro Jym Protein - Tahitian Vanilla Bean,74.99,5.0,937,https://www.gnc.com/whey-protein/533909.html,45.0,24.0,140.0,1 Scoop(s),51.0
2,Axe & Sledge Supplementsâ„¢ Farm Fed 100% Whey P...,56.99,4.5,107,https://www.gnc.com/whey-protein/581725.html,28.0,25.0,130.0,,
3,Raw Nutrition Itholate Protein - Chocolate Pea...,54.99,4.5,35,https://www.gnc.com/whey-protein/604706.html,25.0,28.0,150.0,1 Scoop,25.0
4,RYSE Loaded Protein - Little Debbie Christmas ...,49.99,4.5,136,https://www.gnc.com/whey-protein/556566.html,27.0,25.0,140.0,1 Scoop,27.0
