# Scraping MyProtein Protein Powders

https://us.myprotein.com/c/nutrition/protein/

In [1]:
import re
import requests
import lxml.html as lx
import pandas as pd
import time

In [2]:
url = "https://us.myprotein.com/c/nutrition/protein/"

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:140.0) Gecko/20100101 Firefox/140.0"
}

result = requests.get(url, headers=headers)	
# make sure we actually got the page
result.raise_for_status()

# parse HTML into a tree
html = lx.fromstring(result.text) 

# grab all product cards
cards = html.cssselect("product-card-wrapper")
print(len(cards))

28


In [3]:
# test the first card
first_card = cards[0]

# 1. Product name
name_el = first_card.cssselect("a.product-item-title")[0]
name = name_el.text_content().strip()
print("NAME:", name)

# 2. Product link (relative URL)
href = name_el.get("href")
print("HREF:", href)

# 3. Price
price_el = first_card.cssselect("span.price")[0]
price_text = price_el.text_content().strip()
print("PRICE TEXT:", price_text)

NAME: Myprotein Clear Whey Isolate - MARVEL
HREF: /p/sports-nutrition/myprotein-clear-whey-isolate-marvel/14944142/
PRICE TEXT: $39.99â


In [4]:
# Preprocess price to get a clean numeric value
raw_price = price_text

# Remove non-numeric / non-dot characters:
clean_price = re.sub(r"[^\d\.]", "", raw_price)

# Convert to float
price_value = float(clean_price)

print("Clean price:", price_value)

Clean price: 39.99


In [5]:
# find the raw text test
first_card = cards[0]

review_el = first_card.cssselect("span.reviews-total")
if review_el:
    review_text = review_el[0].text_content().strip()
    print("RAW REVIEW TEXT:", review_text)
else:
    print("No reviews-total span found")

RAW REVIEW TEXT: (29)


In [6]:
# clean previous value
raw_reviews = review_text

# Remove anything that is not a digit
digits_only = re.sub(r"\D", "", raw_reviews)

review_count = int(digits_only) if digits_only else 0

print("Clean review count:", review_count)

Clean review count: 29


In [7]:
BASE_URL = "https://us.myprotein.com"

def scrape_page(url):
    """Scrapes a single MyProtein product-list page and returns a DataFrame."""

    result = requests.get(url, headers=headers)
    result.raise_for_status()
    html = lx.fromstring(result.text)

    cards = html.cssselect("product-card-wrapper")
    rows = []

    for card in cards:

        # product name + link
        name_el = card.cssselect("a.product-item-title")[0]
        name = name_el.text_content().strip()
        href = name_el.get("href")
        full_url = BASE_URL + href

        # price
        price_el = card.cssselect("span.price")[0]
        price_text = price_el.text_content().strip()
        clean_price = re.sub(r"[^\d\.]", "", price_text)
        price_value = float(clean_price) if clean_price else None

        # reviews
        review_el = card.cssselect("span.reviews-total")
        if review_el:
            digits_only = re.sub(r"\D", "", review_el[0].text_content())
            n_reviews = int(digits_only) if digits_only else 0
        else:
            n_reviews = 0

        rows.append({
            "store": "myprotein",
            "product_name": name,
            "product_url": full_url,
            "price_total": price_value,
            "raw_price_text": price_text,
            "n_reviews": n_reviews,
        })

    return pd.DataFrame(rows)

In [8]:
# get all entries from all pages
df1 = scrape_page("https://us.myprotein.com/c/nutrition/protein/")
df2 = scrape_page("https://us.myprotein.com/c/nutrition/protein/?pageNumber=2")
full_df = pd.concat([df1, df2], ignore_index=True)

In [9]:
full_df

Unnamed: 0,store,product_name,product_url,price_total,raw_price_text,n_reviews
0,myprotein,Myprotein Clear Whey Isolate - MARVEL,https://us.myprotein.com/p/sports-nutrition/my...,39.99,$39.99â,29
1,myprotein,Impact Whey Protein,https://us.myprotein.com/p/sports-nutrition/im...,19.99,$19.99â,4389
2,myprotein,Clear Whey Isolate,https://us.myprotein.com/p/sports-nutrition/cl...,39.99,$39.99â,899
3,myprotein,Clear Whey Isolate (Sample),https://us.myprotein.com/p/sports-nutrition/cl...,3.99,$3.99â,53
4,myprotein,Pea Protein Isolate,https://us.myprotein.com/p/sports-nutrition/pe...,34.99,$34.99â,279
5,myprotein,Whey Forward,https://us.myprotein.com/p/sports-nutrition/wh...,29.99,$29.99â,165
6,myprotein,Impact Whey Isolate,https://us.myprotein.com/p/sports-nutrition/im...,29.99,$29.99â,2427
7,myprotein,Clear Whey MIKE AND IKEÂ® Flavors,https://us.myprotein.com/p/sports-nutrition/cl...,39.99,$39.99â,210
8,myprotein,Myprotein Clear Whey Isolate - MARVEL (Sample),https://us.myprotein.com/p/sports-nutrition/my...,3.99,$3.99â,5
9,myprotein,Clear Whey Isolate Protein Drink,https://us.myprotein.com/p/sports-nutrition/cl...,49.99,$49.99â,5


In [10]:
# pick one product URL from your scraped DataFrame
# testing with the first entry
product_url = full_df.loc[0, "product_url"]

print("Scraping:", product_url)

result = requests.get(product_url, headers=headers)
result.raise_for_status()

html = lx.fromstring(result.text)

# find the table under the "Nutrition Facts" section
tables = html.xpath(
    "//strong[normalize-space()='Nutrition Facts']/ancestor::li[1]//table"
)

print("Found tables:", len(tables))

if tables:
    table = tables[0]
    rows = table.xpath(".//tr")
    for i, row in enumerate(rows):
        cells = [t.strip() for t in row.xpath(".//text()") if t.strip()]
        print(f"Row {i}:", cells)
else:
    print("No nutrition table found.")

Scraping: https://us.myprotein.com/p/sports-nutrition/myprotein-clear-whey-isolate-marvel/14944142/
Found tables: 1
Row 0: ['Serving Size', '1 scoop (25g)']
Row 1: ['Amount Per Serving', '%Daily Value*']
Row 2: ['Calories', '80', 'Calories from Fat', '0']
Row 3: ['Total Fat', '0 g', '0%']
Row 4: ['Saturated Fat', '0 g', '0%']
Row 5: ['Trans Fat', '0 g', '0%']
Row 6: ['Cholesterol', '<5 mg', '0%']
Row 7: ['Sodium', '10 mg', '0%']
Row 8: ['Total Carbohydrate', '<1 g', '0%']
Row 9: ['Dietary Fiber', '0 g', '0%']
Row 10: ['Total Sugars', '0 g', 'n/a']
Row 11: ['Added Sugar', '0 g', 'n/a']
Row 12: ['Protein', '20 g', '40%']
Row 13: ['Vitamin D', '0 mcg (0%)', 'Potassium', '0 mg (0%)']
Row 14: ['Calcium', '10 mg (1%)', 'Iron', '0.1 g (1%)']
Row 15: ['* Percent daily Values are based on a 2,000 calorie diet. Your daily', 'values may be higher or lower depending on your calorie needs:']


In [11]:
# preprocess nutrition info
nutrition = {
    "serving_size": None,
    "calories": None,
    "protein_g": None,
}

for row in rows:
    cells = [t.strip() for t in row.xpath(".//text()") if t.strip()]
    if not cells:
        continue

    label = cells[0]

    # Serving size row
    if label == "Serving Size" and len(cells) >= 2:
        nutrition["serving_size"] = cells[1]

    # Calories row
    elif label == "Calories" and len(cells) >= 2:
        # first numeric value after 'Calories'
        nutrition["calories"] = int("".join(ch for ch in cells[1] if ch.isdigit()))

    # Protein row
    elif label == "Protein" and len(cells) >= 2:
        val = cells[1]
        num = "".join(ch for ch in val if (ch.isdigit() or ch == "."))
        nutrition["protein_g"] = float(num) if num else 0.0

print(nutrition)

{'serving_size': '1 scoop (25g)', 'calories': 80, 'protein_g': 20.0}


In [12]:
# extract numeric value helper
def extract_number(text):
    """
    Extract the first numeric value from a string like:
    '2g', '2 g', '<1 g', 'Less than 1 g', '0.5g', etc.
    Returns float or None.
    """
    if not text:
        return None

    text = text.lower()
    text = text.replace("less than", "")
    text = text.replace("<", "")

    m = re.search(r"(\d+(\.\d+)?)", text)
    return float(m.group(1)) if m else None

# main nutrition parsing function
def parse_nutrition(url):
    print(f"Scanning: {url}...")
    try:
        resp = requests.get(url, headers=headers, timeout=12)
        resp.raise_for_status()
    except Exception as e:
        print("Request failed:", url, e)
        return {}

    # initialize the dictionary with None values so we always return a consistent structure
    parsed = {
        "serving_size": None,
        "calories": None,
        "protein_g": None,
    }

    html = lx.fromstring(resp.text)
    
    # first try finding a Standard Nutrition Table
    nutr_table = None
    for tbl in html.xpath("//table"):
        t_text = " ".join(tbl.xpath(".//text()")).lower()
        # must mention both calories and protein to count as a nutrition table
        if "calories" in t_text and "protein" in t_text:
            nutr_table = tbl
            break

    if nutr_table is not None:
        # parse rows of the table
        rows = nutr_table.xpath(".//tr")
        for row in rows:
            cells = [t.strip() for t in row.xpath(".//text()") if t.strip()]
            if not cells:
                continue

            label = cells[0].lower()
            rest = " ".join(cells[1:])

            # serving size in table
            if "serving size" in label and parsed["serving_size"] is None:
                parsed["serving_size"] = rest or cells[-1]
                continue

            # calories
            if label == "calories" and parsed["calories"] is None:
                for part in cells[1:]:
                    m = re.search(r"\d+", part)
                    if m:
                        parsed["calories"] = int(m.group(0))
                        break
                continue

            # protein
            if label == "protein" and parsed["protein_g"] is None:
                parsed["protein_g"] = extract_number(rest)
                continue
    else:
        print("No standard table found. Attempting text fallback...")

    # fallback for Serving Size
    # if table didn't exist OR table didn't have "Serving Size", run the regex on raw text.
    if parsed["serving_size"] is None:
        match = re.search(r'Serving Size[^0-9]{0,50}(\d[^<]*)', resp.text, re.IGNORECASE)
        if match:
            clean_text = match.group(1).strip()
            parsed["serving_size"] = clean_text
            print(f"Recovered Serving Size from text: {clean_text}")

    # fallback for Protein/Calories
    if parsed["protein_g"] is None:
         # look for "Protein 20g" or "20g Protein" pattern in raw text
         prot_match = re.search(r'(\d+(\.\d+)?)\s*g\s*Protein', resp.text, re.IGNORECASE)
         if not prot_match:
             prot_match = re.search(r'Protein\s*:?\s*(\d+(\.\d+)?)\s*g', resp.text, re.IGNORECASE)
         
         if prot_match:
             parsed["protein_g"] = float(prot_match.group(1))
             print(f"Recovered Protein from text: {parsed['protein_g']}g")

    return parsed

In [13]:
# test on single URL
test_url = "https://us.myprotein.com/p/sports-nutrition/origin-protein/15294393/"
print(parse_nutrition(test_url))

Scanning: https://us.myprotein.com/p/sports-nutrition/origin-protein/15294393/...
Recovered Serving Size from text: 2&nbsp;Scoop (45.5g)
{'serving_size': '2&nbsp;Scoop (45.5g)', 'calories': 160, 'protein_g': 30.0}


In [14]:
# run for all rows of data
nutrition_rows = [parse_nutrition(u) for u in full_df["product_url"]]
nut_df = pd.DataFrame(nutrition_rows)
full_with_nutrition = pd.concat(
    [full_df.reset_index(drop=True), nut_df.reset_index(drop=True)],
    axis=1
)

Scanning: https://us.myprotein.com/p/sports-nutrition/myprotein-clear-whey-isolate-marvel/14944142/...
Scanning: https://us.myprotein.com/p/sports-nutrition/impact-whey-protein/10852500/...
Scanning: https://us.myprotein.com/p/sports-nutrition/clear-whey-isolate/12095867/...
Scanning: https://us.myprotein.com/p/sports-nutrition/clear-whey-isolate-sample/12095872/...
Scanning: https://us.myprotein.com/p/sports-nutrition/pea-protein-isolate/10852589/...
Scanning: https://us.myprotein.com/p/sports-nutrition/whey-forward/13625704/...
Recovered Serving Size from text: 1 Scoop (25.2g)
Scanning: https://us.myprotein.com/p/sports-nutrition/impact-whey-isolate/10852482/...
Recovered Serving Size from text: 1 Scoop (29g)
Scanning: https://us.myprotein.com/p/sports-nutrition/clear-whey-mike-and-ike-flavors/13121847/...
Recovered Serving Size from text: 1 Scoop (26.3g)
Scanning: https://us.myprotein.com/p/sports-nutrition/myprotein-clear-whey-isolate-marvel-sample/15368064/...
Scanning: https://us

In [15]:
# convert serving size to grams
def serving_size_to_grams(text):
    if not isinstance(text, str):
        return None
    
    t = text.lower()
    
    # common patterns: "1 scoop (25g)", "1 Large Scoop (25g)", "1 scoop (30 g)"
    m = re.search(r"\((\d+(\.\d+)?)\s*g\)", t)
    if m:
        return float(m.group(1))
    
    # fallback: "1 scoop 25g"
    m = re.search(r"(\d+(\.\d+)?)\s*g", t)
    if m:
        return float(m.group(1))
    
    return None

full_with_nutrition["serving_size_g"] = full_with_nutrition["serving_size"].apply(serving_size_to_grams)
full_with_nutrition.head()

Unnamed: 0,store,product_name,product_url,price_total,raw_price_text,n_reviews,serving_size,calories,protein_g,serving_size_g
0,myprotein,Myprotein Clear Whey Isolate - MARVEL,https://us.myprotein.com/p/sports-nutrition/my...,39.99,$39.99â,29,1 scoop (25g),80.0,20.0,25.0
1,myprotein,Impact Whey Protein,https://us.myprotein.com/p/sports-nutrition/im...,19.99,$19.99â,4389,1 scoop (25g),100.0,19.0,25.0
2,myprotein,Clear Whey Isolate,https://us.myprotein.com/p/sports-nutrition/cl...,39.99,$39.99â,899,1 scoop (25g),80.0,20.0,25.0
3,myprotein,Clear Whey Isolate (Sample),https://us.myprotein.com/p/sports-nutrition/cl...,3.99,$3.99â,53,1 scoop (25g),80.0,20.0,25.0
4,myprotein,Pea Protein Isolate,https://us.myprotein.com/p/sports-nutrition/pe...,34.99,$34.99â,279,1 Large Scoop (25g),100.0,21.0,25.0


In [16]:
# rating parsing function
def parse_rating(url):
    """Extract average rating like '3.67 out of 5 stars'."""
    try:
        resp = requests.get(url, headers=headers, timeout=12)
        resp.raise_for_status()
    except Exception as e:
        print("Request failed:", url, e)
        return None
    
    text = resp.text

    # pattern:   3.67 out of 5 stars
    m = re.search(r"(\d\.\d+)\s+out of 5 stars", text)
    if m:
        return float(m.group(1))

    # backup pattern: raw number placed elsewhere (rare but possible)
    m = re.search(r"(\d\.\d+)\s*</span>", text)
    if m:
        return float(m.group(1))

    return None

In [None]:
# test on single URL
test_url = "https://us.myprotein.com/p/sports-nutrition/origin-protein/15294393/"
print(parse_rating(test_url))

3.67


In [18]:
# run for all rows of data
rating_rows = [parse_rating(u) for u in full_df["product_url"]]
full_with_nutrition["rating_avg"] = rating_rows

In [19]:
# extracts the amount of product by matching the SKU in the URL
def get_amount_by_specific_sku(url):
    print(f"Scanning: {url}...")
    try:
        # extract the SKU (Product ID) from the URL
        # URLs look like: .../clear-whey-isolate-sample/12095872/, we want the last few digits
        sku_match = re.search(r'/(\d+)/?$', url)
        if not sku_match:
            print("Could not find SKU in URL.")
            return None
        
        target_sku = sku_match.group(1)
        print(f"Target SKU: {target_sku}")

        resp = requests.get(url, headers=headers, timeout=10)
        html_text = resp.text

        # then find the SKU, then look forward for the Amount
        # we look for: "sku":12095872 ... "optionKey":"Amount","key":"0.88Oz"
        # we limit the search distance to ~1000 characters to prevent grabbing the wrong product
        pattern = (
            r'"sku":' + target_sku +
            r'(?:.|[\r\n]){1,1000}?' +
            r'"optionKey":"Amount","key":"([^"]+)"'
        )
        
        match = re.search(pattern, html_text)
        
        if match:
            found_amount = match.group(1)
            print(f"Found match: {found_amount}")
            return found_amount

        # if none of those work, try searching for the SKU *inside* quotes (sometimes they are strings)
        pattern_str = (
            r'"sku":"' + target_sku + 
            r'(?:.|[\r\n]){1,1000}?' + 
            r'"optionKey":"Amount","key":"([^"]+)"'
        )
        match_str = re.search(pattern_str, html_text)
        
        if match_str:
            found_amount = match_str.group(1)
            print(f"Found match (string format): {found_amount}")
            return found_amount

        print("SKU found, but no Amount key nearby. Using default fallback.")
        
        # if we can't match the specific SKU, try old method
        # This prevents the script from returning None if the page structure is weird
        fallback_match = re.search(r'"defaultVariant".{1,2000}?"optionKey":"Amount","key":"([^"]+)"', html_text, re.DOTALL)
        if fallback_match:
            return fallback_match.group(1)

        return None

    except Exception as e:
        print(f"Error: {e}")
        return None

# TEST
test_url_sample = "https://us.myprotein.com/p/sports-nutrition/clear-whey-isolate-sample/12095872/"

result = get_amount_by_specific_sku(test_url_sample)
print(f"Result: {result}")

Scanning: https://us.myprotein.com/p/sports-nutrition/clear-whey-isolate-sample/12095872/...
Target SKU: 12095872
Found match: 0.88Oz
Result: 0.88Oz


In [20]:
# load data
print(f"Starting amount scrape for {len(full_with_nutrition)} products...")

# define a wrapper to add a tiny delay
def amount_scraper_wrapper(url):
    val = get_amount_by_specific_sku(url)
    time.sleep(0.5) # Sleep 0.5s to prevent rate limiting
    return val

# apply to the whole column
full_with_nutrition["Number of Servings"] = full_with_nutrition["product_url"].apply(amount_scraper_wrapper)

Starting amount scrape for 31 products...
Scanning: https://us.myprotein.com/p/sports-nutrition/myprotein-clear-whey-isolate-marvel/14944142/...
Target SKU: 14944142
Found match: 20servings
Scanning: https://us.myprotein.com/p/sports-nutrition/impact-whey-protein/10852500/...
Target SKU: 10852500
Found match: 0.55lb
Scanning: https://us.myprotein.com/p/sports-nutrition/clear-whey-isolate/12095867/...
Target SKU: 12095867
Found match: 1.1lb
Scanning: https://us.myprotein.com/p/sports-nutrition/clear-whey-isolate-sample/12095872/...
Target SKU: 12095872
Found match: 0.88Oz
Scanning: https://us.myprotein.com/p/sports-nutrition/pea-protein-isolate/10852589/...
Target SKU: 10852589
Found match: 2.2lb
Scanning: https://us.myprotein.com/p/sports-nutrition/whey-forward/13625704/...
Target SKU: 13625704
Found match: 20servings
Scanning: https://us.myprotein.com/p/sports-nutrition/impact-whey-isolate/10852482/...
Target SKU: 10852482
Found match: 0.55lb
Scanning: https://us.myprotein.com/p/sport

In [23]:
full_with_nutrition.to_csv("data/myprotein_nutrition.csv", index=False)