# Scraping

https://us.myprotein.com/c/nutrition/protein/

In [1]:
import requests
import lxml.html as lx

url = "https://us.myprotein.com/c/nutrition/protein/"

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:140.0) Gecko/20100101 Firefox/140.0"
}

result = requests.get(url, headers=headers)
result.raise_for_status()          # make sure we actually got the page

html = lx.fromstring(result.text)  # parse HTML into a tree

cards = html.cssselect("product-card-wrapper")
print(len(cards))

28


In [2]:

first_card = cards[0]

# 1. Product name
name_el = first_card.cssselect("a.product-item-title")[0]
name = name_el.text_content().strip()
print("NAME:", name)

# 2. Product link (relative URL)
href = name_el.get("href")
print("HREF:", href)

# 3. Price
price_el = first_card.cssselect("span.price")[0]
price_text = price_el.text_content().strip()
print("PRICE TEXT:", price_text)


NAME: Myprotein Clear Whey Isolate - MARVEL
HREF: /p/sports-nutrition/myprotein-clear-whey-isolate-marvel/14944142/
PRICE TEXT: $39.99â


In [3]:

import re

raw_price = price_text

# Remove non-numeric / non-dot characters:
clean_price = re.sub(r"[^\d\.]", "", raw_price)

# Convert to float
price_value = float(clean_price)

print("Clean price:", price_value)


Clean price: 39.99


In [4]:

first_card = cards[0]

review_el = first_card.cssselect("span.reviews-total")
if review_el:
    review_text = review_el[0].text_content().strip()
    print("RAW REVIEW TEXT:", review_text)
else:
    print("No reviews-total span found")

RAW REVIEW TEXT: (29)


In [5]:

import re

raw_reviews = review_text          # e.g. "(29)"

# Remove anything that is not a digit
digits_only = re.sub(r"\D", "", raw_reviews)

review_count = int(digits_only) if digits_only else 0

print("Clean review count:", review_count)

Clean review count: 29


In [6]:
import re
import pandas as pd
import requests
import lxml.html as lx

BASE_URL = "https://us.myprotein.com"

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:140.0) Gecko/20100101 Firefox/140.0"
}

def scrape_page(url):
    """Scrapes a single MyProtein product-list page and returns a DataFrame."""

    result = requests.get(url, headers=headers)
    result.raise_for_status()
    html = lx.fromstring(result.text)

    cards = html.cssselect("product-card-wrapper")
    rows = []

    for card in cards:

        # product name + link
        name_el = card.cssselect("a.product-item-title")[0]
        name = name_el.text_content().strip()
        href = name_el.get("href")
        full_url = BASE_URL + href

        # price
        price_el = card.cssselect("span.price")[0]
        price_text = price_el.text_content().strip()
        clean_price = re.sub(r"[^\d\.]", "", price_text)
        price_value = float(clean_price) if clean_price else None

        # reviews (optional)
        review_el = card.cssselect("span.reviews-total")
        if review_el:
            digits_only = re.sub(r"\D", "", review_el[0].text_content())
            n_reviews = int(digits_only) if digits_only else 0
        else:
            n_reviews = 0

        rows.append({
            "store": "myprotein",
            "product_name": name,
            "product_url": full_url,
            "price_total": price_value,
            "raw_price_text": price_text,
            "n_reviews": n_reviews,
        })

    return pd.DataFrame(rows)

In [7]:
df1 = scrape_page("https://us.myprotein.com/c/nutrition/protein/")
df2 = scrape_page("https://us.myprotein.com/c/nutrition/protein/?pageNumber=2")
full_df = pd.concat([df1, df2], ignore_index=True)

In [8]:
full_df

Unnamed: 0,store,product_name,product_url,price_total,raw_price_text,n_reviews
0,myprotein,Myprotein Clear Whey Isolate - MARVEL,https://us.myprotein.com/p/sports-nutrition/my...,39.99,$39.99â,29
1,myprotein,Impact Whey Protein,https://us.myprotein.com/p/sports-nutrition/im...,19.99,$19.99â,4389
2,myprotein,Clear Whey Isolate,https://us.myprotein.com/p/sports-nutrition/cl...,39.99,$39.99â,899
3,myprotein,Clear Whey Isolate (Sample),https://us.myprotein.com/p/sports-nutrition/cl...,3.99,$3.99â,53
4,myprotein,Pea Protein Isolate,https://us.myprotein.com/p/sports-nutrition/pe...,34.99,$34.99â,279
5,myprotein,Whey Forward,https://us.myprotein.com/p/sports-nutrition/wh...,29.99,$29.99â,165
6,myprotein,Impact Whey Isolate,https://us.myprotein.com/p/sports-nutrition/im...,29.99,$29.99â,2427
7,myprotein,Clear Whey MIKE AND IKEÂ® Flavors,https://us.myprotein.com/p/sports-nutrition/cl...,39.99,$39.99â,210
8,myprotein,Clear Whey Isolate Protein Drink,https://us.myprotein.com/p/sports-nutrition/cl...,49.99,$49.99â,5
9,myprotein,Myprotein Clear Whey Isolate - MARVEL (Sample),https://us.myprotein.com/p/sports-nutrition/my...,3.99,$3.99â,5


In [9]:

import requests
import lxml.html as lx

# pick one product URL from your scraped DataFrame
product_url = full_df.loc[0, "product_url"]   # e.g. Clear Whey Isolate - MARVEL

print("Scraping:", product_url)

result = requests.get(product_url, headers=headers)
result.raise_for_status()

html = lx.fromstring(result.text)

# 1. Find the table under the "Nutrition Facts" section
tables = html.xpath(
    "//strong[normalize-space()='Nutrition Facts']/ancestor::li[1]//table"
)

print("Found tables:", len(tables))

if tables:
    table = tables[0]

    # 2. Print each row as a list of cell texts
    rows = table.xpath(".//tr")
    for i, row in enumerate(rows):
        cells = [t.strip() for t in row.xpath(".//text()") if t.strip()]
        print(f"Row {i}:", cells)
else:
    print("No nutrition table found.")


Scraping: https://us.myprotein.com/p/sports-nutrition/myprotein-clear-whey-isolate-marvel/14944142/
Found tables: 1
Row 0: ['Serving Size', '1 scoop (25g)']
Row 1: ['Amount Per Serving', '%Daily Value*']
Row 2: ['Calories', '80', 'Calories from Fat', '0']
Row 3: ['Total Fat', '0 g', '0%']
Row 4: ['Saturated Fat', '0 g', '0%']
Row 5: ['Trans Fat', '0 g', '0%']
Row 6: ['Cholesterol', '<5 mg', '0%']
Row 7: ['Sodium', '10 mg', '0%']
Row 8: ['Total Carbohydrate', '<1 g', '0%']
Row 9: ['Dietary Fiber', '0 g', '0%']
Row 10: ['Total Sugars', '0 g', 'n/a']
Row 11: ['Added Sugar', '0 g', 'n/a']
Row 12: ['Protein', '20 g', '40%']
Row 13: ['Vitamin D', '0 mcg (0%)', 'Potassium', '0 mg (0%)']
Row 14: ['Calcium', '10 mg (1%)', 'Iron', '0.1 g (1%)']
Row 15: ['* Percent daily Values are based on a 2,000 calorie diet. Your daily', 'values may be higher or lower depending on your calorie needs:']


In [10]:

nutrition = {
    "serving_size": None,
    "calories": None,
    "total_fat_g": None,
    "total_carbs_g": None,
    "protein_g": None,
}

for row in rows:
    cells = [t.strip() for t in row.xpath(".//text()") if t.strip()]
    if not cells:
        continue

    label = cells[0]

    # Serving size row
    if label == "Serving Size" and len(cells) >= 2:
        nutrition["serving_size"] = cells[1]

    # Calories row: ['Calories', '80', 'Calories from Fat', '0']
    elif label == "Calories" and len(cells) >= 2:
        # first numeric value after 'Calories'
        nutrition["calories"] = int("".join(ch for ch in cells[1] if ch.isdigit()))

    # Total Fat row: ['Total Fat', '0 g', '0%']
    elif label == "Total Fat" and len(cells) >= 2:
        val = cells[1]
        # keep digits and decimal point only
        num = "".join(ch for ch in val if (ch.isdigit() or ch == "."))
        nutrition["total_fat_g"] = float(num) if num else 0.0

    # Total Carbohydrate row: ['Total Carbohydrate', '<1 g', '0%']
    elif label == "Total Carbohydrate" and len(cells) >= 2:
        val = cells[1].replace("<", "")  # treat "<1 g" as "1 g"
        num = "".join(ch for ch in val if (ch.isdigit() or ch == "."))
        nutrition["total_carbs_g"] = float(num) if num else 0.0

    # Protein row: ['Protein', '20 g', '40%']
    elif label == "Protein" and len(cells) >= 2:
        val = cells[1]
        num = "".join(ch for ch in val if (ch.isdigit() or ch == "."))
        nutrition["protein_g"] = float(num) if num else 0.0

print(nutrition)


{'serving_size': '1 scoop (25g)', 'calories': 80, 'total_fat_g': 0.0, 'total_carbs_g': 1.0, 'protein_g': 20.0}


In [11]:
import re
import requests
import lxml.html as lx

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:140.0) "
        "Gecko/20100101 Firefox/140.0"
    )
}


def extract_number(text):
    """
    Extract the first numeric value from a string like:
    '2g', '2 g', '<1 g', 'Less than 1 g', '0.5g', etc.
    Returns float or None.
    """
    if not text:
        return None

    text = text.lower()
    text = text.replace("less than", "")
    text = text.replace("<", "")

    m = re.search(r"(\d+(\.\d+)?)", text)
    return float(m.group(1)) if m else None


def parse_nutrition(url):
    try:
        resp = requests.get(url, headers=headers, timeout=12)
        resp.raise_for_status()
    except Exception as e:
        print("Request failed:", url, e)
        return {}

    html = lx.fromstring(resp.text)

    # 1. Find the first table that looks like a nutrition table
    nutr_table = None
    for tbl in html.xpath("//table"):
        t_text = " ".join(tbl.xpath(".//text()")).lower()
        # Must mention both calories and protein to count as a nutrition table
        if "calories" in t_text and "protein" in t_text:
            nutr_table = tbl
            break

    if nutr_table is None:
        print("No nutrition table found for:", url)
        return {}

    # 2. Parse rows of that table
    parsed = {
        "serving_size": None,
        "calories": None,
        "total_fat_g": None,
        "total_carbs_g": None,
        "protein_g": None,
    }

    rows = nutr_table.xpath(".//tr")
    for row in rows:
        cells = [t.strip() for t in row.xpath(".//text()") if t.strip()]
        if not cells:
            continue

        label = cells[0].lower()
        rest = " ".join(cells[1:])  # everything after the label

        # Serving size appears in lots of formats; just grab the text once
        if "serving size" in label and parsed["serving_size"] is None:
            parsed["serving_size"] = rest or cells[-1]
            continue

        # Calories row – multiple styles, so look for a number anywhere after label
        if label == "calories" and parsed["calories"] is None:
            for part in cells[1:]:
                m = re.search(r"\d+", part)
                if m:
                    parsed["calories"] = int(m.group(0))
                    break
            continue

        # Total fat
        if "total fat" in label and parsed["total_fat_g"] is None:
            parsed["total_fat_g"] = extract_number(rest)
            continue

        # Total carbohydrate
        if "total carbohydrate" in label and parsed["total_carbs_g"] is None:
            parsed["total_carbs_g"] = extract_number(rest)
            continue

        # Protein
        if label == "protein" and parsed["protein_g"] is None:
            parsed["protein_g"] = extract_number(rest)
            continue

    return parsed


In [12]:
test_url = "https://us.myprotein.com/p/sports-nutrition/origin-protein/15294393/"
print(parse_nutrition(test_url))

{'serving_size': None, 'calories': 160, 'total_fat_g': 2.0, 'total_carbs_g': 4.0, 'protein_g': 30.0}


In [13]:
nutrition_rows = [parse_nutrition(u) for u in full_df["product_url"]]
nut_df = pd.DataFrame(nutrition_rows)
full_with_nutrition = pd.concat(
    [full_df.reset_index(drop=True), nut_df.reset_index(drop=True)],
    axis=1
)

No nutrition table found for: https://us.myprotein.com/p/sports-nutrition/clear-whey-isolate-with-all-natural-flavors/16546407/
No nutrition table found for: https://us.myprotein.com/p/sports-nutrition/the-whey-hyrox/16675322/
No nutrition table found for: https://us.myprotein.com/p/sports-nutrition/grass-fed-impact-whey-isolate/16401346/
No nutrition table found for: https://us.myprotein.com/p/sports-nutrition/clear-protein-bundle/14920007/
No nutrition table found for: https://us.myprotein.com/p/sports-nutrition/get-stronger-bundle/14970297/
No nutrition table found for: https://us.myprotein.com/p/sports-nutrition/impact-protein-bundle/14914588/


In [14]:
full_with_nutrition[full_with_nutrition["protein_g"].isnull()]

Unnamed: 0,store,product_name,product_url,price_total,raw_price_text,n_reviews,serving_size,calories,total_fat_g,total_carbs_g,protein_g
16,myprotein,Clear Whey Isolate | With All Natural Flavors,https://us.myprotein.com/p/sports-nutrition/cl...,39.99,$39.99â,5,,,,,
18,myprotein,THE Whey - HYROX,https://us.myprotein.com/p/sports-nutrition/th...,54.99,$54.99â,0,,,,,
24,myprotein,Grass-Fed Impact Whey Isolate,https://us.myprotein.com/p/sports-nutrition/gr...,59.99,$59.99â,1,,,,,
26,myprotein,Clear Protein Bundle,https://us.myprotein.com/p/sports-nutrition/cl...,47.99,$47.99â,6,,,,,
27,myprotein,Get Stronger Bundle,https://us.myprotein.com/p/sports-nutrition/ge...,95.98,$95.98â,9,,,,,
29,myprotein,Impact Protein Bundle,https://us.myprotein.com/p/sports-nutrition/im...,57.99,$57.99â,9,,,,,


In [15]:
full_with_nutrition.iloc[10,:]['product_url']

'https://us.myprotein.com/p/sports-nutrition/weight-gainer-blend/10852477/'

In [16]:
import re

def serving_size_to_grams(text):
    if not isinstance(text, str):
        return None
    
    t = text.lower()
    
    # common patterns: "1 scoop (25g)", "1 Large Scoop (25g)", "1 scoop (30 g)"
    m = re.search(r"\((\d+(\.\d+)?)\s*g\)", t)
    if m:
        return float(m.group(1))
    
    # fallback: "1 scoop 25g"
    m = re.search(r"(\d+(\.\d+)?)\s*g", t)
    if m:
        return float(m.group(1))
    
    return None

full_with_nutrition["serving_size_g"] = full_with_nutrition["serving_size"].apply(serving_size_to_grams)
full_with_nutrition


Unnamed: 0,store,product_name,product_url,price_total,raw_price_text,n_reviews,serving_size,calories,total_fat_g,total_carbs_g,protein_g,serving_size_g
0,myprotein,Myprotein Clear Whey Isolate - MARVEL,https://us.myprotein.com/p/sports-nutrition/my...,39.99,$39.99â,29,1 scoop (25g),80.0,0.0,1.0,20.0,25.0
1,myprotein,Impact Whey Protein,https://us.myprotein.com/p/sports-nutrition/im...,19.99,$19.99â,4389,1 scoop (25g),100.0,1.0,3.0,19.0,25.0
2,myprotein,Clear Whey Isolate,https://us.myprotein.com/p/sports-nutrition/cl...,39.99,$39.99â,899,1 scoop (25g),80.0,0.0,0.0,20.0,25.0
3,myprotein,Clear Whey Isolate (Sample),https://us.myprotein.com/p/sports-nutrition/cl...,3.99,$3.99â,53,1 scoop (25g),80.0,0.0,1.0,20.0,25.0
4,myprotein,Pea Protein Isolate,https://us.myprotein.com/p/sports-nutrition/pe...,34.99,$34.99â,279,1 Large Scoop (25g),100.0,2.0,0.5,21.0,25.0
5,myprotein,Whey Forward,https://us.myprotein.com/p/sports-nutrition/wh...,29.99,$29.99â,165,,90.0,0.0,3.0,20.0,
6,myprotein,Impact Whey Isolate,https://us.myprotein.com/p/sports-nutrition/im...,29.99,$29.99â,2427,,110.0,0.0,1.0,25.0,
7,myprotein,Clear Whey MIKE AND IKEÂ® Flavors,https://us.myprotein.com/p/sports-nutrition/cl...,39.99,$39.99â,210,,90.0,0.0,2.0,20.0,
8,myprotein,Clear Whey Isolate Protein Drink,https://us.myprotein.com/p/sports-nutrition/cl...,49.99,$49.99â,5,,80.0,0.0,1.0,20.0,
9,myprotein,Myprotein Clear Whey Isolate - MARVEL (Sample),https://us.myprotein.com/p/sports-nutrition/my...,3.99,$3.99â,5,1 Sachet (25g),80.0,0.0,1.0,20.0,25.0


In [17]:
resp = requests.get(test_url, headers=headers)
print("3.67 out of 5 stars" in resp.text)   # probably False
print("Customer Reviews" in resp.text)      # this might be True/False depending on SSR


True
True


In [18]:
import re
import requests
import lxml.html as lx

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:140.0) Gecko/20100101 Firefox/140.0"
}

def parse_rating(url):
    """Extract average rating like '3.67 out of 5 stars'."""
    try:
        resp = requests.get(url, headers=headers, timeout=12)
        resp.raise_for_status()
    except Exception as e:
        print("Request failed:", url, e)
        return None
    
    text = resp.text

    # Pattern:   3.67 out of 5 stars
    m = re.search(r"(\d\.\d+)\s+out of 5 stars", text)
    if m:
        return float(m.group(1))

    # Backup pattern: raw number placed elsewhere (rare but possible)
    m = re.search(r"(\d\.\d+)\s*</span>", text)
    if m:
        return float(m.group(1))

    return None


In [19]:
test_url = "https://us.myprotein.com/p/sports-nutrition/origin-protein/15294393/"
print(parse_rating(test_url))


3.67


In [None]:
rating_rows = [parse_rating(u) for u in full_df["product_url"]]
full_with_nutrition["rating_avg"] = rating_rows

In [21]:
full_with_nutrition.to_csv("myprotein_nutrition.csv", index=False)