# This notebook is the step-by-step on how to extract the data from https://materialdistrict.com

## 0. Setup

In [1]:
import time, random, re
from urllib import robotparser
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

BASE = "https://materialdistrict.com"
HDRS = {
    "User-Agent": "Academic research bot (non-commercial) <you@your.edu>",
    "Accept-Language": "en-US,en;q=0.9",
}

def get(url, **kw):
    """Small wrapper with headers + timeout."""
    return requests.get(url, headers=HDRS, timeout=kw.pop("timeout", 30), **kw)

def show(text, max_chars=1200):
    """Pretty truncate long text prints."""
    t = text if isinstance(text, str) else str(text)
    print(t if len(t) <= max_chars else t[:max_chars] + "\n... [truncated]")

## 1. Check `robots.txt`

In [2]:
robots_url = urljoin(BASE, "/robots.txt")
r = get(robots_url)
r.raise_for_status()
print("robots.txt URL:", robots_url)
show(r.text)

# Parse with robotparser to programmatically check specific paths
rp = robotparser.RobotFileParser()
rp.set_url(robots_url)
rp.parse(r.text.splitlines())

paths_to_check = [
    "/",
    "/material/",
    "/material/page/1/",
    "/material/mycelium-foam/",
    "/wp-json/",
    "/wp-admin/",
]

ua = HDRS["User-Agent"]
print("\nCan we fetch these paths?")
for p in paths_to_check:
    print(f"{p:35} -> {rp.can_fetch(ua, urljoin(BASE, p))}")

robots.txt URL: https://materialdistrict.com/robots.txt
User-agent: *
Disallow: /wp-admin/
Disallow: /wp-includes/
Disallow: /themes/
Disallow: /85712959/

User-agent: Googlebot
Allow: /wp-admin/css
Allow: /wp-admin/js

Can we fetch these paths?
/                                   -> True
/material/                          -> True
/material/page/1/                   -> True
/material/mycelium-foam/            -> True
/wp-json/                           -> True
/wp-admin/                          -> False


## 2. Try the sitemap index

In [3]:
sitemap_index = urljoin(BASE, "/sitemap_index.xml")
resp = get(sitemap_index)
print("Sitemap index status:", resp.status_code)
if resp.ok:
    soup = BeautifulSoup(resp.text, "xml")  # XML parser
    locs = [loc.get_text(strip=True) for loc in soup.find_all("loc")]
    print(f"Found {len(locs)} sitemap files.")
    # show a few:
    for u in locs[55:65]:
        print("  -", u)
else:
    show(resp.text)

Sitemap index status: 200
Found 83 sitemap files.
  - https://materialdistrict.com/brand-sitemap2.xml
  - https://materialdistrict.com/brand-sitemap3.xml
  - https://materialdistrict.com/material-sitemap.xml
  - https://materialdistrict.com/material-sitemap2.xml
  - https://materialdistrict.com/material-sitemap3.xml
  - https://materialdistrict.com/material-sitemap4.xml
  - https://materialdistrict.com/post_tag-sitemap.xml
  - https://materialdistrict.com/post_tag-sitemap2.xml
  - https://materialdistrict.com/persons-sitemap.xml
  - https://materialdistrict.com/sector-sitemap.xml


## 3. Keep only material sitemaps

In [4]:
material_sitemaps = [u for u in locs if "material-sitemap" in u.lower()]
print("Material sitemaps:", len(material_sitemaps))
for u in material_sitemaps:
    print(" -", u)


Material sitemaps: 4
 - https://materialdistrict.com/material-sitemap.xml
 - https://materialdistrict.com/material-sitemap2.xml
 - https://materialdistrict.com/material-sitemap3.xml
 - https://materialdistrict.com/material-sitemap4.xml


## 4. Extract all `/material/...` URLs from those sitemaps

In [5]:
def urls_from_sitemap(url: str):
    r = requests.get(url, headers=HDRS, timeout=30)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "xml")
    urls = []
    for loc in soup.find_all("loc"):
        u = loc.get_text(strip=True)
        if "/material/" in u:
            urls.append(u)
    return urls

all_material_urls = []
for sm in material_sitemaps:
    time.sleep(random.uniform(0.8, 1.5))   # polite delay
    subset = urls_from_sitemap(sm)
    print(f"{sm} -> {len(subset)} urls")
    all_material_urls.extend(subset)

# de-duplicate + sort
all_material_urls = sorted(set(all_material_urls))
print("\nTOTAL unique material URLs:", len(all_material_urls))
for u in all_material_urls[:10]:
    print("  -", u)

https://materialdistrict.com/material-sitemap.xml -> 1001 urls
https://materialdistrict.com/material-sitemap2.xml -> 1000 urls
https://materialdistrict.com/material-sitemap3.xml -> 1000 urls
https://materialdistrict.com/material-sitemap4.xml -> 163 urls

TOTAL unique material URLs: 3164
  - https://materialdistrict.com/material/
  - https://materialdistrict.com/material/013-denim/
  - https://materialdistrict.com/material/100-bacterial-dye/
  - https://materialdistrict.com/material/100-basalt-fabric/
  - https://materialdistrict.com/material/100-biobased-flax-panel/
  - https://materialdistrict.com/material/100-rejects-waxed-printed-cotton/
  - https://materialdistrict.com/material/2-000-000-000-precious-pieces/
  - https://materialdistrict.com/material/2-5d-print/
  - https://materialdistrict.com/material/2000n-pressed-shoe/
  - https://materialdistrict.com/material/2tec2-high-tech-flooring/


## 5. Sanity-check a few pages (status + `<title>`)

In [6]:
def peek_title(html: str) -> str:
    # quick & tiny title scrape (faster than a full parse)
    m = re.search(r"<title>(.*?)</title>", html, flags=re.I|re.S)
    return (m.group(1).strip() if m else "").replace("\n", " ")

sample = random.sample(all_material_urls, k=min(5, len(all_material_urls)))
print("Sampling", len(sample), "pages...\n")

for u in sample:
    r = requests.get(u, headers=HDRS, timeout=30)
    print(r.status_code, "-", u)
    if r.ok:
        print("   ", peek_title(r.text)[:120])
    time.sleep(random.uniform(0.8, 1.5))

Sampling 5 pages...

200 - https://materialdistrict.com/material/pet-felt-acoustic-panels-stripes/
    PET Felt Acoustic Panels Stripes - MaterialDistrict
200 - https://materialdistrict.com/material/evergreen/
    Evergreen - MaterialDistrict
200 - https://materialdistrict.com/material/tectonitx/
    tectonitX™ - MaterialDistrict
200 - https://materialdistrict.com/material/pet-felt-wall-shingles-round/
    PET Felt Wall Shingles Round - MaterialDistrict
200 - https://materialdistrict.com/material/3d-printed-concrete/
    3D printed concrete - MaterialDistrict


## 6. Save URL list

In [7]:
import json, csv, pathlib

out_dir = pathlib.Path(".")
out_dir.mkdir(parents=True, exist_ok=True)

with open(out_dir / "material_urls.csv", "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f); w.writerow(["url"]); w.writerows([[u] for u in all_material_urls[1:]]) # Because first URL is general page

print("Saved:",
      (out_dir / "material_urls.csv").resolve(), sep="\n - ")


Saved:
 - /Users/juanesfco/MaterialNet/data/material_urls.csv


## 7. Load URL list

In [8]:
import csv

material_urls_list = []
with open('material_urls.csv', newline='', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        material_urls_list.append(row['url'])

print(f"Loaded {len(material_urls_list)} URLs.")
material_urls_list[:5]  # Show a sample

Loaded 3163 URLs.


['https://materialdistrict.com/material/013-denim/',
 'https://materialdistrict.com/material/100-bacterial-dye/',
 'https://materialdistrict.com/material/100-basalt-fabric/',
 'https://materialdistrict.com/material/100-biobased-flax-panel/',
 'https://materialdistrict.com/material/100-rejects-waxed-printed-cotton/']

## 8. Pick test URL

In [28]:
TEST_URL = material_urls_list[0]
print("Test URL:", TEST_URL)

Test URL: https://materialdistrict.com/material/013-denim/


## 9. Helpers to fetch page information

In [23]:
import re, time, requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) MaterialNetBot/0.1 (+research; contact me)"
})

def fetch_html(url, *, sleep=0.5, timeout=20):
    resp = SESSION.get(url, timeout=timeout)
    time.sleep(sleep)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "lxml"), resp.text

_ws_re = re.compile(r"\s+")
def _clean(s):
    return _ws_re.sub(" ", s).strip() if s else None

## 10. Parsers for the desired fields

In [24]:
# Title
def parse_title(soup: BeautifulSoup):
    # Primary
    h1 = soup.select_one(".row.title h1") or soup.find("h1")
    if h1 and _clean(h1.get_text()):
        return _clean(h1.get_text())
    # Fallback to OG/Schema
    og = soup.find("meta", property="og:title")
    if og and og.get("content"):
        # Remove suffix " - MaterialDistrict"
        return _clean(re.sub(r"\s*-\s*MaterialDistrict\s*$", "", og["content"]))
    return None


# Subtitle
def parse_subtitle_meta(soup: BeautifulSoup):
    want = {"Category": None, "Code": None, "Country": None, "Brand": None}
    row = soup.select_one(".row.subtitle")
    if not row:
        return want

    # Each item is a "div.block" with <h6>Label</h6> followed by either text or <a>
    for blk in row.select(".block"):
        label = blk.find("h6")
        if not label:
            continue
        key = _clean(label.get_text()).title()
        if key not in want:
            continue

        a = blk.find("a")
        if a:
            want[key] = _clean(a.get_text())
        else:
            # Text can be mixed; remove the <h6> text from the block's text
            raw = _clean(blk.get_text(" "))
            if raw:
                val = _clean(raw.replace(_clean(label.get_text()), "", 1))
                want[key] = val or None

    return want

# Body text
def _find_main_post_block(soup: BeautifulSoup):
    """
    Typical post content lives in a <div class="block ... material type-material ... post-XXXXX ...">
    We match both 'material' AND 'type-material' and any class starting with 'post-'.
    """
    candidates = []
    for div in soup.select("div.block"):
        classes = set(div.get("class") or [])
        if "material" in classes and "type-material" in classes:
            if any(c.startswith("post-") for c in classes):
                candidates.append(div)
    # Prefer the longest text block if multiple match
    if not candidates:
        # Fallbacks: two-thirds column, hentry, etc.
        candidates = soup.select("article .col.two-thirds div.block")
    if not candidates:
        return None
    return max(candidates, key=lambda d: len(d.get_text(" ").strip()))

def parse_post_paragraphs_list(soup: BeautifulSoup):
    block = _find_main_post_block(soup)
    pars = []
    if block:
        for p in block.find_all("p"):
            txt = _clean(p.get_text(" "))
            if txt:
                pars.append(txt)
    return pars

def parse_post_text(soup: BeautifulSoup, joiner="\n\n"):
    return joiner.join(parse_post_paragraphs_list(soup))

# Material properties
def parse_material_properties(soup: BeautifulSoup):
    root = soup.select_one(".row.material-properties")
    if not root:
        return {}

    groups = {}
    # Each column has one <ul>; the first <li><strong>...</strong></li> is the group title
    for ul in root.select(".col .block ul"):
        # Group title
        title_li = ul.find("li")
        if not title_li:
            continue
        strong = title_li.find("strong")
        group_name = _clean(strong.get_text()) if strong else None
        if not group_name:
            continue

        props = {}
        for li in ul.find_all("li")[1:]:
            label_el = li.find("span", class_="property")
            if not label_el:
                continue
            label = _clean(label_el.get_text())

            # value might be in <a>, possibly wrapped in <span class="important">
            val_el = None
            # Prefer an <span class="important"> wrapper if present
            imp = li.find("span", class_="important")
            if imp and imp.find("a"):
                val_el = imp.find("a")
            elif li.find("a"):
                val_el = li.find("a")
            else:
                # as plain text after the label span
                label_text = _clean(label_el.get_text())
                li_text = _clean(li.get_text(" "))
                if li_text and label_text:
                    # drop the label from the beginning
                    tail = _clean(li_text.replace(label_text, "", 1))
                    val = tail
                else:
                    val = None
                props[label] = val
                continue

            val = _clean(val_el.get_text()) if val_el else None
            props[label] = val

        if props:
            groups[group_name] = props

    return groups

# Tags
def parse_material_tags(soup: BeautifulSoup):
    root = soup.select_one(".row.material-properties")
    if not root:
        return []
    tag_ul = root.select_one("ul.taglist")
    if not tag_ul:
        return []

    tags = []
    for a in tag_ul.select("a[rel='tag']"):
        t = _clean(a.get_text())
        if t:
            tags.append(t)

    # de-dupe while preserving order
    seen, out = set(), []
    for t in tags:
        if t not in seen:
            seen.add(t)
            out.append(t)
    return out

# Complete parser
def parse_material_page(url: str):
    soup, raw = fetch_html(url)
    meta = parse_subtitle_meta(soup)
    data = {
        "url": url,
        "material_name": parse_title(soup),
        "Category": meta.get("Category"),
        "Code": meta.get("Code"),
        "Country": meta.get("Country"),
        "Brand": meta.get("Brand"),
        "post_text": parse_post_text(soup, joiner="\n\n"),
        "post_paragraphs": parse_post_paragraphs_list(soup),
        "material_properties": parse_material_properties(soup),
        "tags": parse_material_tags(soup),
    }
    return data

## 11. Run on test page

In [29]:
data = parse_material_page(TEST_URL)
for k in ("material_name","Category","Code","Country","Brand"):
    print(f"{k:>16}:", data[k])
print("tags:", data["tags"])
print("properties groups:", list(data["material_properties"].keys()))
print("post_text preview:", data["post_text"][:300], "...\n")

   material_name: 013 Denim
        Category: Other naturals
            Code: ONA791
         Country: Netherlands
           Brand: 013 Denim
tags: ['Biobased', 'Recycling', 'Sustainable', 'Other naturals', 'fabric', 'textile']
properties groups: ['Sensorial', 'Technical']
post_text preview: 013 Denim is made 50% from recycled denim garments from Tilburg and 50% from organic cotton yarn from India. For the first production, the denim garments were collected in February 2017 by residents of Tilburg and 6 primary schools in the city. The denim garments were then recycled into denim yarn b ...



## 12. Read URLs from CSV

In [16]:
import pandas as pd

# Assume your CSV has a column called 'url' or similar
urls_df = pd.read_csv("material_urls.csv")
urls = urls_df.loc[:,'url'].dropna().tolist()
print(f"Loaded {len(urls)} URLs")

Loaded 3163 URLs


## 13. Crawl and parse all pages

In [18]:
results = []

for i, url in enumerate(urls, 1):
    try:
        data = parse_material_page(url)
        results.append(data)
        print(f"[{i}/{len(urls)}] OK: {data['material_name']}")
    except Exception as e:
        print(f"[{i}/{len(urls)}] ERROR on {url}: {e}")

[1/3163] OK: 013 Denim
[2/3163] OK: 100% bacterial dye
[3/3163] OK: Basalt knitted fabric
[4/3163] OK: 100% Biobased Flax panel
[5/3163] OK: 100% rejects waxed printed cotton
[6/3163] OK: 2,000,000,000 Precious Pieces
[7/3163] OK: 2.5D print
[8/3163] OK: 2000N Pressed Shoe
[9/3163] OK: 2tec2 high tech flooring
[10/3163] OK: 2tec2 – Marble & Desert
[11/3163] OK: 2TEC2
[12/3163] OK: 3D Veneer
[13/3163] OK: 37.5 technology CVC Woven
[14/3163] OK: 37.5 technology WPB laminate
[15/3163] OK: 3D Bamboo Wall panels
[16/3163] OK: 3D biaxial knitted fabric
[17/3163] OK: 3D Brocade all-over Lurex
[18/3163] OK: 3D Composites
[19/3163] OK: 3D core
[20/3163] OK: 3d-design
[21/3163] OK: 3D Engineered Sound Barrier
[22/3163] OK: 3D Engraved Glass
[23/3163] OK: 3D Felt Tiles
[24/3163] OK: 3D knitted CC cubic acoustic fabric
[25/3163] OK: 3D knitted jersey
[26/3163] OK: 3D Leveling Cork
[27/3163] OK: 3D Metal
[28/3163] OK: 3D Plate
[29/3163] OK: 3D print on stretch fabric
[30/3163] OK: 3D printed concre

## 14. Normalize into DataFrame

In [19]:
def flatten_record(d):
    flat = {
        "url": d["url"],
        "material_name": d.get("material_name"),
        "Category": d.get("Category"),
        "Code": d.get("Code"),
        "Country": d.get("Country"),
        "Brand": d.get("Brand"),
        "post_text": d.get("post_text"),
        "tags": ", ".join(d.get("tags", [])),
    }
    # Flatten properties
    for group, props in d.get("material_properties", {}).items():
        for k, v in props.items():
            flat[f"{group}_{k}"] = v
    return flat

flat_results = [flatten_record(r) for r in results]
df = pd.DataFrame(flat_results)


## 15. Save to CSV

In [20]:
df.to_csv("materials_data.csv", index=False)
print("Saved materials_data.csv with", len(df), "rows and", len(df.columns), "columns.")

Saved materials_data.csv with 3163 rows and 23 columns.
