# Zara Web Scraping

## Descripción

Ejemplo de descarga con Sitemap

## Imports

In [None]:
import requests
import datetime
import bs4
import pandas
import random
random.seed = 42
import time

## Configuración

In [None]:
CONFIGURATION = {
    "SITEMAP_URL": "https://www.zara.com/sitemaps/sitemap-images-es-es.xml.gz",
    "USER_AGENT": "Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0",
    "PRODUCT_LIMIT": 50,
    "THROTTLE": 1
}

## Sitemap

In [None]:
header="Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0"
!wget "$header" "https://www.zara.com/sitemaps/sitemap-images-es-es.xml.gz" -O index

--2024-03-27 02:15:26--  http://mozilla/5.0%20(Windows%20NT%206.3;%20Win64;%20x64;%20rv:109.0)%20Gecko/20100101%20Firefox/115.0
Resolving mozilla (mozilla)... failed: Name or service not known.
wget: unable to resolve host address ‘mozilla’
--2024-03-27 02:15:26--  https://www.zara.com/sitemaps/sitemap-images-es-es.xml.gz
Resolving www.zara.com (www.zara.com)... 23.48.203.82, 23.48.203.75, 2600:1408:ec00:20::1735:23a7, ...
Connecting to www.zara.com (www.zara.com)|23.48.203.82|:443... connected.
HTTP request sent, awaiting response... 403 Forbidden
2024-03-27 02:15:26 ERROR 403: Forbidden.



In [None]:
sitemap_response = requests.get(CONFIGURATION["SITEMAP_URL"], headers={"User-Agent": CONFIGURATION["USER_AGENT"]})

In [None]:
with open("sitemap-images-es-es.xml.gz", "wb+") as file:
  file.write(sitemap_response.content)

In [None]:
!gunzip "sitemap-images-es-es.xml.gz" -y

gzip: invalid option -- 'y'
Try `gzip --help' for more information.


In [None]:
sitemap_content = ""
with open("sitemap-images-es-es.xml") as file:
  sitemap_content = file.read()

In [None]:
sitemap_parsed = bs4.BeautifulSoup(sitemap_content)
products = sitemap_parsed.select("loc")
"total products", len(products)

  sitemap_parsed = bs4.BeautifulSoup(sitemap_content)


('total products', 4797)

In [None]:
products_links = [p.text for p in products]

## Parseo página de producto

In [None]:
def clean(element):
  if not element:
    return element

  return element.text.strip()

In [None]:
def parse_product_page(link: str, driver):
  driver.get(link)

  return {
      "name": driver.find_element(by=By.CLASS_NAME, value="product-detail-info__header-name").text.strip(),
      "price": driver.find_element(by=By.CLASS_NAME, value="money-amount__main").text.strip(),
      "description": driver.find_element(by=By.CLASS_NAME, value="expandable-text__inner-content").get_attribute("innerText").strip(),
      "image_link": driver.find_element(by=By.TAG_NAME, value="body").find_element(by=By.CLASS_NAME, value="media-image img").get_dom_attribute("src"),
  }

## Parseo selección de productos

### Secreto

In [None]:
!pip install -q google-colab-selenium

In [None]:
import google_colab_selenium
from selenium.webdriver.common.by import By
from selenium import webdriver

options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument(f"user-agent={CONFIGURATION['USER_AGENT']}")

driver = google_colab_selenium.Chrome(options=options)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Create a request interceptor
def interceptor(request):
    del request.headers["User-Agent"]  # Delete the header first
    print(request.headers)
    request.headers["User-Agent"] = CONFIGURATION["USER_AGENT"]
    request.headers["Accept"] = "*/*"
    request.headers["Accept-Language"] = "es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3"
    request.headers["Accept-Encoding"] = "gzip, deflate, br"
    request.headers["Referer"] = "https://www.zara.com/es/es/wonder-rose-90-ml-p20120807.html"
    request.headers["Connection"] = "keep-alive"
    request.headers["Cookie"] = "ITXDEVICEID=02fc621d5471ab1040783cd24874e775; _abck=FF9A53297639EC45342B57AFD91E10F2~0~YAAQN3R7XHUaHHmOAQAAFFU0fQtsN6e9fAokpmEM+iSlfo+vztwu33P6BylVsXjJIlz6c4lFsvccuDsr2DhAYIzpppeQfIF/Xu7inbjROPZRsC+2x0SX7LVihe0p9f7s2KOjd/czxwbs5DJwWMzg+Go1OzeP6+Yi282vBeKgrMyPcDPk/qWBHBxepIvKRv0KSpnVcz5lU6A0REyjzWCbtUfOyA9QXYil6gLpfK3meax127+9XZZOJhMb0MvZXq4tem5U96QeDrAcBV88N6ggPjZcfNhNO5R+TbMRsEUwcSXAOrCFEGwYEHewgbFU/ZXGT/C/AzNxRjx6nEzS9ANVhW77KgyKi1zeyChJk8Odv3mqpuQaWpXF1DQEUKDNcdmxHmKjph4RbJnReudVjkgmWL2QX6JKPCU=~-1~-1~-1; bm_sz=E94F23C1140583A524E01EFC24C199A0~YAAQrSURAueDRjaOAQAApm4YfRfgVpYzZq5xzWEUc9VdQVxR3W4T+H+edSMhIjQOXUn5uyQd1L3qvQAoPsiC4vRlsszTFBfw+14jiAj0KuFUr4iWhm8esOHdPURLbo488TF2LpaDGJfQw3AB26lWqvv5H+6B0lORb4nyqQQA1WUVT4sCEl6wIq066fBIPFpospLApeH8n1OQSiytEWWgu+jzmHFE1AStXjhpt5BWwWo5A5lMvDwZsTyzg4Zfp6UAyaJr+B+Jq5z/VQZyiIVMp/xJC2Rxt42EPXv3TpouvkPgJPx88exOcH2Hc3Gr0p2Cpdt5+HPEQ578hZ2EjBILFCI+vTR/W2bNA9MHzbgjpur0gCbX~4535365~3360321; ITXSESSIONID=57bdb2f81a4e29114678207d12eb7064; ak_bmsc=340240611C3998410525FEF6FA35C487~000000000000000000000000000000~YAAQI7U+F7/Sv3OOAQAAGNA8fRdtU5mUR7WmSS1N682gikFlUBYGGV60Jhb6D4l+7+Q59eakzp4WYnGnQaRY4nIS7w2TDWNZUQDs2YdpkFCrOsVSkxkiKeH9kvoo4LM+Wap122H7NgK/p8LZGVY2Zn3binVv1KPkFZPgpmfAJzn8hZ+2AqVtqwT25m8Cq02nJRgEOEwTXv7ZBYyeIItGs6hathPkJ7vqH5Xi3/lr3y89Ed7lrcVsPenrX4CG1IGe8mqSWLFRDIpmZiBhGjSPjzNMrANSPDqs5ESFW0iNpvUJv+DfaESwK3/uYamtbpBCaQ6Yrs7aelL1GedzkCjqe4pxclTCN8xXSgfpXsLff3Pkgw4YDCbTxuMTYlWXFHBYH1wDeZxzNdeRpWku0fLiGVhNKoj2WX1lKM2yXimUD9e7IA95eosTA0VXv8tfTjbKfDvQwyPacl30CQ5LoOow4BZ5h10pPu+xHRh8OkL/J4OntROGye06KGrjEdIKph0qN4CyxOT8pTJ3v7p9ddJvzNZS1rj6RMaNJYeSoKwm20hq3VZob3bwqFDmpGfb+xmj6LHgw4EjDQvZLqznZKWtQWBNsjIBv0LIPxY1T0xUU6ogYDF1dUx1YiDWLcF7puSXpw09D2Dv3OSRDVhyCzta; TS0122c9b6=0163574cbd1927218894f9d1b7a866f032110220a110d3c13cbd6d4552c000d62b4df9137889a37f23324903f2beb5f03883446246; bm_mi=AC1FAC123FB78FE5DFA228CAEEC0EE5E~YAAQN3R7XMYRHHmOAQAAYLgzfRdXa2+yaqeJRcQVPhJBRV92F/x/hFXMRJCrKTMA9/SXnPhBMtwEYtlIiLs+/Lt69BJsfaWRpM8oNFTu+4E1rEYYP8H8TujAPa4l3dYSYk8B8dXn14anx16Hu0cSoox9XtWQtEQbW7iHUxsp0rOZsXz/bI+9Fbxjii5JLzGc6a6TA5I0x8TmOWPUJYSeoUZXFaIQcnZZ3iXQE5U1SrJ8bPIJ5FVc1jjENQmWlmueYLdLy5YSkGJ5T5izbIKaNJQG1wKl3/aX8fP1pBPM1pKzRlVRGfFdts5rT6oDQuYPypwH9d4Vwh4YsyBX16+FQrb5QD8kGPVm98Sb7UmS9HNg~1; bm_sv=9FB71229F21F800EAD22E5C1CD0581C8~YAAQI7U+F8DSv3OOAQAAGNA8fRfxsogqjLEDtHmtsXM4JYovp1zx7uHo5wA8PiY7bmPqdauePMqawsRF5SGh31GTpuvoLLUMfwA/pH4mGu4m+9sTNei0GlULrVYy4xyPaNqDT4jFwdOlBi+VIYzGbWIvHuFGSyXw3jLzBXGq3pQ6bNvvSvdVRWUpbYg3CMQWDK7i8JDmXcXdfCE3h8ZHOoF9VQUxgBTqwtgrb9Khjy+fw/GHn3M3CjjA02uHdw==~1; _itxo11y_ses.01fa=*; _itxo11y_id.01fa=8448decf-85cb-44ff-9644-96b815c30a98.1711497544.1.1711498137..034d9275-cac8-4bbd-b09f-b1116edaf886..3eea7c3d-a33e-46d0-91f1-a40316df1695.1711497543958.13; rid=68de075f-8b7f-4c6f-bb12-4fbab79806b1; storepath=es%2Fes; IDROSTA=d8cd15c6c856:145aeb6a889fd0319c43c2691; 826b75e46a856af63aa6a715b40616e7=ae627a78bc1f75d06af2d41d1e0e35a2; cart-was-updated-in-standard=true; OptanonConsent=isGpcEnabled=1&datestamp=Wed+Mar+27+2024+00%3A59%3A39+GMT%2B0100+(hora+est%C3%A1ndar+de+Europa+central)&version=202401.1.0&browserGpcFlag=1&isIABGlobal=false&hosts=&consentId=28b5ee63-138e-4bce-b59b-de8d04d46ad7&interactionCount=1&landingPath=NotLandingPage&groups=C0001%3A1%2CC0002%3A0%2CC0003%3A0%2CC0004%3A0; lastRskxRun=1711497545505; rskxRunCookie=0; rCookie=3mlp8nvgymus2tzpdofiklu91g5ya; OptanonAlertBoxClosed=2024-03-26T23:59:39.459Z"
    request.headers["Sec-Fetch-Dest"] = "empty"
    request.headers["Sec-Fetch-Mode"] = "cors"
    request.headers["Sec-Fetch-Site"] = "same-origin"
    request.headers["Sec-GPC"] = "1"
    request.headers["TE"] = "trailers"

# Set the interceptor on the driver
driver.request_interceptor = interceptor

In [None]:
def throttle():
  time.sleep(CONFIGURATION["THROTTLE"])

### Parsing

In [None]:
parsed_products = []

prod_products_links = list(products_links)
# random.shuffle(prod_products_links)
prod_products_links = products_links[:CONFIGURATION["PRODUCT_LIMIT"]]

for link in prod_products_links:
  index = len(parsed_products) + 1
  if index > CONFIGURATION["PRODUCT_LIMIT"]:
    break

  print(link)
  throttle()
  try:
    parsed_products.append(parse_product_page(link, driver))
    print(f"({index}/{CONFIGURATION['PRODUCT_LIMIT']})", "-", index / CONFIGURATION["PRODUCT_LIMIT"] * 100, "%")
  except:
    print("error", link)

https://www.zara.com/es/es/wonder-rose-90-ml-p20120807.html
3.0 %
https://www.zara.com/es/es/mini-shorts-rayas-p06929405.html
4.0 %
https://www.zara.com/es/es/camiseta-rib-algodon-p00085900.html
5.0 %
https://www.zara.com/es/es/sudadera-minnie-mouse---disney-p00061566.html
6.0 %
https://www.zara.com/es/es/falda-pareo-estampada-midi-p09878109.html
7.0 %
https://www.zara.com/es/es/calcetines-rib-pack-3-p03992419.html
8.0 %
https://www.zara.com/es/es/vestido-corto-estampado-floral-cinturon-p02737839.html
9.0 %
https://www.zara.com/es/es/mocasin-piel-cadena-p12587310.html
10.0 %
https://www.zara.com/es/es/bermuda-denim-trf-trabillas-ajustables-p08727001.html
11.0 %
https://www.zara.com/es/es/camisa-elastica-rayas-p07545541.html
12.0 %
https://www.zara.com/es/es/mule-acharolado-tacon-p12234310.html
13.0 %
https://www.zara.com/es/es/sudadera-texto-relieve-p02224666.html
14.0 %
https://www.zara.com/es/es/jersey-punto-ochos-p02756755.html
15.0 %
https://www.zara.com/es/es/pack-tres-calcetines-

In [None]:
df = pandas.DataFrame(parsed_products)
df

Unnamed: 0,name,price,description,image_link
0,WONDER ROSE 90 ML,"10,95 EUR","COMPOSICIÓN, CUIDADOS & ORIGENCOMPOSICIÓNTraba...",https://static.zara.net/assets/public/dc08/e8c...
1,MINI SHORTS RAYAS,"22,95 EUR","COMPOSICIÓN, CUIDADOS & ORIGENCOMPOSICIÓNTraba...",https://static.zara.net/assets/public/05c7/94c...
2,CAMISETA RIB ALGODÓN,"15,95 EUR","COMPOSICIÓN, CUIDADOS & ORIGENCOMPOSICIÓNTraba...",https://static.zara.net/photos///2023/I/0/1/p/...
3,SUDADERA MINNIE MOUSE © DISNEY,"15,95 EUR","COMPOSICIÓN, CUIDADOS & ORIGENCOMPOSICIÓNTraba...",https://static.zara.net/assets/public/045f/ab6...
4,FALDA PAREO ESTAMPADA MIDI,"29,95 EUR","COMPOSICIÓN, CUIDADOS & ORIGENCOMPOSICIÓNTraba...",https://static.zara.net/assets/public/8cbc/d41...
5,CALCETINES RIB PACK 3,"9,95 EUR","COMPOSICIÓN, CUIDADOS & ORIGENCOMPOSICIÓNTraba...",https://static.zara.net/assets/public/0a9f/7c6...
6,VESTIDO CORTO ESTAMPADO FLORAL CINTURÓN,"35,95 EUR","COMPOSICIÓN, CUIDADOS & ORIGENCOMPOSICIÓNTraba...",https://static.zara.net/assets/public/4e23/18f...
7,MOCASÍN PIEL CADENA,"39,95 EUR","COMPOSICIÓN, CUIDADOS & ORIGENCOMPOSICIÓNTraba...",https://static.zara.net/photos///2024/V/1/1/p/...
8,BERMUDA DENIM TRF TRABILLAS AJUSTABLES,"25,95 EUR","COMPOSICIÓN, CUIDADOS & ORIGENCOMPOSICIÓNTraba...",https://static.zara.net/assets/public/6e3a/158...
9,CAMISA ELÁSTICA RAYAS,"29,95 EUR","COMPOSICIÓN, CUIDADOS & ORIGENCOMPOSICIÓNTraba...",https://static.zara.net/assets/public/b2df/f05...


Guardando el contenido en un .csv

In [None]:
current_time = datetime.datetime.now().strftime("%Y_%m_%d-%H_%M_%S")
df.to_csv(f"zara-{current_time}.csv", sep=";")