# Trustpilot scraping - Trade Republic

Este notebook scrapea reseñas de Trustpilot para `www.traderepublic.com` desde la pagina 3 hasta la 10, extrayendo texto y fecha, y guardando el resultado en CSV.

In [None]:
# Si falta alguna dependencia, descomenta y ejecuta:
# !pip install requests beautifulsoup4 pandas lxml

In [None]:
import time
from typing import List, Dict

import requests
import pandas as pd
from bs4 import BeautifulSoup


In [None]:
BASE_URL = "https://es.trustpilot.com/review/www.traderepublic.com"
START_PAGE = 3
END_PAGE = 10
OUTPUT_CSV = "traderepublic_reviews_p3_p10.csv"

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/122.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
}


In [None]:
def parse_reviews_from_html(html: str) -> List[Dict[str, str]]:
    soup = BeautifulSoup(html, "lxml")

    reviews_data = []
    review_articles = soup.select("article[data-service-review-id]")

    for article in review_articles:
        text_elem = article.select_one("p[data-service-review-text-typography]")
        date_elem = article.select_one("time")

        review_text = text_elem.get_text(" ", strip=True) if text_elem else ""
        review_date = date_elem.get("datetime", "").strip() if date_elem else ""

        if review_text:
            reviews_data.append({"review_text": review_text, "review_date": review_date})

    return reviews_data


In [None]:
all_reviews = []

for page in range(START_PAGE, END_PAGE + 1):
    url = f"{BASE_URL}?page={page}"
    print(f"Scrapeando: {url}")

    response = requests.get(url, headers=HEADERS, timeout=30)
    response.raise_for_status()

    page_reviews = parse_reviews_from_html(response.text)
    print(f"  -> Reseñas encontradas: {len(page_reviews)}")

    all_reviews.extend(page_reviews)
    time.sleep(1.5)

print(f"\nTotal reseñas extraidas: {len(all_reviews)}")


In [None]:
df = pd.DataFrame(all_reviews, columns=["review_text", "review_date"])
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")
print(f"CSV guardado en: {OUTPUT_CSV}")
df.head()