In [3]:
import re
from datetime import datetime
import time
import requests
from pprint import pprint
import json
import random
import subprocess
from urllib.parse import unquote

from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
from urllib.parse import unquote, urlparse

StatementMeta(, 3a582ded-1a43-4b40-ad80-9887abfb5b13, 5, Finished, Available)

In [4]:
# Function to read JSON data from a file
def read_json_file(file_path):
    try:
        with open(file_path, 'r') as file:
            return json.load(file)
    except (FileNotFoundError, json.JSONDecodeError):
        return []

# Function to write JSON data to a file
def write_json_file(file_path, data):
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4)

StatementMeta(, 3a582ded-1a43-4b40-ad80-9887abfb5b13, 6, Finished, Available)

## Products


In [None]:
# with open("/lakehouse/default/Files/links.txt", "w") as f:
#     f.writelines([link + "\n" for link in product_links])

In [None]:
with open("/lakehouse/default/Files/links.txt") as f:
    product_links = f.readlines()
product_links = [link.strip() for link in product_links]
len(product_links)

In [4]:
def scrape_product_page(soup):
    title_tag_id = ["productTitle", "title"]
    for tag_id in title_tag_id:
        title = soup.find(id=tag_id)
        if title is not None:
            break
    if title is None:
        return None
    title = title.get_text().strip()

    product_description = None
    # https://www.amazon.es/Soundcore-A3947-Liberty-4-NC/dp/B0BZV7M23Q/ref=cs_sr_dp_1
    product_description_tag = soup.find('div', id="feature-bullets")
    if product_description_tag is None:
        # https://www.amazon.es/Headphones-Auriculares-inal%C3%A1mbricos-meditaci%C3%B3n-relajaci%C3%B3n/dp/B087FW1835/ref=cs_sr_dp_1
        product_description_tag = soup.find('div', id='productFactsDesktopExpander')
        if product_description_tag is not None:
            product_description_elements = product_description_tag.find_all("span")
            product_description = ""
            for element in product_description_elements:
                product_description += element.get_text().strip() + "\n"
            product_description = product_description.strip()
    else:
        product_description = product_description_tag.find("ul")
        if product_description:
            product_description = product_description.get_text().strip()

    product_features = None
    product_features_tag = soup.find("div", id="productOverview_feature_div")
    if product_features_tag:
        product_features_table = product_features_tag.find("table")
        if product_features_table:
            feature_spans = product_features_table.find_all("span")
            product_features = ""
            for i, span in enumerate(feature_spans):
                if i % 2 == 0:
                    product_features += "\n"
                else:
                    product_features += " : "
                product_features += span.get_text().strip()
            product_features = product_features.strip()

    try:
        overall_rating_tag_id = "averageCustomerReviews"
        overall_rating = soup.find(id=overall_rating_tag_id).find(class_="a-size-base a-color-base").get_text().strip()
    except:
        overall_rating = soup.find("span", attrs={"data-hook": "rating-out-of-text"})
        if overall_rating:
            overall_rating = re.search("\d+([\.,]\d+)?", overall_rating).group()

    img_tag_id = "imgTagWrapperId"
    img_tag = soup.find(id=img_tag_id)
    img_url = img_tag.find("img").get("src")

    seller_tag_id = "bylineInfo"
    seller_url = soup.find(id=seller_tag_id).get("href")
    if '?' in seller_url:
        seller_url = seller_url[0:seller_url.index('?')]
    seller_url = "https://www.amazon.es" + seller_url

    price = None
    try:
        price_tag_id = "corePriceDisplay_desktop_feature_div"
        price = soup.find(id=price_tag_id)
        price = price.find("span", class_="aok-offscreen").get_text().strip()
        price = re.search("\d+([\.,]\d+)?", price).group()
    except:
        price = soup.find("span", class_="a-price-whole")
        if price:
            price = price.get_text().strip()
        else:
            # find all strings containing euro symbol and take the avg of the numbers
            texts_with_euro = soup.find_all(string=lambda s: '€' in s)
            matches = []
            for text in texts_with_euro:
                match = re.search("(\d+([,.]\d+)?)\xa0€", text)
                if match:
                    matches.append(float(match.group(1).replace(",", ".")))
            if len(matches) > 0:
                price = sum(matches) / len(matches)

    num_reviews = None
    try:
        num_reviews_tag_id = "acrCustomerReviewText"
        num_reviews_tag = soup.find(id=num_reviews_tag_id).get_text().strip()
        num_reviews = re.search("\d+([\.,]\d+)?", num_reviews_tag).group()
    except:
        num_reviews_tag = soup.find("span", attrs={"data-hook": "total-review-count"})
        if num_reviews_tag:
            num_reviews = re.search("\d+([\.,]\d+)?", num_reviews_tag).group()

    try:
        rating_histogram_tag_id = "cm_cr_dp_d_rating_histogram"
        rating_histogram_tag = soup.find(id=rating_histogram_tag_id).find(id="histogramTable")
        histogram = {}
        for i, div in enumerate(rating_histogram_tag.find_all("div", class_="a-meter")):
            value = div.get("aria-valuenow")
            histogram[5 - i] = value
    except:
        histogram = None

    try:
        more_reviews_tag = soup.find(string="Ver más opiniones").parent
        more_reviews_link = "https://www.amazon.es" + more_reviews_tag.get("href")
    except:
        more_reviews_link = None

    return {
        "title": title,
        "product_description": product_description,
        "rating": overall_rating,
        "img_url": img_url,
        "seller_url": seller_url,
        "price": price,
        "num_reviews": num_reviews,
        "rating_histogram": histogram,
        "more_reviews": more_reviews_link,
        "date_accessed": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "product_features": product_features
    }

StatementMeta(, , , Waiting, )

In [5]:
def scrape_products(product_links):
    sessions = [
        requests.Session(),
        requests.Session(),
        requests.Session(),
        requests.Session(),
        requests.Session()
    ]
    headers = {
        'Host': 'www.amazon.es',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:126.0) Gecko/20100101 Firefox/126.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br, zstd',
        'Referer': 'https://www.amazon.es/',
        'Alt-Used': 'www.amazon.es',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-User': '?1',
        'DNT': '1',
        'Sec-GPC': '1',
        'Priority': 'u=1',
        'TE': 'trailers'
    }
    products = read_json_file("/lakehouse/default/Files/products.json")
    p = len(products)
    n = 0
    for i, url in enumerate(tqdm(product_links[p:])):
        print(url)
        response = sessions[n % 5].get(url, headers=headers)
        sessions[n % 5].close()
        sessions[n % 5] = requests.Session()
        n += 1

        if response.status_code != 200:
            raise Exception(f"Status code {response.status_code}")

        soup = BeautifulSoup(response.text)
        product_info = scrape_product_page(soup)
        if product_info is None:
            print("FORCE STOPPED - TITLE NOT FOUND")
            break
        product_info['product_url'] = url
        products.append(product_info)
        write_json_file("products.json", products)
        time.sleep(5)

StatementMeta(, , , Waiting, )

In [5]:
products = read_json_file("/lakehouse/default/Files/products.json")
len(products)

StatementMeta(, 3a582ded-1a43-4b40-ad80-9887abfb5b13, 7, Finished, Available)

1186

In [7]:
completed = {product['product_url'] for product in products}
print(len(completed))

for i, url in enumerate(tqdm(product_links)):
    if url in completed:
        continue

    if url.startswith("https://www.amazon.es/sspa/click?"):
        found_url = re.search("url=(.+sspa)", url).group(1)
        decoded_url = unquote(unquote(found_url))
        url = "https://www.amazon.es" + decoded_url 

    curl_command = [
        'curl',
        url,
        '--compressed',
        '-H', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:126.0) Gecko/20100101 Firefox/126.0',
        '-H', 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
        '-H', 'Accept-Language: en-US,en;q=0.5',
        '-H', 'Accept-Encoding: gzip, deflate, br, zstd',
        '-H', 'Connection: keep-alive',
        '-H', 'Cookie: session-id=258-2658992-6910547; session-id-time=2082787201l; i18n-prefs=EUR; csm-hit=tb:QNWX441JR9KDNHQ1V2B5+s-QNWX441JR9KDNHQ1V2B5|1717169495662&t:1717169495663&adb:adblk_yes; ubid-acbes=260-0513493-7860444; lc-acbes=es_ES; session-token=LasDwk1tZDPLJa5GvWM9erbg6oY3FzZuduvJ3P3rt6jXAwfAZa9DqNDVb4Vhu38rAXzroIDRTp0z77m2MNF21rqQwhFIfE5/n+y0dh8/ETg5zIHZvJPNSbe/BLzLhEciut3wFQl7HY3HtZFS92eTZOnP+bjEWCmNfaSDb1FIeE7f6Vsn1iK7dERu1D/aYGwUwQQn3+Y/rkxhDa4YexeIWIZX6R5VipQ21ykwkB0J0ZDMqYd9tktK2XH4Ab9T6imjtakgciwRFLNlDVVBPnUurVgUQriaIoYqFtxkeZSBK3oWLY6F+6Pz9J4kpmK4YvUEdaRydUXgEZpyssVOQ93frwyYbqgt5+Vw',
        '-H', 'Upgrade-Insecure-Requests: 1',
        '-H', 'Sec-Fetch-Dest: document',
        '-H', 'Sec-Fetch-Mode: navigate',
        '-H', 'Sec-Fetch-Site: cross-site',
        '-H', 'DNT: 1',
        '-H', 'Sec-GPC: 1',
        '-H', 'Priority: u=1',
        '-H', 'TE: trailers'
    ]
    # Run the curl command
    result = subprocess.run(curl_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    # Store the output in a variable
    output = result.stdout

    # Scrape the webpage
    soup = BeautifulSoup(output)
    product_info = scrape_product_page(soup)
    if product_info is None:
        print("FORCE STOPPED - TITLE NOT FOUND")
        break
    product_info['product_url'] = url
    # pprint(product_info)
    # print()
    products.append(product_info)
    write_json_file("/lakehouse/default/Files/products.json", products)
    time.sleep(random.randint(5, 10))

StatementMeta(, , , Waiting, )

810


  0%|          | 0/1183 [00:00<?, ?it/s]

In [None]:
products[0]

StatementMeta(, , , Waiting, )

{'title': 'JBL T110 Auriculares In Ear con Pure Bass - Con manejo de un solo botón y micrófono, color negro',
 'product_description': 'Auriculares cómodos, seguros y sin enredos que se ajustan bien a la oreja para que no se muevan mientras caminas o haces deporte. Se mantienen siempre en su lugar    Sonido emblemático JBL en un diseño compacto y elegante con calidad de sonido JBL, sonido con bajos profundos y potentes    Controla la reproducción de tu música y las llamadas pulsando un solo botón. Con cable plano para evitar enredos y micrófono integrado    Audífonos con unidades de 9 mm que ofrecen bajos notables: sonido de calidad para escuchar en casa, en la oficina o el transporte    Contenido del envío: 1x Auriculares alámbricos JBL T110, 3 tamaños de almohadillas (S, M, L), tarjeta de advertencia y garantía, auriculares color negro',
 'rating': '4,4',
 'img_url': 'https://m.media-amazon.com/images/I/415tPIP3TkL.__AC_SX300_SY300_QL70_ML2_.jpg',
 'seller_url': 'https://www.amazon.es

## Reviews

In [6]:
def get_review_id(html):
    return re.match(r'<div id=\\"(\w+)\\"', html).group(1)

def get_rating(html):
    return re.search(r'<i data-hook=\\"review-star-rating\\".*review-rating\\"><span class=\\"a-icon-alt\\">(.+ estrellas)</span>', html).group(1)

def get_title(html):
    return re.search('<span>([^<]+)</span', html).group(1)

def get_date(html):
    return re.search(r'<span data-hook=\\"review-date\\" class=\\"a-size-base a-color-secondary review-date\\">([^<]+)</span>', html).group(1)

def get_text(html):
    html = re.sub(r'<br[ /]+>', '', html)
    end = re.search(r'<span data-hook=\\"review-body\\"[^>]*>', html).span()[1]
    simple_non_empty_span_pattern = re.compile("<span>([^<]+)</span>", re.DOTALL)
    match = simple_non_empty_span_pattern.search(html[end:])
    return match.group(1) if match else None

def get_num_helpful(html):
    '<span data-hook=\"helpful-vote-statement\" class=\"a-size-base a-color-tertiary cr-vote-text\">A 3 personas les ha parecido esto útil</span>'
    match = re.search(r'<span data-hook=\\"helpful-vote-statement\\"[^>]+>([^<]+)</span>', html)
    if match:
        span = match.group(1)
        match = re.search('\d+', span)
        if match:
            num_helpful = match.group()
            return int(num_helpful)
        elif 'A una persona le ha parecido esto' in span:
            return 1
        else:
            return 0
    else:
        return 0

def parse_api_response(response):
    parsed_response = response.split("&&&")
    reviews = []
    for review in parsed_response:
        review = review.strip()
        if review.startswith('["append","#cm_cr-review_list","<div id='):
            reviews.append(review[32:-1])
    parsed_reviews = []
    for i, review in enumerate(reviews):
        review_date = get_date(review)
        if 'España' not in review_date:
            continue
        parsed_review = {
            'rating': get_rating(review),
            'review_title': get_title(review),
            'review_date': review_date,
            'review_text': get_text(review),
            'helpful_vote_statement': get_num_helpful(review),
            'review_id': get_review_id(review)
        }
        parsed_reviews.append(parsed_review)
    return parsed_reviews

StatementMeta(, 3a582ded-1a43-4b40-ad80-9887abfb5b13, 8, Finished, Available)

In [7]:
products = read_json_file("/lakehouse/default/Files/products.json")
len(products)

StatementMeta(, 3a582ded-1a43-4b40-ad80-9887abfb5b13, 9, Finished, Available)

1186

In [8]:
product = products[0]
product_url = product['product_url']
print(product_url)

StatementMeta(, 3a582ded-1a43-4b40-ad80-9887abfb5b13, 10, Finished, Available)

https://www.amazon.es/JBL-T110-Auriculares-intraaurales-micr%C3%B3fono/dp/B01MG62Z5M/ref=cs_sr_dp_1


In [9]:
product_url = product['product_url']
asin = product_url.split("/")[5]
print(asin)
initial_request = [
    'curl',
    'https://www.amazon.es/hz/reviews-render/ajax/reviews/get/ref=cm_cr_getr_d_paging_btm_prev_1',
    '--compressed',
    '-X', 'POST',
    '-H', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:126.0) Gecko/20100101 Firefox/126.0',
    '-H', 'Accept: text/html,*/*',
    '-H', 'Accept-Language: en-US,en;q=0.5',
    '-H', 'Accept-Encoding: gzip, deflate, br, zstd',
    '-H', 'X-Requested-With: XMLHttpRequest',
    '-H', 'Content-Type: application/x-www-form-urlencoded;charset=UTF-8',
    '-H', 'Origin: https://www.amazon.es',
    '-H', 'Alt-Used: www.amazon.es',
    '-H', 'Connection: keep-alive',
    '-H', 'Cookie: session-id=258-2658992-6910547; session-id-time=2082787201l; i18n-prefs=EUR; csm-hit=tb:s-N3TMYGNZAK5TQXNZ818D|1717226938652&t:1717226938748&adb:adblk_yes; ubid-acbes=260-0513493-7860444; lc-acbes=es_ES; session-token=prqhSfrw7JIZ/Z1uqLXFHcGExicr5zBsVc0E9pMJojgubJy9nH7LWx1csP6j76LqQQ3I1UbvQXJCmo//u+PKUvRjOE2z9X5uQTemW6XsMcHY01o9tcquglnXqZgFyxDIXTiAxs9+/ikdYK0lSqhd+UQ+znVjirl37+gwk0qad/Bxvdt8gHipE/odWZgtIeuGCpjsDahYcAr39VcmBiGY4WkIbSGAgMaPCEKBSOzPtwEQPO2mTXgS8FO83e15HHlkCMuzltpH3fFvNWanDjG0MxyUgx9ls/Bax8xokmHURoCeJ1V/Pvc2JhQRcZs8mCj4YM1OIqhPlsC3YyarJII/dtfzlc0fhrS6',
    '-H', 'Sec-Fetch-Dest: empty',
    '-H', 'Sec-Fetch-Mode: cors',
    '-H', 'Sec-Fetch-Site: same-origin',
    '-H', 'DNT: 1',
    '-H', 'Sec-GPC: 1',
    '-H', 'Priority: u=1',
    '-H', 'TE: trailers',
    '--data-raw', f'sortBy=&reviewerType=all_reviews&formatType=&mediaType=&filterByStar=&filterByAge=&pageNumber=1&filterByLanguage=&filterByKeyword=&shouldAppend=undefined&deviceType=desktop&canShowIntHeader=undefined&reftag=cm_cr_getr_d_paging_btm_prev_1&pageSize=10&asin={asin}&scope=reviewsAjax2'
]
result = subprocess.run(initial_request, capture_output=True, text=True)
output = result.stdout
parsed_reviews = parse_api_response(output)
print(len(parsed_reviews))
pprint(parsed_reviews[0])


StatementMeta(, 3a582ded-1a43-4b40-ad80-9887abfb5b13, 11, Finished, Available)

B01MG62Z5M
10
{'helpful_vote_statement': 17,
 'rating': '5,0 de 5 estrellas',
 'review_date': 'Revisado en España el 17 de marzo de 2024',
 'review_id': 'R2B0UH3FZRGFRG',
 'review_text': 'Después de probar los JBL T110, me siento en la capacidad de '
                'compartir mi experiencia personal con estos auriculares. Lo '
                'primero que captó mi atención fue su promesa de ofrecer un '
                'sonido Pure Bass, algo que JBL suele manejar excepcionalmente '
                'bien en sus productos. Afortunadamente, no me decepcionaron. '
                'La profundidad y claridad del bajo en mis canciones favoritas '
                'fue notable, brindando una experiencia auditiva envolvente '
                'que pocas veces he encontrado en auriculares de este rango de '
                'precio.El diseño de los T110 es otro punto a favor. Su '
                'estética en color negro es elegante y discreta, ideal para '
                'aquellos que, como yo,

In [10]:
all_reviews = read_json_file('/lakehouse/default/Files/reviews.json')
print(len(all_reviews))
completed = {review['product_id'] for review in all_reviews}
print(len(completed))

StatementMeta(, 3a582ded-1a43-4b40-ad80-9887abfb5b13, 12, Finished, Available)

11439
672


In [11]:
all_reviews = read_json_file('/lakehouse/default/Files/reviews.json')
print(len(all_reviews))
completed = {review['product_id'] for review in all_reviews}
print(len(completed))

for product in tqdm(products, total=len(products)):
    product_url = product['product_url']
    asin = product_url.split("/")[5]
    if asin in completed:
        continue
    request = [
        'curl',
        'https://www.amazon.es/hz/reviews-render/ajax/reviews/get/ref=cm_cr_getr_d_paging_btm_prev_1',
        '--compressed',
        '-X', 'POST',
        '-H', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:126.0) Gecko/20100101 Firefox/126.0',
        '-H', 'Accept: text/html,*/*',
        '-H', 'Accept-Language: en-US,en;q=0.5',
        '-H', 'Accept-Encoding: gzip, deflate, br, zstd',
        '-H', 'X-Requested-With: XMLHttpRequest',
        '-H', 'Content-Type: application/x-www-form-urlencoded;charset=UTF-8',
        '-H', 'Origin: https://www.amazon.es',
        '-H', 'Alt-Used: www.amazon.es',
        '-H', 'Connection: keep-alive',
        '-H', 'Cookie: session-id=258-2658992-6910547; session-id-time=2082787201l; i18n-prefs=EUR; csm-hit=tb:5324XC4K1PHZ2Y14CFQ9+sa-K7H91KK0170ZK5XXHKP3-X57ZPX3TGAVK761Y5F8B|1717181910709&t:1717181910709&adb:adblk_yes; ubid-acbes=260-0513493-7860444; lc-acbes=es_ES; session-token=Zwt2rPubpA1lUNKHNeuehp1+WfP+pvziBpRfnaFr/t/8qDIIJbm2nC97V7GL9ibeXUJnFc3HdmDd3E7BFKhlilEZ6Wci7vIdCqLCj0o5zFnnlgehDqo0zPwqbIAdX4j7ZO/9woXJ/UtcVZV0y8GLNWJO4qeI900X7HkzLYExKrSkYcnyB0cym92jefxMo0Jz6SVJjJj4e6Ze6U9D0ONf5dt5vTr5FgncUD9w3SW0BUkKV9tl440hqNB+SmKoQk/Q/NzXs3wrfXxQylZTtccoRgWF4fKS9p+uBORkXZQex/j+q0nUN7JEqvz1okSaRLNg8s4NeoFdvXhCV4EASF8fMtzOYzi3HnNr',
        '-H', 'Sec-Fetch-Dest: empty',
        '-H', 'Sec-Fetch-Mode: cors',
        '-H', 'Sec-Fetch-Site: same-origin',
        '-H', 'DNT: 1',
        '-H', 'Sec-GPC: 1',
        '-H', 'Priority: u=1',
        '-H', 'TE: trailers',
        '--data-raw', f'sortBy=&reviewerType=all_reviews&formatType=&mediaType=&filterByStar=&filterByAge=&pageNumber=1&filterByLanguage=&filterByKeyword=&shouldAppend=undefined&deviceType=desktop&canShowIntHeader=undefined&reftag=cm_cr_getr_d_paging_btm_prev_1&pageSize=10&asin={asin}&scope=reviewsAjax2'
    ]
    result = subprocess.run(request, capture_output=True, text=True)
    output = result.stdout
    parsed_reviews = parse_api_response(output)
    for review in parsed_reviews:
        review['product_id'] = asin
    n = 1
    while len(parsed_reviews) > 0:
        # time.sleep(random.randint(5, 10))
        all_reviews.extend(parsed_reviews)
        n += 1
        request = [
            'curl',
            f'https://www.amazon.es/hz/reviews-render/ajax/reviews/get/ref=cm_cr_getr_d_paging_btm_next_{n}',
            '--compressed',
            '-X', 'POST',
            '-H', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:126.0) Gecko/20100101 Firefox/126.0',
            '-H', 'Accept: text/html,*/*',
            '-H', 'Accept-Language: en-US,en;q=0.5',
            '-H', 'Accept-Encoding: gzip, deflate, br, zstd',
            '-H', 'X-Requested-With: XMLHttpRequest',
            '-H', 'Content-Type: application/x-www-form-urlencoded;charset=UTF-8',
            '-H', 'Origin: https://www.amazon.es',
            '-H', 'Alt-Used: www.amazon.es',
            '-H', 'Connection: keep-alive',
            '-H', 'Cookie: session-id=258-2658992-6910547; session-id-time=2082787201l; i18n-prefs=EUR; csm-hit=tb:s-N3TMYGNZAK5TQXNZ818D|1717226938652&t:1717226938748&adb:adblk_yes; ubid-acbes=260-0513493-7860444; lc-acbes=es_ES; session-token=prqhSfrw7JIZ/Z1uqLXFHcGExicr5zBsVc0E9pMJojgubJy9nH7LWx1csP6j76LqQQ3I1UbvQXJCmo//u+PKUvRjOE2z9X5uQTemW6XsMcHY01o9tcquglnXqZgFyxDIXTiAxs9+/ikdYK0lSqhd+UQ+znVjirl37+gwk0qad/Bxvdt8gHipE/odWZgtIeuGCpjsDahYcAr39VcmBiGY4WkIbSGAgMaPCEKBSOzPtwEQPO2mTXgS8FO83e15HHlkCMuzltpH3fFvNWanDjG0MxyUgx9ls/Bax8xokmHURoCeJ1V/Pvc2JhQRcZs8mCj4YM1OIqhPlsC3YyarJII/dtfzlc0fhrS6',
            '-H', 'Sec-Fetch-Dest: empty',
            '-H', 'Sec-Fetch-Mode: cors',
            '-H', 'Sec-Fetch-Site: same-origin',
            '-H', 'DNT: 1',
            '-H', 'Sec-GPC: 1',
            '-H', 'Priority: u=1',
            '-H', 'TE: trailers',
            '--data-raw', f'sortBy=&reviewerType=all_reviews&formatType=&mediaType=&filterByStar=&filterByAge=&pageNumber={n}&filterByLanguage=&filterByKeyword=&shouldAppend=undefined&deviceType=desktop&canShowIntHeader=undefined&reftag=cm_cr_getr_d_paging_btm_next_{n}&pageSize=10&asin={asin}&scope=reviewsAjax2'
        ]
        result = subprocess.run(request, capture_output=True, text=True)
        output = result.stdout
        parsed_reviews = parse_api_response(output)
        for review in parsed_reviews:
            review['product_id'] = asin
    write_json_file('/lakehouse/default/Files/reviews.json', all_reviews)

StatementMeta(, 3a582ded-1a43-4b40-ad80-9887abfb5b13, 13, Finished, Available)

11439
672


  0%|          | 0/1186 [00:00<?, ?it/s]

StatementMeta(, 3a582ded-1a43-4b40-ad80-9887abfb5b13, 14, Finished, Available)

In [12]:
parse_api_response(output)

StatementMeta(, 3a582ded-1a43-4b40-ad80-9887abfb5b13, 15, Finished, Available)

[]

In [15]:
print(len(all_reviews))
completed = {review['product_id'] for review in all_reviews}
print(len(completed))

StatementMeta(, 3a582ded-1a43-4b40-ad80-9887abfb5b13, 18, Finished, Available)

13978
949


In [None]:
parsed_response = output.split("&&&")
revs = []
for rev in parsed_response:
    rev = rev.strip()
    if rev.startswith('["append","#cm_cr-review_list","<div id='):
        revs.append(rev[32:-1])
len(revs)

StatementMeta(, , , Waiting, )

10

In [None]:
parsed_revs = []
for i, rev in enumerate(revs):
    print(i)
    review_date = get_date(rev)
    if 'España' not in review_date:
        continue
    parsed_rev = {
        'rating': get_rating(rev),
        'review_title': get_title(rev),
        'review_date': review_date,
        'review_text': get_text(rev),
        'helpful_vote_statement': get_num_helpful(rev),
        'review_id': get_review_id(rev)
    }
    parsed_revs.append(parsed_rev)
pprint(parsed_revs)

StatementMeta(, , , Waiting, )

0
1
2
3
4
5
6
7
8
9
[{'helpful_vote_statement': 0,
  'rating': '5,0 de 5 estrellas',
  'review_date': 'Revisado en España el 16 de febrero de 2024',
  'review_id': 'R3GE7VU8DP0PLB',
  'review_text': 'Llegaron muy rápido y sin problema. Auriculares muy '
                 'correctos y muy buena relación calidad/precio',
  'review_title': 'Auriculares económicos que cumplen muy bien su función'},
 {'helpful_vote_statement': 1,
  'rating': '5,0 de 5 estrellas',
  'review_date': 'Revisado en España el 1 de febrero de 2024',
  'review_id': 'R3DLP6YLJFRVM2',
  'review_text': 'Tudo o que eu precisava',
  'review_title': 'Classico'},
 {'helpful_vote_statement': 0,
  'rating': '5,0 de 5 estrellas',
  'review_date': 'Revisado en España el 4 de noviembre de 2023',
  'review_id': 'R2SD7XVIU5X2P7',
  'review_text': 'Buenos auriculares',
  'review_title': 'Buenos auriculares'},
 {'helpful_vote_statement': 1,
  'rating': '5,0 de 5 estrellas',
  'review_date': 'Revisado en España el 9 de enero de 2024'