In [None]:
import requests
from bs4 import BeautifulSoup
import time
import json
import csv
import random
from tqdm import tqdm

In [None]:
request_headers = {
        "method": "GET",
        "http_version": "HTTP/2",
        "Host": "xistore.by",
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
        "Accept-Encoding": "gzip, deflate, br, zstd",
        "DNT": "1",
        "Sec-GPC": "1",
        "Connection": "keep-alive",
        "Cookie": "hg-client-security=2raddTr8odLrsgYb67RiYQe1LA9; PHPSESSID=wJ2S6mmhESmmdK1Z148IspYnFyPx3lD2; VISIT_USER_ID=BA2F506F-4E40-C9BA-82FF-E3A1546CDCA6; BITRIX_SM_5_SALE_UID=123462632; xistore_banner_show=167986; _gcl_au=1.1.628187606.1736801209; _ga_63ZME06VVY=GS1.1.1736801209.1.0.1736801209.60.0.0; _ga=GA1.1.83084367.1736801210",
        "Upgrade-Insecure-Requests": "1",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "cross-site",
        "Referer": "https://xistore.by/catalog/telefony/?PAGEN_1=1",
        "Priority": "u=0, i"
}

In [None]:
all_phones_links = []

# XI-store parser
# Phone catalog pages parser
for i in range(1, 6):
    url = f"https://xistore.by/catalog/smart_televizory/?PAGEN_1={i}"
    r = requests.get(url, headers=request_headers)
    soup = BeautifulSoup(r.content, 'html.parser')
    # Find all phone links on the page
    for a in soup.find_all('a', class_='search__page_item-name'):
        href = a.attrs['href']
        absolute_url = 'https://xistore.by' + href
        all_phones_links.append(absolute_url)
        print(len(all_phones_links))
    time.sleep(random.randint(1, 3))

# /catalog/apple/smartfon_iphone_14_pro/
# https://xistore.by/catalog/apple/smartfon_iphone_14_pro/

In [None]:
all_phones_links = list(set(all_phones_links))

In [None]:
def save_to_csv(product_data_list, filename='notebooks.csv'):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['name', 'description', 'reviews', 'shops_availability', 'price', 'full_price', 'brand', 'remind_status', 'url', 'category', 'images', 'characteristics'] 
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for product_data in product_data_list:
            # Convert lists and dictionaries to JSON strings for CSV storage
            product_data['reviews'] = json.dumps(product_data['reviews'])
            product_data['shops_availability'] = json.dumps(product_data['shops_availability'])

            product_data['images'] = json.dumps(product_data['images'])
            product_data['characteristics'] = json.dumps(product_data['characteristics'])
            writer.writerow(product_data)

In [None]:
all_product_data = []

# Collect detail information
for link in tqdm(all_phones_links):
    print(link)
    try:
        response = requests.get(link, headers=request_headers)
        response.raise_for_status()  # Raise an exception for bad status codes
        soup = BeautifulSoup(response.content, 'html.parser')

        product_data = {}

        # Name
        product_data['name'] = soup.find('h1').text.strip()

        try:
            description = soup.find('div', class_='detail-text-description basic--content').text.strip()
        except:
            meta_description = soup.find('meta', attrs={'itemprop': 'description'})
            description = meta_description['content'].strip() if meta_description else ""
            
        product_data['description'] = description

        # Reviews (currently placeholder, needs more complex logic to scrape reviews)
        product_data['reviews'] = [] # Placeholder for now,  

        # Shops availability (requires more sophisticated logic if data isn't simply embedded)
        availability_scripts = soup.find_all('div', class_=lambda text: text and 'wrapped-city city-wrap' in text)
        shops_list = []
        if availability_scripts:
            for script in availability_scripts:
                shop_data = {}
                try: 
                    shop_data['city'] = script.find('div', class_='pa-heading').text.strip()
                    for shop_block in script.find_all('div', class_='pa-result_item'):
                        shop_data['name'] = shop_block.find('div', class_='pa-result_title').text.strip()
                        shop_availability = shop_block.find('div', class_=lambda x: x and x.startswith('count-code'))
                        if shop_availability.attrs['class'][0].endswith('available-res'):
                            shop_data['availability'] = False
                        else:
                            shop_data['availability'] = True
                        shop_data['worktime'] = shop_block.find('div', class_='pa-result_time').find('p', class_='pa-result_info-data').text.strip()
                        shop_data['adress'] = shop_block.find('div', class_='pa-result_adress').find('p', class_='pa-result_info-data').text.strip()
                        # print(shop_data)
                        shops_list.append(shop_data.copy())
                except (IndexError, json.JSONDecodeError):
                    product_data['shops_availability'] = {} # Handle errors gracefully
                    print(f"Error parsing availability data for {url}")
                
        product_data['shops_availability'] = shops_list

        # Price (needs better error handling)
        price_element = soup.find('span', class_='count price-color')
        if price_element:
            price_str = price_element.text.replace(' ', '').replace('<sup>', '.').replace('</sup>', '').strip()
            product_data['price'] = float(price_str) if price_str else None
        
        # Discount
        old_price_element = soup.find('span', class_='old-price')
        if old_price_element and price_element:
            old_price_str = old_price_element.text.replace(' ', '').replace('<sup>', '.').replace('</sup>', '').strip()
            old_price = float(old_price_str) if old_price_str else 0.0
            product_data['full_price'] = old_price
        else:
            product_data['full_price'] = 0.0

        # Brand (meta tag)
        product_data['brand'] = soup.find('meta', attrs={'itemprop': 'name'})['content']

        # Remind status (placeholder -  logic depends on how it's indicated on the page)
        product_data['remind_status'] = any([shop['availability'] for shop in product_data['shops_availability']])

        # URL
        product_data['url'] = url
        
        # Category
        category_element = soup.find('a', class_='current') # Assuming category in breadcrumb
        product_data['category'] = category_element.text.strip() if category_element else "Unknown"


        # Images
        product_data['images'] = [img['src'] for img in soup.find_all('img', itemprop='image')]


        # Characteristics 
        product_data['characteristics'] = []
        characteristics_containers = soup.select('.characteristic--list')  # Select all characteristic lists
        for container in characteristics_containers:
            for item in container.select('.characteristic--item'):  # Iterate through characteristics within each list
                name = item.select_one('.name').text.strip()
                value = item.select_one('.characteristic').text.strip()
                product_data['characteristics'].append({'name': name, 'value': value})
                
        print(product_data)
        time.sleep(random.randint(1, 3))
        all_product_data.append(product_data)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {link}: {e}")

In [None]:
save_to_csv(all_product_data, filename='televizory.csv')

In [None]:
product_data['category']

In [None]:
script.find_all('div', class_='pa-result_item')

In [None]:
with open('test.html', 'w') as file:
    file.write(soup.prettify())