In [4]:
# Original Version
#!/usr/bin/python3

# Requirements
#
#  apt install python3-pip
#  pip install beautifulsoup4

import requests
import xml.etree.ElementTree as ET
import json
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import csv
import os


# factored out error handling, we expect all urls to yield status 200
def fetch(url):
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception('Aborting, status ' + response.status_code + ' for URL ' + url)
    return response

# cached urls are only fetched once
cache = {}
def fetch_cached(url):
    if url not in cache:
        cache[url] = fetch(url)
    return cache[url]

def get_id_from_url(url):
    return url.split('/dp/')[1].split('/')[0]

def get_item_group_id(url):
    # Extract Item Group ID from the URL by splitting and getting the part after '/dp/'
    item_group_id = url.split('/dp/')[1].split('/')[0]
    
    # Remove any additional parts after the item_group_id
    if '-' in item_group_id:
        item_group_id = item_group_id.split('-')[0]

    return item_group_id

def scrape_product_urls(base_url):
    response = fetch(base_url)
    root = ET.fromstring(response.content)
    urls = [url.text for url in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc')]
    return urls

# Function to extract variation URLs from JSON data
def extract_variation_urls(product_url):
    item_group_id = get_item_group_id(product_url)
    variation_url = f'https://www.packingpanic.de/index.php?fc=module&module=papa_sidecar&controller=allVariations&id={item_group_id}'
    response = fetch(variation_url)
    data = json.loads(response.text)
    if not isinstance(data, list):
            raise Exception('list expected for ' + variation_url)
    return data

def extract_image_links(data):
    image_links = data.get('image', [])
    additional_image_links = ','.join(image_links[1:]) if len(image_links) > 1 else ''
    return additional_image_links

def map_availability(availability):
    availability_mapping = {
        'http://schema.org/OutOfStock': 'out_of_stock',
        'http://schema.org/InStock': 'in_stock',
        'http://schema.org/PreOrder': 'preorder',
        'http://schema.org/BackOrder': 'backorder'
    }
    return availability_mapping.get(availability, '')

#
# Retrieves category page my url and extract category from the meta keywords
# We use the following scheme for the keywords: facebook-category-123, google-category-4711
#
def get_category_from_keywords(url, prefix):
    if not url:
        return ''
    response = fetch_cached(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    keywords = soup.find('meta', {'name': 'keywords'})['content']
    for x in keywords.split(','):
        x = x.strip()
        if x.startswith(prefix):
            return x.split('-')[2]
    return '';    

def scrape_product_data(url):
    response = fetch(product_url)
    result = {}
    soup = BeautifulSoup(response.content, 'html.parser')
    # get all script elements
    scripts = soup.find_all('script', {'type': 'application/ld+json'})
    # find the json data for Product and BreadcrumbList
    for script in scripts:
        obj = json.loads(script.string)
        type = obj.get('@type', '')
        if type == 'Product':
            data = obj
        elif type == 'BreadcrumbList':
            breadcrumbs = obj
            
    result['title'] = data.get('name', '').strip()
    result['description'] = data.get('description', '').strip()
    result['link'] = product_url

    offers= data.get('offers', {})
    price_value = float(offers.get('price', ''))
    price = f"{offers.get('price', '')} {offers.get('priceCurrency', '')}"
    result['price'] = price
    result['price_value'] = price_value
    result['availability'] = map_availability(offers.get('availability', ''))

    result['gtin'] = data.get('gtin13', '')

    brand_data = data.get('brand', {})
    result['brand'] = brand_data.get('name', '').strip()

    image_links = offers.get('image', [])
    result['image_link'] = image_links[0] if image_links else ''
    result['additional_image_link'] = extract_image_links(offers)  # Get additional image links

    # Extract ID from the URL
    result['product_id'] = get_id_from_url(product_url)

    # Extract Item Group ID from the URL
    result['item_group_id'] = get_item_group_id(product_url)

    result['color'] = data.get('color', '')
    result['size'] = data.get('size', '')
    
    # Determine the custom label based on price ranges
    # price_value = float(price.split()[0])
    if price_value < 5:
        custom_label = 'Price A'
    elif price_value < 25:
        custom_label = 'Price B'
    elif price_value < 100:
        custom_label = 'Price C'
    else:
        custom_label = 'Price D'    
    result['custom_label'] = custom_label
    
    # the immediatly assinged category is the second last from the BreadcrumbList
    # extract the url for the category page
    items = breadcrumbs.get('itemListElement')
    category_url = ''
    if len(items) > 1:
        category_url = items[-2]['item']
    
    result['facebook_category'] = get_category_from_keywords(category_url, 'facebook-category-')
    result['google_category'] = get_category_from_keywords(category_url, 'google-category-')
    result['product_type'] = data.get('category', '')

    # Construct path of category names separated by ' & '
    result['idealo_category_path'] = ''
    if len(items) > 2:
        items.pop(0)
        items.pop(-1)
        result['idealo_category_path'] = ' & '.join([x['name'] for x in items])
   
    return result

def strip_url_fragment(url):
    return url.split('#')[0]

sitemap_url = 'https://www.packingpanic.de/2_de_0_sitemap.xml'
product_urls = scrape_product_urls(sitemap_url)

google_export = []
idealo_export = []
geizhals_export = []
guenstiger_export = []
billiger_export = []

for url in product_urls:
    if url.startswith('https://www.packingpanic.de/dp/'):
        print(url)
        variations = extract_variation_urls(url)
        if not any(variations):
            variation_urls = [url]
        else:
            variation_urls = variations

        for product_url in variation_urls:
            print('+' + product_url)
            data = scrape_product_data(product_url)
            # some times gtin is missing if product is not SKU
            if (data['gtin'] == '' and data['availability'] == 'out_of_stock'):
                continue
            google_data = {
                'id': data['product_id'],
                'item_group_id': data['item_group_id'],
                # 'language': 'de',
                'product_type': data['product_type'],
                'google_product_category': data['google_category'],
                'title': data['title'],
                'link': data['link'],
                'price': data['price'],
                'color': data['color'],
                'size': data['size'],
                'availability': data['availability'],
                'sale_price': '',
                'image_link': data['image_link'],
                'additional_image_link': data['additional_image_link'],
                'gtin': data['gtin'],
                # DO NOT export MPN since not always in the product data
                'mpn': '',
                'brand': data['brand'],
                'description': data['description'],
                'condition': 'new',
                'custom_label_0': data['custom_label'],
                'feed_label': data['product_type'] ,
                'age_group': 'adult'
            }
            google_export.append(google_data)
            
            deliveryCosts = '3.50'
            if data['price_value'] >= 35:
                deliveryCosts = '0.00'
            # add variant text to the product title    
            variant_suffix = ''
            if data['color']:
                variant_suffix = ', ' + data['color']
            if data['size']:
                variant_suffix += ', ' + data['size']
            # TODO: Extract this from product page!    
            deliveryText = 'Lieferung in 1-3 Werktagen'
            availabilityText = 'Nicht lagernd'
            if data['availability'] == 'in_stock':
                availabilityText = 'sofort lieferbar, ' + deliveryText
            
            # similar to google, order is same as appearance in the spec
            # https://www.solute.de/ger/support/leitfaden_shop_integration/leitfaden_shop_integration.php
            billiger_data = {
                'id': data['product_id'],
                'gtin': data['gtin'],
                'title': data['title'] + variant_suffix,
                'brand': data['brand'],
                'desc': data['description'],
                'link': strip_url_fragment(data['link']) + '?utm_source=solute',
                # TODO: a bit unclear what this is about
                'target_url': strip_url_fragment(data['link']),
                'images': data['image_link'] + ',' +  data['additional_image_link'],
                # price and availability
                'price': data['price'],
                'dlv_cost': deliveryCosts,
                'dlv_time': deliveryText,
                'availability': data['availability'].replace('_', ' '),
                # category
                'shop_cat': data['idealo_category_path'],
                'google_product_category_ID': data['google_category'],
                # product details
                'item_group_id': data['item_group_id'],
                'size': data['size'],
                'color': data['color'],
            }
            billiger_export.append(billiger_data)

            # amend google data for guenstiger_export
            guenstiger_data = google_data.copy()
            guenstiger_data['availability_guenstiger'] = availabilityText
            guenstiger_data['shipment_cost_guenstiger'] = deliveryCosts
            guenstiger_export.append(guenstiger_data)
            
            ## export only available products to idealo
            if data['availability'] == 'in_stock':
                idealo_export.append({
                    'sku': data['product_id'],
                    'brand': data['brand'],
                    'title': data['title'] + variant_suffix,
                    'description': data['description'],
                    'url': strip_url_fragment(data['link']),
                    'price': data['price'],
                    'colour': data['color'],
                    'size': data['size'],
                    'categoryPath': data['idealo_category_path'],
                    'imageUrls': data['image_link'] + ',' +  data['additional_image_link'],
                    'eans': data['gtin'],
                    'deliveryTime': deliveryText,
                    'deliveryCosts_dhl': deliveryCosts
                })
            geizhals_data = {
                'Artikelnummer': data['product_id'],
                'Produktbezeichnung': data['title'] + variant_suffix,
                'Herstellername': data['brand'],
                'Preis': data['price_value'],
                'Deeplink': strip_url_fragment(data['link']),
                'MPN': '',
                'Verfügbarkeit': availabilityText,
                'EAN': data['gtin'],
                'Kategorie': data['idealo_category_path'],
                'Bild': data['image_link'],
                'Beschreibung': data['description'],
                'Versand DE Vorkasse': deliveryCosts
            }
            geizhals_export.append(geizhals_data)
                    
def write_tsv(name, data):
    # Use the specified path for the file
    file_path = os.path.join('/Users/bakhrululum19/Downloads/Downloads', name)
    
    with open(file_path, 'w', newline='', encoding='utf-8') as file:
        fieldnames = data[0].keys()
        writer = csv.DictWriter(file, fieldnames=fieldnames, delimiter='\t')
        writer.writeheader()
        writer.writerows(data)
    
    print('Data successfully scraped and saved to', file_path)

# Specify the specified path for the TSV files
write_tsv('google-products-de.tsv', google_export)
write_tsv('idealo-products-de.tsv', idealo_export)
write_tsv('guenstiger-products-de.tsv', guenstiger_export)
write_tsv('geizhals-products-de.tsv', geizhals_export)
write_tsv('billiger-products-de.tsv', billiger_export)

https://www.packingpanic.de/dp/6/usb-a-zu-usb-c-kabel-50cm/
+https://www.packingpanic.de/dp/6/usb-a-zu-usb-c-kabel-50cm/
https://www.packingpanic.de/dp/7/usb-cable-type-a-to-type-c-25cm/
+https://www.packingpanic.de/dp/7/usb-cable-type-a-to-type-c-25cm/
https://www.packingpanic.de/dp/9/icebreaker-merino-tech-lite-ii-short-sleeve-t-shirt-herren/
+https://www.packingpanic.de/dp/9-122/icebreaker-merino-tech-lite-ii-short-sleeve-t-shirt-herren/#/grosse-m/farbe-go_berry
+https://www.packingpanic.de/dp/9-123/icebreaker-merino-tech-lite-ii-short-sleeve-t-shirt-herren/#/grosse-s/farbe-go_berry
+https://www.packingpanic.de/dp/9-124/icebreaker-merino-tech-lite-ii-short-sleeve-t-shirt-herren/#/grosse-l/farbe-go_berry
+https://www.packingpanic.de/dp/9-125/icebreaker-merino-tech-lite-ii-short-sleeve-t-shirt-herren/#/grosse-xs/farbe-go_berry
+https://www.packingpanic.de/dp/9-126/icebreaker-merino-tech-lite-ii-short-sleeve-t-shirt-herren/#/grosse-xl/farbe-go_berry
+https://www.packingpanic.de/dp/9-12

In [6]:
# Original Version
#!/usr/bin/python3

# Requirements
#
#  apt install python3-pip
#  pip install beautifulsoup4
# add gender

import requests
import xml.etree.ElementTree as ET
import json
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import csv
import os


# factored out error handling, we expect all urls to yield status 200
def fetch(url):
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception('Aborting, status ' + response.status_code + ' for URL ' + url)
    return response

# cached urls are only fetched once
cache = {}
def fetch_cached(url):
    if url not in cache:
        cache[url] = fetch(url)
    return cache[url]

def get_id_from_url(url):
    return url.split('/dp/')[1].split('/')[0]

def get_item_group_id(url):
    # Extract Item Group ID from the URL by splitting and getting the part after '/dp/'
    item_group_id = url.split('/dp/')[1].split('/')[0]
    
    # Remove any additional parts after the item_group_id
    if '-' in item_group_id:
        item_group_id = item_group_id.split('-')[0]

    return item_group_id

def scrape_product_urls(base_url):
    response = fetch(base_url)
    root = ET.fromstring(response.content)
    urls = [url.text for url in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc')]
    return urls

# Function to extract variation URLs from JSON data
def extract_variation_urls(product_url):
    item_group_id = get_item_group_id(product_url)
    variation_url = f'https://www.packingpanic.de/index.php?fc=module&module=papa_sidecar&controller=allVariations&id={item_group_id}'
    response = fetch(variation_url)
    data = json.loads(response.text)
    if not isinstance(data, list):
            raise Exception('list expected for ' + variation_url)
    return data

def extract_image_links(data):
    image_links = data.get('image', [])
    additional_image_links = ','.join(image_links[1:]) if len(image_links) > 1 else ''
    return additional_image_links

def map_availability(availability):
    availability_mapping = {
        'http://schema.org/OutOfStock': 'out_of_stock',
        'http://schema.org/InStock': 'in_stock',
        'http://schema.org/PreOrder': 'preorder',
        'http://schema.org/BackOrder': 'backorder'
    }
    return availability_mapping.get(availability, '')

#
# Retrieves category page my url and extract category from the meta keywords
# We use the following scheme for the keywords: facebook-category-123, google-category-4711
#
def get_category_from_keywords(url, prefix):
    if not url:
        return ''
    response = fetch_cached(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    keywords = soup.find('meta', {'name': 'keywords'})['content']
    for x in keywords.split(','):
        x = x.strip()
        if x.startswith(prefix):
            return x.split('-')[2]
    return '';    

def scrape_product_data(url):
    response = fetch(url)
    result = {}
    soup = BeautifulSoup(response.content, 'html.parser')
    # get all script elements
    scripts = soup.find_all('script', {'type': 'application/ld+json'})
    # find the json data for Product and BreadcrumbList
    for script in scripts:
        obj = json.loads(script.string)
        type = obj.get('@type', '')
        if type == 'Product':
            data = obj
        elif type == 'BreadcrumbList':
            breadcrumbs = obj
            
    result['title'] = data.get('name', '').strip()
    result['description'] = data.get('description', '').strip()
    result['link'] = url

    offers= data.get('offers', {})
    price_value = float(offers.get('price', ''))
    price = f"{offers.get('price', '')} {offers.get('priceCurrency', '')}"
    result['price'] = price
    result['price_value'] = price_value
    result['availability'] = map_availability(offers.get('availability', ''))

    result['gtin'] = data.get('gtin13', '')

    brand_data = data.get('brand', {})
    result['brand'] = brand_data.get('name', '').strip()

    image_links = offers.get('image', [])
    result['image_link'] = image_links[0] if image_links else ''
    result['additional_image_link'] = extract_image_links(offers)  # Get additional image links

    # Extract ID from the URL
    result['product_id'] = get_id_from_url(product_url)

    # Extract Item Group ID from the URL
    result['item_group_id'] = get_item_group_id(product_url)

    result['color'] = data.get('color', '')
    result['size'] = data.get('size', '')
    
    # Determine the custom label based on price ranges
    # price_value = float(price.split()[0])
    if price_value < 5:
        custom_label = 'Price A'
    elif price_value < 25:
        custom_label = 'Price B'
    elif price_value < 100:
        custom_label = 'Price C'
    else:
        custom_label = 'Price D'    
    result['custom_label'] = custom_label
    
    # the immediatly assinged category is the second last from the BreadcrumbList
    # extract the url for the category page
    items = breadcrumbs.get('itemListElement')
    category_url = ''
    if len(items) > 1:
        category_url = items[-2]['item']
    
    result['facebook_category'] = get_category_from_keywords(category_url, 'facebook-category-')
    result['google_category'] = get_category_from_keywords(category_url, 'google-category-')
    result['product_type'] = data.get('category', '')

    # Construct path of category names separated by ' & '
    result['idealo_category_path'] = ''
    if len(items) > 2:
        items.pop(0)
        items.pop(-1)
        result['idealo_category_path'] = ' & '.join([x['name'] for x in items])
    
       # Check for specific elements in BreadcrumbList
    for item in breadcrumbs.get('itemListElement', []):
        if 'position' in item and 'name' in item:
            position = item['position']
            name = item['name'].lower()
            if position == 2 and 'kleidung' in name:
                result['gender'] = ''
            elif position == 3:
                if 'maenner' in name or 'männer' in name:
                    result['gender'] = 'male'
                elif 'frauen' in name:
                    result['gender'] = 'female'
                elif 'unisex' in name:
                    result['gender'] = 'unisex'
   
    return result

def strip_url_fragment(url):
    return url.split('#')[0]

sitemap_url = 'https://www.packingpanic.de/2_de_0_sitemap.xml'
product_urls = scrape_product_urls(sitemap_url)

google_export = []
idealo_export = []
geizhals_export = []
guenstiger_export = []
billiger_export = []

for url in product_urls:
    if url.startswith('https://www.packingpanic.de/dp/'):
        print(url)
        variations = extract_variation_urls(url)
        if not any(variations):
            variation_urls = [url]
        else:
            variation_urls = variations

        for product_url in variation_urls:
            print('+' + product_url)
            data = scrape_product_data(product_url)
            # some times gtin is missing if product is not SKU
            if (data['gtin'] == '' and data['availability'] == 'out_of_stock'):
                continue
            google_data = {
                'id': data['product_id'],
                'item_group_id': data['item_group_id'],
                # 'language': 'de',
                'product_type': data['product_type'],
                'google_product_category': data['google_category'],
                'title': data['title'],
                'link': data['link'],
                'price': data['price'],
                'color': data['color'],
                'size': data['size'],
                'availability': data['availability'],
                'sale_price': '',
                'image_link': data['image_link'],
                'additional_image_link': data['additional_image_link'],
                'gtin': data['gtin'],
                # DO NOT export MPN since not always in the product data
                'mpn': '',
                'brand': data['brand'],
                'description': data['description'],
                'condition': 'new',
                'custom_label_0': data['custom_label'],
                'feed_label': data['product_type'] ,
                'age_group': 'adult',
                'gender': data['gender']
            }
            google_export.append(google_data)
            
            deliveryCosts = '3.50'
            if data['price_value'] >= 35:
                deliveryCosts = '0.00'
            # add variant text to the product title    
            variant_suffix = ''
            if data['color']:
                variant_suffix = ', ' + data['color']
            if data['size']:
                variant_suffix += ', ' + data['size']
            # TODO: Extract this from product page!    
            deliveryText = 'Lieferung in 1-3 Werktagen'
            availabilityText = 'Nicht lagernd'
            if data['availability'] == 'in_stock':
                availabilityText = 'sofort lieferbar, ' + deliveryText
            
            # similar to google, order is same as appearance in the spec
            # https://www.solute.de/ger/support/leitfaden_shop_integration/leitfaden_shop_integration.php
            billiger_data = {
                'id': data['product_id'],
                'gtin': data['gtin'],
                'title': data['title'] + variant_suffix,
                'brand': data['brand'],
                'desc': data['description'],
                'link': strip_url_fragment(data['link']) + '?utm_source=solute',
                # TODO: a bit unclear what this is about
                'target_url': strip_url_fragment(data['link']),
                'images': data['image_link'] + ',' +  data['additional_image_link'],
                # price and availability
                'price': data['price'],
                'dlv_cost': deliveryCosts,
                'dlv_time': deliveryText,
                'availability': data['availability'].replace('_', ' '),
                # category
                'shop_cat': data['idealo_category_path'],
                'google_product_category_ID': data['google_category'],
                # product details
                'item_group_id': data['item_group_id'],
                'size': data['size'],
                'color': data['color'],
            }
            billiger_export.append(billiger_data)

            # amend google data for guenstiger_export
            guenstiger_data = google_data.copy()
            guenstiger_data['availability_guenstiger'] = availabilityText
            guenstiger_data['shipment_cost_guenstiger'] = deliveryCosts
            guenstiger_export.append(guenstiger_data)
            
            ## export only available products to idealo
            if data['availability'] == 'in_stock':
                idealo_export.append({
                    'sku': data['product_id'],
                    'brand': data['brand'],
                    'title': data['title'] + variant_suffix,
                    'description': data['description'],
                    'url': strip_url_fragment(data['link']),
                    'price': data['price'],
                    'colour': data['color'],
                    'size': data['size'],
                    'categoryPath': data['idealo_category_path'],
                    'imageUrls': data['image_link'] + ',' +  data['additional_image_link'],
                    'eans': data['gtin'],
                    'deliveryTime': deliveryText,
                    'deliveryCosts_dhl': deliveryCosts
                })
            geizhals_data = {
                'Artikelnummer': data['product_id'],
                'Produktbezeichnung': data['title'] + variant_suffix,
                'Herstellername': data['brand'],
                'Preis': data['price_value'],
                'Deeplink': strip_url_fragment(data['link']),
                'MPN': '',
                'Verfügbarkeit': availabilityText,
                'EAN': data['gtin'],
                'Kategorie': data['idealo_category_path'],
                'Bild': data['image_link'],
                'Beschreibung': data['description'],
                'Versand DE Vorkasse': deliveryCosts
            }
            geizhals_export.append(geizhals_data)
                    
def write_tsv(name, data):
    # Use the specified path for the file
    file_path = os.path.join('/Users/bakhrululum19/Downloads/Downloads', name)
    
    with open(file_path, 'w', newline='', encoding='utf-8') as file:
        fieldnames = data[0].keys()
        writer = csv.DictWriter(file, fieldnames=fieldnames, delimiter='\t')
        writer.writeheader()
        writer.writerows(data)
    
    print('Data successfully scraped and saved to', file_path)

# Specify the specified path for the TSV files
write_tsv('google-products-de.tsv', google_export)
write_tsv('idealo-products-de.tsv', idealo_export)
write_tsv('guenstiger-products-de.tsv', guenstiger_export)
write_tsv('geizhals-products-de.tsv', geizhals_export)
write_tsv('billiger-products-de.tsv', billiger_export)

https://www.packingpanic.de/dp/6/usb-a-zu-usb-c-kabel-50cm/
+https://www.packingpanic.de/dp/6/usb-a-zu-usb-c-kabel-50cm/
https://www.packingpanic.de/dp/7/usb-cable-type-a-to-type-c-25cm/
+https://www.packingpanic.de/dp/7/usb-cable-type-a-to-type-c-25cm/
https://www.packingpanic.de/dp/9/icebreaker-merino-tech-lite-ii-short-sleeve-t-shirt-herren/
+https://www.packingpanic.de/dp/9-122/icebreaker-merino-tech-lite-ii-short-sleeve-t-shirt-herren/#/grosse-m/farbe-go_berry
+https://www.packingpanic.de/dp/9-123/icebreaker-merino-tech-lite-ii-short-sleeve-t-shirt-herren/#/grosse-s/farbe-go_berry
+https://www.packingpanic.de/dp/9-124/icebreaker-merino-tech-lite-ii-short-sleeve-t-shirt-herren/#/grosse-l/farbe-go_berry
+https://www.packingpanic.de/dp/9-125/icebreaker-merino-tech-lite-ii-short-sleeve-t-shirt-herren/#/grosse-xs/farbe-go_berry
+https://www.packingpanic.de/dp/9-126/icebreaker-merino-tech-lite-ii-short-sleeve-t-shirt-herren/#/grosse-xl/farbe-go_berry
+https://www.packingpanic.de/dp/9-12