In [1]:
# HTML session requesting
from requests_html import HTMLSession
# HTML parsing 
from bs4 import BeautifulSoup
# XML validation
from lxml import etree
# Regex
import re
# Time
import time
# Date
from dateutil.parser import parse

URL = "https://comfy.ua/"

# Utils

In [2]:
def is_decimal(item):
    if isinstance(item, str):
        int_pattern = re.compile("^[0-9]*$")
        float_pattern = re.compile("^[0-9]*\.[0-9]*$")
        if float_pattern.match(item) or int_pattern.match(item):
            return True
        else:
            return False

def is_date(string, fuzzy=False):
    try: 
        parse(string, fuzzy=fuzzy)
        return True

    except ValueError:
        return False

def is_time_format(input):
    try:
        time.strptime(input, '%H:%M')
        return True
    except ValueError:
        return False

def base_type_def(data: str) -> str:
    data = data.strip()
    
    if data.isnumeric():
        return "integer"
    elif is_decimal(data):
        return "float"
    elif data.lower() in ['true', 'false']:
        return "bool"
    elif is_time_format(data):
        return "time"
    elif is_date(data):
        return "date"
    elif data != "":
        return "string"
    else:
        return "None"

In [3]:
def replace_by_dict(data: str, value_dict: dict) -> str:
    res = data
    for i in value_dict.keys():
        res = res.replace(i, value_dict[i])
    return res

In [4]:
def write_to_file(string: str, path_name: str, encoding: str = 'UTF-8') -> None:
    with open(path_name, 'w', encoding=encoding) as out:
        out.write(string)

In [5]:
def validate_xml(xml_path: str, xsd_path: str):
    xmlschema_doc = etree.parse(xsd_path)
    xmlschema = etree.XMLSchema(xmlschema_doc)
    xml_doc = etree.parse(xml_path)
    result = xmlschema.validate(xml_doc)
    return result

In [6]:
def tree_to_xmlstring(node: dict, version: str = '1.0', encoding: str = 'UTF-8') -> str:
    if version not in ['1.0', '1.1']:
        raise Exception("Specified version doesn't exist!")
    
    if not encoding in [
        "ASCII",
        "UTF-8",
        "UTF-16",
        "US-ASCII",
        "ISO-8859-1",
    ]:
        raise Exception("Specified encoding doesn't exist!")

    result = '<?xml version="{ver}" encoding="{enc}"?>\n'.format(ver=version, enc=encoding)
    return result + tree_to_xmlstring_inner(node)


def tree_to_xmlstring_inner(node_cluster: dict, level: int = 0) -> str:
    result = ''
    for node in node_cluster:
        if node[2] == "":
            continue
        result += ('  ' * level) + '<{name}'.format(name = node[0])
        for attr in node[1].keys():
            result += ' {attr_name}={attr_val}'.format(attr_name=attr, attr_val = '"' + str(node[1][attr]) + '"')

        if node[2] == None:
            result += '/>\n'
        elif type(node[2]) == list:
            result += '>\n' + tree_to_xmlstring_inner(node[2], level + 1) + ('  ' * level) + '</{name}>\n'.format(name = node[0])
        else:
            result += '>{data}</{name}>\n'.format(data = replace_by_dict(str(node[2]), {
                '&':    '&amp;',
                '<':    '&lt;',
                '>':    '&gt;',
                '\'':   '&apos;',
                '"':    '&quot;'
            }), name = node[0])

    return result

# Processing

In [7]:
session = HTMLSession()
response = session.get(URL)
soup = BeautifulSoup(response.text.encode('utf-8'), 'lxml')

In [8]:
products = soup.find_all('div', attrs={'class':'products-list-item prc__item products-list-item--grid'})

In [9]:
def add_located(prd, element: str, elem_class: str, needs_split: bool):
    try:
        if needs_split:
            return prd.find(element, attrs={'class':elem_class}).text.strip().split(' ')
        else:
            return prd.find(element, attrs={'class':elem_class}).text.strip()
    except:
        return ""

In [10]:
product_tree = [
    ("products", {}, [])
]

for product in products:
    product_data = []
    product_data.append(("name", {}, add_located(product, "a", "products-list-item__name", False)))
    product_data.append(("rating", {}, add_located(product, "div", "rating-box__active", False)))
    product_data.append(("review_count", {}, add_located(product, "a", "products-list-item__reviews", False)))
    
    # Price block
    price_old = add_located(product, "div", "products-list-item__actions-price-old", True)
    if price_old != "":
        p_price_old = [re.findall("\d+", x) for x in price_old if x != '' or x != '\n'][:2]
        product_data.append(("price_old", {
                "currency": '₴'
            }, p_price_old[0][0] + '.' + p_price_old[1][0]))

    price_new = add_located(product, "div", "products-list-item__actions-price-current", True)
    if price_new != "":
        p_price_new = [re.findall("\d+", x) for x in price_new if x != '' or x != '\n'][:2]
        product_data.append(("price_new", {
                "currency": '₴'
            }, p_price_new[0][0] + '.' + p_price_new[1][0]))

    discount = add_located(product, "span", "products-list-item__actions-price-discount", False)
    if discount != "":
        product_data.append(("discount", {}, discount))

    
    product_tree[0][2].append(("product", {}, product_data))

In [11]:
write_to_file(tree_to_xmlstring(product_tree), "res.xml")

# Validation

In [12]:
validate_xml("res.xml", "res.xsd")

True