In [60]:
from typing import Optional


class Node:
	def __init__(self, id: int, tag_name: str, parent_id: Optional[int], children: list[int], attrs: dict, text: Optional[str]):
		self.id = id
		self.tag_name = tag_name
		self.parent_id = parent_id
		self.children = children
		self.text = text
		self.attrs = attrs

In [61]:
SUPPORTED_FORMATS = ['.html', '.json', '.xml', '.csv', '.tsv']

In [62]:
import os

file_path = input('relative file path: ')
file_extension = os.path.splitext(file_path)[1]

In [63]:
from bs4 import BeautifulSoup

def convert_html_to_dataclass(html_string: str) -> list[Node]:
	nodes = []

	soup = BeautifulSoup(html_string, 'html.parser')

	def parse_node(soup_node, parent_id=None):
		node_id = len(nodes)
		node = Node(
			id=node_id,
			tag_name=soup_node.name,
			parent_id=parent_id,
			children=[],
			attrs=soup_node.attrs,
			text=soup_node.string if soup_node.string else ''
		)
		nodes.append(node)

		for child in soup_node.children:
			if child.name is not None:
				child_node_id = parse_node(child, node_id)
				node.children.append(child_node_id)

		return node_id

	for element in soup.find_all(recursive=False):
		parse_node(element)

	return nodes

In [64]:
def convert_xml_to_dataclass(xml_string: str) -> list[Node]:
	import xml.etree.ElementTree as ET

	nodes = []

	root = ET.fromstring(xml_string)

	def parse_node(node, parent_id=None):
		node_id = len(nodes)
		node = Node(
			id=node_id,
			tag_name=node.tag,
			parent_id=parent_id,
			children=[],
			attrs=node.attrib,
			text=node.text if node.text else ''
		)
		nodes.append(node)

		for child in node:
			child_node_id = parse_node(child, node_id)
			node.children.append(child_node_id)

		return node_id

	parse_node(root)

	return nodes

In [70]:
def convert_json_to_dataclass(json_string: str) -> list[Node]:
	import json

	nodes = []

	json_data = json.loads(json_string)

	def parse_node(json_node):
		node = Node(
			id=json_node['id'],
			tag_name=json_node['tag_name'],
			parent_id=json_node['parent_id'],
			children=json_node['children'],
			attrs=json_node['attrs'],
			text=json_node['text']
		)
		nodes.append(node)
		
	for element in json_data:
		parse_node(element)

	return nodes

In [66]:
def convert_csv_to_dataclass(csv_content: str) -> list[Node]:
	nodes = []

	import csv

	reader = csv.reader(csv_content.split('\n'))
	for row in reader:
		node = Node(
			id=row[0],
			tag_name=row[1],
			parent_id=int(row[2]),
			children=[int(child) for child in row[3].split(',')],
			attrs=row[4],
			text=row[5]
		)
		nodes.append(node)
	return nodes

def convert_tsv_to_dataclass(csv_content: str) -> list[Node]:
	nodes = []

	import csv

	reader = csv.reader(csv_content.split('\n'), delimiter='\t')
	for row in reader:
		node = Node(
			id=row[0],
			tag_name=row[1],
			parent_id=int(row[2]),
			children=[int(child) for child in row[3].split(',')],
			attrs=row[4],
			text=row[5]
		)
		nodes.append(node)
	return nodes

In [67]:
def convert_to_dataclass(file_path: str, file_extension: str) -> list[Node]:
	if file_extension not in SUPPORTED_FORMATS:
		print('unsupported file format')
		return

	with open(file_path, 'r') as file:
		file_content = file.read()

	functor = None
	if file_extension == '.html':
		functor = convert_html_to_dataclass
	elif file_extension == '.csv':
		functor = convert_csv_to_dataclass
	elif file_extension == '.tsv':
		functor = convert_tsv_to_dataclass
	elif file_extension == '.json':
		functor = convert_json_to_dataclass
	elif file_extension == '.xml':
		functor = convert_xml_to_dataclass
	
	return functor(file_content)
	

In [68]:
import codecs


def convert_from_dataclass(nodes: list[Node], input_file_name: str):
	for extension in SUPPORTED_FORMATS:
		file_path = f'output/{input_file_name}{extension}'
		if os.path.exists(file_path):
			os.remove(file_path)
		
		os.makedirs(os.path.dirname(file_path), exist_ok=True)

		with open(file_path, 'wb') as file:

			if extension == '.json':
				import json
				json.dump([node.__dict__ for node in nodes], codecs.getwriter('utf-8')(file), indent=4, ensure_ascii=False)
			if extension == '.csv': 
				import csv
				writer = csv.writer(codecs.getwriter('utf-8')(file))
				writer.writerow(['id', 'tag_name', 'parent_id', 'children', 'attrs', 'text'])
				for node in nodes:
					writer.writerow([node.id, node.tag_name, node.parent_id, node.children, node.attrs, node.text])
			if extension == '.tsv':
				import csv
				writer = csv.writer(codecs.getwriter('utf-8')(file), delimiter='\t')
				writer.writerow(['id', 'tag_name', 'parent_id', 'children', 'attrs', 'text'])
				for node in nodes:
					writer.writerow([node.id, node.tag_name, node.parent_id, node.children, node.attrs, node.text])
			if extension == '.xml':
				import xml.etree.ElementTree as ET

				def node_to_xml(node):
					element = ET.Element(node.tag_name)
					for attr, value in node.attrs.items():
						if isinstance(value, list):
							value = ' '.join(value)
						element.set(attr, value)
						
					if node.text:
						element.text = node.text
					for child_id in node.children:
						child_node = nodes[child_id]
						element.append(node_to_xml(child_node))
					return element

				root = ET.Element('root')
				for node in nodes:
					if node.parent_id is None:
						root.append(node_to_xml(node))

				tree = ET.ElementTree(root)
				tree.write(file, encoding='utf-8', xml_declaration=True)
			if extension == '.html':
				html_content = ''

				def node_to_html(node):
					tag_open = f'<{node.tag_name}'
					for attr, value in node.attrs.items():
						if isinstance(value, list):
							value = ' '.join(value)
							
						tag_open += f' {attr}="{value}"'
					tag_open += '>'
					tag_close = f'</{node.tag_name}>'
					content = node.text if node.text else ''
					for child_id in node.children:
						child_node = nodes[child_id]
						content += node_to_html(child_node)
					return f'{tag_open}{content}{tag_close}'

				for node in nodes:
					if node.parent_id is None:
						html_content += node_to_html(node)

				file.write(html_content.encode('utf-8'))


In [71]:
parsed = convert_to_dataclass(file_path, file_extension)
convert_from_dataclass(parsed, file_path)