In [None]:
domain = "docs.sillytavern.app"
docs_url = "https://docs.sillytavern.app/"
!wget -e robots=off --recursive --mirror --no-clobber --page-requisites --html-extension --convert-links --restrict-file-names=windows --domains {domain} --no-parent {docs_url}

In [None]:
%pip install -q langchain langchain_openai python-dotenv portkey-ai pyyaml lxml

In [None]:
import os
import dotenv
from langchain_openai import ChatOpenAI
from portkey_ai import PORTKEY_GATEWAY_URL, createHeaders

dotenv.load_dotenv()


def portkey_llm(model="openai/gpt-4o-mini", temperature=0.5):
    PORTKEY_API_KEY = os.getenv("PORTKEY_API_KEY")
    OPEN_ROUTER_API_KEY = os.getenv("OPEN_ROUTER_API_KEY")
    OPEN_ROUTER_URL = os.getenv("OPEN_ROUTER_URL")

    headers = createHeaders(provider="openrouter", api_key=PORTKEY_API_KEY)
    # base_url = "http://localhost:8787/v1"
    # base_url = OPEN_ROUTER_URL
    base_url = PORTKEY_GATEWAY_URL
    # print(headers)

    chat = ChatOpenAI(model=model,
                      api_key=OPEN_ROUTER_API_KEY,
                      base_url=base_url,
                      default_headers=headers,
                      temperature=temperature)
    return chat

In [None]:
chat = portkey_llm()
response = chat.invoke("什么是生命，请回复中文")
print(response)

In [None]:
import re
import yaml
from lxml import html
from lxml.etree import ElementTree

file_path = './docs.sillytavern.app/index.html'

with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()

# html_content = '''
# <html>
#   <body>
#     <div class="content">
#       <p id="para2">This is another paragraph.</p>
#     </div>
#     <div id="para3">abc<b>123</b></div>
#     <span>Some text here</span>
#     <!-- This is a comment and should be skipped -->
#   </body>
# </html>
# '''
    
# 解析 HTML 内容
tree = html.fromstring(content)

# 获取根元素
root = tree.getroottree()

# 存储元素信息的数组
elements_info = []
index = 0

# 遍历所有元素并获取信息
def get_element_info(element, root):
    global index
    # 检查节点是否为元素节点
    if isinstance(element, html.HtmlElement) and element.tag not in ('script', 'style', 'noscript', 'head', 'link'):
        # 仅处理具有文本内容且没有子元素的元素
        text = element.text or ''
        if re.search(r'\w', text):
            xpath = root.getpath(element)
            text = element.text.strip()
            attributes = element.attrib
            index += 1
            elements_info.append({
                'index': index,
                'xpath': xpath,
                'text': text,
                # 'attributes': attributes
            })
    
    # 递归遍历子元素
    for child in element:
        get_element_info(child, root)


# 获取所有元素的信息
get_element_info(tree, root)

# 只提取 index 和 text
filtered_texts = [{"index": item["index"], "text": item["text"]} for item in elements_info]

# 转换为 YAML 格式
yaml_data = yaml.dump(filtered_texts, allow_unicode=True)
print(yaml_data)

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

system_template = """
You will be given a YAML formatted input containing entries with "id" and "{imt_source_field}" fields. 

For each entry in the YAML, translate the contents of the "{imt_source_field}" field into {to}. Write the translation back into the "{{imt_source_field}}" field for that entry.

Here is an example of the expected format:

<example>
Input:
  - id: 1
    {imt_source_field}: Source
Output:
  - id: 1
    {imt_trans_field}: Translation
</example>

Please return the translated YAML directly without wrapping <yaml> tag or include any additional information.
"""

user_template = """
Here is the input:

<yaml>
{{yaml}}
</yaml>
"""

prompt_template = ChatPromptTemplate.from_messages(
    [("system", system_template), ("user", "{text}")]
)

parser = StrOutputParser()
chain = prompt_template | portkey_llm() | parser
response = chain.invoke({"text": yaml_data, "to": "chinese", "imt_source_field": "text", "imt_trans_field": "translation"})


In [None]:

translated_texts = yaml.load(response, Loader=yaml.FullLoader)
# 将 source_texts 和 translated_texts 通过index 合并
for source_text in elements_info:
    for translated_text in translated_texts:
        if source_text["index"] == translated_text["index"]:
            source_text["translated"] = translated_text["translation"]

print(elements_info[0])

In [None]:
elements_info

In [None]:
# 如何根据 xpath 在 elements_info 找到对应的标签，并将标签的 text 替换成translated

for element_info in elements_info:
    element = tree.xpath(element_info["xpath"])[0]
    element.text = element_info["translated"]

# 将修改后的 HTML 内容写回文件
with open(file_path, 'w', encoding='utf-8') as file:
    file.write(html.tostring(tree, pretty_print=True, encoding='unicode'))



In [81]:
import re
import yaml
from bs4 import BeautifulSoup
from lxml import etree

file_path = './docs.sillytavern.app/index.html'

with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()

content = '''
<html>
  <body>
    <div class="content">
      <p id="para2">This is another paragraph.</p>
    </div>
    <div id="para3">abc<b>123</b></div>
    <span>Some text here</span>
    <!-- This is a comment and should be skipped -->
  </body>
</html>
'''

# 使用BeautifulSoup解析HTML内容
soup = BeautifulSoup(content, 'html.parser')

# 将BeautifulSoup对象转换为lxml对象
dom = etree.HTML(str(soup))

# 存储元素信息的数组
elements_info = []
index = 0

# 遍历所有元素并获取信息
def get_element_info(element, root):
    global index
    # 检查节点是否为元素节点
    if isinstance(element, etree._Element) and element.tag not in ('script', 'style', 'noscript', 'head', 'link'):
        # 仅处理具有文本内容且没有子元素的元素
        text = element.text or ''
        if re.search(r'\w', text):
            xpath = root.getpath(element)
            text = text.strip()
            index += 1
            elements_info.append({
                'index': index,
                'tab': element.tag,
                'xpath': xpath,
                'text': text,
            })
    
    # 递归遍历子元素
    for child in element:
        get_element_info(child, root)

# 获取根元素
root = dom.getroottree()

# 获取所有元素的信息
get_element_info(dom, root)

# # 只提取 index 和 text
# filtered_texts = [{"index": item["index"], "text": item["text"]} for item in elements_info]

# # 转换为 YAML 格式
# yaml_data = yaml.dump(filtered_texts, allow_unicode=True)
# print(yaml_data)
elements_info


[{'index': 1,
  'tab': 'p',
  'xpath': '/html/body/div[1]/p',
  'text': 'This is another paragraph.'},
 {'index': 2, 'tab': 'div', 'xpath': '/html/body/div[2]', 'text': 'abc'},
 {'index': 3, 'tab': 'b', 'xpath': '/html/body/div[2]/b', 'text': '123'},
 {'index': 4,
  'tab': 'span',
  'xpath': '/html/body/span',
  'text': 'Some text here'},
 {'index': 5,
  'tab': <cyfunction Comment at 0x109d9b5e0>,
  'xpath': '/html/body/comment()',
  'text': 'This is a comment and should be skipped'}]

In [82]:
import re
import yaml
from bs4 import BeautifulSoup

file_path = './docs.sillytavern.app/index.html'

with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()

content = '''
<html>
  <body>
    <div class="content">
      <p id="para2">This is another paragraph.</p>
    </div>
    <div id="para3">abc<b>123</b></div>
    <span>Some text here</span>
    <!-- This is a comment and should be skipped -->
  </body>
</html>
'''

# 使用BeautifulSoup解析HTML内容
soup = BeautifulSoup(content, 'html.parser')

# 存储元素信息的数组
elements_info = []
index = 0

# 获取元素的 CSS 选择器
def get_css_selector(element):
    parts = []
    for parent in element.parents:
        siblings = parent.find_all(element.name, recursive=False)
        if len(siblings) == 1:
            parts.append(element.name)
        else:
            index = siblings.index(element) + 1
            parts.append(f'{element.name}:nth-of-type({index})')
        element = parent
    parts.reverse()
    return ' > '.join(parts)

# 遍历所有元素并获取信息
def get_element_info(element):
    global index
    if element.name and element.name not in ('script', 'style', 'noscript', 'head', 'link'):
        text = element.get_text(strip=True)
        if re.search(r'\w', text):
            css_selector = get_css_selector(element)
            text = text.strip()
            index += 1
            elements_info.append({
                'index': index,
                'tag': element.name,
                'css_selector': css_selector,
                'text': text,
            })
    
    # 递归遍历子元素
    for child in element.find_all(recursive=False):
        get_element_info(child)

# 获取所有元素的信息
get_element_info(soup)

# 打印所有元素信息，包括CSS选择器
for element_info in elements_info:
    print(element_info)

# 如果需要，可以将elements_info写入YAML文件
# yaml_data = yaml.dump(elements_info, allow_unicode=True)
# print(yaml_data)


{'index': 1, 'tag': '[document]', 'css_selector': '', 'text': 'This is another paragraph.abc123Some text here'}
{'index': 2, 'tag': 'html', 'css_selector': 'html', 'text': 'This is another paragraph.abc123Some text here'}
{'index': 3, 'tag': 'body', 'css_selector': 'html > body', 'text': 'This is another paragraph.abc123Some text here'}
{'index': 4, 'tag': 'div', 'css_selector': 'html > body > div:nth-of-type(1)', 'text': 'This is another paragraph.'}
{'index': 5, 'tag': 'p', 'css_selector': 'html > body > div:nth-of-type(1) > p', 'text': 'This is another paragraph.'}
{'index': 6, 'tag': 'div', 'css_selector': 'html > body > div:nth-of-type(2)', 'text': 'abc123'}
{'index': 7, 'tag': 'b', 'css_selector': 'html > body > div:nth-of-type(2) > b', 'text': '123'}
{'index': 8, 'tag': 'span', 'css_selector': 'html > body > span', 'text': 'Some text here'}


In [11]:
from bs4 import BeautifulSoup

# 示例 HTML 文本
html_doc = """
<html>
    <body>
        <div>
            <p>Some <b>bold</b> text.</p>
            <ul>
                <li>List item 1</li>
                <li>List item 2</li>
            </ul>
            <div>Another <i>italic</i> text.</div>
        </div>
    </body>
</html>
"""

def extract_elements_with_css_selectors(html):
    soup = BeautifulSoup(html, 'html.parser')
    elements = []

    def get_css_selector(element):
        path = []
        while element and element.name != '[document]':
            sibling = element.previous_sibling
            sibling_count = 0
            while sibling:
                if sibling.name == element.name:
                    sibling_count += 1
                sibling = sibling.previous_sibling
            if sibling_count:
                path.insert(0, f"{element.name}:nth-of-type({sibling_count + 1})")
            else:
                path.insert(0, element.name)
            element = element.parent
        return " > ".join(path)

    def contains_specified_tags(element):
        specified_tags = {'div', 'ul'}
        for child in element.descendants:
            if child.name in specified_tags:
                return True
        return False

    def traverse(node):
        for child in node.children:
            if child.name:  # 如果是标签
                if contains_specified_tags(child):  # 如果包含指定的标签
                    traverse(child)  # 递归遍历子元素
                else:
                    css_selector = get_css_selector(child)
                    elements.append((css_selector, child.name, ''.join(str(e) for e in child.contents)))
            elif child.string and child.string.strip():  # 如果是文本
                parent = node.parent if node.name == '[document]' else node
                css_selector = get_css_selector(parent)
                if parent.name not in ['div', 'ul']:  # 如果父节点不是 div 或 ul
                    elements.append((css_selector, parent.name, ''.join(str(e) for e in parent.contents)))
                else:  # 如果父节点是 div 或 ul，只添加文本内容
                    elements.append((css_selector, 'text', child.string.strip()))

    traverse(soup.body)
    return elements

elements = extract_elements_with_css_selectors(html_doc)
for element in elements:
    print(element)


('html > body > div > p', 'p', 'Some <b>bold</b> text.')
('html > body > div > ul', 'ul', '\n<li>List item 1</li>\n<li>List item 2</li>\n')
('html > body > div > div', 'div', 'Another <i>italic</i> text.')
