In [1]:
from langchain.document_loaders import TextLoader
import html2text
from bs4 import BeautifulSoup
import json
import re
from typing import List
from langchain.docstore.document import Document

In [36]:
def extract_links_to_metadata(docs: List[Document]) -> List[Document]:
    for doc in docs:
        content = doc.page_content
        links = re.findall(r'\[.*?\]\((.*?)\)', content)
        unique_links = {}
        link_placeholders = {}
        
        for i, link in enumerate(links, start=1):
            if link not in unique_links:
                placeholder = f'${i}'
                unique_links[link] = placeholder
                link_placeholders[link] = placeholder
                doc.metadata['links'][f'link{i}'] = link
        
        for link, placeholder in link_placeholders.items():
            content = content.replace(f']({link})', f']({placeholder})')
        
        doc.page_content = content
    
    return docs

In [41]:
# załadowanie zawartości pliku html (kod strony został wcześniej zapisany)
loader = TextLoader("aidevs.html", encoding="utf-8")
html = loader.load()[0]

# wybranie tylko sekcji, która jest nam potrzebna
soup = BeautifulSoup(html.page_content, 'html.parser')
authors = soup.select_one("#instructors")

# konwersja do markdowna dla czytelności 
if authors:
    authors_html = str(authors) 
else: 
    authors_html = ""
markdown = html2text.html2text(authors_html)
with open("aidevs.md", 'w', encoding="utf-8") as file: 
    file.write(markdown)

chunks = markdown.split(".jpeg)")[1:]

docs = []
for chunk in chunks:
    # imię autora
    author_match = re.search(r'### (.*(?:\n.*)?) ', chunk)
    author = author_match.group(1).replace(' \n', '').strip() if author_match else ''

    # stworzenie metadanych
    metadata = {
        'source': 'aidevs',
        'section': 'instructors',
        'author': author,
        'links': {},
    }

    # wyciągnięcie linków
    doc = Document(
        page_content=re.sub(r'[\n\\]', '', chunk).replace(r'\s{2,}', ' '),
        metadata=metadata
    )
    docs.append(doc)

# wyodrębnienie linków do metadanych
docs = extract_links_to_metadata(docs)

# zapisanie do jsona
with open('aidevs.json', 'w') as f:
    json.dump([doc.dict() for doc in docs], f, indent=2)