In [1]:
# | exporti
import os

from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import List, Set, Any, Union

import datetime as dt
import dateutil.parser as dateutil_parser

import urllib.parse as url_parse
from tqdm import tqdm

from bs4 import BeautifulSoup
import markdownify as md


from keybert import KeyBERT
from langchain.embeddings import HuggingFaceBgeEmbeddings

from langchain_openai import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
from langchain_core.documents.base import Document


from domolibrary_extensions.utils import detect_encoding, remove_query_params_from_url

In [2]:
# | hide
from pprint import pprint

In [3]:
# | export
class CustomConverter(md.MarkdownConverter):
    """custom converter for converting html into markdown
    has extensions for handling domo kb conversions
    """

    def convert_ul(self, el, text, convert_as_inline):
        res = "\n"
        classList = el.get("class")

        if classList and ("article-list" in classList or "section-list" in classList):
            res = "\n***\n"

        res = res + super().convert_list(el, text, convert_as_inline)
        return res

    def convert_div(self, el, text, convert_as_inline):
        classList = el.get("class")

        if classList and "mt-video-widget" in classList:
            # print(el)
            # custom transformation
            # unwrap child nodes of <a class="searched_found">
            text = ""
            for child in el.children:
                # print(child.get('src'))
                text += child.get("src")
            text = f'{"{{< video"} {text}{" >}}"}'
            print(text)
            return text

        # default transformation
        return super().convert_a(el, text, convert_as_inline)

In [30]:
def read_html_file(
    file_path, is_convert_to_soup: bool = True
) -> Union[str, BeautifulSoup]:
    if not os.path.exists(file_path):
        raise FileNotFoundError(file_path)

    page_encoding = detect_encoding(file_path)

    with open(file_path, encoding=page_encoding["encoding"]) as fp:
        if is_convert_to_soup:
            return BeautifulSoup(fp, "lxml")

        return fp.read()

In [31]:
read_html_file("./SCRAPE/_s_article_360042923054/index.html", False)



In [4]:
@dataclass
class Article_Image:
    url: str
    relative_url: str
    name: str

    def __eq__(self, other):
        if not isinstance(other, Article_Image):
            return False
        return self.url == other.url

    def __hash__(self):
        return hash(self.url)


@dataclass
class Article_Url:
    url: str
    relative_url: str
    inner_text: str

    def __eq__(self, other):
        if not isinstance(other, Article_Url):
            return False
        return self.url == other.url

    def __hash__(self):
        return hash(self.url)


@dataclass
class Article:
    file_path: str

    url: str = None
    url_id: str = None
    base_url: str = "https://domo-support.domo.com"

    soup: BeautifulSoup = field(repr=False, default=None)

    urls: Set[str] = None
    images: Set[Article_Image] = None

    def __post_init__(self):
        self._read_html_file()
        self._get_images()
        self._get_urls()
        self._generate_url()

    def _generate_url(self):
        if not hasattr(self, "url_entity_prefix"):
            return
        if self.url_id and self.base_url and self.url_entity_prefix:
            self.url = url_parse.urljoin(
                url_parse.urljoin(self.base_url, self.url_entity_prefix), self.url_id
            )

    def _read_html_file(self):
        if not os.path.exists(self.file_path):
            raise FileNotFoundError(self.file_path)

        page_encoding = detect_encoding(self.file_path)

        with open(self.file_path, encoding=page_encoding["encoding"]) as fp:
            self.soup = BeautifulSoup(fp, "lxml")

    @staticmethod
    def md_soup(soup, **options):
        """conerts soup to markdown text"""

        return CustomConverter(**options).convert_soup(soup)

    @staticmethod
    def _process_url_with_base_url(url, base_url):
        if not url:
            return None

        url = f"{base_url if (url.startswith('/') and base_url) else ''}{url}"

        if base_url and not url.startswith(base_url):
            return None

        return remove_query_params_from_url(url)

    def _get_images(
        self,
        soup=None,  # pass a soup to just exctract images from the selected content.  Default will exctract all images on the page
        base_url: str = None,  # pass to limit URLs to a specific base
    ):
        "extract image urls from soup"

        soup = soup or self.soup
        base_url = base_url or self.base_url

        self.images = set()

        for item in soup.find_all("img"):
            raw_url = item.get("src", False)
            url = self._process_url_with_base_url(raw_url, base_url)

            if not url:
                continue

            self.images.add(
                Article_Image(
                    url=url,
                    relative_url=raw_url,
                    name=item.get("alt", None),
                )
            )

        return self.images

    def _get_urls(self, soup=None, base_url=None, is_truncate=False):
        base_url = base_url or self.base_url
        soup = soup or self.soup

        self.urls = set()

        for soup_link in soup.find_all("a"):
            raw_url = soup_link.get("href")
            url = self._process_url_with_base_url(raw_url, base_url)

            if not url:
                continue

            url = "/".join(url.split("/")[:6])

            if url[-1] == "/":
                url = url[:-1]

            self.urls.add(
                Article_Url(
                    url=url,
                    relative_url=url.replace(base_url, ""),
                    inner_text=soup_link.text,
                )
            )
        return self.urls

    @classmethod
    def from_factory_path(cls, file_path, path_separator="/", url_separator="_"):
        url_id = file_path.split(path_separator)[-2].split(url_separator)[-1]

        return cls(file_path=file_path, url_id=url_id)

In [5]:
file_path = "./SCRAPE/_s_article_360042923054/index.html"

assert os.path.exists(file_path)

article = Article.from_factory_path(file_path=file_path)

pprint(article)

Article(file_path='./SCRAPE/_s_article_360042923054/index.html',
        url=None,
        url_id='360042923054',
        base_url='https://domo-support.domo.com',
        urls={Article_Url(url='https://domo-support.domo.com/s/knowledge-base',
                          relative_url='/s/knowledge-base',
                          inner_text='Knowledge BaseBrowse thousands of '
                                     'articles in Domo’s comprehensive KB.'),
              Article_Url(url='https://domo-support.domo.com/s/topic/0TO5w000000ZamwGAC',
                          relative_url='/s/topic/0TO5w000000ZamwGAC',
                          inner_text='Release NotesStay current with new '
                                     'product enhancements.'),
              Article_Url(url='https://domo-support.domo.com/s/topic/0TO5w000000ZamzGAC',
                          relative_url='/s/topic/0TO5w000000ZamzGAC',
                          inner_text='Transforming Data In Domo'),
              Artic

In [6]:
# | export
class ArticleKB_ProcessSoupError(Exception):
    def __init__(self, url, search_term):
        super().__init__(f"search term {search_term} does not exist in {url}")


@dataclass
class Article_KB(Article):
    article_id: str = None

    md_str: str = field(default=None, repr=False)

    title: str = None
    views: int = None
    created: dt.date = None
    last_updated: dt.date = None

    base_url: str = "https://domo-support.domo.com"
    url_entity_prefix: str = "/s/article/"

    def __post_init__(self):
        super().__post_init__()

        self._process_soup()

    def _process_soup(self, debug_prn: bool = False):
        soup = self.soup

        search_term = "slds-form-element"

        table = soup.find_all(class_=[search_term])

        if not table or table == []:
            raise ArticleKB_ProcessSoupError(url=self.url, search_term=search_term)

        tarticle = []
        for row in table:
            # print("❤️")

            cells = row.find(class_="slds-form-element__label")

            if list(cells.strings):
                content = row.find(class_="slds-form-element__control")
                tarticle.append((list(cells.strings)[0], content))

        kb_soup = dict(tarticle)

        self.title = self.md_soup(kb_soup.get("Title"))
        self.md_str = self.md_soup(kb_soup.get("Article Body"))
        self.article_id = self.md_soup(kb_soup.get("Article Number"))
        self.views = int(
            self.md_soup(kb_soup.get("Article Total View Count")).replace(",", "")
        )
        self.created = dateutil_parser.parse(
            self.md_soup(kb_soup.get("Article Created Date"))
        )

        self.last_updated = dateutil_parser.parse(
            self.md_soup(kb_soup.get("First Published Date"))
        )

        self._get_images(
            base_url="https://domo-support.domo.com/servlet/rtaImage",
        )

        return kb_soup

In [16]:
# !pip install --upgrade html2text
!pip install playwright

Collecting playwright
  Downloading playwright-1.41.2-py3-none-manylinux1_x86_64.whl.metadata (3.6 kB)
Collecting greenlet==3.0.3 (from playwright)
  Using cached greenlet-3.0.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Collecting pyee==11.0.1 (from playwright)
  Downloading pyee-11.0.1-py3-none-any.whl.metadata (2.7 kB)
Downloading playwright-1.41.2-py3-none-manylinux1_x86_64.whl (37.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.4/37.4 MB[0m [31m206.7 kB/s[0m eta [36m0:00:00[0m00:01[0m00:05[0m
[?25hUsing cached greenlet-3.0.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (620 kB)
Downloading pyee-11.0.1-py3-none-any.whl (15 kB)
Installing collected packages: pyee, greenlet, playwright
  Attempting uninstall: greenlet
    Found existing installation: greenlet 3.0.1
    Uninstalling greenlet-3.0.1:
      Successfully uninstalled greenlet-3.0.1
Successfully installed greenlet-3.0.3 playwright-1.41.2 pyee-1

In [13]:
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain_community.document_transformers import Html2TextTransformer

# Load HTML

file_path = "./SCRAPE/_s_article_360042923054/content.html"

test_article_kb = Article_KB.from_factory_path(file_path=file_path)

html2text = Html2TextTransformer()

html2text.transform_documents([Document(page_content=str(test_article_kb.soup))])

[Document(page_content='# Viewing DataFlow Details\n\n##\n\nOct 24, 2022•Knowledge\n\n### Information\n\nTitle\n\nViewing DataFlow Details\n\nArticle Body\n\nAll DataFlows have a Details view that provides basic information for the\nDataFlow. The view is divided into four tabs— **Settings** , **DataSets** ,\n**History** and **Versions**.  \n  \n  \n\nThe **Settings** tab lets you control scheduling for a DataFlow. When you\ncheck a box for a component DataSet, the DataFlow updates whenever that\nDataSet updates.\n\nThe **DataSets** tab shows you the input and output DataSets in this DataFlow.\nYou can click a DataSet to open the details view for that DataSet.\n\nThe **Lineage** tab displays the flow of data from each DataSet that has been\ncombined and/or transformed to create this DataFlow.\n\nThe **History** tab provides information about runs, such as the number of\nsuccessful and failed runs, the average number of successful runs, the run\nhistory, whether a run was executed manual

In [19]:
test_article_kb = Article_KB.from_factory_path(file_path=file_path)

def _get_content_path(file_path, alternative_file= 'content.html'):

    content_path = file_path.replace('index.html', alternative_file)

    if os.path.exists(content_path):
        file_path = content_path
    
    return content_path
    



def to_document( article_kb) :
    file_path = _get_content_path(article_kb.file_path)

    html = file_path
    
    html2text = Html2TextTransformer()
    
    return html2text.transform_documents([Document(page_content = )])
    

'./SCRAPE/_s_article_360042923054/content.html'

In [None]:
# !pip install --upgrade git+https://github.com/UKPLab/sentence-transformers
# !pip install --upgrade keybert ctransformers[cuda]
# !pip install --upgrade git+https://github.com/huggingface/transformers
# !pip install -U langchain-openai

# Base Embed Models

In [None]:
class Embedding(ABC):
    model_name: str
    embedding: Any = field(repr=False)

    @abstractmethod
    def to_json(self):
        pass


@dataclass
class Embedding_Vector(ABC):
    model_name: str
    embedding: List[float]

    def to_json(self):
        return {self.model_name: self.embedding}


@dataclass
class Embedding_Semantic(ABC):
    model_name: str
    embedding: List[str]

    def to_json(self):
        return {self.model_name: self.embedding}


class EmbedModel(ABC):
    model_name: str
    embed_model = None

    @abstractmethod
    def generate(text_str) -> Embedding:
        pass


@dataclass
class EmbedModels:
    models: List[EmbedModel]

    def embed_document(self, document: Document):
        return [model.generate(document.page_content) for model in self.models]

# Semantic Search Embeddings

In [None]:
# | export


@dataclass
class EmbedModel_BERT(EmbedModel):
    model_name = "key_bert"
    embed_model = KeyBERT()

    def generate(self, text_str) -> Embedding:
        embedding = self.embed_model.extract_keywords(text_str)

        return Embedding_Semantic(
            model_name=self.model_name,
            embedding=[embed[0] for embed in embedding],
        )

  return self.fget.__get__(instance, owner)()


In [None]:
embed_model = EmbedModel_BERT()
embed_model.generate(text_str=test_article_kb.md_str).embedding

['dataflows',
 'dataflow',
 'dataflow_details_update',
 'data_center_dataflows_icon',
 'dataflow_failed']

In [None]:
# | export

DEFAULT_EXTRACTOR_PROMPT = """"
I have the following document:
[DOCUMENT]

Based on the information above, extract the keywords that best describe the topic of the text.
Use the following format separated by commas:
<keywords>
"""


@dataclass
class EmbedModel_LLM_Keyword(EmbedModel):
    model_name: str

    llm: ChatOpenAI  # chat model
    embed_model: BaseCombineDocumentsChain  # q and a chain

    prompt: str = DEFAULT_EXTRACTOR_PROMPT

    @classmethod
    def from_llm(cls, llm, prompt=None):
        return cls(
            model_name=getattr(llm, "model", "llm"),
            embed_model=load_qa_chain(llm=llm, chain_type="stuff"),
            llm=llm,
            prompt=prompt or cls.prompt,
        )

    @staticmethod
    def process_candidate_keywords(documents, candidate_keywords):
        """Create a common format for candidate keywords."""
        if candidate_keywords is None:
            candidate_keywords = [None for _ in documents]
        elif isinstance(candidate_keywords[0][0], str) and not isinstance(
            candidate_keywords[0], list
        ):
            candidate_keywords = [[keyword for keyword, _ in candidate_keywords]]
        elif isinstance(candidate_keywords[0][0], tuple):
            candidate_keywords = [
                [keyword for keyword, _ in keywords] for keywords in candidate_keywords
            ]
        return candidate_keywords

    def generate(
        self, documents: List[str], candidate_keywords: List[List[str]] = None
    ):
        """Extract topics

        Arguments:
            documents: The documents to extract keywords from
            candidate_keywords: A list of candidate keywords that the LLM will fine-tune
                        For example, it will create a nicer representation of
                        the candidate keywords, remove redundant keywords, or
                        shorten them depending on the input prompt.

        Returns:
            all_keywords: All keywords for each document
        """
        if isinstance(documents, str):
            documents = [documents]

        all_keywords = []
        candidate_keywords = self.process_candidate_keywords(
            documents, candidate_keywords
        )

        for document, candidates in tqdm(zip(documents, candidate_keywords)):
            prompt = self.prompt.replace("[DOCUMENT]", document)

            if candidates is not None:
                prompt = prompt.replace("[CANDIDATES]", ", ".join(candidates))

            input_document = Document(page_content=document)

            keywords = self.embed_model.run(
                input_documents=[input_document], question=prompt
            ).strip()

            keywords = [keyword.strip() for keyword in keywords.split(",")]
            all_keywords.append(keywords)

        return Embedding_Semantic(
            model_name=self.model_name,
            embedding=[word for word_ls in all_keywords for word in word_ls],
        )

In [None]:
IP_ADDRESS = "192.168.1.47"

# Create your LLM
llm = ChatOpenAI(
    openai_api_base=f"http://{IP_ADDRESS}:1234/v1/",
    openai_api_key="not-needed",
    model="local_model",
    temperature=0,
)

embed_model = EmbedModel_LLM_Keyword.from_llm(llm=llm)

embed_model.generate(test_article_kb.md_str[0:500])

  warn_deprecated(
1it [00:00,  1.17it/s]


Embedding_Semantic(model_name='llm', embedding=['DataFlows', 'Details view', 'Settings tab', 'Scheduling', 'Component DataSets', 'Updates', 'History tab', 'Versions tab'])

# Vector Embeddings

In [None]:
@dataclass
class EmbedModel_Vector(EmbedModel):
    embed_model: Any

    model_name: str = "BAAI/bge-large-en-v1.5"

    def __post_init__(self):
        self.embed_model = self.embed_model or self._generate_default_bge()

    def _generate_default_bge(self) -> HuggingFaceBgeEmbeddings:
        return HuggingFaceBgeEmbeddings(
            model_name=self.model_name,
            model_kwargs={"device": "cuda"},
            encode_kwargs={"normalize_embeddings": True},
            query_instruction="Generate a representation for this sentence that can be used to retrieve related articles:",
        )

    def generate(self, text_str) -> Embedding:
        return Embedding_Vector(
            model_name=self.model_name,
            embedding=self.embed_model.embed_query(text_str),
        )

In [None]:
embed_model = EmbedModel_Vector()
embed_model.generate(text_str=test_article_kb.md_str)

Embedding_Vector(model_name='BAAI/bge-large-en-v1.5', embedding=[0.013312671333551407, -0.016575293615460396, -0.007060738280415535, 0.013203060254454613, 0.0064755086787045, -0.02625228464603424, -0.018036076799035072, -0.0185143630951643, 0.03139659762382507, 0.034727998077869415, -0.017303980886936188, -0.006018421147018671, 0.04716987907886505, 0.00042789289727807045, -0.027312355116009712, -0.003358651651069522, -0.01903102546930313, 0.008929788134992123, -0.08264664560556412, -0.011443071998655796, -0.006540792994201183, 0.01401978824287653, -0.032939571887254715, -0.006329911760985851, -0.010003392584621906, 0.024286111816763878, 0.007584300357848406, 0.0046102190390229225, 0.07873106747865677, 0.026844600215554237, -0.05086640268564224, -0.018811525776982307, -0.003930436912924051, -0.012134368531405926, 0.008047644048929214, 0.03136052191257477, 0.035953275859355927, 0.008619380183517933, -0.04196415841579437, -0.07135218381881714, -0.033480383455753326, -0.0009060698794201016

In [None]:
IP_ADDRESS = "192.168.1.47"

# Create your LLM
llm = ChatOpenAI(
    openai_api_base=f"http://{IP_ADDRESS}:1234/v1/",
    openai_api_key="not-needed",
    model="local_model",
    temperature=0,
)

embed_models = EmbedModels(
    models=[
        EmbedModel_Vector(),
        EmbedModel_BERT(),
        EmbedModel_LLM_Keyword.from_llm(llm=llm),
    ]
)

embed_models.embed_document(Document(page_content=test_article_kb.md_str[0:500]))

1it [00:00,  1.06it/s]


[Embedding_Vector(model_name='BAAI/bge-large-en-v1.5', embedding=[0.04083440080285072, -0.014177300035953522, -0.016509387642145157, 0.025863030925393105, -0.007573246955871582, -0.027903538197278976, -0.034056417644023895, -0.007666123565286398, -0.0005695745348930359, 0.03668223321437836, -0.004378751385957003, -0.007205614820122719, 0.043338268995285034, 0.010865678079426289, -0.01913772150874138, -0.002682173391804099, -0.024772513657808304, -0.002878405386582017, -0.06642758846282959, 0.002663470571860671, -0.02639744244515896, 0.026730041950941086, -0.019090425223112106, -0.0237107016146183, -0.017907805740833282, 0.04793381690979004, -0.014039643108844757, 0.016547689214348793, 0.07427317649126053, 0.047829654067754745, -0.03408940136432648, -0.00923977605998516, 0.006629142910242081, 0.01814369671046734, -0.01415366493165493, 0.05060886964201927, 0.05027303472161293, 0.0014901588438078761, -0.04533084109425545, -0.05859171971678734, -0.029387427493929863, -0.025959627702832222,

# Embedding Handler

In [None]:
from langchain.text_splitter import MarkdownTextSplitter
from typing import Any


@dataclass
class ChildDocument:
    document: Document
    id: int
    embed_models: EmbedModels
    embeddings: List[any] = None

    def __post_init__(self):
        self.embeddings = self.embed_models.embed_document(self.document)


@dataclass
class ParentDocument:
    document: Document
    id: int
    embed_models: EmbedModels
    child_splitter: MarkdownTextSplitter

    children: list[ChildDocument] = None
    embeddings: List[any] = None

    def __post_init__(self):
        self.embeddings = self.embed_models.embed_document(self.document)

        children = self.child_splitter.split_documents([self.document])

        self.children = [
            ChildDocument(
                child_doc,
                embed_models=self.embed_models,
                id=f"{self.id}-{index}",
            )
            for index, child_doc in enumerate(children)
        ]

    @classmethod
    def from_document(
        cls,
        document,
        id,
        embed_models: EmbedModels,
        child_splitter: MarkdownTextSplitter,
    ):
        return cls(
            document=document,
            id=id,
            embed_models=embed_models,
            child_splitter=child_splitter,
        )


@dataclass
class MasterDocument:
    document: Document
    embed_models: EmbedModels
    parent_splitter: MarkdownTextSplitter = field(repr=False)
    child_splitter: MarkdownTextSplitter = field(repr=False)

    parent_documents: List[Document] = None

    def __post_init__(self):
        parent_documents = self.parent_splitter.split_documents([self.document])

        self.parent_documents = [
            ParentDocument.from_document(
                document=document,
                id=index,
                embed_models=self.embed_models,
                child_splitter=self.child_splitter,
            )
            for index, document in enumerate(parent_documents)
        ]

    @classmethod
    def from_text(
        cls,
        text_str,
        embed_models: EmbedModels,
        parent_chunk_size=1000,
        child_chunk_size=200,
        chunk_overlap=50,
    ):
        return cls(
            document=Document(page_content=text_str),
            embed_models=embed_models,
            parent_splitter=MarkdownTextSplitter(
                chunk_size=parent_chunk_size, chunk_overlap=chunk_overlap
            ),
            child_splitter=MarkdownTextSplitter(
                chunk_size=child_chunk_size, chunk_overlap=chunk_overlap
            ),
        )

In [None]:
master_document = MasterDocument.from_text(
    text_str=test_article_kb.md_str, embed_models=embed_models
)

pprint(master_document.parent_documents[0])

1it [00:01,  1.78s/it]
1it [00:00,  1.49it/s]
1it [00:00,  1.50it/s]
1it [00:00,  1.35it/s]
1it [00:00,  1.91it/s]
1it [00:00,  1.50it/s]
1it [00:01,  1.55s/it]
1it [00:00,  1.45it/s]
1it [00:00,  2.08it/s]
1it [00:00,  3.05it/s]
1it [00:00,  1.76it/s]
1it [00:00,  2.46it/s]
1it [00:00,  1.64it/s]
1it [00:01,  1.43s/it]
1it [00:00,  1.99it/s]
1it [00:00,  3.06it/s]
1it [00:00,  1.54it/s]
1it [00:00,  2.45it/s]
1it [00:00,  3.89it/s]
1it [00:00,  1.92it/s]
1it [00:00,  1.02it/s]
1it [00:00,  2.34it/s]
1it [00:00,  1.92it/s]
1it [00:00,  2.57it/s]
1it [00:00,  2.33it/s]
1it [00:00,  3.00it/s]
1it [00:00,  2.17it/s]
1it [00:00,  1.44it/s]
1it [00:01,  1.57s/it]
1it [00:00,  2.19it/s]
1it [00:00,  2.67it/s]
1it [00:00,  1.86it/s]
1it [00:00,  2.43it/s]
1it [00:00,  1.79it/s]

ParentDocument(document=Document(page_content='All DataFlows have a Details view that provides basic information for the DataFlow. The view is divided into four tabs—**Settings**, **DataSets**, **History** and **Versions**.  \n  \n![dataflow_details_update.png](/servlet/rtaImage?eid=ka05w00000124Tn&feoid=00N5w00000Ri7BU&refid=0EM5w000005vOsT)  \n\xa0\n\n\nThe **Settings** tab lets you control scheduling for a DataFlow. When you check a box for a component DataSet, the DataFlow updates whenever that DataSet updates.\n\n\nThe **DataSets** tab shows you the input and output DataSets in this DataFlow. You can click a DataSet to open the details view for that DataSet.\xa0\n\n\nThe **Lineage**\xa0tab displays the flow of data from each DataSet that has been combined and/or transformed to create this DataFlow.'),
               id=0,
               embed_models=EmbedModels(models=[EmbedModel_Vector(embed_model=HuggingFaceBgeEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_l




In [None]:
# import socket
# def get_ip_address():
#     s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
#     s.connect(("8.8.8.8", 80))
#     return s.getsockname()

# get_ip_address()

In [None]:
from langchain_core.documents.base import Document

class 
def DomoKBLoader(file_path):
    article = Article_KB.from_factory_path(file_path)

    keywords = kw_model.extract_keywords(article.md_str)

    metadata = {
        "title": article.title,
        "views": article.views,
        "created": article.created.strftime('%Y-%m-%d'),
        "last_updated": article.last_updated.strftime('%Y-%m-%d'),
        "url" : article.url,
        "id" : article.article_id,
        "related_urls": [url.url for url in article.urls],
        "keywords": keywords
    }

    return Document(page_content=article.md_str, metadata=metadata)


document = DomoKBLoader(file_path)
document.metadata

SyntaxError: invalid syntax (1849403605.py, line 3)