In [1]:
#| exporti
import os

from dataclasses import dataclass, field
from typing import List, Set
import datetime as dt
import dateutil.parser as dateutil_parser
import urllib.parse as url_parse
from tqdm import tqdm

from bs4 import BeautifulSoup
import markdownify as md

from keybert import KeyBERT


from langchain_openai import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
from langchain_core.documents.base import Document



from gdoc_sync.utils import detect_encoding, remove_query_params_from_url


In [2]:
#| hide
from pprint import pprint

In [3]:
#| export 
class CustomConverter(md.MarkdownConverter):
    """custom converter for converting html into markdown
    has extensions for handling domo kb conversions
    """
    def convert_ul(self, el, text, convert_as_inline):
        res = '\n'
        classList = el.get("class")

        if classList and ("article-list" in classList or "section-list" in classList):
            res = '\n***\n'

        res = res + super().convert_list(el, text, convert_as_inline)
        return res

    def convert_div(self, el, text, convert_as_inline):
        classList = el.get("class")
        
        if classList and "mt-video-widget" in classList:
            # print(el)
            # custom transformation
            # unwrap child nodes of <a class="searched_found">
            text = ""
            for child in el.children:
                # print(child.get('src'))
                text += child.get('src')
            text = f'{"{{< video"} {text}{" >}}"}'
            print(text)
            return text

        # default transformation
        return super().convert_a(el, text, convert_as_inline)

In [4]:
@dataclass
class Article_Image:
    url: str
    relative_url: str
    name: str

    def __eq__(self, other):
        if not isinstance(other, Article_Image):
            return False
        return self.url == other.url

    def __hash__(self):
        return hash(self.url)


@dataclass
class Article_Url:
    url: str
    relative_url: str
    inner_text: str

    def __eq__(self, other):
        if not isinstance(other, Article_Url):
            return False
        return self.url == other.url

    def __hash__(self):
        return hash(self.url)


@dataclass
class Article:
    file_path: str

    url: str = None
    url_id: str = None
    base_url: str = 'https://domo-support.domo.com'

    soup: BeautifulSoup = field(repr=False, default=None)

    urls: Set[str] = None
    images: Set[Article_Image] = None

    def __post_init__(self):
        self._read_html_file()
        self._get_images()
        self._get_urls()
        self._generate_url()


    def _generate_url(self):
        if not hasattr(self, 'url_entity_prefix'):
            return
        if self.url_id and self.base_url and self.url_entity_prefix:
            self.url = url_parse.urljoin(url_parse.urljoin(self.base_url, self.url_entity_prefix), self.url_id)


    def _read_html_file(self):
        if not os.path.exists(self.file_path):
            raise FileNotFoundError(self.file_path)

        page_encoding = detect_encoding(self.file_path)

        with open(self.file_path, encoding=page_encoding["encoding"]) as fp:
            self.soup = BeautifulSoup(fp, "lxml")

    @staticmethod
    def md_soup(soup, **options):
        """conerts soup to markdown text"""

        return CustomConverter(**options).convert_soup(soup)

    @staticmethod
    def _process_url_with_base_url(url, base_url):

        if not url:
            return None
        
        url = f"{base_url if (url.startswith('/') and base_url) else ''}{url}"

        if base_url and not url.startswith(base_url):
            return None

        return remove_query_params_from_url(url)

    def _get_images(
        self,
        soup=None,  # pass a soup to just exctract images from the selected content.  Default will exctract all images on the page
        base_url: str = None,  # pass to limit URLs to a specific base
    ):
        "extract image urls from soup"

        soup = soup or self.soup
        base_url = base_url or self.base_url

        self.images = set()

        for item in soup.find_all("img"):
            raw_url = item.get("src", False)
            url = self._process_url_with_base_url(raw_url, base_url)

            if not url:
                continue

            self.images.add(
                Article_Image(
                    url=url,
                    relative_url=raw_url,
                    name=item.get("alt", None),
                )
            )

        return self.images

    def _get_urls(self, soup=None, base_url=None, is_truncate = False):
        base_url = base_url or self.base_url
        soup = soup or self.soup

        self.urls = set()

        for soup_link in soup.find_all("a"):
            raw_url = soup_link.get("href")
            url = self._process_url_with_base_url(raw_url, base_url)
            
            if not url:
                continue

            url = "/".join(url.split("/")[:6])

            if url[-1] == '/' : 
                url = url[:-1]
   
            self.urls.add(
                Article_Url(url=url, relative_url=url.replace(base_url, ''), inner_text=soup_link.text)
            )
        return self.urls
    
    @classmethod
    def from_factory_path(cls, file_path, path_separator= "/", url_separator= "_"):         
        url_id = file_path.split(path_separator)[-2].split(url_separator)[-1]
        
        return cls(
            file_path = file_path,
            url_id = url_id)
    
    

In [5]:
file_path = './SCRAPE/_s_article_360042923054/index.html'

assert os.path.exists(file_path)

article = Article.from_factory_path(file_path= file_path)

pprint(article)

Article(file_path='./SCRAPE/_s_article_360042923054/index.html',
        url=None,
        url_id='360042923054',
        base_url='https://domo-support.domo.com',
        urls={Article_Url(url='https://domo-support.domo.com/s/knowledge-base',
                          relative_url='/s/knowledge-base',
                          inner_text='Knowledge BaseBrowse thousands of '
                                     'articles in Domo’s comprehensive KB.'),
              Article_Url(url='https://domo-support.domo.com/s/topic/0TO5w000000ZamwGAC',
                          relative_url='/s/topic/0TO5w000000ZamwGAC',
                          inner_text='Release NotesStay current with new '
                                     'product enhancements.'),
              Article_Url(url='https://domo-support.domo.com/s/topic/0TO5w000000ZamzGAC',
                          relative_url='/s/topic/0TO5w000000ZamzGAC',
                          inner_text='Transforming Data In Domo'),
              Artic

In [6]:
# | export
class ArticleKB_ProcessSoupError(Exception):
    def __init__(self, url, search_term):
        super().__init__(f"search term {search_term} does not exist in {url}")
        
@dataclass
class Article_KB(Article):
    article_id: str = None
    
    md_str: str = field(default=None, repr=False)
    
    title: str = None
    views: int = None
    created: dt.date = None
    last_updated: dt.date = None

    base_url:str  = "https://domo-support.domo.com"
    url_entity_prefix : str="/s/article/"

    def __post_init__(self):
        super().__post_init__()
        
        self._process_soup()


    def _process_soup(self, debug_prn: bool = False):
        soup = self.soup
        
        search_term = "slds-form-element"

        table = soup.find_all(class_=[search_term])

        if not table or table == []:
            raise ArticleKB_ProcessSoupError(
                url=self.url, search_term=search_term)

        tarticle = []
        for row in table:
            # print("❤️")

            cells = row.find(class_="slds-form-element__label")

            if list(cells.strings):
                content = row.find(class_="slds-form-element__control")
                tarticle.append((list(cells.strings)[0], content))

        kb_soup = dict(tarticle)

        self.title = self.md_soup(kb_soup.get("Title"))
        self.md_str = self.md_soup(kb_soup.get("Article Body"))
        self.article_id = self.md_soup(kb_soup.get("Article Number"))
        self.views = int(self.md_soup(kb_soup.get("Article Total View Count")).replace(',', ''))
        self.created = dateutil_parser.parse(self.md_soup(
            kb_soup.get("Article Created Date")))

        self.last_updated = dateutil_parser.parse(
            self.md_soup(kb_soup.get("First Published Date"))
        )

        self._get_images(
            base_url="https://domo-support.domo.com/servlet/rtaImage",
        )

        return kb_soup
    


In [7]:
test_article_kb = Article_KB.from_factory_path(file_path=file_path)

pprint(test_article_kb)

Article_KB(file_path='./SCRAPE/_s_article_360042923054/index.html',
           url='https://domo-support.domo.com/s/article/360042923054',
           url_id='360042923054',
           base_url='https://domo-support.domo.com',
           urls={Article_Url(url='https://domo-support.domo.com/s/topic/0TO5w000000ZanUGAS',
                             relative_url='/s/topic/0TO5w000000ZanUGAS',
                             inner_text='DataFlow Management'),
                 Article_Url(url='https://domo-support.domo.com/s/topic/0TO5w000000ZamzGAC',
                             relative_url='/s/topic/0TO5w000000ZamzGAC',
                             inner_text='Transforming Data In Domo'),
                 Article_Url(url='https://domo-support.domo.com/s/article/360043429933',
                             relative_url='/s/article/360043429933',
                             inner_text='Beast Mode Functions Reference Guide'),
                 Article_Url(url='https://domo-support.domo.com/s/art

In [8]:
# !pip install --upgrade git+https://github.com/UKPLab/sentence-transformers
# !pip install keybert ctransformers[cuda]
# !pip install --upgrade git+https://github.com/huggingface/transformers
# !pip install -U langchain-openai

In [19]:
#| export
def get_keywords(doc):
      "crude method using BERT to extract keywords from a body of text"
      
      kw_model = KeyBERT()
      return kw_model.extract_keywords(doc)

In [10]:
get_keywords(test_article_kb.md_str)

  return self.fget.__get__(instance, owner)()


[('dataflows', 0.6305),
 ('dataflow', 0.6222),
 ('dataflow_details_update', 0.5574),
 ('data_center_dataflows_icon', 0.4504),
 ('dataflow_failed', 0.3781)]

In [32]:
from langchain.embeddings import HuggingFaceBgeEmbeddings
from abc import ABC, abstractmethod


class EmbedModel(ABC):

    @abstractmethod
    def generate(text_str):
        pass


class EmbedModel_Keyword(EmbedModel):
    model_name: str
    embedding_model = None

    def __init__(self, model_name="BAAI/bge-large-en-v1.5"):
        self.model_name = model_name
        self._generate_vector_embedding_model(
            model_name=self.model_name
        )

        super().__init__(self)

    def _generate_vector_embedding_model(self, model_name="BAAI/bge-large-en-v1.5"):
        model_kwargs = {"device": "cuda"}
        encode_kwargs = {
            "normalize_embeddings": True
        }  # set True to compute cosine similarity

        self.embedding_model =  HuggingFaceBgeEmbeddings(
            model_name=model_name,
            model_kwargs=model_kwargs,
            encode_kwargs=encode_kwargs,
            query_instruction="Generate a representation for this sentence that can be used to retrieve related articles:",
        )

        return self.embedding_model

    def generate(self, text_str):
        return self.embedding_model.embed_query(text_str)

In [30]:
embedding_model = EmbedModel_Keyword()

embedding = embedding_model.generate("hi this is harrison")
embedding[0:5]

TypeError: EmbedModel_Keyword._generate_vector_embedding_model() got multiple values for argument 'model_name'

In [13]:
# Assuming TokenTextSplitter is defined in a module named 'module_name'
from langchain.text_splitter import MarkdownTextSplitter


def handle_embed_document(
    document: Document,
    child_splitter: MarkdownTextSplitter,
    parent_id: int,
    embedding_model=None,
):
    embedding_model = embedding_model or generate_embedding_model()
    child_documents = child_splitter.split_documents([document])
    return {
        "parent_text": document.page_content,
        "parent_id": parent_id,
        "parent_embedding": embedding_model.embed_query(document.page_content),
        "children": [
            {
                "text": c.page_content,
                "id": f"{parent_id}-{child_index}",
                "embedding": embedding_model.embed_query(c.page_content),
            }
            for child_index, c in enumerate(child_documents)
        ],
    }


def tokenize_document(text_str):
    documents = [Document(page_content=text_str)]

    # Ingest Parent-Child node pairs
    parent_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=48)
    child_splitter = MarkdownTextSplitter(chunk_size=200, chunk_overlap=48)

    parent_documents = parent_splitter.split_documents(documents)

    # TODO implement a cacheing method

    return [handle_embed_document(
            document=parent, parent_id=index, child_splitter=child_splitter
        ) for index, parent in enumerate(parent_documents)]

In [14]:
tokenize_document(test_article_kb.md_str)

[{'parent_text': 'All DataFlows have a Details view that provides basic information for the DataFlow. The view is divided into four tabs—**Settings**, **DataSets**, **History** and **Versions**.  \n  \n![dataflow_details_update.png](/servlet/rtaImage?eid=ka05w00000124Tn&feoid=00N5w00000Ri7BU&refid=0EM5w000005vOsT)  \n\xa0\n\n\nThe **Settings** tab lets you control scheduling for a DataFlow. When you check a box for a component DataSet, the DataFlow updates whenever that DataSet updates.\n\n\nThe **DataSets** tab shows you the input and output DataSets in this DataFlow. You can click a DataSet to open the details view for that DataSet.\xa0\n\n\nThe **Lineage**\xa0tab displays the flow of data from each DataSet that has been combined and/or transformed to create this DataFlow.',
  'parent_id': 0,
  'parent_embedding': [0.025597775354981422,
   0.0040980177000164986,
   -0.006029421463608742,
   0.013953696936368942,
   -0.0041069104336202145,
   -0.02132282219827175,
   -0.03482699394226

In [22]:
# | export

DEFAULT_EXTRACTOR_PROMPT = """"
I have the following document:
[DOCUMENT]

Based on the information above, extract the keywords that best describe the topic of the text.
Use the following format separated by commas:
<keywords>
"""


class KeywordExtractor:
    llm: ChatOpenAI  # chat model

    chain: BaseCombineDocumentsChain = None  # q and a chain
    prompt = DEFAULT_EXTRACTOR_PROMPT

    def __init__(self , llm):
        self.llm = llm
        self.chain = load_qa_chain(llm=self.llm, chain_type="stuff")

    @staticmethod
    def process_candidate_keywords(documents, candidate_keywords):
        """Create a common format for candidate keywords."""
        if candidate_keywords is None:
            candidate_keywords = [None for _ in documents]
        elif isinstance(candidate_keywords[0][0], str) and not isinstance(
            candidate_keywords[0], list
        ):
            candidate_keywords = [[keyword for keyword, _ in candidate_keywords]]
        elif isinstance(candidate_keywords[0][0], tuple):
            candidate_keywords = [
                [keyword for keyword, _ in keywords] for keywords in candidate_keywords
            ]
        return candidate_keywords

    def extract_keywords(
        self, documents: List[str], candidate_keywords: List[List[str]] = None
    ):
        """Extract topics

        Arguments:
            documents: The documents to extract keywords from
            candidate_keywords: A list of candidate keywords that the LLM will fine-tune
                        For example, it will create a nicer representation of
                        the candidate keywords, remove redundant keywords, or
                        shorten them depending on the input prompt.

        Returns:
            all_keywords: All keywords for each document
        """
        all_keywords = []
        candidate_keywords = self.process_candidate_keywords(
            documents, candidate_keywords
        )

        for document, candidates in tqdm(zip(documents, candidate_keywords)):

            prompt = self.prompt.replace("[DOCUMENT]", document)

            if candidates is not None:
                prompt = prompt.replace("[CANDIDATES]", ", ".join(candidates))

            input_document = Document(page_content=document)

            keywords = self.chain.run(
                input_documents=[input_document], question=prompt
            ).strip()

            keywords = [keyword.strip() for keyword in keywords.split(",")]
            all_keywords.append(keywords)

        return all_keywords

In [24]:
IP_ADDRESS = '192.168.57.100'

# Create your LLM
llm = ChatOpenAI(
    openai_api_base=f"http://{IP_ADDRESS}:1234/v1/",
    openai_api_key="not-needed",
    model="local_model",
    temperature=0,
)

kw_extractor = KeywordExtractor(llm = llm)
kw_extractor.extract_keywords(documents = [test_article_kb.md_str[0:500]])

1it [00:01,  1.24s/it]


[['DataFlows',
  'Details view',
  'Settings tab',
  'Scheduling',
  'Component DataSets',
  'Updates',
  'History tab',
  'Versions tab']]

In [None]:
from langchain_core.documents.base import Document

class 
def DomoKBLoader(file_path):
    article = Article_KB.from_factory_path(file_path)

    keywords = kw_model.extract_keywords(article.md_str)

    metadata = {
        "title": article.title,
        "views": article.views,
        "created": article.created.strftime('%Y-%m-%d'),
        "last_updated": article.last_updated.strftime('%Y-%m-%d'),
        "url" : article.url,
        "id" : article.article_id,
        "related_urls": [url.url for url in article.urls],
        "keywords": keywords
    }

    return Document(page_content=article.md_str, metadata=metadata)


document = DomoKBLoader(file_path)
document.metadata

  warn_deprecated(


AttributeError: 'tuple' object has no attribute 'page_content'