In [24]:
import sys
sys.setrecursionlimit(1000) # set to python default

In [25]:
import re
from typing import List, Optional

from markdownify import markdownify
from markdown import markdownFromFile, markdown
from bs4 import BeautifulSoup
from bs4.element import Tag

class TreeOfContents:
    """Tree abstraction for markdown source"""

    source_type = BeautifulSoup
    ## Attributes
    valid_tags = ('a', 'abbr', 'address', 'area', 'article', 'aside', 'audio',
        'b', 'base', 'bdi', 'bdo', 'blockquote', 'body', 'br', 'button',
        'canvas', 'caption', 'cite', 'code', 'col', 'colgroup', 'data',
        'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'div', 'dl', 'dt',
        'em', 'embed', 'fieldset', 'figcaption', 'figure', 'footer', 'form',
        'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hgroup', 'hr',
        'html', 'i', 'iframe', 'img', 'input', 'ins', 'kbd', 'keygen', 'label',
        'legend', 'li', 'link', 'main', 'map', 'mark', 'menu', 'menuitem',
        'meta', 'meter', 'nav', 'noscript', 'object', 'ol', 'optgroup',
        'option', 'output', 'p', 'param', 'picture', 'pre', 'progress', 'q',
        'rp', 'rt', 'ruby', 's', 'samp', 'script', 'section', 'select', 'small',
        'source', 'span', 'strong', 'style', 'sub', 'summary', 'sup', 'table',
        'tbody', 'td', 'template', 'textarea', 'tfoot', 'th', 'thead', 'time',
        'title', 'tr', 'track', 'u', 'ul', 'var', 'video', 'wbr')
    allowed_attrs = ('string', 'name')
    header_name_pattern = r"^h([1-6])"
    max_header_level = 7
    ## md->html
    default_md_extensions = ['markdown.extensions.fenced_code','markdown.extensions.tables']
     
    def __init__(
        self,
        source: Tag,
        branches=(),
        children_tags: List[Tag]=(),
        depth: Optional[int]=None
    ):
        """
        Construct TreeOfContents object using source

        source (Tag): parsed source
        :param list TreeOfContents branches: list of direct children
        :param list SourceType descendants: all descendants
        """
        if source is None:
            raise ValueError('NoneType source passed into TreeOfContents')
        self.source = source
        self.depth = depth
        # self.depth = depth or self.parseTopDepth()
        # MODIFIED - make branches with source.children & expand descendants later
        # self.descendants = descendants or list(source.descendants)
        self.branches: List["TreeOfContents"] = branches or self.parseBranches(children_tags)
        self.descendants: List["TreeOfContents"] = self.expandDescendants()

    @classmethod
    def getHeadingLevel(cls, bs) -> Optional[int]:
        """
        >>> bsify = lambda html: BeautifulSoup(html, 'html.parser')
        >>> bs = bsify('<h1>Hello</h1>').h1
        >>> TOC.getHeadingLevel(bs)
        1
        >>> bs2 = bsify('<p>Hello</p>').p
        >>> TOC.getHeadingLevel(bs2)

        >>> bs3 = bsify('<article>Hello</article>').article
        >>> TOC.getHeadingLevel(bs3)

        """
        # MODIFIED - get header_num by pattern
        header_match = re.search(cls.header_name_pattern, bs.name)
        if header_match:
            try:
                header_num = int(header_match.group(1))
            except (ValueError, IndexError, TypeError):
                return None
            return header_num
        else:
            return None
        # try:
        #     return int(bs.name[1])
        # except (ValueError, IndexError, TypeError):
        #     return None

    def parseTopDepth(self) -> int:
        """
        Parse highest heading in markdown

        >>> TOC.fromHTML('<h2>haha</h2><h1>hoho</h1>').parseTopDepth()
        1
        >>> TOC.fromHTML('<h3>haha</h3><h2>hoho</h2>').parseTopDepth()
        2
        """
        for i in range(1, self.max_header_level):
            if getattr(self.source, 'h{}'.format(i)):
                return i

    def expandDescendants(self) -> List[Tag]:
        """
        Expand descendants from list of branches

        :param list branches: list of immediate children as TreeOfContents objs
        :return: list of all descendants
        """
        descendants = []
        for b in self.branches:
            descendants.append(b)
            descendants.extend(b.expandDescendants())
        return descendants
        # return sum([b.descendants for b in self.branches], []) + \
            # [b.source for b in self.branches]

    def parseBranches(self, children_tags: List[Tag]) -> List["TreeOfContents"]:
        """
        Parse top level of markdown

        :param list elements: list of source objects
        :return: list of filtered TreeOfContents objects
        """
        print("DEPTH", self.depth, self.source)
        # parsed, parent, cond = [], False, lambda b: (b.string or '').strip()
        # parsed, parent, cond = [], False, lambda b: b.name and b.name in self.valid_tags
        parent_level = self.getHeadingLevel(self.source)
        if parent_level is None:
            parent_level = 0

        parsed_branches = []
        cur_level = self.max_header_level ## set to 7 as max depth
        # loop through descendant tags
        # print('-'*30)
        # print(str(self))
        cond = lambda b: b.name and b.name in self.valid_tags
        for branch in filter(cond, children_tags):
            print(self.depth, branch.name, len(list(branch.children)), repr(branch))
            
            ## Check if branch is header (h1, h2,..)
            level = self.getHeadingLevel(branch)
            
            ## If List, paragraph -> add items separately
            # if self.name in ["ul", "ol", "p", "em"]:
            #     node = {'level': 7, 'source': branch, 'descendants': list(branch.children)}
            #     parsed_branches.append(node)
            #     continue
            
            # if is header
            if level is not None and level<=cur_level:
                # split out to new branch
                cur_level = level
                node = {'level': level, 'source': branch, 'descendants': list(branch.children)}
                parsed_branches.append(node)
            # if item is not under a header
            elif cur_level==self.max_header_level:
                node = {'level': self.max_header_level, 'source': branch, 'descendants': list(branch.children)}
                parsed_branches.append(node)
                continue
            else:
                if not parsed_branches:
                    node = {'level': self.max_header_level, 'source': branch, 'descendants': list(branch.children)}
                    parsed_branches.append(node)
                else:
                    parsed_branches[-1]['descendants'].append(branch)
        # print("PARSED_BRANCH", parsed_branches)
        ## Make TOC
        new_depth = self.depth+1
        print("-"*30)
        print("PARSED BRANCHES:", len(parsed_branches))
        for branch in parsed_branches:
            print(branch)
        # print(parsed_branches)
        print("-"*30)
        branches = [TreeOfContents(depth=new_depth, source=x['source'], children_tags=x['descendants']) for x in parsed_branches]
        return branches
    
    def getText(self) -> str:
        return str(self)
    
    def getBranch(self, idx: int) -> "TreeOfContents":
        return self.branches[idx]
    
    def getDescendantsHTML(self, ) -> str:
        html_texts = []
        for desc in self.descendants:
            html_texts.append(repr(desc))
        return "".join(html_texts)
    
    def getDescendantsMarkdown(self, ) -> str:
        return markdownify(self.getDescendantsHTML())

    def __getattr__(self, attr, *default):
        """Check source for attributes"""
        tag = attr[:-1]
        if attr=="source":
            return self.source
        if attr in self.allowed_attrs:
            return getattr(self.source, attr, *default)
        if attr in self.valid_tags:
            return next(filter(lambda t: t.name == attr, self.branches), None)
        if len(default):
            return default[0]
        if attr[-1] == 's' and tag in self.valid_tags:
            condition = lambda t: t.name == tag
            return filter(condition, self.branches)
        raise AttributeError("'TreeOfContents' object has no attribute '%s'" % attr)

    def __repr__(self):
        """Display contents"""
        # return str(self)
        # MODIIFED - return str(source) to access html
        return str(self.source)

    def __str__(self):
        """Display contents"""
        if self.string:
            return self.string
        elif self.source.get_text():
            return self.source.get_text()
        else:
            return ''
        # return self.string or ''

    def __iter__(self):
        """Iterator over children"""
        return iter(self.branches)

    def __getitem__(self, i):
        return self.branches[i]

    @classmethod
    def fromMarkdown(cls, md: str, *args, **kwargs):
        """
        Creates abstraction using path to file

        :param str path: path to markdown file
        :return: TreeOfContents object
        """
        if not kwargs.get('extensions', None):
            kwargs['extensions'] = cls.default_md_extensions
        html_text = markdown(md, *args, **kwargs)
        return cls.fromHTML(html_text)

    @classmethod
    def fromHTML(cls, html: str, *args, **kwargs):
        """
        Creates abstraction using HTML

        :param str html: HTML
        :return: TreeOfContents object
        """
        source = BeautifulSoup(html, 'html.parser', *args, **kwargs)
        # parsed = []
        # parsed, parent, cond = [], False, lambda b: b.name and b.name in TreeOfContents.valid_tags
        # for branch in filter(cond, source.children):
        #     parsed.append({'root':branch.string, 'source':branch})
            
        # branches = [TOC(depth=2, **kwargs) for kwargs in parsed]
        return cls(
            # '[document]',
            source=source,
            depth=0,
            # branches = branches
            children_tags=source.children
            # children_tags=source.descendants
        )

In [26]:
import pandas as pd

In [27]:
## Load Sample
df = pd.read_parquet("sample.parquet")
print(df.shape, df.columns)

(10000, 7) Index(['id', 'title', 'abstract', 'authors', 'published_date', 'link',
       'markdown'],
      dtype='object')


In [28]:
idx = 8603
# idx = 2677 # html 변환 오류
text = df.iloc[idx]["markdown"]

In [45]:
print(text)

Towards the exploitation of Llm-based chatbot for providing legal support to Palestinian cooperatives

###### Abstract

With the ever-increasing utilization of natural language processing (NLP), we started to witness over the past few years a significant transformation in our interaction with legal texts. This technology has advanced the analysis and enhanced the understanding of complex legal terminology and contexts. The development of recent large language models (LLMs), particularly ChatGPT, has also introduced a revolutionary contribution to the way that legal texts can be processed and comprehended. In this paper, we present our work on a cooperative-legal question-answering LLM-based chatbot, where we developed a set of legal questions about Palestinian cooperatives, associated with their regulations and compared the auto-generated answers by the chatbot to their correspondences that are designed by a legal expert. To evaluate the proposed chatbot, we have used 50 queries genera

In [46]:
# sections = get_sections(-1, text)
toc = TreeOfContents.fromMarkdown(text)

DEPTH 0 <p>Towards the exploitation of Llm-based chatbot for providing legal support to Palestinian cooperatives</p>
<h6>Abstract</h6>
<p>With the ever-increasing utilization of natural language processing (NLP), we started to witness over the past few years a significant transformation in our interaction with legal texts. This technology has advanced the analysis and enhanced the understanding of complex legal terminology and contexts. The development of recent large language models (LLMs), particularly ChatGPT, has also introduced a revolutionary contribution to the way that legal texts can be processed and comprehended. In this paper, we present our work on a cooperative-legal question-answering LLM-based chatbot, where we developed a set of legal questions about Palestinian cooperatives, associated with their regulations and compared the auto-generated answers by the chatbot to their correspondences that are designed by a legal expert. To evaluate the proposed chatbot, we have used

In [54]:
for branch in toc.branches:
    print("BRANCH", branch.name, branch.depth)
    print(branch.getText())
    print(branch.getDescendantsMarkdown())
    print("-"*30)
# print(toc.branches)
# print(toc.getDescendantsHTML())
# print(toc.getDescendantsMarkdown())

BRANCH p 1
Towards the exploitation of Llm-based chatbot for providing legal support to Palestinian cooperatives

------------------------------
BRANCH h6 1
Abstract
With the ever\-increasing utilization of natural language processing (NLP), we started to witness over the past few years a significant transformation in our interaction with legal texts. This technology has advanced the analysis and enhanced the understanding of complex legal terminology and contexts. The development of recent large language models (LLMs), particularly ChatGPT, has also introduced a revolutionary contribution to the way that legal texts can be processed and comprehended. In this paper, we present our work on a cooperative\-legal question\-answering LLM\-based chatbot, where we developed a set of legal questions about Palestinian cooperatives, associated with their regulations and compared the auto\-generated answers by the chatbot to their correspondences that are designed by a legal expert. To evaluate t

# Test with snippet

In [36]:
## sample taken from 62
with open("samples/err1_626_max_recursion.txt", "r") as f:
    sample = f.read()

In [37]:
TreeOfContents.fromMarkdown(sample)

DEPTH 0 <p>Finally, (106), (107) and (108) and are combined together to give</p>
<p>[\begin{split}&amp;(\hat{J}<em>{l},\hat{N}</em>{\alpha\uparrow})=-(\hat{J}<em>{ l},\hat{N}</em>{\alpha\downarrow})\ &amp;\approx\sum_{n,\mathbf{k}}|\left\langle\alpha|g_{n\mathbf{k}} \right\rangle|^{2}\big{(}[M_{l}^{+}(\mathbf{k})]<em>{n,n}^{1,1}-[M</em>{l}^{-}( \mathbf{k})]<em>{n,n}^{1,1}\big{)}\ &amp;=\sum</em>{n,\mathbf{k}}|\left\langle\alpha|g_{n\mathbf{k}}\right\rangle |^{2}\frac{\Delta^{2}}{2E_{n\mathbf{k}}^{2}}\Bigg{[}f(E_{n\mathbf{k}})-\frac{ \beta}{2\cosh^{2}!\left(\frac{\beta E_{n\mathbf{k}}}{2}\right)}\Bigg{]} \partial_{l}\varepsilon_{n\mathbf{k}}\,.\end{split} \tag{109}]</p>
<p>In the isolated band limit, only the term (n=\bar{n}) corresponding to the partially filled band gives a nonzero contribution. Note that, to obtain the correct result for the correlation function (109), it is important to retain all of the matrix elements of the current operator ([\hat{J}<em>{l}(\mathbf{k})]</em>{m,n}

<p>Finally, (106), (107) and (108) and are combined together to give</p>
<p>[\begin{split}&amp;(\hat{J}<em>{l},\hat{N}</em>{\alpha\uparrow})=-(\hat{J}<em>{ l},\hat{N}</em>{\alpha\downarrow})\ &amp;\approx\sum_{n,\mathbf{k}}|\left\langle\alpha|g_{n\mathbf{k}} \right\rangle|^{2}\big{(}[M_{l}^{+}(\mathbf{k})]<em>{n,n}^{1,1}-[M</em>{l}^{-}( \mathbf{k})]<em>{n,n}^{1,1}\big{)}\ &amp;=\sum</em>{n,\mathbf{k}}|\left\langle\alpha|g_{n\mathbf{k}}\right\rangle |^{2}\frac{\Delta^{2}}{2E_{n\mathbf{k}}^{2}}\Bigg{[}f(E_{n\mathbf{k}})-\frac{ \beta}{2\cosh^{2}!\left(\frac{\beta E_{n\mathbf{k}}}{2}\right)}\Bigg{]} \partial_{l}\varepsilon_{n\mathbf{k}}\,.\end{split} \tag{109}]</p>
<p>In the isolated band limit, only the term (n=\bar{n}) corresponding to the partially filled band gives a nonzero contribution. Note that, to obtain the correct result for the correlation function (109), it is important to retain all of the matrix elements of the current operator ([\hat{J}<em>{l}(\mathbf{k})]</em>{m,n}), even 