In [21]:
from typing import List, Literal

import matplotlib.pyplot as plt
import pandas as pd


In [22]:
from datetime import datetime
from pydantic import BaseModel, Field

class ArxivPaperSection(BaseModel):
    header: Literal["h2", "h3", "h4", "h5", "h6"]
    title:str = Field(..., description="Section title")
    text: str = Field("", description = "Section contents")
    children: List["ArxivPaperSection"] = Field(list(), description="child sections")

class ArxivPaperMetadata(BaseModel):
    authors: str
    published_date: datetime
    link: str

class ArxivPaper(BaseModel):
    id: str
    title: str
    abstract: str
    sections: List[ArxivPaperSection]
    metadata: ArxivPaperMetadata = Field(None, description="paper metadata")

In [23]:
## Load Sample
df = pd.read_parquet("sample.parquet")
print(df.shape, df.columns)

(10000, 7) Index(['id', 'title', 'abstract', 'authors', 'published_date', 'link',
       'markdown'],
      dtype='object')


In [24]:
df.iloc[0]

id                                                       2310.03187
title             Synthesis of Data-Driven Nonlinear State Obser...
abstract          This paper focuses on the model-free synthesis...
authors                                                 Wentao Tang
published_date                                 2023-10-04T22:19:53Z
link                              http://arxiv.org/abs/2310.03187v1
markdown          # Synthesis of Data-Driven Nonlinear State Obs...
Name: 62774, dtype: object

In [25]:
## md2py based parser
from markdownify import markdownify as md
from src.custom_md2py import md2py, TreeOfContents

In [26]:
from markdown import markdownFromFile, markdown
from bs4 import BeautifulSoup

def parseBranches(descendants):
    """
    Parse top level of markdown

    :param list elements: list of source objects
    :return: list of filtered TreeOfContents objects
    """
    parsed, parent, cond = [], False, lambda b: (b.string or '').strip()
    for branch in filter(cond, descendants):
        print(branch)
        # if self.getHeadingLevel(branch) == self.depth:
        #     parsed.append({'root':branch.string, 'source':branch})
        #     parent = True
        # elif not parent:
        #     parsed.append({'root':branch.string, 'source':branch})
        # else:
        #     parsed[-1].setdefault('descendants', []).append(branch)
    return parsed
list_sample = '''# some list
- head
    - tail
    - tail2
- head2'''
list_html = markdown(list_sample)
print("converted html:", repr(list_html))
source = BeautifulSoup(list_html, 'html.parser')
print("source:", source.name)
for child_i, child in enumerate(source.children):
    print("CHILD {}: {}".format(child_i, child.name))
    print(repr(child.string), repr(str(child)))
    print('-'*30)
# print("children",list(source.children))
# descendants = list(source.children)
# print(type(descendants[0]))
# print("items:", descendants[0].li, descendants[0].attrs)

# print("STRING", descendants[0].string, repr(str(descendants[0])))
# # print(len(descendants))
# # parseBranches(descendants)

converted html: '<h1>some list</h1>\n<ul>\n<li>head<ul>\n<li>tail</li>\n<li>tail2</li>\n</ul>\n</li>\n<li>head2</li>\n</ul>'
source: [document]
CHILD 0: h1
'some list' '<h1>some list</h1>'
------------------------------
CHILD 1: None
'\n' '\n'
------------------------------
CHILD 2: ul
None '<ul>\n<li>head<ul>\n<li>tail</li>\n<li>tail2</li>\n</ul>\n</li>\n<li>head2</li>\n</ul>'
------------------------------


In [27]:
html_list_sample = '''<ul>
<li>head<ul>
<li>tail</li>
<li>tail2</li>
</ul>
</li>
<li>head2</li>
</ul>'''

soup = BeautifulSoup(html_list_sample, 'html.parser')
# soup.find("ul")
print(soup.li)
print("attrs:",soup.attrs)
found_list = soup.find("ul")
print(found_list.name, found_list.find('li'))
print(type(soup.find("ul")))

<li>head<ul>
<li>tail</li>
<li>tail2</li>
</ul>
</li>
attrs: {}
ul <li>head<ul>
<li>tail</li>
<li>tail2</li>
</ul>
</li>
<class 'bs4.element.Tag'>


In [34]:
import re
## markdown text -> List[Section]
header_name_pattern = r"^h([1-6])"

def get_toc_text(toc: TreeOfContents) -> str:
    texts = []
    for child in toc.expandDescendants(toc):
        header_match = re.search(header_name_pattern, child.name)
        if header_match:
            header_num = header_match.group(1)
            ## header
            text = "".join(["#"*int(header_num), child.string])
        elif child.name=="ul":
            # List
            text = md(str(child))
            pass
        else:
            print(child.name, repr(child.string), str(child))
            text = md(str(child))
            # text = child.string
        texts.append(text)
    print(texts)
    return "\n\n".join(texts)


In [29]:
test_sample = '''# Title1
## Title2
writing something about title2
### Title2-1
something something
s2omething2 something2
### Title2-2
something something
s2omething2 something2

- something
- something

ddd

| a | b |
| --- | --- |
| hi | hey |

## Title 3
sth3
### Title3-2
something3 something3
s2omething3-2 something3-2
## Title 4'''

### Title2-1
toc = md2py(test_sample)
print(toc.depth)
document = toc.branches[0]
print(document.depth)
for child_i, child in enumerate(document.branches):
    print("child {}".format(child_i), child.name, child.string, child.depth)
    child_text = get_toc_text(child)
    print("text:",repr(child_text))
    print('-'*30)
    
    # for subchild_i, subchild in enumerate(child.branches):
    #     print(subchild_i, subchild.name, subchild.string)
    #     print(vars(subchild))
    
    #     subchild_children = [x.name for x in subchild.branches]
    #     print(subchild_children)
    #     subchild_children_text = [x.string for x in subchild.branches]
    #     print(subchild_children_text)
    #     print('-'*30)
    
    # break
    

1
2
child 0 h2 Title2 3
p 'something something\ns2omething2 something2'
p 'something something\ns2omething2 something2'
p 'ddd'
p '| a | b |\n| --- | --- |\n| hi | hey |'
p 'writing something about title2'
['something something\ns2omething2 something2', 'something something\ns2omething2 something2', '* something\n* something\n', 'ddd', '| a | b |\n| --- | --- |\n| hi | hey |', 'writing something about title2', '###Title2-1', '###Title2-2']
text: 'something something\ns2omething2 something2\n\nsomething something\ns2omething2 something2\n\n* something\n* something\n\n\nddd\n\n| a | b |\n| --- | --- |\n| hi | hey |\n\nwriting something about title2\n\n###Title2-1\n\n###Title2-2'
------------------------------
child 1 h2 Title 3 3
p 'something3 something3\ns2omething3-2 something3-2'
p 'sth3'
['something3 something3\ns2omething3-2 something3-2', 'sth3', '###Title3-2']
text: 'something3 something3\ns2omething3-2 something3-2\n\nsth3\n\n###Title3-2'
------------------------------
child 2 h2

In [37]:
def split_case1(toc: TreeOfContents):
    """Case 1: contents under h1 title"""

def split_case2(document: TreeOfContents):
    """Case 2: title & contents in same level"""
    sections = []
    for child in document.branches:
        # p - title, h6 - abstract
        if not child.name=="h2":
            continue
        print("CHILD:", child.name, child.string, str(child))
        child_text = get_toc_text(child)
        print(child_text)
        
        print('-'*30)
        # for subchild in child.expandDescendants(child):
        #     print(subchild.name, subchild.string)
        #     print(md(str(subchild)))
        #     print(vars(subchild).keys())
        #     print(len(subchild.contents))
        #     print(repr(subchild.contents))
        
        # section_texts = [
            
        #     for subchild in child.expandDescendants(child)
        # ]
        section = ArxivPaperSection(
            header = child.name,
            title = child.string,
            text = child_text
        )
        sections.append(section)
    return sections

def split_markdown_text(text: str) -> List[ArxivPaperSection]:
    toc = md2py(text) # md -> html -> bs4
    print(vars(toc).keys())
    print("TOC", toc.name)
    l1_branches = toc.branches
    print(len(l1_branches))
    
    sections = []
    ## Case 1 (h1 title)
    if len(l1_branches)==1 and l1_branches[0].name=="h1":
        print(l1_branches[0])
        pass
    ## Case2
    elif len(l1_branches)>1:
        sections = split_case2(toc)
        
    return sections
    
idx = 0 ## Case 1
idx = 14 ## Case 2
sample = df.iloc[idx]['markdown']
split_markdown_text(sample)

dict_keys(['source', 'depth', 'descendants', 'branches'])
TOC [document]
11
CHILD: h2 1 Introduction 1 Introduction
p None <p>Chart figures serve as the visual summary of tabular data, which helps to convey rich context in various documents, such as scientific papers, textbooks, and technical news. An intelligent agent that can understand and communicate chart plots can lead to many useful applications. For example, a virtual doctor who knows how to answer the patient's question on a complex medical report or a reading assistant who can summarize the key findings from scientific papers in brief language. In the past few years, there has been a growing interest in our community to explore chart understanding in vision and language (V+L) tasks and many related benchmarks like Chart Question Answering <strong>(CQA)</strong>Masry et al. (2022); Kafle et al. (2018); Methani et al. (2020) and Chart Summarization <strong>(CS)</strong>Kantharaj et al. (2022) are introduced.</p>
p 'While preval

[ArxivPaperSection(header='h2', title='1 Introduction', text='Chart figures serve as the visual summary of tabular data, which helps to convey rich context in various documents, such as scientific papers, textbooks, and technical news. An intelligent agent that can understand and communicate chart plots can lead to many useful applications. For example, a virtual doctor who knows how to answer the patient\'s question on a complex medical report or a reading assistant who can summarize the key findings from scientific papers in brief language. In the past few years, there has been a growing interest in our community to explore chart understanding in vision and language (V\\+L) tasks and many related benchmarks like Chart Question Answering **(CQA)**Masry et al. (2022\\); Kafle et al. (2018\\); Methani et al. (2020\\) and Chart Summarization **(CS)**Kantharaj et al. (2022\\) are introduced.\n\n\n\nWhile prevalent in the research community, automatic chart understanding remains a challeng

In [None]:
def load_arxiv_paper(row):
    pass