In [1]:
from typing import List, Literal

import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

In [3]:
from datetime import datetime
from pydantic import BaseModel, Field

class ArxivPaperSection(BaseModel):
    header: Literal["h2", "h3", "h4", "h5", "h6"]
    title:str = Field(..., description="Section title")
    text: str = Field("", description = "Section contents")
    children: List["ArxivPaperSection"] = Field(list(), description="child sections")

class ArxivPaperMetadata(BaseModel):
    authors: str
    published_date: datetime
    link: str

class ArxivPaper(BaseModel):
    id: str
    title: str
    abstract: str
    sections: List[ArxivPaperSection]
    metadata: ArxivPaperMetadata = Field(None, description="paper metadata")

In [4]:
## Load Sample
df = pd.read_parquet("sample.parquet")
print(df.shape, df.columns)

(10000, 7) Index(['id', 'title', 'abstract', 'authors', 'published_date', 'link',
       'markdown'],
      dtype='object')


In [5]:
df.iloc[0]

id                                                       2310.03187
title             Synthesis of Data-Driven Nonlinear State Obser...
abstract          This paper focuses on the model-free synthesis...
authors                                                 Wentao Tang
published_date                                 2023-10-04T22:19:53Z
link                              http://arxiv.org/abs/2310.03187v1
markdown          # Synthesis of Data-Driven Nonlinear State Obs...
Name: 62774, dtype: object

In [6]:
## md2py based parser
from markdownify import markdownify as md
from src.custom_md2py import md2py, TreeOfContents

In [7]:
def get_toc_text(toc: TreeOfContents) -> str:
    html_texts = []
    for desc in toc.descendants:
        html_texts.append(repr(desc))
    md_text = md("".join(html_texts))
    return md_text


In [8]:
from abc import ABC, abstractmethod
class BaseArxivPaperSectionSplitter:
    def __init__(self):
        pass
    
    @abstractmethod
    def split(self, toc: TreeOfContents) -> List[ArxivPaperSection]:
        raise NotImplementedError("split must be implemented")
    
    @classmethod
    def is_type(cls):
        return False
    
class Case1Filter(BaseArxivPaperSectionSplitter):
    """Case1: everything under h1 title"""
    @classmethod
    def is_type(cls, toc: TreeOfContents):
        if len(toc.branches)==1 and toc.branches[0].name=="h1":
            return True
        return False
    
    def split(self, toc: TreeOfContents) -> List[ArxivPaperSection]:
        h1_child = list(toc.branches)[0]
        sections = []
        for child in h1_child.branches:
            # p - title, h6 - abstract
            if not child.name=="h2":
                continue
            # print("CHILD:", child.name, child.string, str(child))
            child_text = get_toc_text(child)
            # print("title:",child.name, repr(child), child.string)
            section = ArxivPaperSection(
                header = child.name,
                title = child.getText(),
                text = child.getDescendantsMarkdown()
            )
            sections.append(section)
        return sections
    
class Case2Filter(BaseArxivPaperSectionSplitter):
    """Case2: title & contents in same level"""
    @classmethod
    def is_type(cls, toc: TreeOfContents):
        if len(toc.branches)>1:
            return True
        return False
    
    def split(self, toc: TreeOfContents) -> List[ArxivPaperSection]:
        sections = []
        for child in toc.branches:
            # p - title, h6 - abstract
            if not child.name=="h2":
                continue
            # print("CHILD:", child.name, child.string, str(child))
            child_text = get_toc_text(child)
            
            section = ArxivPaperSection(
                header = child.name,
                title = child.getText(),
                text = child.getDescendantsMarkdown()
            )
            sections.append(section)
        return sections

In [10]:

idx = 0 ## Case 1
# idx = 14 ## Case 2
sample = df.iloc[idx]['markdown']

toc = md2py(sample) 
## Test Filters
sections = []
found_filter = False
for filter_cls in BaseArxivPaperSectionSplitter.__subclasses__():
    try:
        if filter_cls.is_type(toc):
            print("FOUND",filter_cls)
            found_filter = True
            sections = filter_cls().split(toc)
    except Exception as e:
        print("ERROR")
        raise e
    
print("SECTIONS:", len(sections))
for section in sections:
    print(section)

FOUND <class '__main__.Case1Filter'>
SECTIONS: 5
header='h2' title='I Introduction' text='For nonlinear systems that arise from realistic engineering applications such as transport\\-reaction processes, modern control theory relies on *state\\-space representations* for their modeling, analysis, and control \\[1, 2, 3]. Recent advances in nonlinear control have highlighted the role of data\\-driven (machine learning) techniques in identifying governing equations or underlying dynamical structures \\[4, 5, 6], analyzing system and control\\-theoretic properties \\[7, 8], and synthesizing model\\-free controllers \\[9, 10, 11]. In these efforts, it is often assumed that the *state* information is available for analysis or control; for example, in reinforcement learning (RL) literature, it is common to apply stochastic first\\-order optimization to learn a value (cost) function or (Q) function based on temporal actions and state measurements. In many (if not most) control engineering appl

In [21]:
## Test Filters
no_filter_idxs = []
for idx in tqdm(range(df.shape[0])):
    sample = df.iloc[idx]['markdown']
    toc = md2py(sample)
    sections = []
    found_filter = False
    for filter_cls in BaseArxivPaperSectionSplitter.__subclasses__():
        try:
            if filter_cls.is_type(toc):
                # print("FOUND",filter_cls)
                found_filter = True
                sections = filter_cls().split(toc)
        except Exception as e:
            print("ERROR")
            raise e
        if not found_filter:
            no_filter_idxs.append(idx)
print("total {} no filter {}".format(df.shape[0], len(no_filter_idxs)))
        

100%|██████████| 10000/10000 [11:50<00:00, 14.07it/s]

total 10000 no filter 1520





In [22]:
no_filter_idxs[:3]

[14, 18, 25]

In [23]:
idx = 14
sample = df.iloc[idx]['markdown']
print(sample)

Enhanced Chart Understanding in Vision and Language Task via Cross-modal Pre-training on Plot Table Pairs

###### Abstract

Building cross-model intelligence that can understand charts and communicate the salient information hidden behind them is an appealing challenge in the vision and language (V+L) community. The capability to uncover the underlined table data of chart figures is a critical key to automatic chart understanding. We introduce ChartT5, a V+L model that learns how to interpret table information from chart images via cross-modal pre-training on plot table pairs. Specifically, we propose two novel pre-training objectives: Masked Header Prediction (MHP) and Masked Value Prediction (MVP) to facilitate the model with different skills to interpret the table information. We have conducted extensive experiments on chart question answering and chart summarization to verify the effectiveness of the proposed pre-training strategies. In particular, on the ChartQA benchmark, our Cha

In [16]:
from bs4 import BeautifulSoup
sample_html = '''<h2>V Analytical characterization of (\mathbb{L}<em>{x,M}) and (\mathbb{H}</em>{x,M})</h2>'''
soup = BeautifulSoup(sample_html, 'html.parser')
children = list(soup.children)
print(repr(soup))
print(soup.string)
print(str(soup))
print(repr(soup.get_text()))

<h2>V Analytical characterization of (\mathbb{L}<em>{x,M}) and (\mathbb{H}</em>{x,M})</h2>
None
<h2>V Analytical characterization of (\mathbb{L}<em>{x,M}) and (\mathbb{H}</em>{x,M})</h2>
'V Analytical characterization of (\\mathbb{L}{x,M}) and (\\mathbb{H}{x,M})'


In [17]:
idx = 104
sample = df.iloc[idx]['markdown']
toc = md2py(sample)
sections = []
found_filter = False
for filter_cls in BaseArxivPaperSectionSplitter.__subclasses__():
    try:
        if filter_cls.is_type(toc):
            # print("FOUND",filter_cls)
            found_filter = True
            sections = filter_cls().split(toc)
    except Exception as e:
        print("ERROR")
        raise e

In [18]:
print(sample)

# Low-ground/High ground capacity regions analysis for Bosonic Gaussian Channels

###### Abstract

We present a comprehensive characterization of the interconnections between single-mode, phase-insensitive Gaussian Bosonic Channels resulting from channel concatenation. This characterization enables us to identify, in the parameter space of these maps, two distinct regions: low-ground and high-ground. In the low-ground region, the information capacities are smaller than a designated reference value, while in the high-ground region, they are provably greater. As a direct consequence, we systematically outline an explicit set of upper bounds for the quantum and private capacity of these maps, which combine known upper bounds and composition rules, improving upon existing results.

pacs: 03.67.-a, 03.67.Ac, 03.65.Ta.

## I Introduction

The efficiency of classical communication lines can be expressed using a single, simple formula [1; 2]. However, when it comes to quantum communication lin

In [19]:
def split_case1(toc: TreeOfContents):
    """Case 1: contents under h1 title"""

def split_case2(document: TreeOfContents):
    """Case 2: title & contents in same level"""
    sections = []
    for child in document.branches:
        # p - title, h6 - abstract
        if not child.name=="h2":
            continue
        print("CHILD:", child.name, child.string, str(child))
        child_text = get_toc_text(child)
        print(child_text)
        
        print('-'*30)
        # for subchild in child.expandDescendants(child):
        #     print(subchild.name, subchild.string)
        #     print(md(str(subchild)))
        #     print(vars(subchild).keys())
        #     print(len(subchild.contents))
        #     print(repr(subchild.contents))
        
        # section_texts = [
            
        #     for subchild in child.expandDescendants(child)
        # ]
        section = ArxivPaperSection(
            header = child.name,
            title = child.string,
            text = child_text
        )
        sections.append(section)
    return sections

def split_markdown_text(text: str) -> List[ArxivPaperSection]:
    toc = md2py(text) # md -> html -> bs4
    print(vars(toc).keys())
    print("TOC", toc.name)
    l1_branches = toc.branches
    print(len(l1_branches))
    
    sections = []
    ## Case 1 (h1 title)
    if len(l1_branches)==1 and l1_branches[0].name=="h1":
        print(l1_branches[0])
        pass
    ## Case2
    elif len(l1_branches)>1:
        sections = split_case2(toc)
        
    return sections
    
idx = 0 ## Case 1
idx = 14 ## Case 2
sample = df.iloc[idx]['markdown']
split_markdown_text(sample)

dict_keys(['source', 'depth', 'branches', 'descendants'])
TOC [document]
10
CHILD: h2 1 Introduction 1 Introduction
Chart figures serve as the visual summary of tabular data, which helps to convey rich context in various documents, such as scientific papers, textbooks, and technical news. An intelligent agent that can understand and communicate chart plots can lead to many useful applications. For example, a virtual doctor who knows how to answer the patient's question on a complex medical report or a reading assistant who can summarize the key findings from scientific papers in brief language. In the past few years, there has been a growing interest in our community to explore chart understanding in vision and language (V\+L) tasks and many related benchmarks like Chart Question Answering **(CQA)**Masry et al. (2022\); Kafle et al. (2018\); Methani et al. (2020\) and Chart Summarization **(CS)**Kantharaj et al. (2022\) are introduced.

While prevalent in the research community, automa

[ArxivPaperSection(header='h2', title='1 Introduction', text='Chart figures serve as the visual summary of tabular data, which helps to convey rich context in various documents, such as scientific papers, textbooks, and technical news. An intelligent agent that can understand and communicate chart plots can lead to many useful applications. For example, a virtual doctor who knows how to answer the patient\'s question on a complex medical report or a reading assistant who can summarize the key findings from scientific papers in brief language. In the past few years, there has been a growing interest in our community to explore chart understanding in vision and language (V\\+L) tasks and many related benchmarks like Chart Question Answering **(CQA)**Masry et al. (2022\\); Kafle et al. (2018\\); Methani et al. (2020\\) and Chart Summarization **(CS)**Kantharaj et al. (2022\\) are introduced.\n\nWhile prevalent in the research community, automatic chart understanding remains a challenging 

In [20]:
def load_arxiv_paper(row):
    pass