In [1]:
from datetime import datetime
from typing import List, Literal

import matplotlib.pyplot as plt
import pandas as pd
from pydantic import BaseModel, Field
from tqdm import tqdm

In [5]:
from pydantic_settings import BaseSettings, SettingsConfigDict

class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="../.env", env_file_encoding="utf-8", extra="ignore"
    )
    pipeline_src_dir: str
settings = Settings()

import sys
sys.path.append(settings.pipeline_src_dir)

In [6]:
from core.data.paper import ArxivPaperSection, ArxivPaperMetadata, ArxivPaper
from core.parser.md2py import TreeOfContents

from modules.extractor.section_splitter import MarkdownArxivPaperSectionSplitter


# 1. Data 클래스 정의

In [2]:
class ArxivPaperSection(BaseModel):
    header: Literal["h2", "h3", "h4", "h5", "h6"]
    title:str = Field(..., description="Section title")
    text: str = Field("", description = "Section contents")
    children: List["ArxivPaperSection"] = Field(list(), description="child sections")

class ArxivPaperMetadata(BaseModel):
    authors: str
    published_date: datetime
    link: str

class ArxivPaper(BaseModel):
    id: str
    title: str
    abstract: str
    sections: List[ArxivPaperSection]
    metadata: ArxivPaperMetadata = Field(None, description="paper metadata")

# 2. Test with Samples

In [3]:
## Load Sample
df = pd.read_parquet("sample.parquet")
df = df.sample(100)
print(df.shape, df.columns)

(100, 7) Index(['id', 'title', 'abstract', 'authors', 'published_date', 'link',
       'markdown'],
      dtype='object')


In [4]:
df.iloc[0]

id                                                       2304.05628
title             Bounding the Lagrangian Hofer metric via barcodes
abstract          We provide an upper bound on the Lagrangian Ho...
authors                                           Patricia Dietzsch
published_date                                 2023-04-12T06:01:06Z
link                              http://arxiv.org/abs/2304.05628v1
markdown          # Bounding the Lagrangian Hofer metric via bar...
Name: 32259, dtype: object

In [7]:
def get_sections(idx, text):
    # row = df.iloc[idx]
    # text = row['markdown']
    # found_filter = False
    
    sections = None
    for filter_cls in MarkdownArxivPaperSectionSplitter.__subclasses__():
        try:
            if filter_cls.is_type(text):
                # print("FOUND",filter_cls)
                found_filter = True
                sections = filter_cls().split(text)
                break
        except RecursionError as e:
            print("{} RECURSION ERROR {}".format(idx, str(e)))
            return idx, None
        except Exception as e:
            print("{} ERROR {}".format(idx, str(e)))
            # print(traceback.format_exc())
            raise e
    return idx, sections

In [8]:
texts = df.markdown.values.tolist()

results = [get_sections(idx, text) for idx, text in enumerate(texts)]

In [9]:
# Failed Count
failed_count = sum(1 for _, sections in results if sections is None)
print("Total {} failed {}".format(len(results), failed_count))

Total 100 failed 0


In [10]:
results[0]

(0,
 [ArxivPaperSection(header='h2', title='1. Introduction and main results', text="Let ((M,\\\\omega\\=\\-\\\\mathrm{d}\\\\lambda)) be an exact symplectic manifold. Consider the group (\\\\mathrm{Ham}(M)) of compactly supported Hamiltonian diffeomorphisms. Any compactly supported Hamiltonian function (H\\\\in C^{\\\\infty}(\\[0,1]\\\\times M)) generates a Hamiltonian flow ({\\\\phi\\_{t}^{H}}). The Hofer norm of a Hamiltonian diffeomorphism (\\\\phi\\\\in\\\\mathrm{Ham}(M)) is given by\n\n\\[\\|\\|\\\\phi\\|\\|*{H}\\=\\\\inf\\\\left{\\\\int*{0}^{1}\\\\max\\_{x\\\\in M}H\\_{t}(x)\\-\\\\min\\_{x\\\\in M}H\\_{t}( x)\\\\,\\\\mathrm{d}t\\\\,\\\\big{\\|}\\\\,\\\\phi\\_{1}^{H}\\=\\\\phi\\\\right}.]\n\n*{H}\\=\\\\inf\\\\left{\\\\int*Let (L) and (L^{\\\\prime}) be closed connected Lagrangian submanifolds in (M) that are Hamiltonian isotopic. The Lagrangian Hofer distance between (L) and (L^{\\\\prime}) is defined by\n\n\\[d\\_{H}(L,L^{\\\\prime})\\=\\\\inf\\\\left{\\|\\|\\\\phi\\|\\|\\_{H}\\\