In [1]:
from datetime import datetime
from typing import List, Literal

import matplotlib.pyplot as plt
import pandas as pd
from pydantic import BaseModel, Field
from tqdm import tqdm

In [2]:
from pydantic_settings import BaseSettings, SettingsConfigDict

class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="../.env", env_file_encoding="utf-8", extra="ignore"
    )
    pipeline_src_dir: str
settings = Settings()

import sys
sys.path.append(settings.pipeline_src_dir)

In [3]:
from core.data.paper import ArxivPaperSection, ArxivPaperMetadata, ArxivPaper
from core.parser.md2py import TreeOfContents

from modules.extractor.section_splitter import MarkdownArxivPaperSectionSplitter


# 1. Data 클래스 정의

In [4]:
class ArxivPaperSection(BaseModel):
    header: Literal["h2", "h3", "h4", "h5", "h6"]
    title:str = Field(..., description="Section title")
    text: str = Field("", description = "Section contents")
    children: List["ArxivPaperSection"] = Field(list(), description="child sections")

class ArxivPaperMetadata(BaseModel):
    authors: str
    published_date: datetime
    link: str

class ArxivPaper(BaseModel):
    id: str
    title: str
    abstract: str
    sections: List[ArxivPaperSection]
    metadata: ArxivPaperMetadata = Field(None, description="paper metadata")

# 2. Test with Samples

In [6]:
## Load Sample
df = pd.read_parquet("../sample.parquet")
df = df.sample(100)
print(df.shape, df.columns)

(100, 7) Index(['id', 'title', 'abstract', 'authors', 'published_date', 'link',
       'markdown'],
      dtype='object')


In [8]:
df.iloc[0]

id                                                       2307.16062
title             Using Implicit Behavior Cloning and Dynamic Mo...
abstract          Reinforcement learning (RL) for motion plannin...
authors           Zengjie Zhang, Jayden Hong, Amir Soufi Enayati...
published_date                                 2023-07-29T19:46:09Z
link                              http://arxiv.org/abs/2307.16062v2
markdown          Using Implicit Behavior Cloning and Dynamic Mo...
Name: 24219, dtype: object

In [9]:
def get_sections(idx, text):
    # row = df.iloc[idx]
    # text = row['markdown']
    # found_filter = False
    
    sections = None
    for filter_cls in MarkdownArxivPaperSectionSplitter.__subclasses__():
        try:
            if filter_cls.is_type(text):
                # print("FOUND",filter_cls)
                found_filter = True
                sections = filter_cls().split(text)
                break
        except RecursionError as e:
            print("{} RECURSION ERROR {}".format(idx, str(e)))
            return idx, None
        except Exception as e:
            print("{} ERROR {}".format(idx, str(e)))
            # print(traceback.format_exc())
            raise e
    return idx, sections

In [10]:
texts = df.markdown.values.tolist()

results = [get_sections(idx, text) for idx, text in enumerate(texts)]

In [11]:
# Failed Count
failed_count = sum(1 for _, sections in results if sections is None)
print("Total {} failed {}".format(len(results), failed_count))

Total 100 failed 0


In [12]:
results[0]

(0,
 [ArxivPaperSection(header='h2', title='I Introduction', text='The next\\-generation manufacturing is expected to have a higher level of automation and involve less human power. Intelligent robots are needed to actively learn skills instead of being programmed by experts explicitly \\[1, 2]. Reinforcement learning (RL) is a powerful approach that enables robots to automatically learn an ideal manipulation policy via trial and error. A typical application of RL is robot motion planning which requires the robot to move from an initial position to a goal position without colliding with the obstacles in the environment \\[3]. As illustrated in Fig. 1, motion planning is an essential problem for more complicated tasks such as grasping \\[4], assembly \\[5], and manipulation \\[6]. The conventional approaches used for robot motion planning include optimization\\-based methods, such as trajectory optimization \\[7] and sequential convex optimization \\[8], and the sampling\\-based methods

In [13]:
results[0][1][2].children

[ArxivPaperSection(header='p', title='', text='In this section, we first introduce the overall framework of the proposed IBC\\-DMP RL method. Then, we present a novel Multi\\-DoF DMP model for motion planning of a high degree\\-of\\-freedom (DoF) robot. Finally, based on the proposed framework, we clarify the problem to be solved in this paper.', children=[]),
 ArxivPaperSection(header='h3', title='*Overall Framework*', text='*Overall Framework*The overall framework of IBC\\-DMP RL is illustrated in Fig. 2\\. The basic model used to generate robot trajectories is *Multi\\-DoF DMP*, an adapted version of the conventional DMP model introduced in Sec. II\\-A. It also serves as the environment in the robot motion planning problem. The human demonstration encodes a demonstration policy (\\\\tilde{\\\\mathbf{f}}) which reflects how humans behave in a motion planning task. An *IBC\\-DMP agent* is used to generate the desired motion planning policy (\\\\mathbf{f}) for the task. The agent is tr