In [1]:
from datetime import datetime
from typing import List, Literal

import matplotlib.pyplot as plt
import pandas as pd
from pydantic import BaseModel, Field
from tqdm import tqdm

In [2]:
from pydantic_settings import BaseSettings, SettingsConfigDict

class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="../.env", env_file_encoding="utf-8", extra="ignore"
    )
    pipeline_src_dir: str
settings = Settings()

import sys
sys.path.append(settings.pipeline_src_dir)

In [3]:
from core.data.paper import ArxivPaperSection, ArxivPaperMetadata, ArxivPaper
from core.parser.md2py import TreeOfContents

from modules.extractor.section_splitter import MarkdownArxivPaperSectionSplitter

# 1. 테스트 샘플

In [4]:
text = '''# Test Title

## 1. Test section 1
something something intro to section
something something intro2 to section

### 1.1 Something section 1
something something intro to section1.1

### 1.2 Something section 1
something something intro to section 1.2'''

In [5]:
for filter_cls in MarkdownArxivPaperSectionSplitter.__subclasses__():
    try:
        if filter_cls.is_type(text):
            # print("FOUND",filter_cls)
            found_filter = True
            sections = filter_cls().split(text)
            break
    except RecursionError as e:
        print("RECURSION ERROR {}".format(str(e)))
        break
    except Exception as e:
        print("ERROR {}".format(str(e)))
        # print(traceback.format_exc())
        raise e

In [6]:
print(sections[0].text)

something something intro to section
something something intro2 to section

### 1\.1 Something section 1

something something intro to section1\.1

### 1\.2 Something section 1

something something intro to section 1\.2




In [7]:
## Test langchain MarkdownHeaderSplitter
from langchain_text_splitters import MarkdownHeaderTextSplitter

In [8]:
headers_to_split_on = [
    ("###", "h3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

In [9]:
md_header_splits = markdown_splitter.split_text(sections[0].text)
md_header_splits

[Document(metadata={}, page_content='something something intro to section\nsomething something intro2 to section'),
 Document(metadata={'h3': '1\\.1 Something section 1'}, page_content='something something intro to section1\\.1'),
 Document(metadata={'h3': '1\\.2 Something section 1'}, page_content='something something intro to section 1\\.2')]

In [10]:
split_sections = []
for split in md_header_splits:
    if 'h3' in split.metadata:
        header = "h3"
        title=split.metadata.get("h3", "")
    else:
        header = "p"
        title=""
        
    section = ArxivPaperSection(
        header=header,
        title=title,
        text=split.page_content
    )
    split_sections.append(section)

In [11]:
split_sections

[ArxivPaperSection(header='p', title='', text='something something intro to section\nsomething something intro2 to section', children=[]),
 ArxivPaperSection(header='h3', title='1\\.1 Something section 1', text='something something intro to section1\\.1', children=[]),
 ArxivPaperSection(header='h3', title='1\\.2 Something section 1', text='something something intro to section 1\\.2', children=[])]

# Test Module version

In [13]:
sections[0].children

[ArxivPaperSection(header='p', title='', text='something something intro to section\nsomething something intro2 to section', children=[]),
 ArxivPaperSection(header='h3', title='1\\.1 Something section 1', text='something something intro to section1\\.1', children=[]),
 ArxivPaperSection(header='h3', title='1\\.2 Something section 1', text='something something intro to section 1\\.2', children=[])]