In [4]:
import os

from langchain_text_splitters import MarkdownHeaderTextSplitter
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

from src.config import settings

In [5]:
df = pd.read_parquet(os.path.join(settings.data_dir, "arxiver/data/train.parquet"))
## Sample 10k
df = df.sample(10000)
print(df.shape, df.columns)

(10000, 7) Index(['id', 'title', 'abstract', 'authors', 'published_date', 'link',
       'markdown'],
      dtype='object')


In [31]:
## Initialize Splitter
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
]
splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

# Test with Sample

In [38]:
# Test Sample
'''
Abstract is usually defined as
'###### Abstract'
'''
idx = 0
# idx = 15
sample = df.iloc[idx]['markdown']
# print(len(sample), sample[:200])
print(sample)

# Enhanced Low-Dimensional Sensing Mapless Navigation

###### Abstract

In this study, we present two distinct approaches of Deep Reinforcement Learning (Deep-RL) algorithms for a mobile robot. The research methodology primarily involves a comparative analysis between a Deep-RL strategy grounded in the foundational Deep Q-Network (DQN) algorithm, and the Double Deep Q-Network (DDQN) algorithm. The agents in these approaches leverage 24 measurements from laser range sampling, coupled with the agent's positional differentials and orientation relative to the target. This amalgamation of data influences the agents' determinations regarding navigation, ultimately dictating the robot's velocities. By embracing this parsimonious sensory framework as proposed, we successfully showcase the training of an agent for proficiently executing navigation tasks and adeptly circumventing obstacles. Notably, this accomplishment is attained without a dependency on intricate sensory inputs like those inher

In [39]:
sample_splits = splitter.split_text(sample)
print(len(sample_splits))
sample_splits

10


[Document(metadata={'Header 1': 'Enhanced Low-Dimensional Sensing Mapless Navigation'}, page_content="###### Abstract  \nIn this study, we present two distinct approaches of Deep Reinforcement Learning (Deep-RL) algorithms for a mobile robot. The research methodology primarily involves a comparative analysis between a Deep-RL strategy grounded in the foundational Deep Q-Network (DQN) algorithm, and the Double Deep Q-Network (DDQN) algorithm. The agents in these approaches leverage 24 measurements from laser range sampling, coupled with the agent's positional differentials and orientation relative to the target. This amalgamation of data influences the agents' determinations regarding navigation, ultimately dictating the robot's velocities. By embracing this parsimonious sensory framework as proposed, we successfully showcase the training of an agent for proficiently executing navigation tasks and adeptly circumventing obstacles. Notably, this accomplishment is attained without a depend

In [45]:
import re
def parse_markdown_hierarchy(text):
    # Split the text into lines
    lines = text.strip().split('\n')
    # Define a pattern to match headers (from # to ######)
    header_pattern = re.compile(r'^(#{1,6})\s*(.*)$')

    # Initialize the root of the hierarchy
    root = {'children': []}
    # Stack to keep track of the current hierarchy levels
    stack = [{'level': 0, 'node': root}]
    # Accumulate text for the current node
    current_text = []

    for line in lines:
        header_match = header_pattern.match(line)
        if header_match:
            # If we have accumulated text, add it to the current node
            if current_text:
                # Join accumulated text and add to the last node's 'text'
                stack[-1]['node'].setdefault('text', '')
                if stack[-1]['node']['text']:
                    stack[-1]['node']['text'] += '\n'
                stack[-1]['node']['text'] += '\n'.join(current_text).strip()
                current_text = []
            # Extract header level and text
            header_marks, header_text = header_match.groups()
            level = len(header_marks)
            # Pop the stack to find the correct parent level
            while stack and stack[-1]['level'] >= level:
                stack.pop()
            # Create a new node for the header
            node = {
                'header': f'h{level}',
                'value': header_text.strip(),
                'children': []
            }
            # Add the new node to its parent's 'children'
            stack[-1]['node']['children'].append(node) ## parent
            # Push the new node onto the stack
            stack.append({'level': level, 'node': node}) ## for accumulating potential children
        else:
            # Accumulate non-header lines
            current_text.append(line)
    # After processing all lines, add any remaining text
    if current_text:
        stack[-1]['node'].setdefault('text', '')
        if stack[-1]['node']['text']:
            stack[-1]['node']['text'] += '\n'
        stack[-1]['node']['text'] += '\n'.join(current_text).strip()
    # Return the hierarchy starting from the root's children
    return root['children']
hierarchical_structure = parse_markdown_hierarchy(sample)

In [46]:
import pprint
pprint.pprint(hierarchical_structure, width=100)

[{'children': [{'children': [],
                'header': 'h6',
                'text': 'In this study, we present two distinct approaches of Deep Reinforcement '
                        'Learning (Deep-RL) algorithms for a mobile robot. The research '
                        'methodology primarily involves a comparative analysis between a Deep-RL '
                        'strategy grounded in the foundational Deep Q-Network (DQN) algorithm, and '
                        'the Double Deep Q-Network (DDQN) algorithm. The agents in these '
                        'approaches leverage 24 measurements from laser range sampling, coupled '
                        "with the agent's positional differentials and orientation relative to the "
                        "target. This amalgamation of data influences the agents' determinations "
                        "regarding navigation, ultimately dictating the robot's velocities. By "
                        'embracing this parsimonious sensory 