In [1]:
import os
import pandas as pd
from pydantic import BaseModel, Field

from semantic_chunkers import StatisticalChunker
# from semantic_router.encoders import OpenAIEncoder
from src.encoder import OpenAIEncoder

In [3]:
from pydantic_settings import BaseSettings, SettingsConfigDict

class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="../.env", env_file_encoding="utf-8", extra="ignore"
    )
    embedding_base_url: str
    embedding_api_key: str
    embedding_model: str
    
    sample_data_dir: str
    pipeline_src_dir: str
settings = Settings()

import sys
sys.path.append(settings.pipeline_src_dir)

In [4]:
from core.data.paper import ArxivPaperSection, ArxivPaperMetadata, ArxivPaper
from core.parser.md2py import TreeOfContents

from modules.extractor.section_splitter import MarkdownArxivPaperSectionSplitter

# 1. Load Encoder

In [6]:
# Use a custom OpenAIEncoder implementation to use local models
encoder = OpenAIEncoder(
    name=settings.embedding_model,
    base_url=settings.embedding_base_url,
    api_key=settings.embedding_api_key,
)

In [7]:
# StatisticalChunker options
# https://github.com/aurelio-labs/semantic-chunkers/blob/43ee0ac6ecdf76790de7d4ac4029f59438f34769/semantic_chunkers/chunkers/statistical.py#L49
chunker = StatisticalChunker(
    encoder=encoder,
    dynamic_threshold = True,
    window_size = 5,
    min_split_tokens = 256,
    max_split_tokens = 2048,
    plot_chunks = True,
    enable_statistics = True
)

# 2. Load Sample

In [5]:
## Load Sample
df = pd.read_parquet(settings.sample_data_dir)
df = df.sample(100)
print(df.shape, df.columns)

(100, 7) Index(['id', 'title', 'abstract', 'authors', 'published_date', 'link',
       'markdown'],
      dtype='object')


In [9]:
df.iloc[0]

id                                                       2302.05334
title             The Role of Codeword-to-Class Assignments in E...
abstract          Error-correcting codes (ECC) are used to reduc...
authors           Itay Evron, Ophir Onn, Tamar Weiss Orzech, Hai...
published_date                                 2023-02-10T15:48:51Z
link                              http://arxiv.org/abs/2302.05334v1
markdown          # The Role of Codeword-to-Class Assignments in...
Name: 61715, dtype: object

In [8]:
def get_sections(idx, text):
    # row = df.iloc[idx]
    # text = row['markdown']
    # found_filter = False
    
    sections = None
    for filter_cls in MarkdownArxivPaperSectionSplitter.__subclasses__():
        try:
            if filter_cls.is_type(text):
                # print("FOUND",filter_cls)
                found_filter = True
                sections = filter_cls().split(text)
                break
        except RecursionError as e:
            print("{} RECURSION ERROR {}".format(idx, str(e)))
            return idx, None
        except Exception as e:
            print("{} ERROR {}".format(idx, str(e)))
            # print(traceback.format_exc())
            raise e
    return idx, sections

In [10]:
texts = df.markdown.values.tolist()

results = [get_sections(idx, text) for idx, text in enumerate(texts)]
# Failed Count
failed_count = sum(1 for _, sections in results if sections is None)
print("Total {} failed {}".format(len(results), failed_count))

Total 100 failed 0


In [11]:
results[0]

(0,
 [ArxivPaperSection(header='h2', title='1 Introduction', text='Error\\-correcting codes (ECC) have been long used in machine learning as a reduction from multiclass classification tasks to binary classification tasks (Dietterich and Bakiri, 1994\\). This scheme encodes classes using rows of a binary matrix called a codebook. The codebook columns induce binary partitions of classes, or subproblems, to be learned using any binary classification algorithm.\n\nRecently, error\\-correcting codes have been used as output embeddings of deep networks (Yang et al., 2015; Rodriguez et al., 2018; Kusupati et al., 2021\\), on top of features extracted by deep CNNs (Dori et al., 2018\\), and as a means to combine ensembles of several networks (Zheng et al., 2018\\). Moreover, they were recently used for their robustness in adversarial learning (Verma and Swami, 2019; Gupta and Amin, 2021; Song et al., 2021\\) and for their redundancy in regression tasks (Shah et al., 2022\\) and heterogeneous d

# 3. Test Chunking