In [2]:
import os

from langchain_text_splitters import MarkdownHeaderTextSplitter
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

from src.config import settings

In [3]:
df = pd.read_parquet(os.path.join(settings.data_dir, "arxiver/data/train.parquet"))
## Sample 10k
df = df.sample(10000)
print(df.shape, df.columns)

(10000, 7) Index(['id', 'title', 'abstract', 'authors', 'published_date', 'link',
       'markdown'],
      dtype='object')


In [4]:
## Initialize Splitter
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
]
splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

# Test with Sample

In [5]:
# Test Sample
'''
Abstract is usually defined as
'###### Abstract'
'''
idx = 0
# idx = 15
sample = df.iloc[idx]['markdown']
# print(len(sample), sample[:200])
print(sample)

3D-Mol: A Novel Contrastive Learning Framework for Molecular Property Prediction with 3D Information

###### Abstract

Molecular property prediction offers an effective and efficient approach for early screening and optimization of drug candidates. Although deep learning based methods have made notable progress, most existing works still do not fully utilize 3D spatial information. This can lead to a single molecular representation representing multiple actual molecules. To address these issues, we propose a novel 3D structure-based molecular modeling method named 3D-Mol. In order to accurately represent complete spatial structure, we design a novel encoder to extract 3D features by deconstructing the molecules into three geometric graphs. In addition, we use 20M unlabeled data to pretrain our model by contrastive learning. We consider conformations with the same topological structure as positive pairs and the opposites as negative pairs, while the weight is determined by the dissimila

In [6]:
sample_splits = splitter.split_text(sample)
print(len(sample_splits))
sample_splits

7


[Document(metadata={}, page_content='3D-Mol: A Novel Contrastive Learning Framework for Molecular Property Prediction with 3D Information  \n###### Abstract  \nMolecular property prediction offers an effective and efficient approach for early screening and optimization of drug candidates. Although deep learning based methods have made notable progress, most existing works still do not fully utilize 3D spatial information. This can lead to a single molecular representation representing multiple actual molecules. To address these issues, we propose a novel 3D structure-based molecular modeling method named 3D-Mol. In order to accurately represent complete spatial structure, we design a novel encoder to extract 3D features by deconstructing the molecules into three geometric graphs. In addition, we use 20M unlabeled data to pretrain our model by contrastive learning. We consider conformations with the same topological structure as positive pairs and the opposites as negative pairs, while 

In [7]:
import re
def parse_markdown_hierarchy(text):
    # Split the text into lines
    lines = text.strip().split('\n')
    # Define a pattern to match headers (from # to ######)
    header_pattern = re.compile(r'^(#{1,6})\s*(.*)$')

    # Initialize the root of the hierarchy
    root = {'children': []}
    # Stack to keep track of the current hierarchy levels
    stack = [{'level': 0, 'node': root}]
    # Accumulate text for the current node
    current_text = []

    for line in lines:
        header_match = header_pattern.match(line)
        if header_match:
            # If we have accumulated text, add it to the current node
            if current_text:
                # Join accumulated text and add to the last node's 'text'
                stack[-1]['node'].setdefault('text', '')
                if stack[-1]['node']['text']:
                    stack[-1]['node']['text'] += '\n'
                stack[-1]['node']['text'] += '\n'.join(current_text).strip()
                current_text = []
            # Extract header level and text
            header_marks, header_text = header_match.groups()
            level = len(header_marks)
            # Pop the stack to find the correct parent level
            while stack and stack[-1]['level'] >= level:
                stack.pop()
            # Create a new node for the header
            node = {
                'header': f'h{level}',
                'value': header_text.strip(),
                'children': []
            }
            # Add the new node to its parent's 'children'
            stack[-1]['node']['children'].append(node) ## parent
            # Push the new node onto the stack
            stack.append({'level': level, 'node': node}) ## for accumulating potential children
        else:
            # Accumulate non-header lines
            current_text.append(line)
    # After processing all lines, add any remaining text
    if current_text:
        stack[-1]['node'].setdefault('text', '')
        if stack[-1]['node']['text']:
            stack[-1]['node']['text'] += '\n'
        stack[-1]['node']['text'] += '\n'.join(current_text).strip()
    # Return the hierarchy starting from the root's children
    return root['children']
hierarchical_structure = parse_markdown_hierarchy(sample)

In [8]:
import pprint
pprint.pprint(hierarchical_structure, width=100)

[{'children': [],
  'header': 'h6',
  'text': 'Molecular property prediction offers an effective and efficient approach for early '
          'screening and optimization of drug candidates. Although deep learning based methods '
          'have made notable progress, most existing works still do not fully utilize 3D spatial '
          'information. This can lead to a single molecular representation representing multiple '
          'actual molecules. To address these issues, we propose a novel 3D structure-based '
          'molecular modeling method named 3D-Mol. In order to accurately represent complete '
          'spatial structure, we design a novel encoder to extract 3D features by deconstructing '
          'the molecules into three geometric graphs. In addition, we use 20M unlabeled data to '
          'pretrain our model by contrastive learning. We consider conformations with the same '
          'topological structure as positive pairs and the opposites as negative pairs, wh