In [1]:
import glob
import sys
import os
from pathlib import Path

# Get absolute path to the directory containing word_to_markdown.py
project_dir = os.path.abspath('.')
sys.path.insert(0, project_dir)  # Insert at beginning of path

# Debug to verify paths
print(f"Current working directory: {os.getcwd()}")
print(f"Added to path: {project_dir}")
print(f"Python path: {sys.path}")

# Check if file exists
module_path = os.path.join(project_dir, "word_to_markdown.py")
print(f"Module path exists: {os.path.exists(module_path)}")

# Import the converter from your script
from word_to_markdown import WordToMarkdownConverter

volume_path = "test_files"

# Get all PDFs
all_docx = glob.glob(f"{volume_path}/*.docx")

# Initialize the converter
converter = WordToMarkdownConverter(
    preserve_tables=True,
    preserve_images=True,
    preserve_lists=True
)

# Create an output directory for the converted markdown files
output_dir = "markdown_files"
os.makedirs(output_dir, exist_ok=True)

# Convert each DOCX file to Markdown
converted_files = []
for docx_file in all_docx:
    try:
        # Define output path (same filename but with .md extension in the output directory)
        docx_filename = os.path.basename(docx_file)
        output_path = os.path.join(output_dir, Path(docx_filename).with_suffix('.md'))
        
        # Perform the conversion
        converted_file = converter.convert_file(docx_file, output_path)
        converted_files.append(converted_file)
        print(f"Successfully converted: {docx_file}")
    except Exception as e:
        print(f"Failed to convert {docx_file}: {e}")

print(f"Converted {len(converted_files)} files. Output files are in the {output_dir} directory.")



Current working directory: /home/alibina/repo/MunichRe/policy-extraction
Added to path: /home/alibina/repo/MunichRe/policy-extraction
Python path: ['/home/alibina/repo/MunichRe/policy-extraction', '/home/alibina/miniconda3/envs/ai_scientist/lib/python311.zip', '/home/alibina/miniconda3/envs/ai_scientist/lib/python3.11', '/home/alibina/miniconda3/envs/ai_scientist/lib/python3.11/lib-dynload', '', '/home/alibina/miniconda3/envs/ai_scientist/lib/python3.11/site-packages']
Module path exists: True
Saved image: markdown_files/images/image_5006.png
Saved image: markdown_files/images/image_9631.png
Converted: test_files/test.docx → markdown_files/test.md
Created HTML preview: markdown_files/test.preview.html
Created HTML preview: markdown_files/test.preview.html
Successfully converted: test_files/test.docx
Converted 1 files. Output files are in the markdown_files directory.


In [2]:
import sys
# add the markdown-indexer/src directory to the Python path
sys.path.insert(0, 'markdown-indexer')

from src.markdown_processor import MarkdownProcessor
from src.embeddings import EmbeddingGenerator
from src.storage.faiss_storage import FaissStorage
import os

def index_markdown_with_faiss(markdown_text, 
                              model_name, 
                              chunk_size= 4000, 
                              chunk_overlap=200,
                              max_table_size=2000,
                              output_dir=None):

    # check if output_dir is provided
    if output_dir:
        output_chunks = f"{output_dir}/chunks.jsonl"
        output_index = f"{output_dir}/faiss_index"
        output_documents = f"{output_dir}/documents.pkl"
    else:
        output_chunks = None
        output_index = None
        output_documents = None
    # Check if the model name is provided
    if not model_name:
        raise ValueError("Model name must be provided.")
    # Check if the markdown text is provided
    if not markdown_text:
        raise ValueError("Markdown text must be provided.")
    # Check if the output directory exists
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Initialize the Markdown processor
    processor = MarkdownProcessor(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        max_table_size=max_table_size
    )
    
    # Parse and chunk the markdown text - using the correct method flow
    parsed_blocks = processor.parse_markdown(markdown_text)
    chunks = processor.chunk_text(parsed_blocks)

    # save chunks to a file if output_chunks is provided
    if output_chunks:
        processor.save_chunks(chunks, output_chunks)

    
    # Initialize the embedding generator
    embedding_generator = EmbeddingGenerator(model_name=model_name)
    
    # Generate embeddings for each chunk
    embeddings = [embedding_generator.generate_embeddings(chunk['content']) for chunk in chunks]
    
    # Initialize FAISS storage with the appropriate dimension
    dimension = embeddings[0].shape[1]  # Assuming all embeddings have the same dimension
    faiss_storage = FaissStorage(dimension=dimension)
    
    # Index the embeddings
    for chunk, embedding in zip(chunks, embeddings):
        faiss_storage.add((embedding, chunk['content']))

    # Save the FAISS index and documents
    if output_index and output_documents:
        faiss_storage.save(output_index, output_documents)
    
    print(f"Indexed {len(chunks)} chunks into FAISS.")

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
# read the markdown text from a file
with open("/home/azureuser/policy-extraction/markdown_files/test.md", "r") as f:
    markdown_text = f.read()

# Specify the Hugging Face model name
model_name = "distilbert-base-uncased"

# Index the markdown text
index_markdown_with_faiss(
    markdown_text,
    model_name,
    chunk_size=1000,
    chunk_overlap=200,
    max_table_size=2000,
    output_dir="output"
    )

Indexed 2 chunks into FAISS.


In [16]:
# import chunked data
import json
import os
import pandas as pd
import faiss

def load_chunked_data(file_path):
    """
    Load chunked data from a JSONL file.

    Args:
        file_path (str): Path to the JSONL file.

    Returns:
        list: List of dictionaries containing chunked data.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    with open(file_path, 'r') as f:
        data = [json.loads(line) for line in f]

    return data

def load_faiss_index(index_path, documents_path):
    """
    Load a FAISS index and documents.

    Args:
        index_path (str): Path to the FAISS index file.
        documents_path (str): Path to the documents file.

    Returns:
        tuple: FAISS index and documents.
    """
    if not os.path.exists(index_path):
        raise FileNotFoundError(f"Index file not found: {index_path}")

    if not os.path.exists(documents_path):
        raise FileNotFoundError(f"Documents file not found: {documents_path}")

    index = faiss.read_index(index_path)
    documents = pd.read_pickle(documents_path)

    return index, documents

def search_faiss_index(index, query_vector, k=5):
    """
    Search the FAISS index for the nearest neighbors of a query vector.

    Args:
        index (faiss.Index): FAISS index.
        query_vector (numpy.ndarray): Query vector.
        k (int): Number of nearest neighbors to return.
        
    Returns:
        tuple: Distances and indices of nearest neighbors.
    """
    # Ensure query vector is 2D: reshape to (1, d) if it's 1D
    import numpy as np
    if len(query_vector.shape) == 1:
        query_vector = query_vector.reshape(1, -1)
    
    distances, indices = index.search(query_vector, k)
    return distances, indices

In [17]:
jsonl_file = "output/chunks.jsonl"
faiss_index_file = "output/faiss_index"
documents_file = "output/documents.pkl"

# Load the chunked data
chunked_data = load_chunked_data(jsonl_file)
# Load the FAISS index and documents
index, documents = load_faiss_index(faiss_index_file, documents_file)
# Example query vector (replace with your own)
query_vector = index.reconstruct(0)  # Replace with your own query vector

# Search the FAISS index
distances, indices = search_faiss_index(index, query_vector, k=5)

# Print the results
for i in range(len(indices[0])):  # Note the [0] as we're working with batched results
    idx = indices[0][i]
    distance = distances[0][i]
    if idx != -1:  # -1 indicates no neighbor found
        print(f"Match {i+1}, Distance: {distance}")
        print(documents[idx])
        print("---")



Match 1, Distance: 0.0
<!-- Styles for better image display -->
<style>
img.markdown-image {
  display: block;
  max-width: 100%;
  height: auto;
  margin: 20px 0;
  border-radius: 5px;
}
</style>

**Annotation Benchmarking**

**Chemical Reaction Figures**

**Models:**

- AI4Chem/ChemVLM-8B
- GPT-4 turbo-2024-04-09

**Input prompt:**

- You are a chemical annotation assistant. Analyze chemical reaction images, and annotate all relevant information such as reactants, products, catalysts, conditions, and mechanisms. Don't include any reactions or formulas in your annotations. Explain what they mean and serve for. Ensure chemical accuracy, use standard conventions, and maintain completeness and clarity in the annotations. **Sample document:**

- ChemBioChem - 2020 - Norvaiša - Porphyrins as Colorimetric and Photometric Biosensors in Modern Bioanalytical Systems.pdf
---
Match 2, Distance: 137.2635040283203
|         |                                                |                        

In [14]:
query_vector.shape

(768,)