# OpenChemIE Demo Project

## Install

In [1]:
# Install Poppler
!apt update
!apt install build-essential poppler-utils pkg-config -y libpoppler-cpp-dev

Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy InRelease [270 kB]                [0m
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1581 B]
Get:4 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]3m
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]m[33m[33m
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1196 kB]
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:8 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [3436 kB]
Get:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy/main amd64 Packages [32.9 kB]
Get:10 http://archive.ubuntu.com/ubuntu jammy/restricted amd64 Packages [164 kB][0m[33m[33m[33m
Get:11 http://security.ubuntu.com/ubuntu jammy-security/multiverse amd64 Packages [45.2 kB]
Get:12 http://s

In [2]:
# Install OpenChemIE
!pip install --upgrade pip setuptools
!pip install --upgrade --ignore-installed blinker
!pip install 'OpenChemIE @ git+https://github.com/CrystalEye42/OpenChemIE'

Collecting pip
  Downloading pip-24.3.1-py3-none-any.whl.metadata (3.7 kB)
Collecting setuptools
  Downloading setuptools-75.6.0-py3-none-any.whl.metadata (6.7 kB)
Downloading pip-24.3.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hDownloading setuptools-75.6.0-py3-none-any.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: setuptools, pip
  Attempting uninstall: setuptools
    Found existing installation: setuptools 68.2.2
    Uninstalling setuptools-68.2.2:
      Successfully uninstalled setuptools-68.2.2
  Attempting uninstall: pip
    Found existing installation: pip 23.3.1
    Uninstalling pip-23.3.1:
      Successfully uninstalled pip-23.3.1
Successfully installed pip-24.3.1 setuptools-75.6.0
[0mCollecting blinker
  Downloading

## Imports & Environment

In [3]:
import torch
from openchemie import OpenChemIE
import json
import numpy as np

print("CUDA is available: ", torch.cuda.is_available())
model = OpenChemIE(device="cuda" if torch.cuda.is_available() else "cpu")

CUDA is available:  True


In [4]:
import os
import zipfile
import shutil
import pprint

pp = pprint.PrettyPrinter(indent=4)

if os.path.exists("examples.zip") and not os.path.exists("examples"):
    with zipfile.ZipFile('examples.zip', 'r') as zip_ref:
        zip_ref.extractall('.') 

if os.path.exists("__MACOSX"):
    shutil.rmtree("__MACOSX")

INPUT_DIR = "inputs"
if not os.path.exists(INPUT_DIR):
    os.mkdir("inputs")

OUTPUT_DIR = "outputs"
if not os.path.exists(OUTPUT_DIR):
    os.mkdir("outputs")

# Iterate over all files in the source directory
for filename in os.listdir("examples"):
    source_file = os.path.join("examples", filename)
    destination_file = os.path.join(INPUT_DIR, filename)
    shutil.copy(source_file, destination_file)

### Example

In [3]:
# pdf_path = './examples/acs.joc.2c00749.pdf'
# results = model.extract_molecules_from_figures_in_pdf(pdf_path)
# pp.pprint(results)

## Helper Functions

In [9]:
import json
import numpy as np
from uuid import uuid4
import tempfile
from datetime import datetime
from PIL import Image
import cv2
from typing import Union, Any

TEMPDIR = tempfile.gettempdir() # For downloaded files

class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        # Add more custom serialization logic if needed
        return super().default(obj)

def u():
    return str(uuid4())


def save_image_from_ndarray(ndarray: np.ndarray, dir: str = '/workspace/outputs', name: str = str(uuid4())) -> str:
    """Save a numpy ndarray as an image and return the file path."""
    file_path = os.path.join(dir, f"{name}.png")
    cv2.imwrite(file_path, ndarray)
    return file_path


def save_pil_image(image: Image.Image, dir: str = '/workspace/outputs', name: str = str(uuid4())) -> str:
    """Save a PIL image and return the file path."""
    file_path = os.path.join(dir, f"{name}.png")
    image.save(file_path)
    return file_path


def save_json(obj: dict, fname: str, fdir: str):
    json_path = os.path.join(fdir, fname)
    with open(json_path, 'w') as json_file:
        json.dump(obj, json_file, cls=NumpyEncoder)


def process_results(results, out_dir) -> Union[dict, list]:
    """Recursively process the results to save images and replace them with file paths."""
    def process_item(item: Any, out_dir: str) -> Any:
        if isinstance(item, dict):
            return process_results(item, out_dir)
        elif isinstance(item, list):
            return [process_item(sub_item, out_dir) for sub_item in item]
        elif isinstance(item, np.ndarray):
            return save_image_from_ndarray(item, out_dir)
        elif isinstance(item, Image.Image):
            return save_pil_image(item, out_dir)
        return item

    if isinstance(results, dict):
        return {key: process_item(value, out_dir) for key, value in results.items()}
    elif isinstance(results, list):
        return [process_item(item, out_dir) for item in results]
    else:
        return results


def is_image(file_path):
    """Check if a file is an image using Pillow."""
    try:
        with Image.open(file_path) as img:
            img.verify()  # Verify that it is, in fact, an image
        return True
    except (IOError, SyntaxError):
        return False


def load_image_pillow(file_path, convert_mode='RGB'):
    """Load an image using Pillow and convert it to the specified mode."""
    with Image.open(file_path) as img:
        img_converted = img.convert(convert_mode)
    return img_converted


def load_image_cv2(file_path, convert_color=True):
    """Load an image using OpenCV and convert it from BGR to RGB if required."""
    img = cv2.imread(file_path, cv2.IMREAD_UNCHANGED)
    if img is None:
        raise ValueError(f"Failed to load image: {file_path}")
    if convert_color:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return img


## Procedure

In [13]:
# Process inputs pdfs

# Define methods to run
pdf_methods = {
    'molecules_from_figures': model.extract_molecules_from_figures_in_pdf,
    'molecules_from_text': model.extract_molecules_from_text_in_pdf,
    'reactions_from_figures': model.extract_reactions_from_figures_in_pdf,
    'reactions_from_text': model.extract_reactions_from_text_in_pdf,
    'reactions_combined': model.extract_reactions_from_text_in_pdf_combined,
    'reactions_figures_tables': model.extract_reactions_from_figures_and_tables_in_pdf,
    'molecule_corefs': model.extract_molecule_corefs_from_figures_in_pdf
}

img_methods = {
    'molecules_from_figures': model.extract_molecules_from_figures,
    'reactions_from_figures': model.extract_reactions_from_figures,
    'molecule_bboxes': model.extract_molecule_bboxes_from_figures,
    'molecule_corefs': model.extract_molecule_corefs_from_figures
}

def run_openchemie(xfile, methods, output_dir='/workspace/outputs', use_pillow=True):
    """
    Process a file (PDF or Image) using specified methods.
    
    Args:
        xfile (str): Path to the input file.
        methods (dict): Dictionary of methods to apply.
        output_dir (str): Directory to save outputs.
        use_pillow (bool): Whether to use Pillow for image processing.
    """
    try:
        # Determine if the file is a PDF or Image
        _, ext = os.path.splitext(xfile)
        ext = ext.lower()
        is_pdf = ext == '.pdf'
        is_img = is_image(xfile)  # Using imghdr-based detection
        
        if not (is_pdf or is_img):
            print(f"Unsupported file type for {xfile}. Skipping.")
            return
        
        # Initialize results
        results = {}
        results['input'] = xfile
        job_id = u()
        results['job'] = job_id
        now = datetime.now().strftime("%Y-%m-%d %H:%M")
        results['start'] = now

        # Create job-specific output directory
        job_output_dir = os.path.join(output_dir, job_id)
        print("Out Dir:", job_output_dir)
        os.makedirs(job_output_dir, exist_ok=True)

        # Load the file appropriately
        if is_pdf:
            # For PDFs, pass the file path directly to methods
            data = xfile
        elif is_img:
            try:
                if use_pillow:
                    data = [load_image_pillow(xfile)]
                else:
                    data = [load_image_cv2(xfile)]
                
            except Exception as e:
                print(f"Failed to load image {xfile}: {e}")
                return
        else:
            print(f"File {xfile} is neither PDF nor Image. Skipping.")
            return

        # Run each method
        for key, method in methods.items():
            try:
                if is_pdf:
                    r = method(xfile)  # Pass file path
                elif is_img:
                    r = method(data)  # Pass image data
                else:
                    print(f"Uncaught file type: {xfile}, skipping...")
                    continue
                
                r = process_results(r, job_output_dir)
                results[key] = r

                out_file = f"{key}_{job_id}.json"
                save_json(r, out_file, job_output_dir)

                print(f"Processed {key} for {xfile}")
            except Exception as e:
                results[key] = None
                print(f"Error in {key}: {e}\nContinuing.")

        # Finalize results
        now = datetime.now()
        results['end'] = now.strftime("%Y-%m-%d %H:%M")

        out_file = f"results_{job_id}_{now.strftime('%Y%m%d%H%M')}.json"
        save_json(results, out_file, job_output_dir)

        print("Saved results to", out_file)
    except Exception as e:
        print(f"Error processing file {xfile}: {e}")
        raise e

# Iterate through each file in the input directory
input_dir = '/workspace/inputs'
images = []
for filename in os.listdir(input_dir):
    fpath = os.path.join(input_dir, filename)
    _, ext = os.path.splitext(filename)
    ext = ext.lower()
    
    # if ext == ".pdf":
    #     run_openchemie(fpath, pdf_methods)
    if is_image(fpath):  # Use imghdr-based detection
        run_openchemie(fpath, img_methods, use_pillow=True)  # Set use_pillow=False to use cv2
    else:
        print(f"Skipping unsupported file: {filename}")

Skipping unsupported file: acs.joc.2c00749.pdf
Out Dir: /workspace/outputs/683a80ab-499c-432f-b14b-cb60e636fda5


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Processed molecules_from_figures for /workspace/inputs/img1.png


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Processed reactions_from_figures for /workspace/inputs/img1.png
Processed molecule_bboxes for /workspace/inputs/img1.png


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Processed molecule_corefs for /workspace/inputs/img1.png
Saved results to results_683a80ab-499c-432f-b14b-cb60e636fda5_202412191350.json
Skipping unsupported file: .ipynb_checkpoints
Out Dir: /workspace/outputs/a49c51c1-aa9a-4b87-ab8b-f3df28b6a21c


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Processed molecules_from_figures for /workspace/inputs/img3.png


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Processed reactions_from_figures for /workspace/inputs/img3.png
Processed molecule_bboxes for /workspace/inputs/img3.png


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Processed molecule_corefs for /workspace/inputs/img3.png
Saved results to results_a49c51c1-aa9a-4b87-ab8b-f3df28b6a21c_202412191350.json
Out Dir: /workspace/outputs/737be751-2768-46da-a251-09630edce8f8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Processed molecules_from_figures for /workspace/inputs/img2.png


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Processed reactions_from_figures for /workspace/inputs/img2.png
Processed molecule_bboxes for /workspace/inputs/img2.png


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Processed molecule_corefs for /workspace/inputs/img2.png
Saved results to results_737be751-2768-46da-a251-09630edce8f8_202412191350.json


## Test cli.py

In [19]:
# !rm -rf ./outputs
# !mkdir outputs

In [None]:
!python cli.py --input-dir ./inputs --output-dir ./outputs/test

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Processing PDF: ./inputs/acs.joc.2c00749.pdf, results will be saved in: ./outputs/test/9e58b30b-78f2-4d0a-9896-ad56209cd63a
Processing Image: ./inputs/img3.png, results will be saved in: ./outputs/test/3886cf56-32c5-4b11-9b08-f32e6dbfb10c
Processing Image: ./inputs/img1.png, results will be saved in: ./outputs/test/0017ccac-99ed-4f6e-94fc-69cec034acdb
Processing Image: ./inputs/.ipynb_checkpoints/img1-checkpoint.png, results will be saved in: ./outputs/test/5b4882e2-e843-44d7-9945-07c1afb06232
Processing Image: ./inputs/img2.png, results will be saved in: ./outputs/test/c23bb727-863d-4174-96ea-1f4cd35c398c
Config of the encoder: <class 'transformers.models.bert.modeling_bert.BertModel'> is overwritten by shared encoder config: BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embe

In [26]:
!python cli.py --url "https://www.nature.com/articles/s41557-024-01687-7.pdf"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Results will be saved in: /workspace
Downloading https://www.nature.com/articles/s41557-024-01687-7.pdf to /tmp/tmp6r5u7b3s
Detected MIME type: application/pdf
File is a PDF.
Processing PDF: /tmp/tmp6r5u7b3s/0b2dfa79-9cc2-4502-8f00-0d968f6be33b.pdf, results will be saved in: /workspace/10b2705f-1b99-4ccd-aefa-d8ffc27f4de4
Config of the encoder: <class 'transformers.models.bert.modeling_bert.BertModel'> is overwritten by shared encoder config: BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 1764,
  "model_type": "bert",
  "num_attention_heads": 4,
  "num_hidden_layers": 4,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.47.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 2094
}

Config of the decoder: <class 