# 3. Information Extraction

> **Purpose:**  
> This notebook contains the experiments using the **pdf2aas** pipeline to extract **technical properties** from PDF product datasheets.  
> For each product, the extraction process is executed multiple times under various **degradation conditions** to assess robustness and reproducibility.

## 3.0 Imports

In [1]:
from pdf2aas.dictionary import Dictionary, CDD, ECLASS, ETIM
from pdf2aas.extractor import PropertyLLM, PropertyLLMSearch, CustomLLMClientHTTP, CustomLLMClient
from pdf2aas import preprocessor
from openai import OpenAI
from pdf2aas.generator import AASSubmodelTechnicalData, AASTemplate
from pdf2aas.model import PropertyDefinition, Property
import os
import json
import hashlib
import numpy as np
import pandas as pd
from typing import List, Optional, Union
from pydantic import BaseModel, Field

from datetime import datetime
from text_perturbation import degrade_prompt
import subprocess
import glob
from time import sleep


# relative path inside the container
data_path = "/app/data/"
processing_path = os.path.join(data_path, "processed/sample") # Folder where files are processed
metadata_path = os.path.join(processing_path, "metadata.csv") # metadata file
log_path = os.path.join(processing_path, "experiment_log.csv")
configs_path = os.path.join(processing_path, "configs")
if not os.path.exists(configs_path):
    os.makedirs(configs_path)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## 3.1 Customized Classes

This section defines customized helper classes and configurations used to extend or modify default components of the `pdf2aas` pipeline.

### 3.1.1 Custom Ollama Client for Open Source LLMs

A custom client interface is implemented to interact with open-source Large Language Models (LLMs) via **Ollama**.
This class adapts the standard API calls and allows for local model inference.

In [5]:
import requests
from copy import deepcopy
class CustomOllamaClient(CustomLLMClient):
    def __init__(
        self,
        endpoint: str = "http://host.docker.internal:50000/api/chat", # url to access host machine from inside the container
        result_path: Optional[str] = "message.content",
        headers: Optional[dict] = None,
        timeout: float = 30000, # long timeout for large models
        retries: int = 0,
    ):
        self.endpoint = endpoint
        self.result_path = result_path
        self.headers = headers or {
            "Content-Type": "application/json",
            "Accept": "application/json",
        }
        self.timeout = timeout
        self.retries = retries

    def create_completions(
        self,
        messages: list[dict[str, str]],
        model: str,
        temperature: float,
        max_tokens: int,
        response_format: dict | BaseModel,
    ) -> tuple[str | None, str | None]:
        try:
            response_format = response_format.model_json_schema()
        except:
            pass
        payload = {
            "model": model,
            "messages": messages,
            "temperature": temperature,
            "max_tokens": max_tokens,
            "format": response_format,  # Full JSON schema expected here
            "stream": False
        }
        self.recent_model = model
        headers = deepcopy(self.headers)

        for attempt in range(self.retries + 1):
            try:
                response = requests.post(
                    self.endpoint,
                    headers=headers,
                    json=payload,
                    timeout=self.timeout
                )
                print(response)
                response.raise_for_status()
                result = response.json()
                break
            except requests.exceptions.RequestException:
                import logging
                logging.exception("Error on attempt %s calling Ollama endpoint.", attempt)
                result = None

        if result is None:
            raise Exception("Error calling Ollama")
            #return None, None

        return self.evaluate_result_path(result), result

    def evaluate_result_path(self, raw_result: dict) -> Optional[str]:
        if not self.result_path:
            return str(raw_result)

        try:
            keys = self.result_path.replace("[", ".").replace("]", "").split(".")
            for key in keys:
                if isinstance(raw_result, list):
                    raw_result = raw_result[int(key)]
                else:
                    raw_result = raw_result[key]
            return str(raw_result)
        except (KeyError, ValueError, TypeError):
            return None

    def unload_model(self) -> None:
        """
        Sends a request to the Ollama API to stop all running models and reset GPU memory.
        """
        if self.recent_model:
            stop_url = self.endpoint.replace("/api/chat", "/api/generate")
            headers = deepcopy(self.headers)

            payload = {
                "model": self.recent_model,
                "keep_alive": 0
            }

            try:
                response = requests.post(
                    stop_url,
                    headers=headers,
                    json=payload,
                    timeout=self.timeout
                )
                response.raise_for_status()
                print("Ollama has been successfully reset.")
                sleep(10)
            except requests.exceptions.RequestException:
                import logging
                logging.exception("Failed to reset Ollama runtime.")
                raise Exception("Error resetting Ollama")


### 3.1.2 Custom Pydantic Template for LLM Output

A custom **Pydantic model** defines the expected structure of LLM outputs.
It ensures that the extracted property values conform to the predefined data schema and can be validated automatically.

In [6]:
class PropertyItem(BaseModel):
    property: str
    value: Union[str, int, float, List[Union[str, int, float]]]
    unit: str
    reference: str

class ExtractionResult(BaseModel):
    result: List[PropertyItem]
issubclass(ExtractionResult, BaseModel)
response_formats = {
    "ExtractionResult": ExtractionResult
    }


### 3.1.3 Custom `pdf2aas` Property Extractor

An adapted property extraction class integrates the custom LLM client and output schema.
This allows controlled variation of extraction parameters and supports experimental comparison across model configurations.

In [7]:
class CustomPropertyLLMSearch(PropertyLLMSearch):
    def extract(
        self,
        datasheet: list[str] | str,
        property_definition: PropertyDefinition | list[PropertyDefinition],
        raw_prompts: list | None = None,
        raw_results: list | None = None,
        prompt_hint: str | None = None,
    ) -> list[Property]:
        """Try to extract all properties found in the given datasheet text.

        Ignores the `property_definition` list. Use a more specific PropertyLLM,
        e.g. PropertyLLMSearch extractor, if specific property definitions
        should be searched.

        If a `raw_prompt` or `raw_result` list is given, the created prompts and
        returned results are added to these lists.

        The `prompt_hint` can be used to add context or additional instructions
        to the prompt before it is sent to the LLM.
        """
        if not hasattr(self, "prompt_degradation_intensity"):
            self.prompt_degradation_intensity = 0.0

        if isinstance(datasheet, list):
            datasheet = "\n".join(datasheet)

        messages = [
            {"role": "system", "content": self.system_prompt_template},
            {
                "role": "user",
                "content": self.create_prompt(datasheet, property_definition, hint=prompt_hint),
            },
        ]
        if self.prompt_degradation_intensity > 0.0:
            messages[0]['content'] = degrade_prompt(messages[0]['content'], self.prompt_degradation_intensity)
            messages[1]['content'] = degrade_prompt(messages[1]['content'], self.prompt_degradation_intensity)
        if isinstance(raw_prompts, list):
            raw_prompts.append(messages)
        result = self._prompt_llm(messages, raw_results)
        properties = self._parse_result(result)
        properties = self._parse_properties(properties)
        return self._add_definitions(properties, property_definition)



### 3.1.4 Example LLM Call

A short demonstration of a single LLM call is provided to illustrate the interaction between the text prompt, and output model.

In [8]:
use_in_prompt = ['unit','datatype'] # Which information the model gets out of ['definition','unit','datatype', 'values']

# Either use Ollama...
# client = CustomOllamaClient()
# model = "tinyllama"

# ...or OpenAI
client = OpenAI(
    api_key=os.environ.get('OPENAI_API_KEY'),
    base_url=os.environ.get('OPENAI_BASE_URL')
 )
model = "gpt-4o-mini"


extractor = CustomPropertyLLMSearch(
            model_identifier=model,
            client=client,
            property_keys_in_prompt=use_in_prompt,
        )
extractor.prompt_degradation_intensity = 1.0
extractor.result_format = response_formats["ExtractionResult"]
result = extractor.extract(["hi","my name is John","I am 30 years old","I live in New York"], [], [], [],[])

Extracted 3 properties for 0 definitions.


### 3.1.5 Custom Property Dictionary

A modified version of the property dictionary use to retrieve templates for the LLM prompts

In [9]:
class CustomDictionary(Dictionary):
    supported_releases = ['0.0']
    def get_class_url(self, class_id: str) -> str | None:
        """Get the web URL for the class of the class_id for details."""
        return None

    def get_property_url(self, property_id: str) -> str | None:
        """Get the web URL for the property id for details."""
        return None

dictionary = CustomDictionary(release='0.0')
dictionary.load_from_file("temp/dict/CustomDictionary-0.0.json")

True

## 3.2 Experiment Functions

This section defines the experiment orchestration functions, including helper routines for running and logging multiple extraction trials under different degradation conditions.

In [10]:
def load_product(product_id, metadata_df, data_path, classification_system="ECLASS"):
    product_path = os.path.join(data_path, product_id)
    datasheet_path = os.path.join(product_path, product_id + "_Datasheet.pdf")
    if not os.path.exists(datasheet_path):
        if not os.path.exists(os.path.join(product_path, product_id + ".pdf")):
            raise FileNotFoundError(f"Product folder not found for {product_id}")
        else:
            datasheet_path = os.path.join(product_path, product_id + ".pdf")

    if classification_system == "ECLASS":
        release = str(metadata_df.loc[metadata_df['product_id'] == product_id, 'Classification_System_Version'].iloc[0])
        class_id = str(metadata_df.loc[metadata_df['product_id'] == product_id, 'Class_Id'].iloc[0])
    elif classification_system == "CustomDictionary":
        release = "0.0"
        class_id = product_id
    else:
        raise ValueError(f"Unsupported classification system: {classification_system}")
    product_classification = {
        "classification_system": classification_system,
        "class_id": class_id,
        "release": release     
    }
    return datasheet_path, product_classification


In [11]:
# Generate short hashes to keep track of different experiment configurations
def hash_config(config):
    config_str = json.dumps(config, sort_keys=True)
    return hashlib.sha1(config_str.encode()).hexdigest()[:8]

# Get the current git commit hash to track code version
def get_git_commit_hash():
    try:
        return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode('ascii').strip()
    except Exception:
        return "unknown"

get_git_commit_hash()

'43d217f'

In [12]:
def update_experiment_log(record):
    if os.path.exists(log_path):
        df = pd.read_csv(log_path)
    else:
        df = pd.DataFrame()
    # Convert record to DataFrame and concatenate
    new_record_df = pd.DataFrame([record])
    df = pd.concat([df, new_record_df], ignore_index=True)
    df.to_csv(log_path, index=False)

In [13]:
def pdf_to_aas(datasheet_path, product_classification, config):
    """Main Experiment function"""
    
    if product_classification['classification_system'] == "ECLASS":
        dictionary = ECLASS(release=product_classification['release'])
    elif product_classification['classification_system'] == "ETIM":
        dictionary = ETIM(release=product_classification['release'])
    elif product_classification['classification_system'] == "CustomDictionary":
        dictionary = CustomDictionary(release=product_classification['release'])
        dictionary.load_from_file("temp/dict/CustomDictionary-0.0.json")
    else:
        raise ValueError(f"Unsupported classification system: {product_classification['classification_system']}")
    definitions = dictionary.get_class_properties(product_classification['class_id'])
    #print(definitions)
    dictionary.save_to_file()

    # Preprocess the PDF
    prep = preprocessor.PDFium()
    preprocessed_text = prep.convert(datasheet_path)
    preprocessed_datasheet = "\n".join(preprocessed_text) if isinstance(preprocessed_text, list) else preprocessed_text

    model = config["model"] if "model" in config else "gpt-4o-mini"
    if config['client'] == "ollama":
        client = CustomOllamaClient()
    else:
        client = OpenAI(
                api_key=os.environ.get('OPENAI_API_KEY'),
                base_url=os.environ.get('OPENAI_BASE_URL')
            )       
    
    
    if 'use_in_prompt' in config:
        use_in_prompt = config['use_in_prompt']
    else:
        use_in_prompt = ['unit','datatype'] # any of ['definition','unit','datatype', 'values']
    extractor = CustomPropertyLLMSearch(
                model_identifier=model,
                client=client,
                property_keys_in_prompt=use_in_prompt,
            )

    if "temperature" in config:
       extractor.temperature = config["temperature"]
    if "response_format" in config:
        extractor.response_format = response_formats[config["response_format"]]

    prompt_hint = config["prompt_hint"] if "prompt_hint" in config else ""
    raw_results = []
    raw_prompts = []
    
    if "prompt_degradation_intensity" in config:
        extractor.prompt_degradation_intensity = config["prompt_degradation_intensity"]
    retry = config['retry'] if 'retry' in config else 20
     
    for i in range(retry):
        #print(i)
        if 'batch_size' in config:
            if config['batch_size'] <= 0:
                properties = extractor.extract(
                        preprocessed_datasheet,
                        property_definition=definitions,
                        raw_prompts=raw_prompts,
                        prompt_hint=prompt_hint,
                        raw_results=raw_results)
            elif config['batch_size'] == 1:
                properties = []
                for d in definitions:
                    properties.extend(extractor.extract(
                        preprocessed_datasheet,
                        property_definition=definitions,
                        raw_prompts=raw_prompts,
                        prompt_hint=prompt_hint,
                        raw_results=raw_results))
        else:
            properties = []
            for i in range(0, len(definitions), config.batch_size):
                properties.extend(
                    extractor.extract(
                    preprocessed_datasheet,
                    property_definition=definitions[i : i + config.batch_size],
                    raw_prompts=raw_prompts,
                    prompt_hint=prompt_hint,
                    raw_results=raw_results)
                )
        #print(properties)
        cleaned_properties = []
        for prop in properties:
            if prop.reference == "":
                prop.reference = None
            if prop.value == "":
                prop.value = None
            cleaned_properties.append(prop)
        if len(cleaned_properties) > 1:
            break


    config_hash = hash_config(config)
    now = datetime.now()
    date_string = now.strftime("%Y-%m-%d_%H-%M-%S")
    commit_hash = get_git_commit_hash()
    submodel_path = os.path.join(processing_path, product_id, f"{product_id}_pdftoaas_{config_hash}_{date_string}.json")
    config_path = os.path.join(processing_path, "configs", f"config_{config_hash}_{commit_hash}.json")
 

    #print(submodel_path)
    submodel = AASSubmodelTechnicalData()
    submodel.add_classification(dictionary, product_classification['class_id'])

    submodel.add_properties(cleaned_properties)
    submodel.dump(submodel_path)
    with open(config_path, "w") as f:
        json.dump(config, f, indent=4)

    record = {
        "product_id": product_id,
        "date": date_string,
        "commit_hash": commit_hash,
        "config_hash": config_hash,
        "result_path": submodel_path,
        **config
    }

    update_experiment_log(record)
    return client

## 3.3 Experiments

The core experimental procedures are executed here.
Each configuration defines a specific degradation level, prompt style, or model setting.

### 3.3.1 Experiment Configurations

Configuration dictionaries specify the experimental conditions.

In [14]:
# generate configurations

base_pdftoaas_config = {
    "classification_system": "CustomDictionary", # e.g. "ECLASS", "ETIM", "CustomDictionary"
    "name": "deepseek_40960_context",#"experiments_test_sample_3",#
    "model": "gpt-4o-mini", # e.g. "gpt-4o-mini", "llama3:8b", "llama3.1:8b", "tinyllama"
    "client": "openai", # e.g. "openai", "ollama"
    "property_keys_in_prompt": ['unit','datatype'], # e.g. ['definition','unit','datatype', 'values']
    "temperature": 0.0,
    "prompt_hint": "",
    "prompt_degradation_intensity": 0.0, # [0.0-1.0] 
    "batch_size": 0, # 0 = all properties in LLM call
    "retry": 20,
    "response_format": "ExtractionResult"
    #"commit": get_git_commit_hash()
}

# Test runs

# temperature
temps = np.linspace(1.2, 1.9, 8)
configs_temperature = [base_pdftoaas_config | {"temperature": float(temp), "model": "gpt-4o-mini"} for temp in temps]
# prompt degradation
intensties = np.linspace(0.0, 1.0, 11)
configs_prompt_degradation = [base_pdftoaas_config | {"prompt_degradation_intensity": float(intensity), "model": "gpt-4o-mini"} for intensity in intensties]
# models
deepseek_models = ['deepseek-r1:1.5b','deepseek-r1:7b','deepseek-r1:8b','deepseek-r1:14b','deepseek-r1:32b','deepseek-r1:70b']
qwen3_models = ['qwen3:0.6b','qwen3:1.7b','qwen3:4b','qwen3:8b','qwen3:14b','qwen3:30b','qwen3:32b']
models = deepseek_models + qwen3_models
configs_models = [base_pdftoaas_config | {"model": model, "client": "ollama"} for model in models] 

configs = configs_temperature + configs_prompt_degradation + configs_models
print(len(configs)) # 32 configurations
print(configs)


32
[{'classification_system': 'CustomDictionary', 'name': 'deepseek_40960_context', 'model': 'gpt-4o-mini', 'client': 'openai', 'property_keys_in_prompt': ['unit', 'datatype'], 'temperature': 1.2, 'prompt_hint': '', 'prompt_degradation_intensity': 0.0, 'batch_size': 0, 'retry': 20, 'response_format': 'ExtractionResult'}, {'classification_system': 'CustomDictionary', 'name': 'deepseek_40960_context', 'model': 'gpt-4o-mini', 'client': 'openai', 'property_keys_in_prompt': ['unit', 'datatype'], 'temperature': 1.3, 'prompt_hint': '', 'prompt_degradation_intensity': 0.0, 'batch_size': 0, 'retry': 20, 'response_format': 'ExtractionResult'}, {'classification_system': 'CustomDictionary', 'name': 'deepseek_40960_context', 'model': 'gpt-4o-mini', 'client': 'openai', 'property_keys_in_prompt': ['unit', 'datatype'], 'temperature': 1.4, 'prompt_hint': '', 'prompt_degradation_intensity': 0.0, 'batch_size': 0, 'retry': 20, 'response_format': 'ExtractionResult'}, {'classification_system': 'CustomDictio

## 3.3.2 Experiment Loop

A main loop iterates through all experiment configurations, executes the extraction pipeline, and records the results.
Logs are continuously updated to enable later recovery or analysis.

In [None]:
metadata_df = pd.read_csv(os.path.join(processing_path, "metadata.csv"))
recent_client = None
recent_model = None



for config in configs:
    print(config)
    # logic for self-hosted ollama models to avoid out of memory errors
    if config['model'] != recent_model and config['client'] == 'ollama' and recent_client == 'ollama':
        print("reloading")
        client.unload_model()



    for product_id in os.listdir(processing_path):
        product_path = os.path.join(processing_path, product_id)
        config_hash = hash_config(config)
        pattern = f"{processing_path}/{product_id}/{product_id}_pdftoaas_{config_hash}_*.json"
        matching_files = glob.glob(pattern)
        if not os.path.isdir(product_path) or product_id=="configs" or len(matching_files)>0:
            continue
        print(product_id)
        datasheet_path, product_classification = load_product(product_id, metadata_df, processing_path, config['classification_system'])
        client = pdf_to_aas(datasheet_path, product_classification, config)

        # logic for self-hosted models
        recent_model = config['model']
        recent_client = config['client']

## 3.4 Update Experiment Log from Files

This final section updates the experiment logs with existing files and configuration data.
It is used to restore experiment states or fill missing log entries after interruptions, such as runtime crashes or manually added output files.

In [2]:
log_path = os.path.join(processing_path, "experiment_log.csv")
mylogdf = pd.read_csv(log_path)

In [None]:
# Update the experiment logs with existing files and configs in case of missing entries
# (e.g. after a crash or if files were added manually)
import os
import re
import json
import pandas as pd

data_path = "/app/data/"
processing_path = os.path.join(data_path, "processed/sample")
log_path = os.path.join(processing_path, "experiment_log.csv")
configs_path = os.path.join(processing_path, "configs")

def extract_metadata_from_filename(filename):
    # Match pattern like: product_pdftoaas_<config_hash>_<date>.json
    match = re.match(r"(.*?)_pdftoaas_([a-f0-9]+)_(\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2})\.json", filename)
    if not match:
        return None
    product_id, config_hash, date_string = match.groups()
    return product_id, config_hash, date_string

def migrate_existing_files():
    records = []

    for root, dirs, files in os.walk(processing_path):
        for file in files:
            if "_pdftoaas_" not in file:
                continue

            meta = extract_metadata_from_filename(file)
            if not meta:
                continue

            product_id, config_hash, date_string = meta
            result_path = os.path.join(root, file)

            # Try to find corresponding config
            config_filename_pattern = f"config_{config_hash}_"
            config_file = next((f for f in os.listdir(configs_path) if f.startswith(config_filename_pattern)), None)
            if not config_file:
                continue

            commit_hash = config_file.split("_")[-1].replace(".json", "")
            config_path = os.path.join(configs_path, config_file)

            # Load config
            with open(config_path, 'r') as f:
                config = json.load(f)

            record = {
                "product_id": product_id,
                "date": date_string,
                "commit_hash": commit_hash,
                "config_hash": config_hash,
                "result_path": result_path,
                **config
            }
            records.append(record)

    df = pd.DataFrame(records)
    df.to_csv(log_path, index=False)
    print(f"Saved {len(df)} records to {log_path}")

migrate_existing_files()

Saved 6447 records to /app/data/processed/sample/experiment_log.csv
