## CDM ONTOLOGIES WORKFLOW

### Full workflow

In [None]:
%%time
import sys
import os
# Add the scripts directory to the Python path
repo_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
scripts_path = os.path.join(repo_path, 'scripts')
sys.path.append(scripts_path)
# Import all required functions
from analyze_core_ontologies import analyze_core_ontologies
import analyze_non_core_ontologies
from create_pseudo_base_ontology import create_pseudo_base_ontologies
from merge_ontologies import merge_ontologies
from create_semantic_sql_db import create_semantic_sql_db
from extract_sql_tables_to_tsv import extract_sql_tables_to_tsv

# Print workflow start
print("Starting CDM Ontologies Workflow...")

print("\n1. Analyzing Core Ontologies...")
analyze_core_ontologies(repo_path)

print("\n2. Analyzing Non-Core Ontologies...")
analyze_non_core_ontologies.analyze_non_core_ontologies(repo_path)

print("\n3. Creating Pseudo Base Ontologies...")
create_pseudo_base_ontologies(repo_path)

print("\n4. Merging Ontologies...")
if not merge_ontologies(repo_path):
    raise Exception("Ontology merge failed")

print("\n5. Creating Semantic SQL Database...")
if not create_semantic_sql_db(repo_path):
    raise Exception("Database creation failed")

print("\n6. Extracting SQL Tables to TSV...")
if not extract_sql_tables_to_tsv(repo_path):
    raise Exception("TSV extraction failed")

print("\nWorkflow completed successfully!")

Starting CDM Ontologies Workflow...

1. Analyzing Core Ontologies...

Analysis Results:
Skipping download, envo.owl already present

File: envo.owl
  Has imports: No
  Ontology IRI: http://purl.obolibrary.org/obo/envo.owl
  Own terms: 4385
  External terms: 3043
  Classification: Non-Base. Base version available in OBO Foundry: http://purl.obolibrary.org/obo/envo/envo-base.owl.
  External Terms Subject of Triples? Yes
  Number of external terms that are subjects of triples: 2984
  First 5 external terms that are subject of triples:
    http://purl.obolibrary.org/obo/BFO_0000001
    http://purl.obolibrary.org/obo/BFO_0000002
    http://purl.obolibrary.org/obo/BFO_0000003
    http://purl.obolibrary.org/obo/BFO_0000004
    http://purl.obolibrary.org/obo/BFO_0000006
  First 5 own terms:
    http://purl.obolibrary.org/obo/ENVO_00000000
    http://purl.obolibrary.org/obo/ENVO_00000001
    http://purl.obolibrary.org/obo/ENVO_00000002
    http://purl.obolibrary.org/obo/ENVO_00000003
    http:/

### Worflow in seperate steps

#### Analyze and Download Core Ontologies

In [9]:
%%time
import sys
import os

# Add the scripts directory to the Python path
repo_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
scripts_path = os.path.join(repo_path, 'scripts')
sys.path.append(scripts_path)

# Import and run the analysis
from analyze_core_ontologies import analyze_core_ontologies

# Run the analysis
analyze_core_ontologies(repo_path)


Analysis Results:
Downloading envo.owl...
Successfully downloaded: envo.owl

File: envo.owl
  Has imports: No
  Ontology IRI: http://purl.obolibrary.org/obo/envo.owl
  Own terms: 4514
  External terms: 2858
  Classification: Non-Base. Base version available in OBO Foundry: http://purl.obolibrary.org/obo/envo/envo-base.owl.
  External Terms Subject of Triples? Yes
  Number of external terms that are subjects of triples: 2807
  First 5 external terms that are subject of triples:
    http://purl.obolibrary.org/obo/BFO_0000001
    http://purl.obolibrary.org/obo/BFO_0000002
    http://purl.obolibrary.org/obo/BFO_0000003
    http://purl.obolibrary.org/obo/BFO_0000004
    http://purl.obolibrary.org/obo/BFO_0000006
  First 5 own terms:
    http://purl.obolibrary.org/obo/ENVO_00000000
    http://purl.obolibrary.org/obo/ENVO_00000001
    http://purl.obolibrary.org/obo/ENVO_00000002
    http://purl.obolibrary.org/obo/ENVO_00000003
    http://purl.obolibrary.org/obo/ENVO_00000004
  First 5 extern

#### Analyze and Download Non-Core Ontologies

In [10]:
%%time
import sys
import os

# Add the scripts directory to the Python path
repo_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
scripts_path = os.path.join(repo_path, 'scripts')
sys.path.append(scripts_path)

# Import and run the analysis
import analyze_non_core_ontologies

# Run the analysis
analyze_non_core_ontologies.analyze_non_core_ontologies(repo_path)


Processing external terms from core ontologies...
Downloading base version of ro...
Successfully downloaded: ro-base.owl
Downloading base version of pato...
Successfully downloaded: pato-base.owl
Downloading base version of obi...
Successfully downloaded: obi-base.owl
Downloading base version of fao...
Successfully downloaded: fao-base.owl
Downloading regular version of po...
Successfully downloaded: po.owl
Downloading base version of pco...
Successfully downloaded: pco-base.owl
Downloading regular version of iao...
Successfully downloaded: iao.owl
Downloading base version of uberon...
Successfully downloaded: uberon-base.owl
Downloading regular version of omo...
Successfully downloaded: omo.owl
Downloading regular version of bfo...
Successfully downloaded: bfo.owl
Downloading regular version of foodon...
Successfully downloaded: foodon.owl

Updating ontologies.txt...

Processing Additional OBO Foundry, PyOBO and In-house ontologies...
Downloading uo.owl...
Successfully downloaded: uo

#### Recreate pseudo base versions

In [11]:
%%time
import sys
import os
# Add the scripts directory to the Python path
repo_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
scripts_path = os.path.join(repo_path, 'scripts')
sys.path.append(scripts_path)
# Import and run the pseudo base creation
from create_pseudo_base_ontology import create_pseudo_base_ontologies
# Run the creation
create_pseudo_base_ontologies(repo_path)

Using ROBOT at: /scratch/jplfaria/install_stuff/robot/robot
Processing po.owl...
Using base IRI: http://purl.obolibrary.org/obo/PO_
Executing command:
robot remove --input /scratch/jplfaria/KBase_CDM_Ontologies/ontology_data_owl/non-base-ontologies/po.owl --base-iri http://purl.obolibrary.org/obo/PO_ --axioms external --preserve-structure false --trim false remove --select imports --trim false --output /scratch/jplfaria/KBase_CDM_Ontologies/ontology_data_owl/po-base.owl
Created base version for po.owl: po-base.owl
Processing iao.owl...
Using base IRI: http://purl.obolibrary.org/obo/IAO_
Executing command:
robot remove --input /scratch/jplfaria/KBase_CDM_Ontologies/ontology_data_owl/non-base-ontologies/iao.owl --base-iri http://purl.obolibrary.org/obo/IAO_ --axioms external --preserve-structure false --trim false remove --select imports --trim false --output /scratch/jplfaria/KBase_CDM_Ontologies/ontology_data_owl/iao-base.owl
Created base version for iao.owl: iao-base.owl
Processing om

True

#### Analyze the prefixes

In [6]:
%%time
import sys
import os
# Add the scripts directory to the Python path
repo_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
scripts_path = os.path.join(repo_path, 'scripts')
sys.path.append(scripts_path)
# Import the prefix analyzer
from analyze_prefixes import analyze_all_ontologies, generate_prefix_mapping
# Define input directory path
input_dir = os.path.join(repo_path, 'ontology_data_owl')
# Run the analysis - NEED TO PASS BOTH ARGUMENTS
print(f"Analyzing ontologies in: {input_dir}")
results = analyze_all_ontologies(input_dir, repo_path)  # Added repo_path here!
# Generate and save prefix mapping
mapping_content = generate_prefix_mapping(results)
mapping_file = os.path.join(repo_path, 'prefix_mapping.txt')
with open(mapping_file, 'w') as f:
    f.write(mapping_content)
print(f"\nPrefix mapping file generated at: {mapping_file}")
# Print summary of analysis
print("\nSummary of analysis:")
for filename, data in results.items():
    print(f"\n{filename}:")
    print(f"  Declared prefixes: {len(data['prefixes'])}")
    additional_prefixes = set(data['prefix_to_iris'].keys()) - data['prefixes']
    print(f"  Potential additional prefixes needed: {len(additional_prefixes)}")
    if additional_prefixes:
        print("  Additional prefixes:")
        for prefix in sorted(additional_prefixes):
            iris = data['prefix_to_iris'][prefix]
            if iris:
                print(f"    - {prefix}: {next(iter(iris))}")

Analyzing ontologies in: /scratch/jplfaria/KBase_CDM_Ontologies/ontology_data_owl_core

Analyzing ncbitaxon.owl...



KeyboardInterrupt



#### Merge Ontologies

In [8]:
%%time
import sys
import os
import subprocess
from threading import Thread
import time
from pathlib import Path
from datetime import datetime
import logging

# Set up logging
logging.basicConfig(
    filename='merge_memory.log',
    level=logging.INFO,
    format='%(asctime)s - %(message)s'
)

# Initialize peak memory tracking
peak_memory_gb = 0
last_logged_memory = 0

def parse_memory_output(mem_line):
    parts = mem_line.split()
    used_mem = parts[2]
    value = float(used_mem[:-2])
    return value * 1024 if used_mem.endswith('Ti') else value

def monitor_memory():
    global peak_memory_gb, last_logged_memory
    while True:
        try:
            mem_info = subprocess.check_output(['free', '-h']).decode('utf-8').split('\n')
            current_memory_gb = parse_memory_output(mem_info[1])
            peak_memory_gb = max(peak_memory_gb, current_memory_gb)
            
            # Log if memory changed by more than 10GB
            if abs(current_memory_gb - last_logged_memory) > 10:
                logging.info(f"Memory: {current_memory_gb:.0f}GB (Peak: {peak_memory_gb:.0f}GB)")
                last_logged_memory = current_memory_gb
            
            time.sleep(60)
        except:
            break

# Start memory monitoring in background
monitor_thread = Thread(target=monitor_memory, daemon=True)
monitor_thread.start()

# Run merge
repo_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
scripts_path = os.path.join(repo_path, 'scripts')
sys.path.append(scripts_path)
from merge_ontologies import merge_ontologies

print("Starting ontology merge - Check merge_memory.log for memory usage details")
logging.info("Starting ontology merge")

merge_ontologies(
    repo_path,
    input_dir_name='ontology_data_owl',
    output_filename='ontology_data_owl.owl'
)

# Log final statistics
logging.info(f"Merge completed - Final peak memory: {peak_memory_gb:.0f}GB")
print(f"\nMerge completed - Peak memory usage: {peak_memory_gb:.0f}GB")

Starting ontology merge - Check merge_memory.log for memory usage details
Using ROBOT at: /scratch/jplfaria/install_stuff/robot/robot
Looking for ontology files in: /scratch/jplfaria/KBase_CDM_Ontologies/ontology_data_owl
Found 30 ontology files:
  - /scratch/jplfaria/KBase_CDM_Ontologies/ontology_data_owl/kegg.owl
  - /scratch/jplfaria/KBase_CDM_Ontologies/ontology_data_owl/metacyc.owl
  - /scratch/jplfaria/KBase_CDM_Ontologies/ontology_data_owl/modelseed.owl
  - /scratch/jplfaria/KBase_CDM_Ontologies/ontology_data_owl/envo.owl
  - /scratch/jplfaria/KBase_CDM_Ontologies/ontology_data_owl/go.owl
  - /scratch/jplfaria/KBase_CDM_Ontologies/ontology_data_owl/ncbitaxon.owl
  - /scratch/jplfaria/KBase_CDM_Ontologies/ontology_data_owl/chebi.owl
  - /scratch/jplfaria/KBase_CDM_Ontologies/ontology_data_owl/cl-base.owl
  - /scratch/jplfaria/KBase_CDM_Ontologies/ontology_data_owl/pato-base.owl
  - /scratch/jplfaria/KBase_CDM_Ontologies/ontology_data_owl/caro-base.owl
  - /scratch/jplfaria/KBase_

#### Create Semantic SQL DB

In [5]:
import sys
print(f"Current Python executable: {sys.executable}")

Current Python executable: /home/jplfaria/miniconda3/bin/python


In [6]:
!which python

/usr/local/bin/python


In [1]:
%%time
import sys
import os

# Add the scripts directory to the Python path
repo_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
scripts_path = os.path.join(repo_path, 'scripts')
sys.path.append(scripts_path)

# Import and run the database creation
from create_semantic_sql_db import create_semantic_sql_db

# Run the database creation with custom input filename
create_semantic_sql_db(
    repo_path,
    input_owl_filename='eccode.owl'
)

Error occurred: Could not find semsql installation location
CPU times: user 24.9 ms, sys: 128 μs, total: 25.1 ms
Wall time: 398 ms


Traceback (most recent call last):
  File "/scratch/jplfaria/KBase_CDM_Ontologies/scripts/create_semantic_sql_db.py", line 24, in create_semantic_sql_db
    raise Exception("Could not find semsql installation location")
Exception: Could not find semsql installation location


False

In [11]:
%%time
import sys
import os

# Add the scripts directory to the Python path
repo_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
scripts_path = os.path.join(repo_path, 'scripts')
sys.path.append(scripts_path)

# Import and run the database creation
from create_semantic_sql_db import create_semantic_sql_db

# Run the database creation with custom input filename
create_semantic_sql_db(
    repo_path,
    input_owl_filename='ontology_data_owl.owl'
)

Input OWL file: ontology_data_owl.owl
Output DB file: ontology_data_owl.db
Working directory: /scratch/jplfaria/KBase_CDM_Ontologies/outputs

Executing command:

        export PATH="/scratch/jplfaria/bin:$PATH"
        source /home/jplfaria/scratch/install_stuff/setup_oak_env.sh
        /home/jplfaria/scratch/install_stuff/semsql/bin/semsql make ontology_data_owl.db
        

Output:
robot \
remove -i ontology_data_owl.owl --axioms "equivalent disjoint annotation abox type" \
filter --exclude-terms /home/jplfaria/scratch/install_stuff/semsql/semsql/builder//exclude-terms.txt \
-o ontology_data_owl-min.owl
touch ontology_data_owl-properties.txt
grep -v ^prefix, /home/jplfaria/scratch/install_stuff/semsql/semsql/builder/prefixes/prefixes.csv | grep -v ^obo, | perl -npe 's@,(.*)@: "$1"@' > ontology_data_owl-prefixes.yaml.tmp && mv ontology_data_owl-prefixes.yaml.tmp ontology_data_owl-prefixes.yaml
relation-graph --disable-owl-nothing true \
--ontology-file ontology_data_owl-min.owl \
 \


False

#### Extract SQLite tables to .tsv

In [4]:
%%time
import sys
import os

# Add the scripts directory to the Python path
repo_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
scripts_path = os.path.join(repo_path, 'scripts')
sys.path.append(scripts_path)

# Import and run the extraction
from extract_sql_tables_to_tsv import extract_sql_tables_to_tsv

# Run the extraction
extract_sql_tables_to_tsv(repo_path)

Reading database from: /cdm_shared_workspace/user_shared_workspace/Jose/KBase_CDM_Ontologies/outputs/CDM_merged_ontologies.db
Saving TSV files to: /cdm_shared_workspace/user_shared_workspace/Jose/KBase_CDM_Ontologies/outputs/tsv_tables
Processing table: term_association
Exported 'term_association' to '/cdm_shared_workspace/user_shared_workspace/Jose/KBase_CDM_Ontologies/outputs/tsv_tables/term_association.tsv'
Processing table: has_oio_synonym_statement
Exported 'has_oio_synonym_statement' to '/cdm_shared_workspace/user_shared_workspace/Jose/KBase_CDM_Ontologies/outputs/tsv_tables/has_oio_synonym_statement.tsv'
Processing table: anonymous_expression
Exported 'anonymous_expression' to '/cdm_shared_workspace/user_shared_workspace/Jose/KBase_CDM_Ontologies/outputs/tsv_tables/anonymous_expression.tsv'
Processing table: anonymous_class_expression
Exported 'anonymous_class_expression' to '/cdm_shared_workspace/user_shared_workspace/Jose/KBase_CDM_Ontologies/outputs/tsv_tables/anonymous_class

True

In [11]:
%%time
import os
import sqlite3
import pandas as pd

# Specify the output directory
output_dir = '/scratch/jplfaria/KBase_CDM_Ontologies/outputs/parquet_tables_ontology_data_owl'

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Connect to the SQLite database
conn = sqlite3.connect('/scratch/jplfaria/KBase_CDM_Ontologies/outputs/ontology_data_owl.db')

# Get a list of all tables in the database
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

# Extract each table and save as Parquet
for table in tables:
    table_name = table[0]
    df = pd.read_sql_query(f"SELECT * FROM {table_name}", conn)
    output_path = os.path.join(output_dir, f"{table_name}.parquet")
    df.to_parquet(output_path, index=False)
    print(f"Table '{table_name}' has been exported to '{output_path}'")

# Close the connection
conn.close()

print("All tables have been exported to Parquet files in the specified directory.")

Table 'term_association' has been exported to '/scratch/jplfaria/KBase_CDM_Ontologies/outputs/parquet_tables_ontology_data_owl/term_association.parquet'
Table 'has_oio_synonym_statement' has been exported to '/scratch/jplfaria/KBase_CDM_Ontologies/outputs/parquet_tables_ontology_data_owl/has_oio_synonym_statement.parquet'
Table 'anonymous_expression' has been exported to '/scratch/jplfaria/KBase_CDM_Ontologies/outputs/parquet_tables_ontology_data_owl/anonymous_expression.parquet'
Table 'anonymous_class_expression' has been exported to '/scratch/jplfaria/KBase_CDM_Ontologies/outputs/parquet_tables_ontology_data_owl/anonymous_class_expression.parquet'
Table 'anonymous_property_expression' has been exported to '/scratch/jplfaria/KBase_CDM_Ontologies/outputs/parquet_tables_ontology_data_owl/anonymous_property_expression.parquet'
Table 'anonymous_individual_expression' has been exported to '/scratch/jplfaria/KBase_CDM_Ontologies/outputs/parquet_tables_ontology_data_owl/anonymous_individual_