## CDM ONTOLOGIES WORKFLOW

### Full workflow

In [ ]:
%%time
# Updated workflow using the current cdm_ontologies package structure
import os
from pathlib import Path

# Get the repository root path
repo_path = Path(os.getcwd()).parent

# Print workflow start
print("Starting CDM Ontologies Workflow...")
print(f"Repository path: {repo_path}")

# Run the complete workflow using the CLI
import subprocess

# Run the complete workflow
result = subprocess.run(
    ["python", "-m", "cdm_ontologies.cli", "run-all"],
    cwd=repo_path,
    capture_output=True,
    text=True
)

print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)
    
if result.returncode == 0:
    print("\nWorkflow completed successfully!")
else:
    print(f"\nWorkflow failed with return code: {result.returncode}")

### Worflow in seperate steps

#### Analyze and Download Core Ontologies

In [ ]:
%%time
import os
import subprocess
from pathlib import Path

# Get the repository root path
repo_path = Path(os.getcwd()).parent

print("Analyzing Core Ontologies...")
print(f"Repository path: {repo_path}")

# Run analyze-core using the CLI
result = subprocess.run(
    ["python", "-m", "cdm_ontologies.cli", "analyze-core"],
    cwd=repo_path,
    capture_output=True,
    text=True
)

print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

#### Analyze and Download Non-Core Ontologies

In [ ]:
%%time
import os
import subprocess
from pathlib import Path

# Get the repository root path
repo_path = Path(os.getcwd()).parent

print("Analyzing Non-Core Ontologies...")
print(f"Repository path: {repo_path}")

# Run analyze-non-core using the CLI
result = subprocess.run(
    ["python", "-m", "cdm_ontologies.cli", "analyze-non-core"],
    cwd=repo_path,
    capture_output=True,
    text=True
)

print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

#### Recreate pseudo base versions

In [ ]:
%%time
import os
import subprocess
from pathlib import Path

# Get the repository root path
repo_path = Path(os.getcwd()).parent

print("Creating Pseudo Base Ontologies...")
print(f"Repository path: {repo_path}")

# Run create-base using the CLI
result = subprocess.run(
    ["python", "-m", "cdm_ontologies.cli", "create-base"],
    cwd=repo_path,
    capture_output=True,
    text=True
)

print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

#### Analyze the prefixes

In [6]:
%%time
import sys
import os
# Add the scripts directory to the Python path
repo_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
scripts_path = os.path.join(repo_path, 'scripts')
sys.path.append(scripts_path)
# Import the prefix analyzer
from analyze_prefixes import analyze_all_ontologies, generate_prefix_mapping
# Define input directory path
input_dir = os.path.join(repo_path, 'ontology_data_owl')
# Run the analysis - NEED TO PASS BOTH ARGUMENTS
print(f"Analyzing ontologies in: {input_dir}")
results = analyze_all_ontologies(input_dir, repo_path)  # Added repo_path here!
# Generate and save prefix mapping
mapping_content = generate_prefix_mapping(results)
mapping_file = os.path.join(repo_path, 'prefix_mapping.txt')
with open(mapping_file, 'w') as f:
    f.write(mapping_content)
print(f"\nPrefix mapping file generated at: {mapping_file}")
# Print summary of analysis
print("\nSummary of analysis:")
for filename, data in results.items():
    print(f"\n{filename}:")
    print(f"  Declared prefixes: {len(data['prefixes'])}")
    additional_prefixes = set(data['prefix_to_iris'].keys()) - data['prefixes']
    print(f"  Potential additional prefixes needed: {len(additional_prefixes)}")
    if additional_prefixes:
        print("  Additional prefixes:")
        for prefix in sorted(additional_prefixes):
            iris = data['prefix_to_iris'][prefix]
            if iris:
                print(f"    - {prefix}: {next(iter(iris))}")

Analyzing ontologies in: /scratch/jplfaria/KBase_CDM_Ontologies/ontology_data_owl_core

Analyzing ncbitaxon.owl...



KeyboardInterrupt



#### Merge Ontologies

In [ ]:
%%time
import os
import subprocess
from pathlib import Path
from threading import Thread
import time
import logging

# Get the repository root path
repo_path = Path(os.getcwd()).parent

# Set up logging for memory monitoring
logging.basicConfig(
    filename=repo_path / 'logs' / 'merge_memory.log',
    level=logging.INFO,
    format='%(asctime)s - %(message)s'
)

print("Merging Ontologies...")
print(f"Repository path: {repo_path}")
print("Memory usage will be logged to logs/merge_memory.log")

# Start memory monitoring in background if needed
def monitor_memory():
    """Monitor memory usage during merge"""
    while True:
        try:
            # Get memory info
            mem_info = subprocess.check_output(['free', '-h']).decode('utf-8').split('\n')
            if len(mem_info) > 1:
                mem_line = mem_info[1].split()
                if len(mem_line) > 2:
                    used_mem = mem_line[2]
                    logging.info(f"Memory used: {used_mem}")
            time.sleep(60)  # Log every minute
        except:
            break

# Start monitoring thread
monitor_thread = Thread(target=monitor_memory, daemon=True)
monitor_thread.start()

# Run merge using the CLI
result = subprocess.run(
    ["python", "-m", "cdm_ontologies.cli", "merge"],
    cwd=repo_path,
    capture_output=True,
    text=True
)

print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)
    
if result.returncode == 0:
    print("\nMerge completed successfully!")
else:
    print(f"\nMerge failed with return code: {result.returncode}")

#### Create Semantic SQL DB

In [5]:
import sys
print(f"Current Python executable: {sys.executable}")

Current Python executable: /home/jplfaria/miniconda3/bin/python


In [6]:
!which python

/usr/local/bin/python


In [1]:
%%time
import sys
import os

# Add the scripts directory to the Python path
repo_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
scripts_path = os.path.join(repo_path, 'scripts')
sys.path.append(scripts_path)

# Import and run the database creation
from create_semantic_sql_db import create_semantic_sql_db

# Run the database creation with custom input filename
create_semantic_sql_db(
    repo_path,
    input_owl_filename='eccode.owl'
)

Error occurred: Could not find semsql installation location
CPU times: user 24.9 ms, sys: 128 μs, total: 25.1 ms
Wall time: 398 ms


Traceback (most recent call last):
  File "/scratch/jplfaria/KBase_CDM_Ontologies/scripts/create_semantic_sql_db.py", line 24, in create_semantic_sql_db
    raise Exception("Could not find semsql installation location")
Exception: Could not find semsql installation location


False

In [ ]:
%%time
import os
import subprocess
from pathlib import Path

# Get the repository root path
repo_path = Path(os.getcwd()).parent

print("Creating Semantic SQL Database...")
print(f"Repository path: {repo_path}")

# Run create-db using the CLI
result = subprocess.run(
    ["python", "-m", "cdm_ontologies.cli", "create-db"],
    cwd=repo_path,
    capture_output=True,
    text=True
)

print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)
    
if result.returncode == 0:
    print("\nDatabase creation completed successfully!")
else:
    print(f"\nDatabase creation failed with return code: {result.returncode}")

#### Extract SQLite tables to .tsv

In [ ]:
%%time
import os
import subprocess
from pathlib import Path

# Get the repository root path
repo_path = Path(os.getcwd()).parent

print("Extracting SQL Tables to TSV...")
print(f"Repository path: {repo_path}")

# Run extract-tables using the CLI
result = subprocess.run(
    ["python", "-m", "cdm_ontologies.cli", "extract-tables"],
    cwd=repo_path,
    capture_output=True,
    text=True
)

print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)
    
if result.returncode == 0:
    print("\nTable extraction completed successfully!")
else:
    print(f"\nTable extraction failed with return code: {result.returncode}")

In [ ]:
%%time
import os
import subprocess
from pathlib import Path

# Get the repository root path
repo_path = Path(os.getcwd()).parent

print("Creating Parquet Files...")
print(f"Repository path: {repo_path}")

# Run create-parquet using the CLI
result = subprocess.run(
    ["python", "-m", "cdm_ontologies.cli", "create-parquet"],
    cwd=repo_path,
    capture_output=True,
    text=True
)

print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)
    
if result.returncode == 0:
    print("\nParquet file creation completed successfully!")
else:
    print(f"\nParquet file creation failed with return code: {result.returncode}")