# NCBA Taxon ID to Avibase ID

In [6]:
# setup

import sqlite3
from pathlib import Path

# Set up paths
project_root = Path.cwd().parent  # Go up one level from notebooks/
avibase_data_dir = project_root / "data" / "avibase"
db_path = avibase_data_dir / "master_birder.db"
print(f"Database path: {db_path}")

# Use with statement for proper connection management
with sqlite3.connect(db_path) as conn:
    cursor = conn.cursor()
    
    # List all tables
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';")
    tables = cursor.fetchall()
    for table in tables:
        print(f"  - {table[0]}")

Database path: /Users/ken/Documents/wk/master-birder-paper/data/avibase/master_birder.db
  - AvibaseID
  - ParentChildRelationships
  - OriginalConcepts
  - TaxanomicConcepts
  - NameConcepts
  - LifeHistory
  - GeoGraphicRange
  - OtherRelationships
  - Synonyms
  - NCBITaxonID


# Load data into local RDBMS

In [None]:
# create table
create_table_sql = """
CREATE TABLE IF NOT EXISTS NCBITaxonID (
    ncbi_taxon_id TEXT PRIMARY KEY,
    scientific_name TEXT
);
"""

cursor.executescript(create_table_sql)
conn.commit()

print("NCBITaxonID table created successfully!")

Avibase database schema created successfully!


In [7]:
# load data
import csv

def load_ncbi_data(db_path: str, csv_file_path: str):
    """Load order data from CSV into NCBITaxonID table"""
    
    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()
        
        with open(csv_file_path, 'r', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            
            for row in reader:
                # Extract data from CSV row
                ncbi_taxon_id = row['taxon']
                scientific_name = row['taxonLabel']

                cursor.execute("""
                    INSERT OR REPLACE INTO NCBITaxonID (ncbi_taxon_id, scientific_name)
                    VALUES (?, ?)
                """, (ncbi_taxon_id, scientific_name))

        conn.commit()
        print(f"Successfully loaded data from {csv_file_path}")

ontology_data_dir = project_root / "data" / "ontology"
csv_file_path = ontology_data_dir / "ncbitaxonid2scientificname.csv"
load_ncbi_data(db_path, csv_file_path)

Successfully loaded data from /Users/ken/Documents/wk/master-birder-paper/data/ontology/ncbitaxonid2scientificname.csv


In [None]:
## generate triples

# this only accounts for 9463 species
sql = """
SELECT a.avibase_id, n.ncbi_taxon_id, n.scientific_name
FROM AvibaseID a
JOIN NCBITaxonID n on a.concept_label = n.scientific_name
"""


# Prep load for Fuseki
```
python generate_avibase_turtle.py data/avibase/master_birder.db sparql/avibase-instances.ttl
```

then manually load in Fuseki