### Load Uniformat-II codes into SQL database

In [1]:
import sqlite3
import pandas as pd
import json
import fitz # PyMuPDF
import google.generativeai as genai
import os

def setup_database(db_name="uniformat.db"):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()

    # Create uniformat_codes table
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS uniformat_codes (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            type TEXT,
            level1_code TEXT,
            level1_name TEXT,
            level2_code TEXT,
            level2_name TEXT,
            level3_code TEXT,
            level3_name TEXT,
            level4_code TEXT,
            level4_name TEXT,
            description TEXT,
            notes TEXT
        );
    """)

    # Create uniformat_inclusions table
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS uniformat_inclusions (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            uniformat_code_id INTEGER,
            inclusion_text TEXT,
            FOREIGN KEY (uniformat_code_id) REFERENCES uniformat_codes(id)
        );
    """)

    # Create uniformat_exclusions table
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS uniformat_exclusions (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            uniformat_code_id INTEGER,
            exclusion_text TEXT,
            FOREIGN KEY (uniformat_code_id) REFERENCES uniformat_codes(id)
        );
    """)

    conn.commit()
    conn.close()
    print(f"Database '{db_name}' and tables created successfully.")


def insert_excel_data_clearing_first(df, db_name="uniformat.db"):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    cursor.execute("DELETE FROM uniformat_codes;")

    for index, row in df.iterrows():
        cursor.execute("""
            INSERT INTO uniformat_codes (type, level1_code, level1_name, level2_code, level2_name,
                                      level3_code, level3_name, level4_code, level4_name)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?);
        """, (row['Type'], row['Level 1 Code'], row['Level 1 Name'], row['Level 2 Code'], row['Level 2 Name'],
              row['Level 3 Code'], row['Level 3 Name'], row['Level 4 Code'], row['Level 4 Name']))
    conn.commit()
    conn.close()
    print("Excel data inserted into 'uniformat_codes' table.")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
setup_database()# Load the CSV file and insert data into the database
df = pd.read_csv("uniformat-ii-codes.csv") 
insert_excel_data_clearing_first(df) 

Database 'uniformat.db' and tables created successfully.
Excel data inserted into 'uniformat_codes' table.


### Parsing Uniformat-II guide + LLMs
- Parse official Uniformat II guide
- Generate denormalized SQL tables for inclusion and exclusion information found in Appendix B.
- Generate LLM-enhanced code description

In [4]:
PDF_PATH = "pdf_parsing/uniformat-guide.pdf"
START_PAGE = 61 
END_PAGE = 83 
DB_NAME = "uniformat.db"
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
DB_NAME = 'uniformat.db'

In [5]:
# --- Step 1: Extract text from the PDF ---
def extract_text_from_pdf_pages(pdf_path, start_page, end_page):
    text = ""
    try:
        doc = fitz.open(pdf_path)
        # PyMuPDF pages are 0-indexed, so adjust accordingly
        for page_num in range(start_page - 1, end_page):
            if page_num < len(doc):
                page = doc.load_page(page_num)
                text += page.get_text()
            else:
                print(f"Warning: Page {page_num + 1} is out of bounds.")
                break # Stop if we hit end of document
    except FileNotFoundError:
        print(f"Error: PDF file not found at '{pdf_path}'")
        return None
    except Exception as e:
        print(f"An error occurred during PDF text extraction: {e}")
        return None
    finally:
        if 'doc' in locals() and doc:
            doc.close()
    return text


In [None]:
# Schema for Gemini's JSON output for Part 1 (no description field)
UNIFORMAT_EXTRACTION_SCHEMA_NO_DESC = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "level3_code": {
                "type": "string",
                "description": "The Uniformat Level 3 code, e.g., 'A1010', 'B2010'."
            },
            "level3_name": {
                "type": "string",
                "description": "The name corresponding to the Level 3 code, e.g., 'Standard Foundations', 'Exterior Walls'."
            },
            "inclusions": {
                "type": "array",
                "items": {"type": "string"},
                "description": "A list of items explicitly included in this Uniformat element, extracted from the 'Includes' section (each bullet point as a separate string)."
            },
            "exclusions": {
                "type": "array",
                "items": {"type": "string"},
                "description": "A list of items explicitly excluded from this Uniformat element, extracted from the 'Excludes' section (each bullet point as a separate string), retaining cross-references."
            }
        },
        "required": ["level3_code", "level3_name", "inclusions", "exclusions"]
    }
}

def get_initial_uniformat_details_from_gemini_no_desc(text_content, api_key):
    """
    Calls Gemini API to extract Level 3 code, name, inclusions, and exclusions.
    Does NOT extract a description from the PDF.
    """
    if not text_content:
        print("No text content provided for Gemini API call.")
        return None

    genai.configure(api_key=api_key)
    model = genai.GenerativeModel('gemini-1.5-flash-latest')

    system_instruction = (
        "You are an expert in construction classification systems, specifically Uniformat II. "
        "Your task is to accurately extract detailed information about Uniformat Level 3 elements "
        "from the provided text, focusing solely on the element name, its code, and the explicit "
        "bulleted lists of inclusions and exclusions. Do NOT extract any general prose description "
        "for the element, as none is provided directly preceding the lists."
        "Provide the output strictly as a JSON array according to the specified schema."
    )

    prompt_parts = [
        {"text": f"{system_instruction}\n\n"
                 f"Extract the Uniformat II Level 3 element data from the following document section. "
                 f"For each element, identify its Level 3 code and name. "
                 f"Then, extract the separate lists of its explicit inclusions and exclusions, where each bullet point in the 'Includes' and 'Excludes' sections should be a distinct item in the respective list. "
                 f"Retain any cross-references like '(see section ...)' within the exclusion text.\n\n"
                 f"**Text Content:**\n{text_content}\n\n"
                 f"**Output Schema (JSON):**\n{json.dumps(UNIFORMAT_EXTRACTION_SCHEMA_NO_DESC, indent=2)}\n\n"
                 f"Please provide ONLY the JSON array, with no preamble, explanation, or any other surrounding text. Ensure valid JSON."}
    ]

    try:
        print("Sending request to Gemini API for initial extraction (without description)...")
        response = model.generate_content(
            prompt_parts,
            generation_config=genai.GenerationConfig(
                response_mime_type="application/json",
                temperature=0.0
            )
        )
        print("Gemini API call successful for initial extraction.")

        # --- DEBUG: Verify Gemini's Part 1 Raw Output ---
        print("\n--- DEBUG: Gemini Part 1 Raw Response Text (first 500 chars) ---")
        print(response.text[:500])
        print("----------------------------------------------------\n")

        parsed_json = json.loads(response.text)

        # --- DEBUG: Verify Parsed JSON Structure (first 2 entries and A1030 if present) ---
        print("\n--- DEBUG: Gemini Part 1 Parsed JSON (first 2 entries + A1030 if found) ---")
        debug_entries = parsed_json[:2]
        a1030_entry = next((item for item in parsed_json if item.get('level3_code') == 'A1030'), None)
        if a1030_entry:
            debug_entries.append(a1030_entry)
        print(json.dumps(debug_entries, indent=2))
        print("----------------------------------------------------\n")

        return parsed_json
    except Exception as e:
        print(f"Error calling Gemini API or parsing initial extraction response: {e}")
        print(f"Raw response text (for debugging): {response.text if 'response' in locals() else 'N/A'}")
        return None

def incorporate_initial_gemini_data_into_db_no_desc(gemini_data, db_name):
    """
    Incorporates extracted inclusions and exclusions into the database.
    Does NOT update the 'description' field in uniformat_codes.
    """
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()

    print("\n--- Incorporating initial Gemini output (inclusions/exclusions only) into database ---")
    for element in gemini_data:
        level3_code_gemini = element.get('level3_code')
        inclusions = element.get('inclusions', [])
        exclusions = element.get('exclusions', [])

        if not level3_code_gemini:
            print(f"Skipping entry due to missing level3_code from Gemini: {element}")
            continue

        # Clean the code from Gemini just in case (strip whitespace)
        level3_code_gemini_cleaned = level3_code_gemini.strip()

        # Find the ID of the corresponding Level 3 element in uniformat_codes
        # This will pick ONE of the IDs that share this level3_code.
        # This is correct for linking to the Level 3 concept.
        cursor.execute("""
            SELECT id FROM uniformat_codes
            WHERE level3_code = ?
            LIMIT 1;
        """, (level3_code_gemini_cleaned,))
        uniformat_code_id_result = cursor.fetchone()

        if uniformat_code_id_result:
            uniformat_code_id = uniformat_code_id_result[0]
            # --- DEBUG: Confirming ID and Level 3 Code Match ---
            # print(f"Processing Level 3: '{level3_code_gemini_cleaned}' (from Gemini, len={len(level3_code_gemini_cleaned)}), Linked DB ID: {uniformat_code_id}")

            # Clear existing inclusions/exclusions for this ID to prevent duplicates on rerun
            cursor.execute("DELETE FROM uniformat_inclusions WHERE uniformat_code_id = ?", (uniformat_code_id,))
            cursor.execute("DELETE FROM uniformat_exclusions WHERE uniformat_code_id = ?", (uniformat_code_id,))

            # Insert inclusions
            for inc_item in inclusions:
                cursor.execute("INSERT INTO uniformat_inclusions (uniformat_code_id, inclusion_text) VALUES (?, ?)",
                               (uniformat_code_id, inc_item.strip()))

            # Insert exclusions
            for exc_item in exclusions:
                cursor.execute("INSERT INTO uniformat_exclusions (uniformat_code_id, exclusion_text) VALUES (?, ?)",
                               (uniformat_code_id, exc_item.strip()))
        else:
            # --- DEBUG: Detailed Warning for Mismatch ---
            print(f"Warning: No matching ID found in 'uniformat_codes' for Level 3 code '{level3_code_gemini_cleaned}' (from Gemini, len={len(level3_code_gemini_cleaned)}).")
            # Try a fuzzy match to give more info
            cursor.execute("SELECT level3_code, LENGTH(level3_code) FROM uniformat_codes WHERE level3_code LIKE ? LIMIT 5;", (f'%{level3_code_gemini_cleaned.replace(" ", "%")}%',))
            fuzzy_matches = cursor.fetchall()
            if fuzzy_matches:
                print(f"  Possible matches in DB (code, length): {fuzzy_matches}. Mismatch is likely due to subtle differences.")
            else:
                print(f"  '{level3_code_gemini_cleaned}' does not appear to exist in DB at all.")
            print("  Skipping enrichment for this element.")

    conn.commit()
    conn.close()
    print("\nInitial Gemini output (inclusions/exclusions) successfully incorporated into database.")

# --- Part 2: Enhanced Description Generation ---

def get_level3_data_for_enhancement(level3_code, db_name):
    """Retrieves existing Level 3 data (code, name, brief_desc, inclusions, exclusions) from DB."""
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()

    # We are now certain the level3_code coming into this function from all_level3_codes
    # is a cleaned code directly from the DB.
    cursor.execute("""
        SELECT
            level3_name,
            description -- This will be NULL/empty before enhancement
        FROM
            uniformat_codes
        WHERE
            level3_code = ?
        LIMIT 1;
    """, (level3_code,))
    main_info = cursor.fetchone()

    if not main_info:
        conn.close()
        return None, None, None, None

    level3_name, current_description = main_info

    cursor.execute("SELECT id FROM uniformat_codes WHERE level3_code = ? LIMIT 1", (level3_code,))
    uniformat_code_id_result = cursor.fetchone()
    uniformat_code_id = uniformat_code_id_result[0] if uniformat_code_id_result else None

    inclusions = []
    exclusions = []
    if uniformat_code_id:
        cursor.execute("SELECT inclusion_text FROM uniformat_inclusions WHERE uniformat_code_id = ?", (uniformat_code_id,))
        inclusions = [row[0] for row in cursor.fetchall()]
        cursor.execute("SELECT exclusion_text FROM uniformat_exclusions WHERE uniformat_code_id = ?", (uniformat_code_id,))
        exclusions = [row[0] for row in cursor.fetchall()]

    conn.close()
    return level3_name, current_description, inclusions, exclusions

def generate_enhanced_description_with_gemini(level3_code, level3_name, current_description, inclusions, exclusions, api_key):
    """
    Calls Gemini API to generate a more detailed description based on existing data.
    """
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel('gemini-1.5-flash-latest')

    # Construct the prompt using all available information
    prompt = f"""
    You are an expert in Uniformat II classification for construction.
    Your task is to generate a comprehensive and highly detailed description for a Uniformat II Level 3 element.

    Here is the information about the element:
    Uniformat Code: {level3_code}
    Uniformat Name: {level3_name}
    Current Brief Description (from guide, if any): "{current_description if current_description else 'No specific brief description found in the guide, rely on name, code, inclusions, and exclusions.'}"

    Items explicitly INCLUDED in this element:
    {'- ' + '\\n- '.join(inclusions) if inclusions else 'No explicit inclusions listed.'}

    Items explicitly EXCLUDED from this element:
    {'- ' + '\\n- '.join(exclusions) if exclusions else 'No explicit exclusions listed.'}

    Using all the provided information (code, name, brief description (if any), inclusions, and exclusions), and leveraging your expert knowledge of Uniformat II and general construction principles,
    generate a comprehensive and detailed description. The description should be professional, clear,
    and expand upon the provided context. It should be suitable for use in advanced cost estimation and project planning tools.
    Do NOT include the 'Includes' or 'Excludes' lists themselves in the output.
    Provide ONLY the enhanced description text, with no preamble or additional formatting.
    """

    try:
        print(f"Generating enhanced description for {level3_code}...")
        response = model.generate_content(
            prompt,
            generation_config=genai.GenerationConfig(
                temperature=0.7,
                max_output_tokens=500
            )
        )
        # --- DEBUG: Verify Gemini's Part 2 Raw Output ---
        print(f"\n--- DEBUG: Gemini Part 2 Raw Response Text for {level3_code} (first 500 chars) ---")
        print(response.text[:500])
        print("----------------------------------------------------\n")

        return response.text.strip()
    except Exception as e:
        print(f"Error generating enhanced description for {level3_code}: {e}")
        return None

def update_description_in_db(level3_code, new_description, db_name):
    """Updates the 'description' field in uniformat_codes with the new enhanced description."""
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    try:
        # --- DEBUG: Verify Description Value Before Update ---
        print(f"--- DEBUG: Updating {level3_code} with new description (first 100 chars):")
        print(new_description[:100])
        print("----------------------------------------------------\n")

        cursor.execute("UPDATE uniformat_codes SET description = ? WHERE level3_code = ?",
                       (new_description, level3_code))
        conn.commit()
        print(f"Enhanced description successfully updated for {level3_code}.")
    except Exception as e:
        print(f"Error updating description for {level3_code}: {e}")
    finally:
        conn.close()

# --- Main Workflow Execution ---

if __name__ == "__main__":
    # 1. Setup Database and Load Initial Excel Data
    setup_database(DB_NAME) # This function (and imports/configs) needs to be above this block in your full script
    insert_excel_data_clearing_first(df, DB_NAME) # This function also needs to be above

    # 2. Part 1: Extract and Incorporate Inclusions/Exclusions from PDF
    print("\n--- Starting Part 1: Initial Inclusions/Exclusions Extraction from PDF ---")
    # extract_text_from_pdf_pages needs to be defined before this line
    extracted_pdf_text = extract_text_from_pdf_pages(PDF_PATH, START_PAGE, END_PAGE) # PDF_PATH, START_PAGE, END_PAGE also need to be defined

    if extracted_pdf_text:
        initial_gemini_json = get_initial_uniformat_details_from_gemini_no_desc(extracted_pdf_text, GEMINI_API_KEY)
        if initial_gemini_json:
            incorporate_initial_gemini_data_into_db_no_desc(initial_gemini_json, DB_NAME)
            print("Part 1: Initial extraction and database incorporation complete.")
        else:
            print("Part 1: Failed to get valid JSON from Gemini for initial extraction.")
    else:
        print("Part 1: Failed to extract text from PDF.")

    # --- DEBUG: Verify Database State After Part 1 (First 5 uniformat_codes entries) ---
    print("\n--- DEBUG: Database State After Part 1 (First 5 uniformat_codes entries) ---")
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    cursor.execute("SELECT id, level3_code, level4_code, description FROM uniformat_codes LIMIT 5;")
    for row in cursor.fetchall():
        print(row)
    conn.close()
    print("----------------------------------------------------------------\n")


    # 3. Part 2: Generate and Update Enhanced Descriptions
    print("\n--- Starting Part 2: Enhanced Description Generation ---")

    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    # Ensure level3_code is stripped when selecting distinct codes from DB
    cursor.execute("SELECT DISTINCT level3_code FROM uniformat_codes WHERE level3_code IS NOT NULL;")
    all_level3_codes = [str(row[0]).strip() for row in cursor.fetchall()] # Ensure codes are strings and stripped
    conn.close()

    if not all_level3_codes:
        print("No Level 3 codes found in the database to enhance descriptions for.")
    else:
        for code in all_level3_codes:
            name, current_desc, incl, excl = get_level3_data_for_enhancement(code, DB_NAME)
            if name:
                enhanced_desc = generate_enhanced_description_with_gemini(code, name, current_desc, incl, excl, GEMINI_API_KEY)
                if enhanced_desc:
                    update_description_in_db(code, enhanced_desc, DB_NAME)
                else:
                    print(f"Skipping enhanced description update for {code} due to generation error.")
            else:
                print(f"Skipping enhanced description for {code}: data not found in DB.")

    print("Part 2: Enhanced description generation complete.")
    print("\n--- All processes finished. ---")

    # --- DEBUG: Verify Final Database State ---
    print("\n--- DEBUG: Database State After Part 2 (First 5 uniformat_codes entries) ---")
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    cursor.execute("SELECT id, level3_code, level4_code, description FROM uniformat_codes LIMIT 5;")
    for row in cursor.fetchall():
        print(row)
    conn.close()
    print("----------------------------------------------------------------\n")

Database 'uniformat.db' and tables created successfully.
Excel data inserted into 'uniformat_codes' table.

--- Starting Part 1: Initial Inclusions/Exclusions Extraction from PDF ---
Sending request to Gemini API for initial extraction (without description)...
Gemini API call successful for initial extraction.

--- DEBUG: Gemini Part 1 Raw Response Text (first 500 chars) ---
[{"level3_code": "A1010", "level3_name": "Standard Foundations", "inclusions": ["wall & column foundations", "foundation walls up to level of top of slab on grade", "pile caps", "backfill & compaction", "footings & bases", "perimeter insulation", "perimeter drainage", "anchor plates", "dewatering"], "exclusions": ["general excavation to reduce levels (see section  G 1030, Site Earthwork)", "excavation for basements (see section A 2010, Basement Excavation)", "basement walls (see section A 2020, 
----------------------------------------------------


--- DEBUG: Gemini Part 1 Parsed JSON (first 2 entries + A1030 if f

In [7]:
enhanced_desc

In [None]:
PDF_PATH = "pdf_parsing/uniformat-guide.pdf"
START_PAGE = 61 
END_PAGE = 83 
DB_NAME = "uniformat.db"
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")

# --- Step 1: Extract text from the PDF ---
def extract_text_from_pdf_pages(pdf_path, start_page, end_page):
    text = ""
    try:
        doc = fitz.open(pdf_path)
        # PyMuPDF pages are 0-indexed, so adjust accordingly
        for page_num in range(start_page - 1, end_page):
            if page_num < len(doc):
                page = doc.load_page(page_num)
                text += page.get_text()
            else:
                print(f"Warning: Page {page_num + 1} is out of bounds.")
                break # Stop if we hit end of document
    except FileNotFoundError:
        print(f"Error: PDF file not found at '{pdf_path}'")
        return None
    except Exception as e:
        print(f"An error occurred during PDF text extraction: {e}")
        return None
    finally:
        if 'doc' in locals() and doc:
            doc.close()
    return text

# Schema for Gemini's JSON output for Part 1 (no description field)
UNIFORMAT_EXTRACTION_SCHEMA = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "level3_code": {
                "type": "string",
                "description": "The Uniformat Level 3 code, e.g., 'A1010', 'B2010'."
            },
            "level3_name": {
                "type": "string",
                "description": "The name corresponding to the Level 3 code, e.g., 'Standard Foundations', 'Exterior Walls'."
            },
            "inclusions": {
                "type": "array",
                "items": {"type": "string"},
                "description": "A list of items explicitly included in this Uniformat element, extracted from the 'Includes' section (each bullet point as a separate string)."
            },
            "exclusions": {
                "type": "array",
                "items": {"type": "string"},
                "description": "A list of items explicitly excluded from this Uniformat element, extracted from the 'Excludes' section (each bullet point as a separate string), retaining cross-references."
            }
        },
        "required": ["level3_code", "level3_name", "inclusions", "exclusions"]
    }
}

def get_initial_uniformat_details_from_gemini_no_desc(text_content, api_key):
    """
    Calls Gemini API to extract Level 3 code, name, inclusions, and exclusions.
    Does NOT extract a description from the PDF.
    """
    if not text_content:
        print("No text content provided for Gemini API call.")
        return None

    genai.configure(api_key=api_key)
    model = genai.GenerativeModel('gemini-1.5-flash-latest')

    system_instruction = (
        "You are an expert in construction classification systems, specifically Uniformat II. "
        "Your task is to accurately extract detailed information about Uniformat Level 3 elements "
        "from the provided text, focusing solely on the element name, its code, and the explicit "
        "bulleted lists of inclusions and exclusions. Do NOT extract any general prose description "
        "for the element, as none is provided directly preceding the lists."
        "Provide the output strictly as a JSON array according to the specified schema."
    )

    prompt_parts = [
        {"text": f"{system_instruction}\n\n"
                 f"Extract the Uniformat II Level 3 element data from the following document section. "
                 f"For each element, identify its Level 3 code and name. "
                 f"Then, extract the separate lists of its explicit inclusions and exclusions, where each bullet point in the 'Includes' and 'Excludes' sections should be a distinct item in the respective list. "
                 f"Retain any cross-references like '(see section ...)' within the exclusion text.\n\n"
                 f"**Text Content:**\n{text_content}\n\n"
                 f"**Output Schema (JSON):**\n{json.dumps(UNIFORMAT_EXTRACTION_SCHEMA, indent=2)}\n\n"
                 f"Please provide ONLY the JSON array, with no preamble, explanation, or any other surrounding text. Ensure valid JSON."}
    ]

    try:
        print("Sending request to Gemini API for initial extraction (without description)...")
        response = model.generate_content(
            prompt_parts,
            generation_config=genai.GenerationConfig(
                response_mime_type="application/json",
                temperature=0.0
            )
        )
        print("Gemini API call successful for initial extraction.")
        return json.loads(response.text)
    except Exception as e:
        print(f"Error calling Gemini API or parsing initial extraction response: {e}")
        print(f"Raw response text (for debugging): {response.text if 'response' in locals() else 'N/A'}")
        return None

def incorporate_initial_gemini_data_into_db_no_desc(gemini_data, db_name):
    """
    Incorporates extracted inclusions and exclusions into the database.
    Does NOT update the 'description' field in uniformat_codes.
    """
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()

    print("\n--- Incorporating initial Gemini output (inclusions/exclusions only) into database ---")
    for element in gemini_data:
        level3_code = element.get('level3_code')
        inclusions = element.get('inclusions', [])
        exclusions = element.get('exclusions', [])

        if not level3_code:
            print(f"Skipping entry due to missing level3_code: {element}")
            continue

        cursor.execute("""
            SELECT id FROM uniformat_codes
            WHERE level3_code = ?
            LIMIT 1;
        """, (level3_code,))
        uniformat_code_id_result = cursor.fetchone()

        if uniformat_code_id_result:
            uniformat_code_id = uniformat_code_id_result[0]

            # Clear existing inclusions/exclusions for this ID to prevent duplicates on rerun
            cursor.execute("DELETE FROM uniformat_inclusions WHERE uniformat_code_id = ?", (uniformat_code_id,))
            cursor.execute("DELETE FROM uniformat_exclusions WHERE uniformat_code_id = ?", (uniformat_code_id,))

            # Insert inclusions
            for inc_item in inclusions:
                cursor.execute("INSERT INTO uniformat_inclusions (uniformat_code_id, inclusion_text) VALUES (?, ?)",
                               (uniformat_code_id, inc_item.strip()))

            # Insert exclusions
            for exc_item in exclusions:
                cursor.execute("INSERT INTO uniformat_exclusions (uniformat_code_id, exclusion_text) VALUES (?, ?)",
                               (uniformat_code_id, exc_item.strip()))
        else:
            print(f"Warning: No matching ID found in 'uniformat_codes' for Level 3 code '{level3_code}'. "
                  f"Ensure this Level 3 code exists from your Excel data. Skipping enrichment for this element.")

    conn.commit()
    conn.close()
    print("\nInitial Gemini output (inclusions/exclusions) successfully incorporated into database.")

# --- Part 2: Enhanced Description Generation ---

def get_level3_data_for_enhancement(level3_code, db_name):
    """Retrieves existing Level 3 data (code, name, brief_desc, inclusions, exclusions) from DB."""
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()

    cursor.execute("""
        SELECT
            level3_name,
            description -- This will be NULL or empty before enhancement
        FROM
            uniformat_codes
        WHERE
            level3_code = ?
        LIMIT 1;
    """, (level3_code,))
    main_info = cursor.fetchone()

    if not main_info:
        conn.close()
        return None, None, None, None

    level3_name, current_description = main_info

    cursor.execute("SELECT id FROM uniformat_codes WHERE level3_code = ? LIMIT 1", (level3_code,))
    uniformat_code_id_result = cursor.fetchone()
    uniformat_code_id = uniformat_code_id_result[0] if uniformat_code_id_result else None

    inclusions = []
    exclusions = []
    if uniformat_code_id:
        cursor.execute("SELECT inclusion_text FROM uniformat_inclusions WHERE uniformat_code_id = ?", (uniformat_code_id,))
        inclusions = [row[0] for row in cursor.fetchall()]
        cursor.execute("SELECT exclusion_text FROM uniformat_exclusions WHERE uniformat_code_id = ?", (uniformat_code_id,))
        exclusions = [row[0] for row in cursor.fetchall()]

    conn.close()
    return level3_name, current_description, inclusions, exclusions

def generate_enhanced_description_with_gemini(level3_code, level3_name, current_description, inclusions, exclusions, api_key):
    """
    Calls Gemini API to generate a more detailed description based on existing data.
    """
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel('gemini-1.5-flash-latest')

    # Construct the prompt using all available information
    prompt = f"""
    You are an expert in Uniformat II classification for construction.
    Your task is to generate a comprehensive and highly detailed description for a Uniformat II Level 3 element.

    Here is the information about the element:
    Uniformat Code: {level3_code}
    Uniformat Name: {level3_name}
    Current Brief Description (from guide, if any): "{current_description if current_description else 'No specific brief description found in the guide, rely on name, code, inclusions, and exclusions.'}"

    Items explicitly INCLUDED in this element:
    {'- ' + '\\n- '.join(inclusions) if inclusions else 'No explicit inclusions listed.'}

    Items explicitly EXCLUDED from this element:
    {'- ' + '\\n- '.join(exclusions) if exclusions else 'No explicit exclusions listed.'}

    Using all the provided information (code, name, brief description (if any), inclusions, and exclusions), and leveraging your expert knowledge of Uniformat II and general construction principles,
    generate a comprehensive and detailed description. The description should be professional, clear,
    and expand upon the provided context. It should be suitable for use in advanced cost estimation and project planning tools.
    Do NOT include the 'Includes' or 'Excludes' lists themselves in the output and aim to minimize redundancy.
    Provide ONLY the enhanced description text, with no preamble or additional formatting.
    """

    try:
        print(f"Generating enhanced description for {level3_code}...")
        response = model.generate_content(
            prompt,
            generation_config=genai.GenerationConfig(
                temperature=0.7, # Higher temperature for more creative/detailed output
                max_output_tokens=500 # Limit output length if desired
            )
        )
        return response.text.strip()
    except Exception as e:
        print(f"Error generating enhanced description for {level3_code}: {e}")
        return None

def update_description_in_db(level3_code, new_description, db_name):
    """Updates the 'description' field in uniformat_codes with the new enhanced description."""
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    try:
        cursor.execute("UPDATE uniformat_codes SET description = ? WHERE level3_code = ?",
                       (new_description, level3_code))
        conn.commit()
        print(f"Enhanced description updated for {level3_code}.")
    except Exception as e:
        print(f"Error updating description for {level3_code}: {e}")
    finally:
        conn.close()

# --- Main Workflow Execution ---

if __name__ == "__main__":
    # 1. Setup Database
    setup_database(DB_NAME)

    # 2. Load Initial Excel Data (Sample DataFrame - Replace with your actual df from Excel)
    sample_excel_data = {
        'Type': ['Building'] * 10,
        'Level 1 Code': ['A'] * 10, 'Level 1 Name': ['SUBSTRUCTURE'] * 10,
        'Level 2 Code': ['A10'] * 10, 'Level 2 Name': ['Foundations'] * 10,
        'Level 3 Code': ['A1010'] * 3 + ['A1020'] * 7,
        'Level 3 Name': ['Standard Foundations'] * 3 + ['Special Foundations'] * 7,
        'Level 4 Code': ['A1011', 'A1012', 'A1013', 'A1021', 'A1022', 'A1023', 'A1024', 'A1025', 'A1026', 'A1027'],
        'Level 4 Name': ['Wall Foundations', 'Column Foundations & Pile Caps', 'Perimeter Drainage & Insulation',
                         'Pile Foundations', 'Grade Beams', 'Caissons', 'Underpinning', 'Dewatering',
                         'Raft Foundations', 'Pressure Injected Grouting']
    }
    df_excel = pd.DataFrame(sample_excel_data)
    insert_excel_data_clearing_first(df_excel, DB_NAME)

    # 3. Part 1: Extract and Incorporate Inclusions/Exclusions from PDF
    print("\n--- Starting Part 1: Initial Inclusions/Exclusions Extraction from PDF ---")
    extracted_pdf_text = extract_text_from_pdf_pages(PDF_PATH, START_PAGE, END_PAGE)

    if extracted_pdf_text:
        initial_gemini_json = get_initial_uniformat_details_from_gemini_no_desc(extracted_pdf_text, GEMINI_API_KEY)
        if initial_gemini_json:
            incorporate_initial_gemini_data_into_db_no_desc(initial_gemini_json, DB_NAME)
            print("Part 1: Initial extraction and database incorporation complete.")
        else:
            print("Part 1: Failed to get valid JSON from Gemini for initial extraction.")
    else:
        print("Part 1: Failed to extract text from PDF.")

    # 4. Part 2: Generate and Update Enhanced Descriptions
    print("\n--- Starting Part 2: Enhanced Description Generation ---")
    
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    cursor.execute("SELECT DISTINCT level3_code FROM uniformat_codes WHERE level3_code IS NOT NULL;")
    all_level3_codes = [row[0] for row in cursor.fetchall()]
    conn.close()

    if not all_level3_codes:
        print("No Level 3 codes found in the database to enhance descriptions for.")
    else:
        for code in all_level3_codes:
            name, current_desc, incl, excl = get_level3_data_for_enhancement(code, DB_NAME)
            if name:
                # current_desc will be None or an empty string, which is handled by the prompt
                enhanced_desc = generate_enhanced_description_with_gemini(code, name, current_desc, incl, excl, GEMINI_API_KEY)
                if enhanced_desc:
                    update_description_in_db(code, enhanced_desc, DB_NAME)
                else:
                    print(f"Skipping enhanced description update for {code} due to generation error.")
            else:
                print(f"Skipping enhanced description for {code}: data not found in DB.")

    print("Part 2: Enhanced description generation complete.")
    print("\n--- All processes finished. ---")

Database 'uniformat.db' and tables created successfully.
Excel data inserted into 'uniformat_codes' table.

--- Starting Part 1: Initial Inclusions/Exclusions Extraction from PDF ---
Sending request to Gemini API for initial extraction (without description)...
Gemini API call successful for initial extraction.

--- Incorporating initial Gemini output (inclusions/exclusions only) into database ---

Initial Gemini output (inclusions/exclusions) successfully incorporated into database.
Part 1: Initial extraction and database incorporation complete.

--- Starting Part 2: Enhanced Description Generation ---
Generating enhanced description for A1010...
Enhanced description updated for A1010.
Generating enhanced description for A1020...
Enhanced description updated for A1020.
Part 2: Enhanced description generation complete.

--- All processes finished. ---


In [29]:
enhanced_desc

'A1020 Special Foundations encompasses all foundation elements beyond standard spread footings, strip footings, and isolated piers. This category includes deep foundation systems such as piles (driven, bored, or helical), caissons (drilled or poured-in-place), and underpinning systems designed to support existing structures or to address challenging soil conditions.  It also incorporates complex foundation solutions like raft foundations, which distribute loads over a large area, and grade beams, which tie individual foundation elements together for improved structural stability. Dewatering systems, essential for maintaining stable excavation conditions during the construction of special foundations in saturated soils, are explicitly included.  Rock excavation is only included if it is integral to the construction of the special foundation itself; otherwise, it falls under standard foundation or excavation categories.  Pile caps, being a separate structural component atop the piles, ar

In [24]:
def insert_gemini_output_into_db(gemini_data, db_name="uniformat.db"):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()

    print("\n--- Inserting Gemini output into database ---")
    for element in gemini_data:
        level3_code = element.get('level3_code')
        description = element.get('description', '')
        inclusions = element.get('inclusions', [])
        exclusions = element.get('exclusions', [])

        if not level3_code:
            print(f"Skipping entry due to missing level3_code: {element}")
            continue

        # Find the ID of the corresponding Level 3 element in uniformat_codes
        # We need to find the specific row that represents the Level 3 entry.
        # This assumes your Excel data includes entries for Level 3 codes.
        # For example, if A1010 has Level 4 children, it should still have a conceptual
        # entry for A1010 itself in the uniformat_codes table.
        # If your Excel data only has Level 4 rows and no explicit Level 3 rows,
        # this linking strategy needs adjustment.
        # Assuming Level 3 codes are present in the uniformat_codes table (e.g., as level3_code with a NULL level4_code, or as a specific entry in your original Excel data)

        # Let's search for a row where level3_code matches and level4_code is NULL or matching level3_code (if your Excel had 'parent' rows)
        # A safer approach is often to look for the ID of the *first* entry that matches the level3_code
        # or add an explicit entry for Level 3 parents during the Excel import.

        # Most reliable: find the ID of the row that *represents* this Level 3 element.
        # This might be a row where level4_code is NULL, or simply the first matching level3_code
        # from the set of related codes. Given your Excel, it might be that A1010 is duplicated
        # for each of its children (A1011, A1012, A1013).
        # We want to associate the description with the *parent* A1010 element.

        # A robust way is to select the ID of an existing entry for this Level 3 code.
        # If multiple Level 4 entries share the same Level 3 code, we need to pick one
        # to link the inclusions/exclusions. Or, ideally, your uniformat_codes table
        # should have a specific row for A1010 itself.
        cursor.execute("""
            SELECT id FROM uniformat_codes
            WHERE level3_code = ?
            LIMIT 1; -- Get just one ID, assuming all Level 4 children under A1010 would link to the same A1010 parent description
        """, (level3_code,))
        uniformat_code_id_result = cursor.fetchone()

        if uniformat_code_id_result:
            uniformat_code_id = uniformat_code_id_result[0]

            # 1. Update the description for the corresponding Level 3 element
            # This will update the 'description' column for ALL rows that share this level3_code
            # if your Excel data has Level3 repeated for each Level4 child.
            # If you want to update only ONE canonical Level 3 entry, you need a way to identify it (e.g., where level4_code is NULL)
            cursor.execute("UPDATE uniformat_codes SET description = ? WHERE level3_code = ?",
                           (description, level3_code))
            print(f"Updated description for {level3_code}")

            # 2. Clear existing inclusions/exclusions for this ID to prevent duplicates on rerun
            # This is important if you might re-run the Gemini extraction.
            cursor.execute("DELETE FROM uniformat_inclusions WHERE uniformat_code_id = ?", (uniformat_code_id,))
            cursor.execute("DELETE FROM uniformat_exclusions WHERE uniformat_code_id = ?", (uniformat_code_id,))

            # 3. Insert inclusions
            for inc in inclusions:
                cursor.execute("INSERT INTO uniformat_inclusions (uniformat_code_id, inclusion_text) VALUES (?, ?)",
                               (uniformat_code_id, inc))
            print(f"Inserted {len(inclusions)} inclusions for {level3_code}")

            # 4. Insert exclusions
            for exc in exclusions:
                cursor.execute("INSERT INTO uniformat_exclusions (uniformat_code_id, exclusion_text) VALUES (?, ?)",
                               (uniformat_code_id, exc))
            print(f"Inserted {len(exclusions)} exclusions for {level3_code}")

        else:
            print(f"Warning: No matching entry found in 'uniformat_codes' for Level 3 code '{level3_code}'. "
                  f"Skipping description, inclusions, and exclusions for this element. "
                  f"Ensure your Excel data fully covers Level 3 elements for proper linking.")
            # Optional: If you want to insert new Level 3 entries if they don't exist:
            # You would need to determine the parent Level 1 and Level 2 codes/names
            # and insert a new row into uniformat_codes for this Level 3 element.
            # This can get complicated if the Level 3 element doesn't have an explicit parent in your initial Excel data.


    conn.commit()
    conn.close()
    print("\nGemini output successfully incorporated into database.")

# --- Final Execution Flow ---
if __name__ == "__main__":
    # Ensure your database setup and initial Excel data insertion are done first
    # Example:
    # setup_database() # Or setup_database_with_unique_constraint()
    # insert_excel_data(your_dataframe) # Or insert_excel_data_clearing_first(your_dataframe)

    # 1. Extract text from PDF
    extracted_pdf_text = extract_text_from_pdf_pages(PDF_PATH, START_PAGE, END_PAGE)

    if extracted_pdf_text:
        # 2. Get structured JSON from Gemini
        gemini_output_json = get_uniformat_data_from_gemini(extracted_pdf_text)

        if gemini_output_json:
            # 3. Insert/Update database with Gemini's JSON output
            insert_gemini_output_into_db(gemini_output_json, DB_NAME)
            print("Data integration complete.")
        else:
            print("Process stopped: No valid JSON data from Gemini.")
    else:
        print("Process stopped: Could not extract text from PDF.")

Sending request to Gemini API...
Gemini API call successful.

--- Inserting Gemini output into database ---
Updated description for A1010
Inserted 9 inclusions for A1010
Inserted 4 exclusions for A1010
Updated description for A1020
Inserted 7 inclusions for A1020
Inserted 2 exclusions for A1020
Updated description for A1030
Inserted 8 inclusions for A1030
Inserted 2 exclusions for A1030
Updated description for A2010
Inserted 3 inclusions for A2010
Inserted 1 exclusions for A2010
Updated description for A2020
Inserted 2 inclusions for A2020
Inserted 2 exclusions for A2020
Updated description for B1010
Inserted 9 inclusions for B1010
Inserted 4 exclusions for B1010
Updated description for B1020
Inserted 5 inclusions for B1020
Inserted 3 exclusions for B1020
Updated description for B2010
Inserted 7 inclusions for B2010
Inserted 6 exclusions for B2010
Updated description for B2020
Inserted 5 inclusions for B2020
Inserted 1 exclusions for B2020
Updated description for B2030
Inserted 4 inclu

### Use LLM to enrich SQL database
- Add info to exclusion and inclusion tables
- Add detailed description column in uniformat-codes database

In [20]:
# Using Google AI Python SDK
import google.generativeai as genai
import json
import os

# Configure your API key
genai.configure(api_key=os.environ.get("GEMINI_API_KEY"))

model = genai.GenerativeModel('gemini-2.5-flash')

# Example content from your Appendix B snippet
appendix_b_snippet = """
APPENDIX B UNIFORMAT II, Level-3 ELEMENT
DESCRIPTIONS—List of Inclusions and Exclusions
...
A 10 Foundations
A 1010 Standard Foundations
Includes
• wall & column foundations
• foundation walls up to level of top of slab on grade
• pile caps
• backfill & compaction
• footings & bases
• perimeter insulation
• perimeter drainage
• anchor plates
• dewatering
Excludes
• general excavation to reduce levels (see section G 1030, Site Earthwork)
• excavation for basements (see section A 2010, Basement Excavation)
• basement walls (see section A 2020, Basement Walls)
• under-slab drainage and insulation (see section A 1030, Slab on Grade)

B 20 Substructure (another example)
B 2010 Exterior Walls
Includes
• all exterior load-bearing walls
• curtain walls
• exterior skin finishes
Excludes
• interior partitions (see C 1010)
"""

# Define the output schema
# You can also use Pydantic models for more robust schema definition
output_schema = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "level3_code": {"type": "string", "description": "The Uniformat Level 3 code, e.g., A1010"},
            "level3_name": {"type": "string", "description": "The name of the Level 3 element, e.g., Standard Foundations"},
            "description": {"type": "string", "description": "A general description of the element."},
            "inclusions": {
                "type": "array",
                "items": {"type": "string"},
                "description": "A list of items included in this element."
            },
            "exclusions": {
                "type": "array",
                "items": {"type": "string"},
                "description": "A list of items excluded from this element, often with cross-references."
            }
        },
        "required": ["level3_code", "level3_name", "inclusions", "exclusions"]
    }
}


# Construct the prompt with structured output
# For actual PDF files, you would use:
# prompt_parts = [
#     genai.upload_file("path/to/your/uniformat_guide.pdf"),
#     {"text": "Extract the Uniformat II Level 3 element descriptions, including their inclusions and exclusions, from this document. Format the output as a JSON array according to the following schema:"},
#     {"text": json.dumps(output_schema)}, # Provide the schema directly in the prompt
#     {"text": "Ensure that cross-references like '(see section ...)' are captured within the exclusion text."}
# ]

# For demonstration with the text snippet:
prompt_parts = [
    {"text": f"Extract the Uniformat II Level 3 element descriptions, including their inclusions and exclusions, from the following text. Format the output as a JSON array according to the following schema: {json.dumps(output_schema)}\n\nText:\n{appendix_b_snippet}"},
    {"text": "Ensure that cross-references like '(see section ...)' are captured within the exclusion text."}
]

try:
    # Generate content with structured response
    response = model.generate_content(
        prompt_parts,
        generation_config=genai.GenerationConfig(response_mime_type="application/json")
    )

    # Parse the JSON response
    extracted_data = json.loads(response.text)
    print(json.dumps(extracted_data, indent=2))

    # Now you can iterate through extracted_data and insert into your SQL tables
    # (similar to the insert_parsed_data function from the previous response)

except Exception as e:
    print(f"An error occurred: {e}")
    # You might want to print response.text here to debug if it's not valid JSON

[
  {
    "level3_code": "A1010",
    "level3_name": "Standard Foundations",
    "inclusions": [
      "wall & column foundations",
      "foundation walls up to level of top of slab on grade",
      "pile caps",
      "backfill & compaction",
      "footings & bases",
      "perimeter insulation",
      "perimeter drainage",
      "anchor plates",
      "dewatering"
    ],
    "exclusions": [
      "general excavation to reduce levels (see section G 1030, Site Earthwork)",
      "excavation for basements (see section A 2010, Basement Excavation)",
      "basement walls (see section A 2020, Basement Walls)",
      "under-slab drainage and insulation (see section A 1030, Slab on Grade)"
    ]
  },
  {
    "level3_code": "B2010",
    "level3_name": "Exterior Walls",
    "inclusions": [
      "all exterior load-bearing walls",
      "curtain walls",
      "exterior skin finishes"
    ],
    "exclusions": [
      "interior partitions (see C 1010)"
    ]
  }
]
