In [None]:
import sqlite3
import pandas as pd
from datetime import date
from docstore.extract.input import OpenAIExtractor

model = "gpt-4.1-nano"

openai_extract = OpenAIExtractor(model=model, api_key_file="/home/.openai/key")

In [None]:
conn = sqlite3.connect("/home/jovyan/data/ecoi.db")
cursor = conn.cursor()

# Step 2: Create the `pages` table
cursor.execute("""
    CREATE TABLE IF NOT EXISTS page_properties (
        id INTEGER PRIMARY KEY,
        country TEXT,
        published TEXT,
        document_type TEXT,
        language TEXT,
        document_id INT,
        original_link TEXT,
        model TEXT,
        sucessful BOOL,
        requested TEXT
    )
""")

# Index on page_exists (formerly "exists")
cursor.execute("""
    CREATE INDEX IF NOT EXISTS idx_page_properties_published
    ON page_properties (published)
    """
)

cursor.execute("""
    CREATE INDEX IF NOT EXISTS idx_page_properties_requested
    ON page_properties (requested)
    """
)

In [None]:
pd.read_sql("SELECT count(*) FROM page_properties", con=conn)

In [None]:
X = pd.read_sql(
    """
    SELECT 
        p.id, source, output, requested 
    FROM pages p 
    WHERE p.exist AND p.id NOT IN (SELECT id FROM page_properties)
    """,
    con=conn
)

for _, row in X.iterrows():

    id = row["id"]
    text = row["output"]

    print(f"id={id}. textlen={len(text)}", end='\r', flush=True)

    properties = openai_extract.extract_properties(text)

    print(f"id={id}. textlen={len(text)}. properties={properties}", end='\r', flush=True)

    cursor.execute("""
        INSERT OR REPLACE INTO page_properties (
                   id, 
                   country, 
                   published, 
                   document_type, 
                   language, 
                   document_id, 
                   original_link, 
                   model, 
                   sucessful, 
                   requested
        )
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """,
        (
            id, 
            properties["country"], 
            properties["published"], 
            properties["document_type"], 
            properties["language"], 
            properties["document_id"],
            properties["original_link"],
            model, 
            True,
            str(date.today())
        )
            
    )

    conn.commit()

conn.close()