# EMMA Public Solicitations Data Analysis

This notebook connects to the PostgreSQL database and analyzes the EMMA Maryland public solicitations data collected by our Dagster pipeline.

In [6]:
# Import required libraries
import pandas as pd
import json
from sqlalchemy import create_engine, text
from datetime import datetime
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

print("Libraries imported successfully")

Libraries imported successfully


In [7]:
# Database connection setup
DB_URL = "postgresql://postgres:St0ck!adePG@localhost:5432/engineering"

# Create SQLAlchemy engine
engine = create_engine(DB_URL)

# Test connection
with engine.connect() as conn:
    result = conn.execute(text("SELECT version();"))
    version = result.fetchone()[0]
    print(f"Connected to PostgreSQL: {version}")

Connected to PostgreSQL: PostgreSQL 16.9 (Ubuntu 16.9-0ubuntu0.24.04.1) on x86_64-pc-linux-gnu, compiled by gcc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0, 64-bit


In [8]:
# Get the raw HTML content for analysis
from bs4 import BeautifulSoup
import re

query_html = """
SELECT 
    id,
    timestamp,
    value->>'raw_html' as raw_html
FROM bronze.emma_public_solicitations
ORDER BY timestamp DESC
LIMIT 1;
"""

df_html = pd.read_sql(query_html, engine)
raw_html = df_html.iloc[0]['raw_html'] if len(df_html) > 0 else None

print(f"Retrieved HTML content of length: {len(raw_html) if raw_html else 0}")
print(f"Timestamp: {df_html.iloc[0]['timestamp'] if len(df_html) > 0 else 'None'}")

# Also show basic metadata
query_metadata = """
SELECT 
    id,
    timestamp,
    created_at,
    length(value->>'raw_html') as html_length,
    value->>'url' as url,
    value->>'status' as status,
    value->>'body_length' as body_length
FROM bronze.emma_public_solicitations
ORDER BY timestamp DESC;
"""

df_metadata = pd.read_sql(query_metadata, engine)
print(f"Found {len(df_metadata)} records in the database")
display(df_metadata)

Retrieved HTML content of length: 558634
Timestamp: 2025-06-09 02:49:42.431903+00:00
Found 1 records in the database


Unnamed: 0,id,timestamp,created_at,html_length,url,status,body_length
0,1,2025-06-09 02:49:42.431903+00:00,2025-06-09 06:49:42.432242+00:00,558634,https://emma.maryland.gov/page.aspx/en/rfp/req...,200,558634


In [9]:
# Inspect table 11 specifically (the one with 26 rows)
if raw_html:
    soup = BeautifulSoup(raw_html, 'html.parser')
    tables = soup.find_all('table')
    
    if len(tables) >= 11:
        table_11 = tables[10]  # Table 11 (0-indexed)
        rows = table_11.find_all('tr')
        
        print(f"=== TABLE 11 DETAILED INSPECTION ===")
        print(f"Total rows: {len(rows)}")
        
        # Show header row
        if len(rows) > 0:
            header_cells = [cell.get_text(strip=True) for cell in rows[0].find_all(['th', 'td'])]
            print(f"Header row ({len(header_cells)} columns): {header_cells}")
        
        # Show first few data rows
        print(f"\nFirst 5 data rows:")
        for i, row in enumerate(rows[1:6], 1):  # Show rows 1-5
            cells = [cell.get_text(strip=True) for cell in row.find_all(['td', 'th'])]
            print(f"Row {i} ({len(cells)} columns): {cells}")
        
        # Check if any rows have links or special content
        print(f"\nChecking for links in first few rows:")
        for i, row in enumerate(rows[1:4], 1):
            links = row.find_all('a')
            if links:
                print(f"Row {i} has {len(links)} links: {[a.get('href') for a in links]}")
            else:
                print(f"Row {i} has no links")
                
        print(f"\nTable 11 structure looks good for extraction!")
    else:
        print(f"Cannot access table 11 - only {len(tables)} tables found")
else:
    print("No HTML content available")

=== TABLE 11 DETAILED INSPECTION ===
Total rows: 26
Header row (19 columns): ['Editing column', 'ID', 'Title', 'Status', 'Due / Close Date', 'Publish Date UTC-4', 'Main Category', 'Solicitation Type', 'Issuing Agency', 'Auto opening', 'Round #', 'Award Status', 'Procurement Officer / Buyer', 'Authority', 'Sub Agency', 'Site', 'Bid Holders List', 'eMM ID', 'Orga x_id']

First 5 data rows:
Row 1 (19 columns): ['Edit Pest Control Services', 'BPM051657', 'Pest Control Services', 'Open', '7/15/2025', '6/6/2025 5:41:02 PM', 'Other', 'Public Notice', 'Maryland Transportation Authority', 'Auto opening', '1', 'In progress', 'Wheeler Akia', 'State of Maryland Government', 'Maryland Transportation Authority', '', '', '', 'auth;J07']
Row 2 (19 columns): ['Edit MAA-MC-26-002 Integrated Airport Security System (IASS) Maintenance at BWI Thurgood Marshall Airport', 'BPM051572', 'MAA-MC-26-002 Integrated Airport Security System (IASS) Maintenance at BWI Thurgood Marshall Airport', 'Open', '7/10/2025', 

# EMMA Public Contracts Data Analysis

Now let's analyze the contracts data in a similar way.

In [ ]:
# Get the raw HTML content for contracts
query_contracts_html = """
SELECT 
    id,
    timestamp,
    value->>'raw_html' as raw_html
FROM bronze.emma_public_contracts
ORDER BY timestamp DESC
LIMIT 1;
"""

df_contracts_html = pd.read_sql(query_contracts_html, engine)
contracts_raw_html = df_contracts_html.iloc[0]['raw_html'] if len(df_contracts_html) > 0 else None

print(f"Retrieved contracts HTML content of length: {len(contracts_raw_html) if contracts_raw_html else 0}")
print(f"Timestamp: {df_contracts_html.iloc[0]['timestamp'] if len(df_contracts_html) > 0 else 'None'}")

# Show metadata for contracts
query_contracts_metadata = """
SELECT 
    id,
    timestamp,
    created_at,
    length(value->>'raw_html') as html_length,
    value->>'url' as url,
    value->>'status' as status,
    value->>'body_length' as body_length
FROM bronze.emma_public_contracts
ORDER BY timestamp DESC;
"""

df_contracts_metadata = pd.read_sql(query_contracts_metadata, engine)
print(f"\nFound {len(df_contracts_metadata)} contract records in the database")
display(df_contracts_metadata)

In [ ]:
# Analyze the contracts HTML structure
if contracts_raw_html:
    contracts_soup = BeautifulSoup(contracts_raw_html, 'html.parser')
    contracts_tables = contracts_soup.find_all('table')
    
    print(f"Total tables in contracts page: {len(contracts_tables)}")
    
    # Analyze each table to find the one with contract data
    for i, table in enumerate(contracts_tables):
        rows = table.find_all('tr')
        if rows:
            # Get headers from first row
            headers = [cell.get_text(strip=True) for cell in rows[0].find_all(['th', 'td'])]
            print(f"\n=== TABLE {i+1} ===")
            print(f"Rows: {len(rows)}, Headers: {headers[:5]}..." if len(headers) > 5 else f"Rows: {len(rows)}, Headers: {headers}")
            
            # If this looks like the main data table (has many rows and relevant headers)
            if len(rows) > 10 and any('contract' in h.lower() or 'id' in h.lower() for h in headers):
                print(f"  -> This looks like the main contracts table!")
                # Show first data row
                if len(rows) > 1:
                    first_data = [cell.get_text(strip=True) for cell in rows[1].find_all(['td', 'th'])]
                    print(f"  -> First data row: {first_data[:3]}...")
else:
    print("No contracts HTML content available")

In [ ]:
# Extract contracts data from table 11 (the correct table)
contracts_data = []

if contracts_raw_html:
    contracts_soup = BeautifulSoup(contracts_raw_html, 'html.parser')
    contracts_tables = contracts_soup.find_all('table')
    
    print(f"Found {len(contracts_tables)} total tables in contracts page")
    
    # Table 11 (index 10) contains the contracts data
    if len(contracts_tables) >= 11:
        target_table = contracts_tables[10]  # Table 11 (0-indexed)
        rows = target_table.find_all('tr')
        
        print(f"Table 11 has {len(rows)} rows (including header)")
        
        if len(rows) > 1:
            # Get headers from first row
            header_row = rows[0]
            headers = [th.get_text(strip=True) for th in header_row.find_all(['th', 'td'])]
            
            print(f"Headers: {headers}")
            
            # Extract data rows (skip header row)
            data_rows_processed = 0
            for i, row in enumerate(rows[1:], 1):
                cells = [td.get_text(strip=True) for td in row.find_all(['td', 'th'])]
                
                # Only process rows that have actual data
                if len(cells) > 0 and any(cell.strip() for cell in cells):
                    # Ensure we have the right number of columns
                    while len(cells) < len(headers):
                        cells.append('')
                    
                    # Create a record with column names
                    record = {}
                    for j, header in enumerate(headers):
                        if j < len(cells):
                            record[header] = cells[j]
                        else:
                            record[header] = ''
                    
                    # Add metadata
                    record['row_number'] = i
                    record['original_row_length'] = len(cells)
                    
                    contracts_data.append(record)
                    data_rows_processed += 1
                    
                    # Show first 3 rows for debugging
                    if i <= 3:
                        print(f"Row {i}: {cells[:3]}...")
            
            print(f"Processed {data_rows_processed} data rows")
        else:
            print("Table 11 has no data rows")
    else:
        print(f"Only found {len(contracts_tables)} tables, cannot access table 11")

# Create DataFrame
if contracts_data:
    df_contracts = pd.DataFrame(contracts_data)
    print(f"\nExtracted {len(df_contracts)} contract records from table 11")
    print(f"Columns: {list(df_contracts.columns)}")
    
    # Clean up the data
    # Remove empty columns if any
    df_contracts = df_contracts.dropna(how='all', axis=1)
    
    # Clean up column names
    df_contracts.columns = [col.strip() for col in df_contracts.columns]
    
    # Display basic info
    print(f"\nDataFrame shape: {df_contracts.shape}")
    print(f"Cleaned columns: {list(df_contracts.columns)}")
    
    # Display first few records
    print("\nFirst 5 contracts:")
    display(df_contracts.head())
    
    # Show some statistics
    if 'Contract Type' in df_contracts.columns:
        print(f"\nContract Type distribution:")
        print(df_contracts['Contract Type'].value_counts())
        
    if 'Vendor' in df_contracts.columns:
        print(f"\nTop 5 vendors by contract count:")
        print(df_contracts['Vendor'].value_counts().head())
        
else:
    print("No contract data extracted from table 11")
    df_contracts = pd.DataFrame()

In [ ]:
# Summary - Compare Solicitations and Contracts Data

print("=== DATA SUMMARY ===\n")

# Solicitations summary
if 'df_solicitations' in locals() and not df_solicitations.empty:
    print(f"SOLICITATIONS DATA:")
    print(f"- Total records: {len(df_solicitations)}")
    print(f"- Columns: {df_solicitations.shape[1]}")
    print(f"- Key fields: ID, Title, Status, Due/Close Date, Issuing Agency")
    print(f"- All solicitations are: {df_solicitations['Status'].unique()}")
    print(f"- Date range: {df_solicitations['Due / Close Date'].min()} to {df_solicitations['Due / Close Date'].max()}")
else:
    print("No solicitations data available")

print("\n")

# Contracts summary  
if 'df_contracts' in locals() and not df_contracts.empty:
    print(f"CONTRACTS DATA:")
    print(f"- Total records: {len(df_contracts)}")
    print(f"- Columns: {df_contracts.shape[1]}")
    print(f"- Key fields: Code, Contract Title, Vendor, Contract Type, Effective/Expiration Date")
    if 'Contract Type' in df_contracts.columns:
        print(f"- Contract types: {df_contracts['Contract Type'].unique()}")
    if 'Effective Date' in df_contracts.columns and 'Expiration Date' in df_contracts.columns:
        print(f"- Date range: {df_contracts['Effective Date'].min()} to {df_contracts['Expiration Date'].max()}")
else:
    print("No contracts data available")

print("\n=== BOTH DATASETS SUCCESSFULLY EXTRACTED INTO DATAFRAMES ===")
print("\nYou can now work with:")
print("- df_solicitations: Contains all open solicitations")
print("- df_contracts: Contains all current contracts")

In [10]:
# Extract solicitation data from table 11 specifically (the correct table with 26 rows)
solicitations_data = []

if raw_html:
    soup = BeautifulSoup(raw_html, 'html.parser')
    
    # Get all tables
    tables = soup.find_all('table')
    print(f"Found {len(tables)} total tables")
    
    # Target table 11 (index 10) specifically - the one with actual solicitation data
    if len(tables) >= 11:
        target_table = tables[10]  # Table 11 (0-indexed)
        rows = target_table.find_all('tr')
        
        print(f"Table 11 has {len(rows)} rows (including header)")
        
        if len(rows) > 1:  # Has header and data rows
            # Get headers from first row
            header_row = rows[0]
            headers = [th.get_text(strip=True) for th in header_row.find_all(['th', 'td'])]
            
            print(f"Headers: {headers}")
            
            # Extract data rows (skip header row)
            data_rows_processed = 0
            for i, row in enumerate(rows[1:], 1):  # Skip header, start counting from 1
                cells = [td.get_text(strip=True) for td in row.find_all(['td', 'th'])]
                
                # Only process rows that have actual data (not empty)
                if len(cells) > 0 and any(cell.strip() for cell in cells):
                    # Ensure we have the right number of columns
                    while len(cells) < len(headers):
                        cells.append('')
                    
                    # Create a record with column names
                    record = {}
                    for j, header in enumerate(headers):
                        if j < len(cells):
                            record[header] = cells[j]
                        else:
                            record[header] = ''
                    
                    # Add metadata
                    record['row_number'] = i
                    record['original_row_length'] = len(cells)
                    
                    solicitations_data.append(record)
                    data_rows_processed += 1
                    
                    # Show first 3 rows for debugging
                    if i <= 3:
                        print(f"Row {i}: {cells[:3]}...")
            
            print(f"Processed {data_rows_processed} data rows")
        else:
            print("Table 11 has no data rows")
    else:
        print(f"Only found {len(tables)} tables, cannot access table 11")

# Create DataFrame
if solicitations_data:
    df_solicitations = pd.DataFrame(solicitations_data)
    print(f"\nExtracted {len(df_solicitations)} solicitation records from table 11")
    print(f"Columns: {list(df_solicitations.columns)}")
    
    # Clean up the data
    # Remove empty columns if any
    df_solicitations = df_solicitations.dropna(how='all', axis=1)
    
    # Clean up column names (remove extra spaces, etc.)
    df_solicitations.columns = [col.strip() for col in df_solicitations.columns]
    
    # Display basic info
    print(f"\nDataFrame shape: {df_solicitations.shape}")
    print(f"Cleaned columns: {list(df_solicitations.columns)}")
    
    # Display first few records
    print("\nFirst 5 solicitations:")
    display(df_solicitations.head())
    
    # Show some statistics
    if 'Status' in df_solicitations.columns:
        print(f"\nStatus distribution:")
        print(df_solicitations['Status'].value_counts())
        
    if 'Due / Close Date' in df_solicitations.columns:
        print(f"\nDue dates (first 10):")
        print(df_solicitations['Due / Close Date'].head(10))
        
else:
    print("No solicitation data extracted from table 11")
    df_solicitations = pd.DataFrame()

Found 11 total tables
Table 11 has 26 rows (including header)
Headers: ['Editing column', 'ID', 'Title', 'Status', 'Due / Close Date', 'Publish Date UTC-4', 'Main Category', 'Solicitation Type', 'Issuing Agency', 'Auto opening', 'Round #', 'Award Status', 'Procurement Officer / Buyer', 'Authority', 'Sub Agency', 'Site', 'Bid Holders List', 'eMM ID', 'Orga x_id']
Row 1: ['Edit Pest Control Services', 'BPM051657', 'Pest Control Services']...
Row 2: ['Edit MAA-MC-26-002 Integrated Airport Security System (IASS) Maintenance at BWI Thurgood Marshall Airport', 'BPM051572', 'MAA-MC-26-002 Integrated Airport Security System (IASS) Maintenance at BWI Thurgood Marshall Airport']...
Row 3: ['Edit M00P5605104 - RSA Renewal', 'BPM051655', 'M00P5605104 - RSA Renewal']...
Processed 25 data rows

Extracted 25 solicitation records from table 11
Columns: ['Editing column', 'ID', 'Title', 'Status', 'Due / Close Date', 'Publish Date UTC-4', 'Main Category', 'Solicitation Type', 'Issuing Agency', 'Auto ope

Unnamed: 0,Editing column,ID,Title,Status,Due / Close Date,Publish Date UTC-4,Main Category,Solicitation Type,Issuing Agency,Auto opening,...,Award Status,Procurement Officer / Buyer,Authority,Sub Agency,Site,Bid Holders List,eMM ID,Orga x_id,row_number,original_row_length
0,Edit Pest Control Services,BPM051657,Pest Control Services,Open,7/15/2025,6/6/2025 5:41:02 PM,Other,Public Notice,Maryland Transportation Authority,Auto opening,...,In progress,Wheeler Akia,State of Maryland Government,Maryland Transportation Authority,,,,auth;J07,1,19
1,Edit MAA-MC-26-002 Integrated Airport Security...,BPM051572,MAA-MC-26-002 Integrated Airport Security Syst...,Open,7/10/2025,6/6/2025 5:19:47 PM,Security and protection software,IFB: Invitation for Bid (w/ Min Quals),Maryland Aviation Administration,Auto opening,...,In progress,White Krystal,State of Maryland Government,Maryland Aviation Administration,,,,auth;J06,2,19
2,Edit M00P5605104 - RSA Renewal,BPM051655,M00P5605104 - RSA Renewal,Open,6/16/2025,6/6/2025 4:54:21 PM,Software,IFB: Invitation for Bid,Department of Health & Mental Hygiene,Auto opening,...,In progress,Mondesir Chevon,State of Maryland Government,,,View,,sact;MDM00,3,19
3,Edit BPM050845/ASE30CRS - Work Zone Automated ...,BPM050845,BPM050845/ASE30CRS - Work Zone Automated Speed...,Open,7/22/2025,6/6/2025 4:46:13 PM,Defense and Law Enforcement and Security and S...,RFP: Triple Envelope Proposal,State Highway Administration,Auto opening,...,In progress,Warner Samina,State of Maryland Government,State Highway Administration,,,,auth;J02,4,19
4,Edit Testing and Inspection Services - Pimlico...,BPM051653,Testing and Inspection Services - Pimlico Raci...,Open,7/3/2025,6/6/2025 7:46:26 PM,Engineering testing services,Public Notice,Capital Projects,Auto opening,...,In progress,Kramer Christian,State of Maryland GovernmentState of Maryland ...,Capital Projects,,,,auth;D2841110sact;MDD28,5,19



Status distribution:
Status
Open    25
Name: count, dtype: int64

Due dates (first 10):
0    7/15/2025
1    7/10/2025
2    6/16/2025
3    7/22/2025
4     7/3/2025
5    6/16/2025
6    6/23/2025
7    6/11/2025
8     7/1/2025
9    6/30/2025
Name: Due / Close Date, dtype: object


In [ ]:
# Query the Gold layer table to see the enhanced data
query_gold = """
SELECT 
    id,
    solicitation_id,
    title,
    status,
    solicitation_type,
    lot_number,
    round_number,
    main_category,
    issuing_agency,
    due_date,
    project_cost_class,
    mbe_participation_pct,
    asian_american_pct,
    hispanic_american_pct,
    women_owned_pct,
    african_american_pct,
    small_business_reserve,
    solicitation_summary,
    additional_instructions,
    attachments,
    detail_url,
    processed_at
FROM gold.emma_solicitations_gold
ORDER BY processed_at DESC;
"""

try:
    df_gold = pd.read_sql(query_gold, engine)
    print(f"Found {len(df_gold)} records in Gold layer")
    
    if len(df_gold) > 0:
        print("\n=== GOLD LAYER DATA ===")
        
        # Show basic fields
        print("\nBasic Information:")
        for idx, row in df_gold.head(1).iterrows():
            print(f"ID: {row['solicitation_id']}")
            print(f"Title: {row['title']}")
            print(f"Status: {row['status']}")
            print(f"Type: {row['solicitation_type']}")
            print(f"Lot #: {row['lot_number']}")
            print(f"Round #: {row['round_number']}")
            print(f"Agency: {row['issuing_agency']}")
            print(f"Due Date: {row['due_date']}")
            print(f"Cost Class: {row['project_cost_class']}")
            print(f"SBR: {row['small_business_reserve']}")
            
        print("\n\nParticipation Goals:")
        for idx, row in df_gold.head(1).iterrows():
            print(f"MBE Total: {row['mbe_participation_pct']}%")
            print(f"- Asian American: {row['asian_american_pct']}%")
            print(f"- Hispanic American: {row['hispanic_american_pct']}%")
            print(f"- Women Owned: {row['women_owned_pct']}%")
            print(f"- African American: {row['african_american_pct']}%")
            
        print("\n\nSummary:")
        for idx, row in df_gold.head(1).iterrows():
            if row['solicitation_summary']:
                print(row['solicitation_summary'][:300] + "...")
                
        print("\n\nAdditional Instructions:")
        for idx, row in df_gold.head(1).iterrows():
            if row['additional_instructions']:
                print(row['additional_instructions'][:200] + "...")
                
        # Show attachments count
        print("\n\nAttachments:")
        for idx, row in df_gold.head(1).iterrows():
            if row['attachments']:
                import json
                try:
                    attachments = json.loads(row['attachments'])
                    print(f"Total attachments: {len(attachments)}")
                    # Show first 3
                    for i, att in enumerate(attachments[:3]):
                        print(f"  {i+1}. {att.get('title', 'Unknown')} ({att.get('type', 'Unknown')})")
                except:
                    print("Error parsing attachments")
                    
        display(df_gold[['solicitation_id', 'title', 'status', 'mbe_participation_pct', 'small_business_reserve']].head())
        
except Exception as e:
    print(f"Error querying Gold table: {e}")

In [ ]:
# Test the fixed selectors - Query Gold table with enhanced data
query_gold_fixed = """
SELECT 
    solicitation_id,
    title,
    status,  -- Should now show "Open" instead of "val"
    procurement_officer,  -- Should now show "Samina WARNER"
    email,  -- Should now show actual email
    solicitation_type,
    lot_number,
    round_number,
    project_cost_class,
    mbe_participation_pct,
    asian_american_pct,
    hispanic_american_pct,
    women_owned_pct,
    african_american_pct,
    small_business_reserve,
    issuing_agency,
    due_date,
    processed_at
FROM gold.emma_solicitations_gold
ORDER BY processed_at DESC
LIMIT 1;
"""

try:
    df_gold_fixed = pd.read_sql(query_gold_fixed, engine)
    
    if len(df_gold_fixed) > 0:
        print("=== ENHANCED GOLD LAYER DATA ===")
        print()
        
        row = df_gold_fixed.iloc[0]
        
        print("🆔 IDENTIFICATION:")
        print(f"  Solicitation ID: {row['solicitation_id']}")
        print(f"  Title: {row['title']}")
        print()
        
        print("📋 STATUS & TYPE:")
        print(f"  Status: {row['status']} (should be 'Open' not 'val')")
        print(f"  Type: {row['solicitation_type']}")
        print(f"  Lot #: {row['lot_number']}")
        print(f"  Round #: {row['round_number']}")
        print(f"  Cost Class: {row['project_cost_class']}")
        print()
        
        print("👤 CONTACT INFO:")
        print(f"  Procurement Officer: {row['procurement_officer']} (should be 'Samina WARNER')")
        print(f"  Email: {row['email']} (should be actual email)")
        print(f"  Agency: {row['issuing_agency']}")
        print()
        
        print("🎯 PARTICIPATION GOALS:")
        print(f"  MBE Total: {row['mbe_participation_pct']}%")
        print(f"  - Asian American: {row['asian_american_pct']}%")
        print(f"  - Hispanic American: {row['hispanic_american_pct']}%") 
        print(f"  - Women Owned: {row['women_owned_pct']}%")
        print(f"  - African American: {row['african_american_pct']}%")
        print(f"  Small Business Reserve: {row['small_business_reserve']}")
        print()
        
        print("📅 TIMING:")
        print(f"  Due Date: {row['due_date']}")
        print(f"  Processed: {row['processed_at']}")
        
        # Verify the fixes worked
        print("\n" + "="*50)
        print("🔍 VERIFICATION OF FIXES:")
        
        if row['status'] == 'Open':
            print("✅ Status: Fixed - showing 'Open' instead of 'val'")
        else:
            print(f"❌ Status: Still showing '{row['status']}' instead of 'Open'")
            
        if row['procurement_officer'] and 'WARNER' in str(row['procurement_officer']):
            print("✅ Procurement Officer: Fixed - showing actual name")
        else:
            print(f"❌ Procurement Officer: Still showing '{row['procurement_officer']}'")
            
        if row['email'] and '@' in str(row['email']):
            print("✅ Email: Fixed - showing actual email address")
        else:
            print(f"❌ Email: Still showing '{row['email']}'")
            
        print("="*50)
        
    else:
        print("No records found in Gold table")
        
except Exception as e:
    print(f"Error querying Gold table: {e}")
    import traceback
    traceback.print_exc()

In [ ]:
# Final verification - Clean Gold table structure without unused fields
query_final = """
SELECT *
FROM gold.emma_solicitations_gold
ORDER BY processed_at DESC
LIMIT 1;
"""

try:
    df_final = pd.read_sql(query_final, engine)
    
    if len(df_final) > 0:
        print("=== FINAL CLEAN GOLD LAYER STRUCTURE ===")
        print()
        
        # Show all columns and their values
        for col in df_final.columns:
            value = df_final.iloc[0][col]
            if pd.isna(value) or value is None:
                print(f"  {col}: None")
            elif isinstance(value, str) and len(value) > 100:
                print(f"  {col}: {value[:100]}...")
            else:
                print(f"  {col}: {value}")
        
        print(f"\n📊 SUMMARY:")
        print(f"  Total columns: {len(df_final.columns)}")
        non_null_count = sum(1 for col in df_final.columns if not pd.isna(df_final.iloc[0][col]) and df_final.iloc[0][col] is not None)
        print(f"  Non-null fields: {non_null_count}")
        print(f"  Data completeness: {non_null_count/len(df_final.columns)*100:.1f}%")
        
        # Check for the removed fields
        removed_fields = ['questions_due_date', 'pre_bid_date', 'pre_bid_details']
        print(f"\n✅ CLEANUP VERIFICATION:")
        for field in removed_fields:
            if field in df_final.columns:
                print(f"  ❌ {field}: Still present (should be removed)")
            else:
                print(f"  ✅ {field}: Successfully removed")
                
        print(f"\n🎯 KEY DATA EXTRACTED:")
        row = df_final.iloc[0]
        print(f"  Status: '{row['status']}' (user-friendly)")
        print(f"  Officer: '{row['procurement_officer']}' (full name)")
        print(f"  Email: '{row['email']}' (contact info)")
        print(f"  MBE Goals: {row['mbe_participation_pct']}% total")
        print(f"  Attachments: {row['attachments'].count('title') if row['attachments'] else 0} files")
        
    else:
        print("No records found in Gold table")
        
except Exception as e:
    print(f"Error querying Gold table: {e}")
    import traceback
    traceback.print_exc()

In [ ]:
# Find real contract URLs from the scraped contract data
print("=== FINDING REAL CONTRACT URLS ===")

# First check if we have contracts data 
if 'df_contracts' in locals() and not df_contracts.empty:
    print(f"✅ Found {len(df_contracts)} contract records")
    print(f"Columns: {list(df_contracts.columns)}")
    
    # Look for URL or link columns
    url_columns = [col for col in df_contracts.columns if 'url' in col.lower() or 'link' in col.lower() or 'href' in col.lower()]
    print(f"URL-related columns: {url_columns}")
    
    # Check the first few rows for any URL patterns
    print("\nFirst 3 contract records:")
    for idx, row in df_contracts.head(3).iterrows():
        contract_id = row.get('Contract Code', row.get('ID', 'Unknown'))
        contract_title = row.get('Contract Title', row.get('Title', 'Unknown'))
        print(f"\n{idx+1}. Contract: {contract_id}")
        print(f"   Title: {contract_title[:50]}...")
        
        # Check all columns for anything that looks like a URL
        for col, val in row.items():
            if isinstance(val, str) and ('http' in val or '/page.aspx' in val or 'contract' in val.lower()):
                print(f"   {col}: {val}")
        
        # Also check if there are any hidden URL patterns
        if hasattr(row, 'to_dict'):
            for key, value in row.to_dict().items():
                if isinstance(value, str) and len(value) > 10 and '/' in value:
                    print(f"   Possible URL in {key}: {value}")
    
else:
    print("❌ No contracts data available")
    print("Need to run the contract extraction first")

# Also check what we might have in the raw HTML
print(f"\n=== CHECKING RAW HTML FOR CONTRACT LINKS ===")

if 'contracts_raw_html' in locals() and contracts_raw_html:
    contracts_soup = BeautifulSoup(contracts_raw_html, 'html.parser')
    
    # Look for contract detail links in the HTML
    all_links = contracts_soup.find_all('a', href=True)
    contract_links = [link for link in all_links if 'contract' in link.get('href', '').lower() or 'ctr' in link.get('href', '')]
    
    print(f"Found {len(contract_links)} potential contract links:")
    for i, link in enumerate(contract_links[:5]):  # Show first 5
        href = link.get('href', '')
        text = link.get_text(strip=True)
        print(f"  {i+1}. {href} -> '{text[:30]}...'")
        
    # Also look for any links that might lead to contract details
    detail_links = [link for link in all_links if 'detail' in link.get('href', '').lower() or 'view' in link.get('href', '').lower()]
    print(f"\nFound {len(detail_links)} potential detail links:")
    for i, link in enumerate(detail_links[:5]):  # Show first 5
        href = link.get('href', '')
        text = link.get_text(strip=True)
        print(f"  {i+1}. {href} -> '{text[:30]}...'")
        
else:
    print("❌ No contracts raw HTML available")

In [ ]:
# Query the Contracts Gold layer to see the extracted data
query_contracts_gold = """
SELECT 
    contract_id,
    contract_title,
    contract_type,
    vendor_name,
    contract_amount,
    currency,
    effective_date,
    expiration_date,
    procurement_officer,
    contact_email,
    agency_org,
    commodities,
    vsbe_goal_percentage,
    linked_solicitation,
    contract_scope,
    detail_url,
    processed_at
FROM gold.emma_contracts_gold
ORDER BY processed_at DESC
LIMIT 1;
"""

try:
    df_contracts_gold = pd.read_sql(query_contracts_gold, engine)
    
    if len(df_contracts_gold) > 0:
        print("=== CONTRACTS GOLD LAYER DATA ===")
        print()
        
        row = df_contracts_gold.iloc[0]
        
        print("🆔 IDENTIFICATION:")
        print(f"  Contract ID: {row['contract_id']}")
        print(f"  Title: {row['contract_title']}")
        print(f"  Type: {row['contract_type']}")
        print()
        
        print("💰 FINANCIAL DETAILS:")
        print(f"  Amount: {row['contract_amount']} {row['currency']}")
        print(f"  Effective: {row['effective_date']}")
        print(f"  Expires: {row['expiration_date']}")
        print()
        
        print("🏢 VENDOR & AGENCY:")
        print(f"  Vendor: {row['vendor_name']}")
        print(f"  Agency: {row['agency_org']}")
        print(f"  Commodities: {row['commodities']}")
        print()
        
        print("👤 CONTACT INFO:")
        print(f"  Procurement Officer: {row['procurement_officer']}")
        print(f"  Email: {row['contact_email']}")
        print()
        
        print("🎯 PARTICIPATION GOALS:")
        print(f"  VSBE Goal: {row['vsbe_goal_percentage']}%")
        print()
        
        print("🔗 LINKAGE & CONTENT:")
        print(f"  Linked Solicitation: {row['linked_solicitation']}")
        if row['contract_scope']:
            print(f"  Contract Scope: {row['contract_scope'][:100]}...")
        print()
        
        print("📅 METADATA:")
        print(f"  Detail URL: {row['detail_url']}")
        print(f"  Processed: {row['processed_at']}")
        
        # Verify the extraction success
        print("\n" + "="*50)
        print("🔍 EXTRACTION VERIFICATION:")
        
        extracted_fields = 0
        total_fields = 0
        
        for col in ['contract_id', 'contract_title', 'vendor_name', 'contract_amount', 
                   'effective_date', 'expiration_date', 'procurement_officer', 'contact_email', 'agency_org']:
            total_fields += 1
            if row[col] and str(row[col]).strip() and str(row[col]) != 'None':
                extracted_fields += 1
                print(f"  ✅ {col}: {row[col]}")
            else:
                print(f"  ❌ {col}: Empty or None")
                
        print(f"\n📊 SUCCESS RATE: {extracted_fields}/{total_fields} core fields ({extracted_fields/total_fields*100:.1f}%)")
        
        if extracted_fields >= 7:
            print("🎉 EXCELLENT: Most fields successfully extracted!")
        elif extracted_fields >= 5:
            print("✅ GOOD: Core fields extracted successfully!")
        else:
            print("⚠️  NEEDS IMPROVEMENT: Some core fields missing")
        
        # Show the full record
        print(f"\n📋 FULL RECORD:")
        display(df_contracts_gold.head())
        
    else:
        print("❌ No records found in Contracts Gold table")
        
except Exception as e:
    print(f"❌ Error querying Contracts Gold table: {e}")
    import traceback
    traceback.print_exc()