In [1]:
import sqlite3
import logging
import os
import json
from datetime import datetime
from typing import Optional

In [4]:
class DatabaseSetup:
    def __init__(self, db_path: str = 'rso_emails.db'):
        self.db_path = db_path
        self.setup_logging()
        
    def setup_logging(self):
        """Set up logging configuration"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('db_setup.log'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)
        
    def create_tables(self) -> bool:
        """Create all necessary database tables"""
        try:
            with sqlite3.connect(self.db_path) as conn:
                cursor = conn.cursor()
                
                # Create RSO emails table
                cursor.execute('''
                CREATE TABLE IF NOT EXISTS rso_emails (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    rso_listhost TEXT NOT NULL,
                    subject TEXT,
                    sender TEXT,
                    email_date TIMESTAMP,
                    content TEXT,
                    message_id TEXT UNIQUE,
                    processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    FOREIGN KEY (rso_listhost) REFERENCES rsos(listhost)
                )
                ''')
                
                # Create RSOs table
                cursor.execute('''
                CREATE TABLE IF NOT EXISTS rsos (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    name TEXT NOT NULL,
                    listhost TEXT UNIQUE,
                    has_listhost BOOLEAN NOT NULL DEFAULT 0,
                    last_email_check TIMESTAMP,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    categories TEXT,  -- Store as JSON array
                    description TEXT
                )
                ''')
                
                # Create indices for better performance
                cursor.execute('''
                CREATE INDEX IF NOT EXISTS idx_rso_emails_listhost 
                ON rso_emails(rso_listhost)
                ''')
                
                cursor.execute('''
                CREATE INDEX IF NOT EXISTS idx_rso_emails_date 
                ON rso_emails(email_date)
                ''')
                
                conn.commit()
                self.logger.info("Database tables created successfully")
                return True
                
        except sqlite3.Error as e:
            self.logger.error(f"Error creating database tables: {str(e)}")
            return False
            
    def import_rso_data(self, json_path: str) -> Optional[int]:
        """Import RSO data from JSON file into database"""
        try:
            # Read JSON file
            with open(json_path, 'r') as f:
                rsos = json.load(f)
                
            imported_count = 0
            with sqlite3.connect(self.db_path) as conn:
                cursor = conn.cursor()
                
                for rso in rsos:
                    try:
                        # Extract data from RSO
                        name = rso.get('name', '')
                        listhost = rso.get('additional_info', {}).get('RSO Listhost')
                        categories = json.dumps(rso.get('categories', []))
                        description = rso.get('description_preview', '') or rso.get('full_description', '')
                        
                        # Set has_listhost flag
                        has_listhost = 1 if listhost else 0
                        
                        cursor.execute('''
                        INSERT INTO rsos (name, listhost, has_listhost, categories, description)
                        VALUES (?, ?, ?, ?, ?)
                        ON CONFLICT(listhost) DO UPDATE SET
                            name = excluded.name,
                            has_listhost = excluded.has_listhost,
                            categories = excluded.categories,
                            description = excluded.description
                        ''', (name, listhost, has_listhost, categories, description))
                        imported_count += 1
                        
                    except sqlite3.Error as e:
                        self.logger.warning(f"Error importing RSO {rso.get('name', 'Unknown')}: {str(e)}")
                        continue
                
                conn.commit()
            
            self.logger.info(f"Successfully imported {imported_count} RSOs")
            return imported_count
            
        except FileNotFoundError:
            self.logger.error(f"JSON file not found: {json_path}")
            return None
        except json.JSONDecodeError:
            self.logger.error(f"Invalid JSON format in file: {json_path}")
            return None
        except Exception as e:
            self.logger.error(f"Error importing RSO data: {str(e)}")
            return None
            
    def verify_database(self) -> dict:
        """Verify database setup and return statistics"""
        try:
            with sqlite3.connect(self.db_path) as conn:
                cursor = conn.cursor()
                
                # Check tables exist
                cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
                tables = [row[0] for row in cursor.fetchall()]
                
                # Get table counts
                stats = {}
                for table in tables:
                    cursor.execute(f"SELECT COUNT(*) FROM {table}")
                    stats[table] = cursor.fetchone()[0]
                
                # Get RSO statistics
                cursor.execute('''
                SELECT 
                    COUNT(*) as total_rsos,
                    SUM(CASE WHEN has_listhost = 1 THEN 1 ELSE 0 END) as rsos_with_listhost,
                    SUM(CASE WHEN has_listhost = 0 THEN 1 ELSE 0 END) as rsos_without_listhost
                FROM rsos
                ''')
                rso_stats = cursor.fetchone()
                stats['rso_stats'] = {
                    'total_rsos': rso_stats[0],
                    'with_listhost': rso_stats[1],
                    'without_listhost': rso_stats[2]
                }
                
                # Get database file size
                stats['db_size_mb'] = os.path.getsize(self.db_path) / (1024 * 1024)
                
                return {
                    'tables_present': tables,
                    'table_counts': stats,
                    'db_file': self.db_path,
                    'verification_time': datetime.now().isoformat()
                }
                
        except sqlite3.Error as e:
            self.logger.error(f"Error verifying database: {str(e)}")
            return {}

In [5]:
def setup_database(json_path: Optional[str] = None):
    """Main function to set up the database"""
    db_setup = DatabaseSetup()
    
    # Create tables
    if not db_setup.create_tables():
        return
    
    # Import RSO data if JSON path provided
    if json_path:
        imported_count = db_setup.import_rso_data(json_path)
        if imported_count is None:
            db_setup.logger.error("Failed to import RSO data")
            return
    
    # Verify setup
    verification = db_setup.verify_database()
    db_setup.logger.info("Database Verification Results:")
    for key, value in verification.items():
        db_setup.logger.info(f"{key}: {value}")


In [6]:
json_path = "./scrape_db/rso_data_detailed.json"
setup_database(json_path)

2025-01-11 20:28:09,673 - INFO - Database tables created successfully
2025-01-11 20:28:09,690 - INFO - Successfully imported 396 RSOs
2025-01-11 20:28:09,691 - INFO - Database Verification Results:
2025-01-11 20:28:09,691 - INFO - tables_present: ['rso_emails', 'sqlite_sequence', 'rsos']
2025-01-11 20:28:09,692 - INFO - table_counts: {'rso_emails': 0, 'sqlite_sequence': 1, 'rsos': 396, 'rso_stats': {'total_rsos': 396, 'with_listhost': 0, 'without_listhost': 396}, 'db_size_mb': 0.13671875}
2025-01-11 20:28:09,692 - INFO - db_file: rso_emails.db
2025-01-11 20:28:09,692 - INFO - verification_time: 2025-01-11T20:28:09.691283
