In [None]:
# Program 01: Certificate Feature Analysis
# Location: RAPIDS/notebooks/certificate_analysis/01_certificate_feature_analysis.ipynb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from pathlib import Path
import json
import logging
from datetime import datetime
import re
from typing import Dict, Optional, Tuple

class CertificateAnalyzer:
    """Analyzer for SSL certificate features"""
    
    def __init__(self, config_path: str):
        """Initialize analyzer with configuration"""
        self.setup_environment(config_path)
        self.setup_logging()
        
    def setup_environment(self, config_path: str) -> None:
        """Setup analysis environment"""
        # Load database configuration
        with open(config_path) as f:
            self.config = json.load(f)['database']
        
        # Setup directories
        self.base_dir = Path('/home/asomura/waseda/nextstep/RAPIDS')
        self.output_dir = self.base_dir / 'reports' / 'certificate_analysis'
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
    def setup_logging(self) -> None:
        """Configure logging"""
        log_dir = self.base_dir / 'data' / 'logs'
        log_dir.mkdir(parents=True, exist_ok=True)
        
        logging.basicConfig(
            filename=log_dir / f'cert_analysis_{datetime.now():%Y%m%d}.log',
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)
        
    def get_database_engine(self, db_name: str) -> create_engine:
        """Create database connection engine"""
        return create_engine(
            f"postgresql://{self.config['user']}:{self.config['password']}@localhost/{db_name}"
        )
        
    def extract_expiry_days(self, expiry: str) -> Optional[int]:
        """Extract days until expiry from certificate date"""
        if pd.isna(expiry):
            return None
            
        try:
            # Parse YYYYMMDDHHMMSSZ format
            expiry_date = datetime.strptime(expiry.strip('Z'), '%Y%m%d%H%M%S')
            days = (expiry_date - datetime.now()).days
            return max(0, days)
        except (ValueError, AttributeError):
            return None
            
    def extract_protocol_info(self, cert_info: str) -> Tuple[str, str]:
        """Extract TLS protocol and cipher information"""
        if pd.isna(cert_info):
            return ('UNKNOWN', 'UNKNOWN')
            
        protocol_match = re.search(r'Protocol:\s*(TLSv[\d.]+)', cert_info)
        cipher_match = re.search(r'Cipher:\s*(TLS_[A-Z0-9_]+)', cert_info)
        
        return (
            protocol_match.group(1) if protocol_match else 'UNKNOWN',
            cipher_match.group(1) if cipher_match else 'UNKNOWN'
        )
        
    def analyze_certificates(self, db_name: str) -> Dict:
        """Analyze certificate features from database"""
        self.logger.info(f"Analyzing certificates from {db_name}")
        
        try:
            # Get certificate data
            engine = self.get_database_engine(db_name)
            query = """
                SELECT 
                    domain,
                    https_certificate_expiry as expiry,
                    https_certificate_body as cert_info
                FROM website_data 
                WHERE status = 7
            """
            df = pd.read_sql_query(query, engine)
            
            # Extract features
            df['expiry_days'] = df['expiry'].apply(self.extract_expiry_days)
            
            # Extract protocol and cipher
            protocol_cipher = df['cert_info'].apply(self.extract_protocol_info)
            df['protocol'] = [p[0] for p in protocol_cipher]
            df['cipher'] = [p[1] for p in protocol_cipher]
            
            # Calculate statistics
            stats = {
                'total_certs': len(df),
                'expiry_stats': df['expiry_days'].describe(),
                'protocol_dist': df['protocol'].value_counts(),
                'cipher_dist': df['cipher'].value_counts()
            }
            
            return {'df': df, 'stats': stats}
            
        except Exception as e:
            self.logger.error(f"Error analyzing {db_name}: {str(e)}")
            raise

    def plot_results(self, results: Dict, db_name: str) -> None:
        """Create visualization of analysis results"""
        timestamp = datetime.now().strftime('%Y%m%d')
        
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
        
        # Protocol distribution
        results['stats']['protocol_dist'].plot(kind='bar', ax=ax1)
        ax1.set_title('TLS Protocol Distribution')
        ax1.tick_params(axis='x', rotation=45)
        
        # Cipher distribution
        results['stats']['cipher_dist'].plot(kind='bar', ax=ax2)
        ax2.set_title('Cipher Suite Distribution')
        ax2.tick_params(axis='x', rotation=45)
        
        # Expiry days distribution
        results['df']['expiry_days'].hist(bins=50, ax=ax3)
        ax3.set_title('Certificate Validity Period Distribution')
        ax3.set_xlabel('Days until expiry')
        
        # Save plot
        plt.tight_layout()
        plt.savefig(self.output_dir / f'cert_analysis_{db_name}_{timestamp}.png')
        plt.close()

def main():
    """Main execution function"""
    config_path = "/home/asomura/waseda/nextstep/RAPIDS/config/database.json"
    analyzer = CertificateAnalyzer(config_path)
    
    for db_name in ['website_data', 'normal_sites']:
        try:
            print(f"\nAnalyzing {db_name}...")
            results = analyzer.analyze_certificates(db_name)
            
            print(f"\nAnalysis Results for {db_name}:")
            print(f"Total Certificates: {results['stats']['total_certs']}")
            print("\nExpiry Statistics:")
            print(results['stats']['expiry_stats'])
            print("\nProtocol Distribution:")
            print(results['stats']['protocol_dist'])
            print("\nCipher Distribution:")
            print(results['stats']['cipher_dist'])
            
            analyzer.plot_results(results, db_name)
            
        except Exception as e:
            print(f"Error analyzing {db_name}: {str(e)}")

if __name__ == "__main__":
    main()