In [1]:
# Certificate Lifecycle Analysis
# Location: RAPIDS/notebooks/certificate_analysis/07_certificate_lifecycle_analysis.ipynb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import json
from pathlib import Path
import logging
from datetime import datetime, timedelta
from typing import Dict, List, Tuple
import calendar

class CertificateLifecycleAnalyzer:
    """Analyzer for SSL certificate lifecycle patterns between phishing and normal sites"""
    
    def __init__(self, config_path: str = '/home/asomura/waseda/nextstep/RAPIDS/config/database.json'):
        """
        Initialize the analyzer with configuration and setup directories
        
        Args:
            config_path: Path to database configuration file
        """
        self.setup_environment(config_path)
        self.setup_logging()
        
    def setup_environment(self, config_path: str) -> None:
        """
        Setup analysis environment and load configuration
        
        Args:
            config_path: Path to configuration file
        """
        # Load database configuration
        with open(config_path) as f:
            self.config = json.load(f)['database']
            
        # Setup directory structure
        self.base_dir = Path('/home/asomura/waseda/nextstep/RAPIDS')
        self.output_dir = self.base_dir / 'reports' / 'lifecycle_analysis'
        self.data_dir = self.base_dir / 'data' / 'processed'
        
        for dir_path in [self.output_dir, self.data_dir]:
            dir_path.mkdir(parents=True, exist_ok=True)
            
        # Set plot style
        # plt.style.use('seaborn')
        # self.colors = {'phishing': '#FF6B6B', 'normal': '#4ECDC4'}

        # Set plot style using sns
        sns.set_theme()  # seabornのデフォルトテーマを設定
        self.colors = {'phishing': '#FF6B6B', 'normal': '#4ECDC4'}
    
    def setup_logging(self) -> None:
        """Configure logging settings"""
        log_dir = self.base_dir / 'data' / 'logs'
        log_dir.mkdir(parents=True, exist_ok=True)
        
        self.timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        logging.basicConfig(
            filename=log_dir / f'lifecycle_analysis_{self.timestamp}.log',
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)
        
    def get_database_engine(self, db_name: str) -> create_engine:
        """
        Create database connection engine
        
        Args:
            db_name: Name of the database to connect
            
        Returns:
            SQLAlchemy engine
        """
        host = 'localhost' if db_name == 'website_data' else '192.168.1.92'
        return create_engine(
            f"postgresql://{self.config['user']}:{self.config['password']}@{host}/{db_name}"
        )

    def extract_certificate_data(self, db_name: str) -> pd.DataFrame:
        """
        Extract relevant certificate data for lifecycle analysis
        
        Args:
            db_name: Database name to query
            
        Returns:
            DataFrame containing certificate lifecycle data
        """
        self.logger.info(f"Extracting certificate data from {db_name}")
        
        query = """
        SELECT 
            domain,
            https_certificate_issuer,
            https_certificate_expiry,
            last_update,
            domain_registrar,
            https_certificate_domain,
            https_certificate_body
        FROM website_data 
        WHERE status = 7 
        AND https_certificate_issuer IS NOT NULL
        AND last_update IS NOT NULL
        """
        
        engine = self.get_database_engine(db_name)
        return pd.read_sql_query(query, engine)
        
    def analyze_lifecycle_patterns(self, df: pd.DataFrame) -> Dict:
        """
        Analyze certificate lifecycle patterns
        
        Args:
            df: DataFrame containing certificate data
            
        Returns:
            Dictionary containing analysis results
        """
        df['last_update'] = pd.to_datetime(df['last_update'])
        
        # Extract expiry period from certificate expiry text
        df['expiry_days'] = df['https_certificate_expiry'].apply(self._extract_expiry_period)
        
        # Analyze temporal patterns
        temporal_patterns = {
            'weekday_distribution': df['last_update'].dt.dayofweek.value_counts().to_dict(),
            'hour_distribution': df['last_update'].dt.hour.value_counts().to_dict(),
            'month_distribution': df['last_update'].dt.month.value_counts().to_dict()
        }
        
        # Analyze expiry patterns
        expiry_patterns = {
            'expiry_distribution': df['expiry_days'].value_counts().to_dict(),
            'short_term_ratio': len(df[df['expiry_days'] < 90]) / len(df),
            'medium_term_ratio': len(df[(df['expiry_days'] >= 90) & (df['expiry_days'] < 365)]) / len(df),
            'long_term_ratio': len(df[df['expiry_days'] >= 365]) / len(df)
        }
        
        # Analyze issuer patterns
        issuer_patterns = self._analyze_issuer_patterns(df)
        
        return {
            'temporal_patterns': temporal_patterns,
            'expiry_patterns': expiry_patterns,
            'issuer_patterns': issuer_patterns
        }
    
    def _extract_expiry_period(self, expiry_text: str) -> int:
        """
        Extract certificate validity period in days
        
        Args:
            expiry_text: Text containing expiry information
            
        Returns:
            Number of days until expiry
        """
        if pd.isna(expiry_text):
            return 0
            
        # Extract days from common patterns
        days_pattern = r'(\d+)\s*(?:days?|d)'
        match = re.search(days_pattern, str(expiry_text), re.IGNORECASE)
        
        if match:
            return int(match.group(1))
        return 0
        
    def _analyze_issuer_patterns(self, df: pd.DataFrame) -> Dict:
        """
        Analyze patterns in certificate issuer behavior
        
        Args:
            df: DataFrame containing certificate data
            
        Returns:
            Dictionary containing issuer pattern analysis
        """
        issuer_expiry = df.groupby('https_certificate_issuer')['expiry_days'].agg(['mean', 'std', 'count'])
        top_issuers = issuer_expiry.nlargest(5, 'count')
        
        return {
            'top_issuers': top_issuers.to_dict(),
            'issuer_timing': df.groupby('https_certificate_issuer')['last_update'].agg(
                lambda x: x.dt.dayofweek.mode().iloc[0]
            ).to_dict()
        }
        
    def visualize_patterns(self, patterns: Dict, db_name: str) -> None:
        """
        Create visualizations for lifecycle patterns
        
        Args:
            patterns: Dictionary containing analysis results
            db_name: Name of the database analyzed
        """
        # Create temporal pattern plots
        self._plot_temporal_patterns(patterns['temporal_patterns'], db_name)
        
        # Create expiry pattern plots
        self._plot_expiry_patterns(patterns['expiry_patterns'], db_name)
        
        # Create issuer pattern plots
        self._plot_issuer_patterns(patterns['issuer_patterns'], db_name)
        
    def _plot_temporal_patterns(self, temporal_patterns: Dict, db_name: str) -> None:
        """Create temporal pattern visualizations"""
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
        
        # Weekday distribution
        weekdays = [calendar.day_name[i] for i in range(7)]
        weekday_counts = [temporal_patterns['weekday_distribution'].get(i, 0) for i in range(7)]
        
        ax1.bar(weekdays, weekday_counts, color=self.colors['phishing'])
        ax1.set_title(f'Certificate Issuance by Day of Week - {db_name}')
        ax1.tick_params(axis='x', rotation=45)
        
        # Hour distribution
        hours = list(range(24))
        hour_counts = [temporal_patterns['hour_distribution'].get(i, 0) for i in hours]
        
        ax2.plot(hours, hour_counts, color=self.colors['phishing'])
        ax2.set_title(f'Certificate Issuance by Hour - {db_name}')
        ax2.set_xlabel('Hour of Day')
        
        plt.tight_layout()
        plt.savefig(self.output_dir / f'temporal_patterns_{db_name}_{self.timestamp}.png')
        plt.close()
        
    def _plot_expiry_patterns(self, expiry_patterns: Dict, db_name: str) -> None:
        """Create expiry pattern visualizations"""
        plt.figure(figsize=(10, 6))
        
        # Plot expiry period distribution
        expiry_data = pd.Series(expiry_patterns['expiry_distribution'])
        expiry_data.plot(kind='hist', bins=50, color=self.colors['phishing'])
        
        plt.title(f'Certificate Validity Period Distribution - {db_name}')
        plt.xlabel('Days until Expiry')
        plt.ylabel('Count')
        
        plt.tight_layout()
        plt.savefig(self.output_dir / f'expiry_patterns_{db_name}_{self.timestamp}.png')
        plt.close()
        
    def _plot_issuer_patterns(self, issuer_patterns: Dict, db_name: str) -> None:
        """Create issuer pattern visualizations"""
        plt.figure(figsize=(12, 6))
        
        # Plot top issuers by certificate count
        top_issuers = pd.DataFrame(issuer_patterns['top_issuers'])
        top_issuers['count'].plot(kind='bar', color=self.colors['phishing'])
        
        plt.title(f'Top Certificate Issuers - {db_name}')
        plt.xlabel('Issuer')
        plt.ylabel('Certificate Count')
        plt.xticks(rotation=45, ha='right')
        
        plt.tight_layout()
        plt.savefig(self.output_dir / f'issuer_patterns_{db_name}_{self.timestamp}.png')
        plt.close()

def main():
    """Main execution function"""
    analyzer = CertificateLifecycleAnalyzer()
    
    results = {}
    for db_name in ['website_data', 'normal_sites']:
        print(f"\nAnalyzing {db_name}...")
        
        try:
            # Extract and analyze data
            df = analyzer.extract_certificate_data(db_name)
            patterns = analyzer.analyze_lifecycle_patterns(df)
            
            # Create visualizations
            analyzer.visualize_patterns(patterns, db_name)
            
            # Store results
            results[db_name] = patterns
            
            # Save results to JSON
            output_path = analyzer.data_dir / f'lifecycle_patterns_{db_name}_{analyzer.timestamp}.json'
            with open(output_path, 'w') as f:
                json.dump(patterns, f, indent=2)
            
            print(f"Analysis complete for {db_name}")
            
        except Exception as e:
            print(f"Error analyzing {db_name}: {str(e)}")
            analyzer.logger.error(f"Error analyzing {db_name}: {str(e)}")

if __name__ == "__main__":
    main()

OSError: 'seaborn' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)