# インポートと初期設定

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import json
from pathlib import Path
import logging
from datetime import datetime, date
import re
from typing import Dict, List, Optional, Tuple, Any

# Set up default plotting style
plt.style.use('default')

# アナライザークラスの全定義

In [6]:
class AdvancedCertificateAnalyzer:
    """Advanced SSL certificate analyzer with comprehensive feature extraction and analysis"""
    
    def __init__(self, config_path: str):
        """Initialize the analyzer with configuration and setup"""
        self.setup_environment(config_path)
        self.setup_logging()
        
    def setup_environment(self, config_path: str) -> None:
        """Setup analysis environment and load configuration"""
        with open(config_path) as f:
            self.config = json.load(f)['database']
            
        self.base_dir = Path('/home/asomura/waseda/nextstep/RAPIDS')
        self.output_dir = self.base_dir / 'reports' / 'certificate_analysis'
        self.data_dir = self.base_dir / 'data' / 'processed'
        
        for dir_path in [self.output_dir, self.data_dir]:
            dir_path.mkdir(parents=True, exist_ok=True)
            
    def setup_logging(self) -> None:
        """Configure logging settings"""
        log_dir = self.base_dir / 'data' / 'logs'
        log_dir.mkdir(parents=True, exist_ok=True)
        
        self.timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        logging.basicConfig(
            filename=log_dir / f'advanced_cert_analysis_{self.timestamp}.log',
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

    def get_database_connection(self, db_name: str) -> create_engine:
        """Create database connection"""
        host = 'localhost' if db_name == 'website_data' else '192.168.1.92'
        return create_engine(
            f"postgresql://{self.config['user']}:{self.config['password']}@{host}/{db_name}"
        )

    def extract_certificate_data(self, db_name: str) -> pd.DataFrame:
        """Extract certificate data from database"""
        self.logger.info(f"Extracting certificate data from {db_name}")
        
        engine = self.get_database_connection(db_name)
        query = """
        SELECT 
            domain,
            https_certificate_issuer,
            https_certificate_domain,
            https_certificate_expiry,
            https_certificate_public_key,
            https_certificate_signature_algorithm,
            https_certificate_extensions,
            https_certificate_body,
            domain_registrar,
            last_update,
            whois_domain,
            dig_info_a,
            dig_info_mx,
            dig_info_ns,
            ip_organization
        FROM website_data 
        WHERE status = 7 
        AND https_certificate_issuer IS NOT NULL
        """
        
        return pd.read_sql_query(query, engine)

    def _extract_basic_features(self, df: pd.DataFrame) -> Dict:
        """Extract basic certificate features"""
        return {
            'total_certs': int(len(df)),
            'issuer_distribution': df['https_certificate_issuer'].value_counts().to_dict(),
            'algorithm_distribution': df['https_certificate_signature_algorithm'].value_counts().to_dict(),
            'registrar_distribution': df['domain_registrar'].value_counts().to_dict()
        }

    def _extract_temporal_features(self, df: pd.DataFrame) -> Dict:
        """Extract temporal patterns and features"""
        df['last_update'] = pd.to_datetime(df['last_update'])
        
        daily_counts = df.groupby(df['last_update'].dt.date).size()
        monthly_counts = df.groupby([
            df['last_update'].dt.year,
            df['last_update'].dt.month
        ]).size()
        
        return {
            'daily_counts': {str(k): int(v) for k, v in daily_counts.items()},
            'monthly_counts': {f"{k[0]}-{k[1]}": int(v) for k, v in monthly_counts.items()},
            'weekday_distribution': df['last_update'].dt.dayofweek.value_counts().to_dict()
        }

    def _extract_structural_features(self, df: pd.DataFrame) -> Dict:
        """Extract structural certificate features"""
        domain_levels = df['domain'].str.count(r'\.') + 1
        has_wildcard = df['https_certificate_domain'].str.contains(r'\*', regex=True).fillna(False)
        
        return {
            'domain_levels': domain_levels.value_counts().to_dict(),
            'has_wildcard': has_wildcard.value_counts().to_dict(),
            'cert_domain_match': df.apply(
                lambda x: str(x['domain']).lower() in str(x['https_certificate_domain']).lower()
                if pd.notnull(x['https_certificate_domain']) else False,
                axis=1
            ).value_counts().to_dict()
        }

    def _extract_security_features(self, df: pd.DataFrame) -> Dict:
        """Extract security-related features"""
        key_strength = df['https_certificate_public_key'].apply(self._analyze_key_strength)
        return {
            'key_strength': [k for k in key_strength],
            'is_self_signed': df['https_certificate_issuer'].str.contains(
                'self signed', case=False
            ).fillna(False).value_counts().to_dict(),
            'uses_sha1': df['https_certificate_signature_algorithm'].str.contains(
                'sha1', case=False
            ).fillna(False).value_counts().to_dict()
        }

    def _analyze_key_strength(self, key_info: str) -> Dict:
        """Analyze public key strength"""
        if pd.isna(key_info):
            return {'type': 'unknown', 'strength': 0}
            
        key_info = str(key_info).upper()
        
        if 'RSA' in key_info:
            match = re.search(r'(\d+)\s*(?:BIT|BITS)?', key_info)
            return {
                'type': 'RSA',
                'strength': int(match.group(1)) if match else 0
            }
        elif any(ec in key_info for ec in ['EC', 'ECDSA']):
            match = re.search(r'(\d+)[Kk]?', key_info)
            return {
                'type': 'EC',
                'strength': int(match.group(1)) if match else 0
            }
        
        return {'type': 'unknown', 'strength': 0}

    def analyze_certificates(self, db_name: str) -> Dict:
        """Perform comprehensive certificate analysis"""
        self.logger.info(f"Starting analysis for {db_name}")
        
        try:
            df = self.extract_certificate_data(db_name)
            
            features = {
                'basic': self._extract_basic_features(df),
                'temporal': self._extract_temporal_features(df),
                'structural': self._extract_structural_features(df),
                'security': self._extract_security_features(df)
            }
            
            self._save_processed_data(features, db_name)
            self._generate_visualizations(features, db_name)
            
            return features
        except Exception as e:
            self.logger.error(f"Error analyzing certificates for {db_name}: {str(e)}")
            raise

    def _process_for_serialization(self, obj: Any) -> Any:
        """Process objects for JSON serialization"""
        if isinstance(obj, dict):
            return {str(k): self._process_for_serialization(v) for k, v in obj.items()}
        elif isinstance(obj, (pd.Series, pd.DataFrame)):
            return obj.to_dict()
        elif isinstance(obj, (np.integer, np.floating)):
            return float(obj)
        elif isinstance(obj, (date, datetime)):
            return str(obj)
        elif isinstance(obj, (list, tuple)):
            return [self._process_for_serialization(x) for x in obj]
        return obj

    def _save_processed_data(self, features: Dict, db_name: str) -> None:
        """Save processed data to files"""
        try:
            output_path = self.data_dir / f'cert_features_{db_name}_{self.timestamp}.json'
            processed_features = self._process_for_serialization(features)
            
            with open(output_path, 'w') as f:
                json.dump(processed_features, f, indent=2, default=str)
                
            self.logger.info(f"Saved processed data to {output_path}")
        except Exception as e:
            self.logger.error(f"Error saving processed data: {str(e)}")
            raise

    def _generate_visualizations(self, features: Dict, db_name: str) -> None:
        """Generate and save visualizations"""
        try:
            self._plot_temporal_patterns(features['temporal'], db_name)
            self._plot_security_patterns(features['security'], db_name)
            self._plot_structural_patterns(features['structural'], db_name)
            
        except Exception as e:
            self.logger.error(f"Error generating visualizations: {str(e)}")
            raise

    def _plot_temporal_patterns(self, temporal_features: Dict, db_name: str) -> None:
        """Create temporal analysis plots"""
        plt.figure(figsize=(15, 5))
        
        monthly_data = {
            datetime.strptime(k, "%Y-%m"): v 
            for k, v in temporal_features['monthly_counts'].items()
        }
        dates = sorted(monthly_data.keys())
        counts = [monthly_data[date] for date in dates]
        
        plt.plot(range(len(counts)), counts)
        plt.title(f'Monthly Certificate Counts - {db_name}')
        plt.xlabel('Month Index')
        plt.ylabel('Number of Certificates')
        
        plt.tight_layout()
        plt.savefig(self.output_dir / f'temporal_analysis_{db_name}_{self.timestamp}.png')
        plt.close()

    def _plot_security_patterns(self, security_features: Dict, db_name: str) -> None:
        """Create security analysis plots"""
        plt.figure(figsize=(12, 6))
        
        key_strengths = [x['strength'] for x in security_features['key_strength'] if x['strength'] > 0]
        if key_strengths:
            plt.hist(key_strengths, bins=20, edgecolor='black')
            plt.title(f'Key Strength Distribution - {db_name}')
            plt.xlabel('Key Strength (bits)')
            plt.ylabel('Count')
        
        plt.tight_layout()
        plt.savefig(self.output_dir / f'security_analysis_{db_name}_{self.timestamp}.png')
        plt.close()

    def _plot_structural_patterns(self, structural_features: Dict, db_name: str) -> None:
        """Create structural analysis plots"""
        plt.figure(figsize=(10, 6))
        
        levels = list(structural_features['domain_levels'].keys())
        counts = list(structural_features['domain_levels'].values())
        
        plt.bar(levels, counts)
        plt.title(f'Domain Level Distribution - {db_name}')
        plt.xlabel('Domain Levels')
        plt.ylabel('Count')
        
        plt.tight_layout()
        plt.savefig(self.output_dir / f'structural_analysis_{db_name}_{self.timestamp}.png')
        plt.close()

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import json
from pathlib import Path
import logging
from datetime import datetime, date
import re
from typing import Dict, List, Optional, Tuple, Any

# Set up default plotting style
plt.style.use('default')

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import json
from pathlib import Path
import logging
from datetime import datetime, date
import re
from typing import Dict, List, Optional, Tuple, Any

# Set up default plotting style
plt.style.use('default')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import json
from pathlib import Path
import logging
from datetime import datetime, date
import re
from typing import Dict, List, Optional, Tuple, Any

# Set up default plotting style
plt.style.use('default')

In [3]:
class AdvancedCertificateAnalyzer:
    """Advanced SSL certificate analyzer with comprehensive feature extraction and analysis"""
    
    def __init__(self, config_path: str):
        """Initialize the analyzer with configuration and setup"""
        self.setup_environment(config_path)
        self.setup_logging()
        
    def setup_environment(self, config_path: str) -> None:
        """Setup analysis environment and load configuration"""
        with open(config_path) as f:
            self.config = json.load(f)['database']
            
        self.base_dir = Path('/home/asomura/waseda/nextstep/RAPIDS')
        self.output_dir = self.base_dir / 'reports' / 'certificate_analysis'
        self.data_dir = self.base_dir / 'data' / 'processed'
        
        for dir_path in [self.output_dir, self.data_dir]:
            dir_path.mkdir(parents=True, exist_ok=True)
            
    def setup_logging(self) -> None:
        """Configure logging settings"""
        log_dir = self.base_dir / 'data' / 'logs'
        log_dir.mkdir(parents=True, exist_ok=True)
        
        self.timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        logging.basicConfig(
            filename=log_dir / f'advanced_cert_analysis_{self.timestamp}.log',
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

    def get_database_connection(self, db_name: str) -> create_engine:
        """Create database connection"""
        host = 'localhost' if db_name == 'website_data' else '192.168.1.92'
        return create_engine(
            f"postgresql://{self.config['user']}:{self.config['password']}@{host}/{db_name}"
        )

    def extract_certificate_data(self, db_name: str) -> pd.DataFrame:
        """Extract certificate data from database"""
        self.logger.info(f"Extracting certificate data from {db_name}")
        
        engine = self.get_database_connection(db_name)
        query = """
        SELECT 
            domain,
            https_certificate_issuer,
            https_certificate_domain,
            https_certificate_expiry,
            https_certificate_public_key,
            https_certificate_signature_algorithm,
            https_certificate_extensions,
            https_certificate_body,
            domain_registrar,
            last_update,
            whois_domain,
            dig_info_a,
            dig_info_mx,
            dig_info_ns,
            ip_organization
        FROM website_data 
        WHERE status = 7 
        AND https_certificate_issuer IS NOT NULL
        """
        
        return pd.read_sql_query(query, engine)

    def _extract_basic_features(self, df: pd.DataFrame) -> Dict:
        """Extract basic certificate features"""
        return {
            'total_certs': int(len(df)),
            'issuer_distribution': df['https_certificate_issuer'].value_counts().to_dict(),
            'algorithm_distribution': df['https_certificate_signature_algorithm'].value_counts().to_dict(),
            'registrar_distribution': df['domain_registrar'].value_counts().to_dict()
        }

    def _extract_temporal_features(self, df: pd.DataFrame) -> Dict:
        """Extract temporal patterns and features"""
        df['last_update'] = pd.to_datetime(df['last_update'])
        
        daily_counts = df.groupby(df['last_update'].dt.date).size()
        monthly_counts = df.groupby([
            df['last_update'].dt.year,
            df['last_update'].dt.month
        ]).size()
        
        return {
            'daily_counts': {str(k): int(v) for k, v in daily_counts.items()},
            'monthly_counts': {f"{k[0]}-{k[1]}": int(v) for k, v in monthly_counts.items()},
            'weekday_distribution': df['last_update'].dt.dayofweek.value_counts().to_dict()
        }

    def _extract_structural_features(self, df: pd.DataFrame) -> Dict:
        """Extract structural certificate features"""
        domain_levels = df['domain'].str.count(r'\.') + 1
        has_wildcard = df['https_certificate_domain'].str.contains(r'\*', regex=True).fillna(False)
        
        return {
            'domain_levels': domain_levels.value_counts().to_dict(),
            'has_wildcard': has_wildcard.value_counts().to_dict(),
            'cert_domain_match': df.apply(
                lambda x: str(x['domain']).lower() in str(x['https_certificate_domain']).lower()
                if pd.notnull(x['https_certificate_domain']) else False,
                axis=1
            ).value_counts().to_dict()
        }

    def _extract_security_features(self, df: pd.DataFrame) -> Dict:
        """Extract security-related features"""
        key_strength = df['https_certificate_public_key'].apply(self._analyze_key_strength)
        return {
            'key_strength': [k for k in key_strength],
            'is_self_signed': df['https_certificate_issuer'].str.contains(
                'self signed', case=False
            ).fillna(False).value_counts().to_dict(),
            'uses_sha1': df['https_certificate_signature_algorithm'].str.contains(
                'sha1', case=False
            ).fillna(False).value_counts().to_dict()
        }

    def _analyze_key_strength(self, key_info: str) -> Dict:
        """Analyze public key strength"""
        if pd.isna(key_info):
            return {'type': 'unknown', 'strength': 0}
            
        key_info = str(key_info).upper()
        
        if 'RSA' in key_info:
            match = re.search(r'(\d+)\s*(?:BIT|BITS)?', key_info)
            return {
                'type': 'RSA',
                'strength': int(match.group(1)) if match else 0
            }
        elif any(ec in key_info for ec in ['EC', 'ECDSA']):
            match = re.search(r'(\d+)[Kk]?', key_info)
            return {
                'type': 'EC',
                'strength': int(match.group(1)) if match else 0
            }
        
        return {'type': 'unknown', 'strength': 0}

    def analyze_certificates(self, db_name: str) -> Dict:
        """Perform comprehensive certificate analysis"""
        self.logger.info(f"Starting analysis for {db_name}")
        
        try:
            df = self.extract_certificate_data(db_name)
            
            features = {
                'basic': self._extract_basic_features(df),
                'temporal': self._extract_temporal_features(df),
                'structural': self._extract_structural_features(df),
                'security': self._extract_security_features(df)
            }
            
            self._save_processed_data(features, db_name)
            self._generate_visualizations(features, db_name)
            
            return features
        except Exception as e:
            self.logger.error(f"Error analyzing certificates for {db_name}: {str(e)}")
            raise

    def _process_for_serialization(self, obj: Any) -> Any:
        """Process objects for JSON serialization"""
        if isinstance(obj, dict):
            return {str(k): self._process_for_serialization(v) for k, v in obj.items()}
        elif isinstance(obj, (pd.Series, pd.DataFrame)):
            return obj.to_dict()
        elif isinstance(obj, (np.integer, np.floating)):
            return float(obj)
        elif isinstance(obj, (date, datetime)):
            return str(obj)
        elif isinstance(obj, (list, tuple)):
            return [self._process_for_serialization(x) for x in obj]
        return obj

    def _save_processed_data(self, features: Dict, db_name: str) -> None:
        """Save processed data to files"""
        try:
            output_path = self.data_dir / f'cert_features_{db_name}_{self.timestamp}.json'
            processed_features = self._process_for_serialization(features)
            
            with open(output_path, 'w') as f:
                json.dump(processed_features, f, indent=2, default=str)
                
            self.logger.info(f"Saved processed data to {output_path}")
        except Exception as e:
            self.logger.error(f"Error saving processed data: {str(e)}")
            raise

    def _generate_visualizations(self, features: Dict, db_name: str) -> None:
        """Generate and save visualizations"""
        try:
            self._plot_temporal_patterns(features['temporal'], db_name)
            self._plot_security_patterns(features['security'], db_name)
            self._plot_structural_patterns(features['structural'], db_name)
            
        except Exception as e:
            self.logger.error(f"Error generating visualizations: {str(e)}")
            raise

    def _plot_temporal_patterns(self, temporal_features: Dict, db_name: str) -> None:
        """Create temporal analysis plots"""
        plt.figure(figsize=(15, 5))
        
        monthly_data = {
            datetime.strptime(k, "%Y-%m"): v 
            for k, v in temporal_features['monthly_counts'].items()
        }
        dates = sorted(monthly_data.keys())
        counts = [monthly_data[date] for date in dates]
        
        plt.plot(range(len(counts)), counts)
        plt.title(f'Monthly Certificate Counts - {db_name}')
        plt.xlabel('Month Index')
        plt.ylabel('Number of Certificates')
        
        plt.tight_layout()
        plt.savefig(self.output_dir / f'temporal_analysis_{db_name}_{self.timestamp}.png')
        plt.close()

    def _plot_security_patterns(self, security_features: Dict, db_name: str) -> None:
        """Create security analysis plots"""
        plt.figure(figsize=(12, 6))
        
        key_strengths = [x['strength'] for x in security_features['key_strength'] if x['strength'] > 0]
        if key_strengths:
            plt.hist(key_strengths, bins=20, edgecolor='black')
            plt.title(f'Key Strength Distribution - {db_name}')
            plt.xlabel('Key Strength (bits)')
            plt.ylabel('Count')
        
        plt.tight_layout()
        plt.savefig(self.output_dir / f'security_analysis_{db_name}_{self.timestamp}.png')
        plt.close()

    def _plot_structural_patterns(self, structural_features: Dict, db_name: str) -> None:
        """Create structural analysis plots"""
        plt.figure(figsize=(10, 6))
        
        levels = list(structural_features['domain_levels'].keys())
        counts = list(structural_features['domain_levels'].values())
        
        plt.bar(levels, counts)
        plt.title(f'Domain Level Distribution - {db_name}')
        plt.xlabel('Domain Levels')
        plt.ylabel('Count')
        
        plt.tight_layout()
        plt.savefig(self.output_dir / f'structural_analysis_{db_name}_{self.timestamp}.png')
        plt.close()

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import json
from pathlib import Path
import logging
from datetime import datetime, date
import re
from typing import Dict, List, Optional, Tuple, Any

# Set up default plotting style
plt.style.use('default')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import json
from pathlib import Path
import logging
from datetime import datetime, date
import re
from typing import Dict, List, Optional, Tuple, Any

# Set up default plotting style
plt.style.use('default')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import json
from pathlib import Path
import logging
from datetime import datetime, date
import re
from typing import Dict, List, Optional, Tuple, Any

# Set up default plotting style
plt.style.use('default')

In [3]:
class AdvancedCertificateAnalyzer:
    """Advanced SSL certificate analyzer with comprehensive feature extraction and analysis"""
    
    def __init__(self, config_path: str):
        """Initialize the analyzer with configuration and setup"""
        self.setup_environment(config_path)
        self.setup_logging()
        
    def setup_environment(self, config_path: str) -> None:
        """Setup analysis environment and load configuration"""
        with open(config_path) as f:
            self.config = json.load(f)['database']
            
        self.base_dir = Path('/home/asomura/waseda/nextstep/RAPIDS')
        self.output_dir = self.base_dir / 'reports' / 'certificate_analysis'
        self.data_dir = self.base_dir / 'data' / 'processed'
        
        for dir_path in [self.output_dir, self.data_dir]:
            dir_path.mkdir(parents=True, exist_ok=True)
            
    def setup_logging(self) -> None:
        """Configure logging settings"""
        log_dir = self.base_dir / 'data' / 'logs'
        log_dir.mkdir(parents=True, exist_ok=True)
        
        self.timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        logging.basicConfig(
            filename=log_dir / f'advanced_cert_analysis_{self.timestamp}.log',
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

    def get_database_connection(self, db_name: str) -> create_engine:
        """Create database connection"""
        host = 'localhost' if db_name == 'website_data' else '192.168.1.92'
        return create_engine(
            f"postgresql://{self.config['user']}:{self.config['password']}@{host}/{db_name}"
        )

    def extract_certificate_data(self, db_name: str) -> pd.DataFrame:
        """Extract certificate data from database"""
        self.logger.info(f"Extracting certificate data from {db_name}")
        
        engine = self.get_database_connection(db_name)
        query = """
        SELECT 
            domain,
            https_certificate_issuer,
            https_certificate_domain,
            https_certificate_expiry,
            https_certificate_public_key,
            https_certificate_signature_algorithm,
            https_certificate_extensions,
            https_certificate_body,
            domain_registrar,
            last_update,
            whois_domain,
            dig_info_a,
            dig_info_mx,
            dig_info_ns,
            ip_organization
        FROM website_data 
        WHERE status = 7 
        AND https_certificate_issuer IS NOT NULL
        """
        
        return pd.read_sql_query(query, engine)

    def _extract_basic_features(self, df: pd.DataFrame) -> Dict:
        """Extract basic certificate features"""
        return {
            'total_certs': int(len(df)),
            'issuer_distribution': df['https_certificate_issuer'].value_counts().to_dict(),
            'algorithm_distribution': df['https_certificate_signature_algorithm'].value_counts().to_dict(),
            'registrar_distribution': df['domain_registrar'].value_counts().to_dict()
        }

    def _extract_temporal_features(self, df: pd.DataFrame) -> Dict:
        """Extract temporal patterns and features"""
        df['last_update'] = pd.to_datetime(df['last_update'])
        
        daily_counts = df.groupby(df['last_update'].dt.date).size()
        monthly_counts = df.groupby([
            df['last_update'].dt.year,
            df['last_update'].dt.month
        ]).size()
        
        return {
            'daily_counts': {str(k): int(v) for k, v in daily_counts.items()},
            'monthly_counts': {f"{k[0]}-{k[1]}": int(v) for k, v in monthly_counts.items()},
            'weekday_distribution': df['last_update'].dt.dayofweek.value_counts().to_dict()
        }

    def _extract_structural_features(self, df: pd.DataFrame) -> Dict:
        """Extract structural certificate features"""
        domain_levels = df['domain'].str.count(r'\.') + 1
        has_wildcard = df['https_certificate_domain'].str.contains(r'\*', regex=True).fillna(False)
        
        return {
            'domain_levels': domain_levels.value_counts().to_dict(),
            'has_wildcard': has_wildcard.value_counts().to_dict(),
            'cert_domain_match': df.apply(
                lambda x: str(x['domain']).lower() in str(x['https_certificate_domain']).lower()
                if pd.notnull(x['https_certificate_domain']) else False,
                axis=1
            ).value_counts().to_dict()
        }

    def _extract_security_features(self, df: pd.DataFrame) -> Dict:
        """Extract security-related features"""
        key_strength = df['https_certificate_public_key'].apply(self._analyze_key_strength)
        return {
            'key_strength': [k for k in key_strength],
            'is_self_signed': df['https_certificate_issuer'].str.contains(
                'self signed', case=False
            ).fillna(False).value_counts().to_dict(),
            'uses_sha1': df['https_certificate_signature_algorithm'].str.contains(
                'sha1', case=False
            ).fillna(False).value_counts().to_dict()
        }

    def _analyze_key_strength(self, key_info: str) -> Dict:
        """Analyze public key strength"""
        if pd.isna(key_info):
            return {'type': 'unknown', 'strength': 0}
            
        key_info = str(key_info).upper()
        
        if 'RSA' in key_info:
            match = re.search(r'(\d+)\s*(?:BIT|BITS)?', key_info)
            return {
                'type': 'RSA',
                'strength': int(match.group(1)) if match else 0
            }
        elif any(ec in key_info for ec in ['EC', 'ECDSA']):
            match = re.search(r'(\d+)[Kk]?', key_info)
            return {
                'type': 'EC',
                'strength': int(match.group(1)) if match else 0
            }
        
        return {'type': 'unknown', 'strength': 0}

    def analyze_certificates(self, db_name: str) -> Dict:
        """Perform comprehensive certificate analysis"""
        self.logger.info(f"Starting analysis for {db_name}")
        
        try:
            df = self.extract_certificate_data(db_name)
            
            features = {
                'basic': self._extract_basic_features(df),
                'temporal': self._extract_temporal_features(df),
                'structural': self._extract_structural_features(df),
                'security': self._extract_security_features(df)
            }
            
            self._save_processed_data(features, db_name)
            self._generate_visualizations(features, db_name)
            
            return features
        except Exception as e:
            self.logger.error(f"Error analyzing certificates for {db_name}: {str(e)}")
            raise

    def _process_for_serialization(self, obj: Any) -> Any:
        """Process objects for JSON serialization"""
        if isinstance(obj, dict):
            return {str(k): self._process_for_serialization(v) for k, v in obj.items()}
        elif isinstance(obj, (pd.Series, pd.DataFrame)):
            return obj.to_dict()
        elif isinstance(obj, (np.integer, np.floating)):
            return float(obj)
        elif isinstance(obj, (date, datetime)):
            return str(obj)
        elif isinstance(obj, (list, tuple)):
            return [self._process_for_serialization(x) for x in obj]
        return obj

    def _save_processed_data(self, features: Dict, db_name: str) -> None:
        """Save processed data to files"""
        try:
            output_path = self.data_dir / f'cert_features_{db_name}_{self.timestamp}.json'
            processed_features = self._process_for_serialization(features)
            
            with open(output_path, 'w') as f:
                json.dump(processed_features, f, indent=2, default=str)
                
            self.logger.info(f"Saved processed data to {output_path}")
        except Exception as e:
            self.logger.error(f"Error saving processed data: {str(e)}")
            raise

    def _generate_visualizations(self, features: Dict, db_name: str) -> None:
        """Generate and save visualizations"""
        try:
            self._plot_temporal_patterns(features['temporal'], db_name)
            self._plot_security_patterns(features['security'], db_name)
            self._plot_structural_patterns(features['structural'], db_name)
            
        except Exception as e:
            self.logger.error(f"Error generating visualizations: {str(e)}")
            raise

    def _plot_temporal_patterns(self, temporal_features: Dict, db_name: str) -> None:
        """Create temporal analysis plots"""
        plt.figure(figsize=(15, 5))
        
        monthly_data = {
            datetime.strptime(k, "%Y-%m"): v 
            for k, v in temporal_features['monthly_counts'].items()
        }
        dates = sorted(monthly_data.keys())
        counts = [monthly_data[date] for date in dates]
        
        plt.plot(range(len(counts)), counts)
        plt.title(f'Monthly Certificate Counts - {db_name}')
        plt.xlabel('Month Index')
        plt.ylabel('Number of Certificates')
        
        plt.tight_layout()
        plt.savefig(self.output_dir / f'temporal_analysis_{db_name}_{self.timestamp}.png')
        plt.close()

    def _plot_security_patterns(self, security_features: Dict, db_name: str) -> None:
        """Create security analysis plots"""
        plt.figure(figsize=(12, 6))
        
        key_strengths = [x['strength'] for x in security_features['key_strength'] if x['strength'] > 0]
        if key_strengths:
            plt.hist(key_strengths, bins=20, edgecolor='black')
            plt.title(f'Key Strength Distribution - {db_name}')
            plt.xlabel('Key Strength (bits)')
            plt.ylabel('Count')
        
        plt.tight_layout()
        plt.savefig(self.output_dir / f'security_analysis_{db_name}_{self.timestamp}.png')
        plt.close()

    def _plot_structural_patterns(self, structural_features: Dict, db_name: str) -> None:
        """Create structural analysis plots"""
        plt.figure(figsize=(10, 6))
        
        levels = list(structural_features['domain_levels'].keys())
        counts = list(structural_features['domain_levels'].values())
        
        plt.bar(levels, counts)
        plt.title(f'Domain Level Distribution - {db_name}')
        plt.xlabel('Domain Levels')
        plt.ylabel('Count')
        
        plt.tight_layout()
        plt.savefig(self.output_dir / f'structural_analysis_{db_name}_{self.timestamp}.png')
        plt.close()

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import json
from pathlib import Path
import logging
from datetime import datetime, date
import re
from typing import Dict, List, Optional, Tuple, Any

# Set up default plotting style
plt.style.use('default')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import json
from pathlib import Path
import logging
from datetime import datetime, date
import re
from typing import Dict, List, Optional, Tuple, Any

# Set up default plotting style
plt.style.use('default')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import json
from pathlib import Path
import logging
from datetime import datetime, date
import re
from typing import Dict, List, Optional, Tuple, Any

# Set up default plotting style
plt.style.use('default')

In [3]:
class AdvancedCertificateAnalyzer:
    """Advanced SSL certificate analyzer with comprehensive feature extraction and analysis"""
    
    def __init__(self, config_path: str):
        """Initialize the analyzer with configuration and setup"""
        self.setup_environment(config_path)
        self.setup_logging()
        
    def setup_environment(self, config_path: str) -> None:
        """Setup analysis environment and load configuration"""
        with open(config_path) as f:
            self.config = json.load(f)['database']
            
        self.base_dir = Path('/home/asomura/waseda/nextstep/RAPIDS')
        self.output_dir = self.base_dir / 'reports' / 'certificate_analysis'
        self.data_dir = self.base_dir / 'data' / 'processed'
        
        for dir_path in [self.output_dir, self.data_dir]:
            dir_path.mkdir(parents=True, exist_ok=True)
            
    def setup_logging(self) -> None:
        """Configure logging settings"""
        log_dir = self.base_dir / 'data' / 'logs'
        log_dir.mkdir(parents=True, exist_ok=True)
        
        self.timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        logging.basicConfig(
            filename=log_dir / f'advanced_cert_analysis_{self.timestamp}.log',
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

    def get_database_connection(self, db_name: str) -> create_engine:
        """Create database connection"""
        host = 'localhost' if db_name == 'website_data' else '192.168.1.92'
        return create_engine(
            f"postgresql://{self.config['user']}:{self.config['password']}@{host}/{db_name}"
        )

    def extract_certificate_data(self, db_name: str) -> pd.DataFrame:
        """Extract certificate data from database"""
        self.logger.info(f"Extracting certificate data from {db_name}")
        
        engine = self.get_database_connection(db_name)
        query = """
        SELECT 
            domain,
            https_certificate_issuer,
            https_certificate_domain,
            https_certificate_expiry,
            https_certificate_public_key,
            https_certificate_signature_algorithm,
            https_certificate_extensions,
            https_certificate_body,
            domain_registrar,
            last_update,
            whois_domain,
            dig_info_a,
            dig_info_mx,
            dig_info_ns,
            ip_organization
        FROM website_data 
        WHERE status = 7 
        AND https_certificate_issuer IS NOT NULL
        """
        
        return pd.read_sql_query(query, engine)

    def _extract_basic_features(self, df: pd.DataFrame) -> Dict:
        """Extract basic certificate features"""
        return {
            'total_certs': int(len(df)),
            'issuer_distribution': df['https_certificate_issuer'].value_counts().to_dict(),
            'algorithm_distribution': df['https_certificate_signature_algorithm'].value_counts().to_dict(),
            'registrar_distribution': df['domain_registrar'].value_counts().to_dict()
        }

    def _extract_temporal_features(self, df: pd.DataFrame) -> Dict:
        """Extract temporal patterns and features"""
        df['last_update'] = pd.to_datetime(df['last_update'])
        
        daily_counts = df.groupby(df['last_update'].dt.date).size()
        monthly_counts = df.groupby([
            df['last_update'].dt.year,
            df['last_update'].dt.month
        ]).size()
        
        return {
            'daily_counts': {str(k): int(v) for k, v in daily_counts.items()},
            'monthly_counts': {f"{k[0]}-{k[1]}": int(v) for k, v in monthly_counts.items()},
            'weekday_distribution': df['last_update'].dt.dayofweek.value_counts().to_dict()
        }

    def _extract_structural_features(self, df: pd.DataFrame) -> Dict:
        """Extract structural certificate features"""
        domain_levels = df['domain'].str.count(r'\.') + 1
        has_wildcard = df['https_certificate_domain'].str.contains(r'\*', regex=True).fillna(False)
        
        return {
            'domain_levels': domain_levels.value_counts().to_dict(),
            'has_wildcard': has_wildcard.value_counts().to_dict(),
            'cert_domain_match': df.apply(
                lambda x: str(x['domain']).lower() in str(x['https_certificate_domain']).lower()
                if pd.notnull(x['https_certificate_domain']) else False,
                axis=1
            ).value_counts().to_dict()
        }

    def _extract_security_features(self, df: pd.DataFrame) -> Dict:
        """Extract security-related features"""
        key_strength = df['https_certificate_public_key'].apply(self._analyze_key_strength)
        return {
            'key_strength': [k for k in key_strength],
            'is_self_signed': df['https_certificate_issuer'].str.contains(
                'self signed', case=False
            ).fillna(False).value_counts().to_dict(),
            'uses_sha1': df['https_certificate_signature_algorithm'].str.contains(
                'sha1', case=False
            ).fillna(False).value_counts().to_dict()
        }

    def _analyze_key_strength(self, key_info: str) -> Dict:
        """Analyze public key strength"""
        if pd.isna(key_info):
            return {'type': 'unknown', 'strength': 0}
            
        key_info = str(key_info).upper()
        
        if 'RSA' in key_info:
            match = re.search(r'(\d+)\s*(?:BIT|BITS)?', key_info)
            return {
                'type': 'RSA',
                'strength': int(match.group(1)) if match else 0
            }
        elif any(ec in key_info for ec in ['EC', 'ECDSA']):
            match = re.search(r'(\d+)[Kk]?', key_info)
            return {
                'type': 'EC',
                'strength': int(match.group(1)) if match else 0
            }
        
        return {'type': 'unknown', 'strength': 0}

    def analyze_certificates(self, db_name: str) -> Dict:
        """Perform comprehensive certificate analysis"""
        self.logger.info(f"Starting analysis for {db_name}")
        
        try:
            df = self.extract_certificate_data(db_name)
            
            features = {
                'basic': self._extract_basic_features(df),
                'temporal': self._extract_temporal_features(df),
                'structural': self._extract_structural_features(df),
                'security': self._extract_security_features(df)
            }
            
            self._save_processed_data(features, db_name)
            self._generate_visualizations(features, db_name)
            
            return features
        except Exception as e:
            self.logger.error(f"Error analyzing certificates for {db_name}: {str(e)}")
            raise

    def _process_for_serialization(self, obj: Any) -> Any:
        """Process objects for JSON serialization"""
        if isinstance(obj, dict):
            return {str(k): self._process_for_serialization(v) for k, v in obj.items()}
        elif isinstance(obj, (pd.Series, pd.DataFrame)):
            return obj.to_dict()
        elif isinstance(obj, (np.integer, np.floating)):
            return float(obj)
        elif isinstance(obj, (date, datetime)):
            return str(obj)
        elif isinstance(obj, (list, tuple)):
            return [self._process_for_serialization(x) for x in obj]
        return obj

    def _save_processed_data(self, features: Dict, db_name: str) -> None:
        """Save processed data to files"""
        try:
            output_path = self.data_dir / f'cert_features_{db_name}_{self.timestamp}.json'
            processed_features = self._process_for_serialization(features)
            
            with open(output_path, 'w') as f:
                json.dump(processed_features, f, indent=2, default=str)
                
            self.logger.info(f"Saved processed data to {output_path}")
        except Exception as e:
            self.logger.error(f"Error saving processed data: {str(e)}")
            raise

    def _generate_visualizations(self, features: Dict, db_name: str) -> None:
        """Generate and save visualizations"""
        try:
            self._plot_temporal_patterns(features['temporal'], db_name)
            self._plot_security_patterns(features['security'], db_name)
            self._plot_structural_patterns(features['structural'], db_name)
            
        except Exception as e:
            self.logger.error(f"Error generating visualizations: {str(e)}")
            raise

    def _plot_temporal_patterns(self, temporal_features: Dict, db_name: str) -> None:
        """Create temporal analysis plots"""
        plt.figure(figsize=(15, 5))
        
        monthly_data = {
            datetime.strptime(k, "%Y-%m"): v 
            for k, v in temporal_features['monthly_counts'].items()
        }
        dates = sorted(monthly_data.keys())
        counts = [monthly_data[date] for date in dates]
        
        plt.plot(range(len(counts)), counts)
        plt.title(f'Monthly Certificate Counts - {db_name}')
        plt.xlabel('Month Index')
        plt.ylabel('Number of Certificates')
        
        plt.tight_layout()
        plt.savefig(self.output_dir / f'temporal_analysis_{db_name}_{self.timestamp}.png')
        plt.close()

    def _plot_security_patterns(self, security_features: Dict, db_name: str) -> None:
        """Create security analysis plots"""
        plt.figure(figsize=(12, 6))
        
        key_strengths = [x['strength'] for x in security_features['key_strength'] if x['strength'] > 0]
        if key_strengths:
            plt.hist(key_strengths, bins=20, edgecolor='black')
            plt.title(f'Key Strength Distribution - {db_name}')
            plt.xlabel('Key Strength (bits)')
            plt.ylabel('Count')
        
        plt.tight_layout()
        plt.savefig(self.output_dir / f'security_analysis_{db_name}_{self.timestamp}.png')
        plt.close()

    def _plot_structural_patterns(self, structural_features: Dict, db_name: str) -> None:
        """Create structural analysis plots"""
        plt.figure(figsize=(10, 6))
        
        levels = list(structural_features['domain_levels'].keys())
        counts = list(structural_features['domain_levels'].values())
        
        plt.bar(levels, counts)
        plt.title(f'Domain Level Distribution - {db_name}')
        plt.xlabel('Domain Levels')
        plt.ylabel('Count')
        
        plt.tight_layout()
        plt.savefig(self.output_dir / f'structural_analysis_{db_name}_{self.timestamp}.png')
        plt.close()

# メイン実行部分

In [9]:
# Configuration and execution
config_path = "/home/asomura/waseda/nextstep/RAPIDS/config/database.json"
analyzer = AdvancedCertificateAnalyzer(config_path)

# Analyze both databases
results = {}
for db_name in ['website_data', 'normal_sites']:
    try:
        print(f"\nAnalyzing {db_name}...")
        results[db_name] = analyzer.analyze_certificates(db_name)
        
        # Print summary statistics
        print(f"\nAnalysis Results for {db_name}:")
        print(f"Total certificates: {results[db_name]['basic']['total_certs']}")
        print("\nTop certificate issuers:")
        for issuer, count in sorted(
            results[db_name]['basic']['issuer_distribution'].items(),
            key=lambda x: x[1],
            reverse=True
        )[:5]:
            print(f"- {issuer}: {count}")
            
    except Exception as e:
        print(f"Error analyzing {db_name}: {str(e)}")


Analyzing website_data...

Analysis Results for website_data:
Total certificates: 10974

Top certificate issuers:
- R11: 2923
- R10: 2526
- R3: 1026
- GTS CA 1D4: 805
- E5: 547

Analyzing normal_sites...


  has_wildcard = df['https_certificate_domain'].str.contains(r'\*', regex=True).fillna(False)



Analysis Results for normal_sites:
Total certificates: 9591

Top certificate issuers:
- WE1: 1789
- R11: 958
- R10: 931
- Amazon RSA 2048 M02: 478
- Amazon RSA 2048 M03: 443
