In [14]:
# Program 01: Certificate Analysis
# Location: RAPIDS/notebooks/certificate_analysis/01_certificate_analysis.ipynb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from pathlib import Path
import json
import logging
from datetime import datetime
from typing import Dict, Optional, Tuple

# Setup constants
BASE_DIR = Path('/home/asomura/waseda/nextstep/RAPIDS')
CONFIG_PATH = BASE_DIR / 'config/database.json'
OUTPUT_DIR = BASE_DIR / 'reports/certificate_analysis'
LOG_DIR = BASE_DIR / 'data/logs'

# Create directories
for dir_path in [OUTPUT_DIR, LOG_DIR]:
    dir_path.mkdir(parents=True, exist_ok=True)

# Setup logging
logging.basicConfig(
    filename=LOG_DIR / f'certificate_analysis_{datetime.now():%Y%m%d}.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Setup plotting style
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

class CertificateAnalyzer:
    """Certificate analysis for phishing detection"""
    
    def __init__(self, config_path: Path):
        """Initialize the analyzer with database configuration"""
        with open(config_path) as f:
            self.config = json.load(f)['database']
            
        self.colors = {
            'phishing': '#FF6B6B',
            'normal': '#4ECDC4',
            'grid': '#DDDDDD'
        }
            
    def get_database_connection(self, db_name: str) -> create_engine:
        """Create database connection"""
        return create_engine(
            f"postgresql://{self.config['user']}:{self.config['password']}@localhost/{db_name}"
        )
    
    def extract_expiry_info(self, expiry_text: str) -> Optional[int]:
        """
        Extract days until expiry from certificate expiry timestamp
        
        Args:
            expiry_text: Certificate expiry text in format YYYYMMDDHHMMSSZ
            
        Returns:
            Days until expiry or None if invalid format
        """
        if pd.isna(expiry_text):
            return None

        try:
            # Parse expiry date 
            expiry_date = datetime.strptime(expiry_text.strip(), '%Y%m%d%H%M%SZ')
            
            # Calculate days until expiry
            days = (expiry_date - datetime.now()).days
            return max(0, days)
            
        except (ValueError, AttributeError) as e:
            logger.warning(f"Error parsing expiry date: {expiry_text} - {str(e)}")
            return None
            
    def analyze_certificates(self, db_name: str) -> Dict:
        """
        Analyze certificates from database
        
        Args:
            db_name: Name of the database to analyze
            
        Returns:
            Dictionary containing analysis results
        """
        logger.info(f"Analyzing certificates from {db_name}")
        
        try:
            # Extract certificate data
            engine = self.get_database_connection(db_name)
            query = """
            SELECT 
                domain,
                https_certificate_expiry,
                https_certificate_issuer,
                https_certificate_signature_algorithm
            FROM website_data 
            WHERE status = 7 
            AND https_certificate_issuer IS NOT NULL
            AND https_certificate_expiry IS NOT NULL
            """
            
            df = pd.read_sql_query(query, engine)
            
            # Extract expiry days
            df['expiry_days'] = df['https_certificate_expiry'].apply(self.extract_expiry_info)
            
            # Calculate statistics
            stats = {
                'total_certificates': len(df),
                'expiry_stats': df['expiry_days'].describe().to_dict(),
                'issuers': df['https_certificate_issuer'].value_counts().to_dict(),
                'algorithms': df['https_certificate_signature_algorithm'].value_counts().to_dict()
            }
            
            return {
                'data': df,
                'stats': stats
            }
            
        except Exception as e:
            logger.error(f"Error analyzing {db_name}: {str(e)}")
            raise
            
    def plot_analysis(self, phishing_results: Dict, normal_results: Dict):
        """
        Create comparative visualizations
        
        Args:
            phishing_results: Analysis results for phishing sites
            normal_results: Analysis results for normal sites
        """
        # Create figure with subplots
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
        
        # 1. Expiry Distribution
        self._plot_expiry_distribution(
            phishing_results['data']['expiry_days'],
            normal_results['data']['expiry_days'],
            ax1
        )
        
        # 2. Issuer Distribution
        self._plot_issuer_distribution(
            phishing_results['stats']['issuers'],
            normal_results['stats']['issuers'],
            ax2
        )
        
        # 3. Algorithm Distribution
        self._plot_algorithm_distribution(
            phishing_results['stats']['algorithms'],
            normal_results['stats']['algorithms'],
            ax3
        )
        
        # 4. Certificate Count Summary
        self._plot_count_summary(
            phishing_results['stats']['total_certificates'],
            normal_results['stats']['total_certificates'],
            ax4
        )
        
        plt.tight_layout()
        plt.savefig(OUTPUT_DIR / f'certificate_analysis_{datetime.now():%Y%m%d}.png')
        plt.close()
        
    def _plot_expiry_distribution(self, phishing_expiry: pd.Series, normal_expiry: pd.Series, ax):
        """Plot certificate expiry distribution"""
        bins = [0, 30, 90, 180, 365, float('inf')]
        labels = ['0-30', '31-90', '91-180', '181-365', '365+']
        
        ax.hist(
            [phishing_expiry, normal_expiry],
            bins=bins,
            label=['Phishing', 'Normal'],
            color=[self.colors['phishing'], self.colors['normal']],
            alpha=0.7
        )
        ax.set_title('Certificate Expiry Distribution')
        ax.set_xlabel('Days until expiry')
        ax.set_ylabel('Number of certificates')
        ax.legend()
        ax.grid(True, alpha=0.3)
        
    def _plot_issuer_distribution(self, phishing_issuers: Dict, normal_issuers: Dict, ax):
        """Plot certificate issuer distribution"""
        issuers = sorted(set(list(phishing_issuers.keys()) + list(normal_issuers.keys())))
        x = np.arange(len(issuers))
        width = 0.35
        
        ax.bar(
            x - width/2,
            [phishing_issuers.get(i, 0) for i in issuers],
            width,
            label='Phishing',
            color=self.colors['phishing']
        )
        ax.bar(
            x + width/2,
            [normal_issuers.get(i, 0) for i in issuers],
            width,
            label='Normal',
            color=self.colors['normal']
        )
        
        ax.set_title('Certificate Issuer Distribution')
        ax.set_xticks(x)
        ax.set_xticklabels(issuers, rotation=45, ha='right')
        ax.legend()
        ax.grid(True, alpha=0.3)
        
    def _plot_algorithm_distribution(self, phishing_algs: Dict, normal_algs: Dict, ax):
        """Plot signature algorithm distribution"""
        algorithms = sorted(set(list(phishing_algs.keys()) + list(normal_algs.keys())))
        x = np.arange(len(algorithms))
        width = 0.35
        
        ax.bar(
            x - width/2,
            [phishing_algs.get(a, 0) for a in algorithms],
            width,
            label='Phishing',
            color=self.colors['phishing']
        )
        ax.bar(
            x + width/2,
            [normal_algs.get(a, 0) for a in algorithms],
            width,
            label='Normal',
            color=self.colors['normal']
        )
        
        ax.set_title('Signature Algorithm Distribution')
        ax.set_xticks(x)
        ax.set_xticklabels(algorithms, rotation=45, ha='right')
        ax.legend()
        ax.grid(True, alpha=0.3)
        
    def _plot_count_summary(self, phishing_count: int, normal_count: int, ax):
        """Plot certificate count summary"""
        x = ['Phishing Sites', 'Normal Sites']
        counts = [phishing_count, normal_count]
        
        ax.bar(
            x,
            counts,
            color=[self.colors['phishing'], self.colors['normal']]
        )
        ax.set_title('Total Certificates Analyzed')
        ax.set_ylabel('Number of certificates')
        
        # Add value labels on top of bars
        for i, count in enumerate(counts):
            ax.text(i, count, str(count), ha='center', va='bottom')
            
        ax.grid(True, alpha=0.3)

def main():
    """Main execution function"""
    analyzer = CertificateAnalyzer(CONFIG_PATH)
    results = {}
    
    # Analyze both databases
    for db_name in ['website_data', 'normal_sites']:
        try:
            print(f"\nAnalyzing {db_name}...")
            results[db_name] = analyzer.analyze_certificates(db_name)
            
            # Print summary statistics
            stats = results[db_name]['stats']
            print(f"\nTotal Certificates: {stats['total_certificates']}")
            print("\nExpiry Statistics (days):")
            print(pd.Series(stats['expiry_stats']))
            print("\nIssuer Distribution:")
            print(pd.Series(stats['issuers']))
            print("\nSignature Algorithm Distribution:")
            print(pd.Series(stats['algorithms']))
            
        except Exception as e:
            print(f"Error analyzing {db_name}: {str(e)}")
            
    # Create visualizations
    if len(results) == 2:
        analyzer.plot_analysis(
            results['website_data'],
            results['normal_sites']
        )

if __name__ == "__main__":
    main()


Analyzing website_data...

Total Certificates: 10974

Expiry Statistics (days):
count    10974.000000
mean        70.981137
std        562.528852
min          0.000000
25%          0.000000
50%          0.000000
75%         21.000000
max      33509.000000
dtype: float64

Issuer Distribution:
R11                                                 2923
R10                                                 2526
R3                                                  1026
GTS CA 1D4                                           805
E5                                                   547
                                                    ... 
GlobalSign Atlas R3 DV TLS CA 2024 Q3                  1
Sectigo RSA Extended Validation Secure Server CA       1
GlobalSign GCC R6 AlphaSSL CA 2023                     1
GoGetSSL RSA DV CA                                     1
Network Solutions RSA DV SSL CA 3                      1
Length: 136, dtype: int64

Signature Algorithm Distribution:
sha256WithRSAEncry

  boffset += 0.5 * totwidth
