In [None]:
# Certificate Pattern Analysis
# Location: RAPIDS/notebooks/certificate_analysis/06_certificate_pattern_analysis.ipynb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
from datetime import datetime
import re
from typing import Dict, List, Tuple
from collections import Counter

class CertificatePatternAnalyzer:
    """Analyzer for comparing certificate patterns between phishing and normal sites"""
    
    def __init__(self, base_dir: str = '/home/asomura/waseda/nextstep/RAPIDS'):
        """Initialize the analyzer with directory paths"""
        self.base_dir = Path(base_dir)
        self.data_dir = self.base_dir / 'data' / 'processed'
        self.output_dir = self.base_dir / 'reports' / 'pattern_analysis'
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
        # Set plotting style
        plt.style.use('default')
        self.colors = {'phishing': '#FF6B6B', 'normal': '#4ECDC4'}
        
    def load_data(self) -> Tuple[Dict, Dict]:
        """Load the latest analysis results"""
        json_files = list(self.data_dir.glob('cert_features_*.json'))
        latest_files = sorted(json_files, key=lambda x: x.stat().st_mtime, reverse=True)[:2]
        
        results = {}
        for file_path in latest_files:
            with open(file_path) as f:
                db_name = 'website_data' if 'website_data' in file_path.name else 'normal_sites'
                results[db_name] = json.load(f)
                
        return results.get('website_data', {}), results.get('normal_sites', {})

    def analyze_certificate_patterns(self):
        """Analyze and compare certificate patterns"""
        phishing_data, normal_data = self.load_data()
        if not phishing_data or not normal_data:
            print("Error: Could not load required data")
            return

        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        
        # 1. Certificate Issuer Comparison
        self._analyze_issuers(phishing_data, normal_data, timestamp)
        
        # 2. Temporal Pattern Analysis
        self._analyze_temporal_patterns(phishing_data, normal_data, timestamp)
        
        # 3. Security Characteristics
        self._analyze_security_features(phishing_data, normal_data, timestamp)
        
        # 4. Domain Structure Analysis
        self._analyze_domain_structure(phishing_data, normal_data, timestamp)
        
        # Save summary report
        self._generate_summary_report(phishing_data, normal_data, timestamp)

    def _analyze_issuers(self, phishing_data: Dict, normal_data: Dict, timestamp: str):
        """Analyze and compare certificate issuers"""
        plt.figure(figsize=(15, 8))
        
        # Prepare issuer data
        phish_issuers = phishing_data['basic']['issuer_distribution']
        normal_issuers = normal_data['basic']['issuer_distribution']
        
        # Get top 10 issuers from both datasets
        top_issuers = set()
        for issuers in [phish_issuers, normal_issuers]:
            sorted_issuers = sorted(issuers.items(), key=lambda x: x[1], reverse=True)[:10]
            top_issuers.update([x[0] for x in sorted_issuers])
            
        # Create comparison dataframe
        issuer_data = []
        for issuer in top_issuers:
            phish_count = phish_issuers.get(issuer, 0)
            normal_count = normal_issuers.get(issuer, 0)
            issuer_data.append({
                'Issuer': issuer,
                'Phishing Sites': phish_count,
                'Normal Sites': normal_count
            })
            
        df_issuers = pd.DataFrame(issuer_data)
        
        # Plot comparison
        ax = df_issuers.plot(
            x='Issuer', 
            y=['Phishing Sites', 'Normal Sites'], 
            kind='bar',
            color=[self.colors['phishing'], self.colors['normal']]
        )
        plt.title('Top Certificate Issuers Comparison')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig(self.output_dir / f'issuer_comparison_{timestamp}.png')
        plt.close()

    def _analyze_temporal_patterns(self, phishing_data: Dict, normal_data: Dict, timestamp: str):
        """Analyze temporal patterns in certificate issuance"""
        plt.figure(figsize=(15, 6))
        
        # Convert and sort temporal data
        def process_temporal_data(data):
            monthly = data['temporal']['monthly_counts']
            df = pd.DataFrame(
                [(k, v) for k, v in monthly.items()],
                columns=['month', 'count']
            )
            df['month'] = pd.to_datetime(df['month'].str.replace('-', '/') + '/01')
            return df.sort_values('month')
            
        phish_temporal = process_temporal_data(phishing_data)
        normal_temporal = process_temporal_data(normal_data)
        
        # Plot temporal patterns
        plt.plot(phish_temporal['month'], phish_temporal['count'], 
                label='Phishing Sites', color=self.colors['phishing'])
        plt.plot(normal_temporal['month'], normal_temporal['count'], 
                label='Normal Sites', color=self.colors['normal'])
        
        plt.title('Monthly Certificate Issuance Patterns')
        plt.xlabel('Month')
        plt.ylabel('Number of Certificates')
        plt.legend()
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(self.output_dir / f'temporal_patterns_{timestamp}.png')
        plt.close()

    def _analyze_security_features(self, phishing_data: Dict, normal_data: Dict, timestamp: str):
        """Analyze security characteristics distribution"""
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
        
        # Key strength distribution
        def get_key_strengths(data):
            return [x['strength'] for x in data['security']['key_strength'] if x['strength'] > 0]
            
        phish_strengths = get_key_strengths(phishing_data)
        normal_strengths = get_key_strengths(normal_data)
        
        # Plot key strength distribution
        ax1.hist([phish_strengths, normal_strengths], label=['Phishing', 'Normal'],
                color=[self.colors['phishing'], self.colors['normal']], bins=20, alpha=0.7)
        ax1.set_title('Key Strength Distribution')
        ax1.set_xlabel('Key Strength (bits)')
        ax1.set_ylabel('Count')
        ax1.legend()
        
        # Algorithm usage analysis
        def get_algorithm_usage(data):
            return pd.Series(data['basic']['algorithm_distribution'])
            
        phish_algs = get_algorithm_usage(phishing_data)
        normal_algs = get_algorithm_usage(normal_data)
        
        # Combine and plot top algorithms
        all_algs = pd.concat([phish_algs, normal_algs])
        top_algs = all_algs.groupby(level=0).sum().nlargest(5).index
        
        alg_data = pd.DataFrame({
            'Phishing': phish_algs[top_algs],
            'Normal': normal_algs[top_algs]
        }).fillna(0)
        
        alg_data.plot(kind='bar', ax=ax2, color=[self.colors['phishing'], self.colors['normal']])
        ax2.set_title('Top Signature Algorithms')
        ax2.set_xlabel('Algorithm')
        ax2.set_ylabel('Count')
        plt.xticks(rotation=45, ha='right')
        
        plt.tight_layout()
        plt.savefig(self.output_dir / f'security_features_{timestamp}.png')
        plt.close()

    def _analyze_domain_structure(self, phishing_data: Dict, normal_data: Dict, timestamp: str):
        """Analyze domain structure patterns"""
        plt.figure(figsize=(12, 6))
        
        # Prepare domain level data
        phish_levels = pd.Series(phishing_data['structural']['domain_levels'])
        normal_levels = pd.Series(normal_data['structural']['domain_levels'])
        
        # Convert to percentages
        phish_pct = (phish_levels / phish_levels.sum() * 100)
        normal_pct = (normal_levels / normal_levels.sum() * 100)
        
        # Combine data
        domain_df = pd.DataFrame({
            'Phishing': phish_pct,
            'Normal': normal_pct
        }).fillna(0)
        
        # Plot domain levels comparison
        domain_df.plot(kind='bar', color=[self.colors['phishing'], self.colors['normal']])
        plt.title('Domain Level Distribution Comparison')
        plt.xlabel('Number of Domain Levels')
        plt.ylabel('Percentage of Sites')
        plt.legend()
        plt.tight_layout()
        plt.savefig(self.output_dir / f'domain_structure_{timestamp}.png')
        plt.close()

    def _generate_summary_report(self, phishing_data: Dict, normal_data: Dict, timestamp: str):
        """Generate a summary report of the analysis"""
        report = {
            'timestamp': timestamp,
            'total_certificates': {
                'phishing': phishing_data['basic']['total_certs'],
                'normal': normal_data['basic']['total_certs']
            },
            'key_findings': {
                'issuer_diversity': {
                    'phishing': len(phishing_data['basic']['issuer_distribution']),
                    'normal': len(normal_data['basic']['issuer_distribution'])
                },
                'security_metrics': {
                    'phishing': {
                        'self_signed': sum(phishing_data['security']['is_self_signed'].values()),
                        'uses_sha1': sum(phishing_data['security']['uses_sha1'].values())
                    },
                    'normal': {
                        'self_signed': sum(normal_data['security']['is_self_signed'].values()),
                        'uses_sha1': sum(normal_data['security']['uses_sha1'].values())
                    }
                }
            }
        }
        
        # Save report
        with open(self.output_dir / f'analysis_summary_{timestamp}.json', 'w') as f:
            json.dump(report, f, indent=2)
        
        # Print summary
        print("\nAnalysis Summary Report")
        print("=" * 50)
        print(f"\nTotal Certificates Analyzed:")
        print(f"Phishing Sites: {report['total_certificates']['phishing']}")
        print(f"Normal Sites: {report['total_certificates']['normal']}")
        
        print("\nKey Findings:")
        print(f"Unique Certificate Issuers:")
        print(f"- Phishing Sites: {report['key_findings']['issuer_diversity']['phishing']}")
        print(f"- Normal Sites: {report['key_findings']['issuer_diversity']['normal']}")
        
        print("\nSecurity Metrics:")
        print("Phishing Sites:")
        print(f"- Self-signed Certificates: {report['key_findings']['security_metrics']['phishing']['self_signed']}")
        print(f"- SHA1 Usage: {report['key_findings']['security_metrics']['phishing']['uses_sha1']}")
        print("Normal Sites:")
        print(f"- Self-signed Certificates: {report['key_findings']['security_metrics']['normal']['self_signed']}")
        print(f"- SHA1 Usage: {report['key_findings']['security_metrics']['normal']['uses_sha1']}")

# Example usage
if __name__ == "__main__":
    analyzer = CertificatePatternAnalyzer()
    analyzer.analyze_certificate_patterns()