In [1]:
# SSL Certificate Analysis for Phishing Detection
# Location: RAPIDS/notebooks/certificate_analysis/01_certificate_data_exploration.ipynb

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine, text
import json
from pathlib import Path
import logging
from datetime import datetime

# Set up logging
log_dir = Path('/home/asomura/waseda/nextstep/RAPIDS/data/logs')
log_dir.mkdir(parents=True, exist_ok=True)
logging.basicConfig(
    filename=log_dir / f'certificate_analysis_{datetime.now():%Y%m%d}.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Load database configuration
config_path = Path('/home/asomura/waseda/nextstep/RAPIDS/config/database.json')
with open(config_path) as f:
    config = json.load(f)['database']

def get_engine(db_name):
    """Create database engine for the specified database"""
    host = 'localhost' if db_name == 'website_data' else '192.168.1.92'
    return create_engine(
        f"postgresql://{config['user']}:{config['password']}@{host}/{db_name}"
    )

# Query certificate data from both databases
def get_certificate_data(db_name):
    """Retrieve certificate-related data from database"""
    engine = get_engine(db_name)
    query = """
    SELECT 
        domain,
        https_certificate_issuer,
        https_certificate_domain,
        https_certificate_expiry,
        https_certificate_public_key,
        https_certificate_signature_algorithm,
        https_certificate_extensions,
        domain_registrar,
        last_update
    FROM website_data 
    WHERE status = 7 
    AND https_certificate_issuer IS NOT NULL
    """
    return pd.read_sql_query(query, engine)



In [2]:
# Analyze certificate data for both phishing and normal sites
for db_name in ['website_data', 'normal_sites']:
    print(f"\nAnalyzing {db_name}...")
    
    try:
        # Get certificate data
        df = get_certificate_data(db_name)
        print(f"Retrieved {len(df)} records with certificate data")
        
        # Basic statistics
        print("\nBasic Statistics:")
        print("Number of unique certificate issuers:", df['https_certificate_issuer'].nunique())
        print("Number of unique signature algorithms:", df['https_certificate_signature_algorithm'].nunique())
        
        # Certificate issuer distribution
        issuer_dist = df['https_certificate_issuer'].value_counts()
        print("\nTop 10 Certificate Issuers:")
        print(issuer_dist.head(10))
        
        # Signature algorithm distribution
        algo_dist = df['https_certificate_signature_algorithm'].value_counts()
        print("\nSignature Algorithm Distribution:")
        print(algo_dist)
        
        # Save the analysis results
        output_dir = Path('/home/asomura/waseda/nextstep/RAPIDS/reports/certificate_analysis')
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Save distributions to CSV
        issuer_dist.to_csv(output_dir / f'{db_name}_issuer_distribution.csv')
        algo_dist.to_csv(output_dir / f'{db_name}_algorithm_distribution.csv')
        
        # Create visualizations
        plt.figure(figsize=(12, 6))
        issuer_dist.head(10).plot(kind='bar')
        plt.title(f'Top 10 Certificate Issuers - {db_name}')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(output_dir / f'{db_name}_issuer_distribution.png')
        plt.close()
        
    except Exception as e:
        print(f"Error analyzing {db_name}: {str(e)}")
        logging.error(f"Error analyzing {db_name}: {str(e)}")


Analyzing website_data...
Retrieved 10974 records with certificate data

Basic Statistics:
Number of unique certificate issuers: 136
Number of unique signature algorithms: 6

Top 10 Certificate Issuers:
https_certificate_issuer
R11                                               2923
R10                                               2526
R3                                                1026
GTS CA 1D4                                         805
E5                                                 547
E6                                                 507
E1                                                 347
Sectigo RSA Domain Validation Secure Server CA     251
invalid2.invalid                                   251
GlobalSign Atlas R3 DV TLS CA 2023 Q4              184
Name: count, dtype: int64

Signature Algorithm Distribution:
https_certificate_signature_algorithm
sha256WithRSAEncryption    9232
ecdsa-with-SHA384          1412
ecdsa-with-SHA256           235
sha384WithRSAEncryption   