In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine, text
import json
from pathlib import Path
import logging
from datetime import datetime
import re

# Set up logging
log_dir = Path('/home/asomura/waseda/nextstep/RAPIDS/data/logs')
log_dir.mkdir(parents=True, exist_ok=True)
logging.basicConfig(
    filename=log_dir / f'certificate_analysis_{datetime.now():%Y%m%d}.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Load database configuration
config_path = Path('/home/asomura/waseda/nextstep/RAPIDS/config/database.json')
with open(config_path) as f:
    config = json.load(f)['database']

def get_engine(db_name):
    """Create database engine for the specified database"""
    host = 'localhost' if db_name == 'website_data' else '192.168.1.92'
    return create_engine(
        f"postgresql://{config['user']}:{config['password']}@{host}/{db_name}"
    )

def get_detailed_certificate_data(db_name):
    """Retrieve detailed certificate data from database"""
    engine = get_engine(db_name)
    query = """
    SELECT 
        domain,
        https_certificate_expiry,
        https_certificate_all,
        https_certificate_signature_algorithm
    FROM website_data 
    WHERE status = 7 
    AND https_certificate_all IS NOT NULL
    """
    return pd.read_sql_query(query, engine)

def parse_certificate_expiry(df):
    """Parse and calculate certificate expiry dates"""
    # Convert expiry format (YYYYMMDDHHMMSSZ) to datetime
    df['https_certificate_expiry'] = pd.to_datetime(
        df['https_certificate_expiry'].str[:14], format='%Y%m%d%H%M%S', errors='coerce'
    )
    # Calculate remaining validity in days
    df['expiry_days_remaining'] = (df['https_certificate_expiry'] - datetime.now()).dt.days
    return df

def analyze_signature_algorithm(df):
    """Analyze signature algorithm usage"""
    return df['https_certificate_signature_algorithm'].value_counts()

def visualize_expiry_distribution(df, output_dir, db_name):
    """Visualize and save expiry distribution"""
    plt.figure(figsize=(10, 6))
    sns.histplot(data=df, x='expiry_days_remaining', bins=50, kde=True)
    plt.title(f'Certificate Expiry Distribution - {db_name}')
    plt.xlabel('Days Remaining')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.savefig(output_dir / f'{db_name}_expiry_distribution_{datetime.now():%Y%m%d}.png')
    plt.close()

# Main analysis process
output_dir = Path('/home/asomura/waseda/nextstep/RAPIDS/reports/certificate_analysis')
output_dir.mkdir(parents=True, exist_ok=True)

results = {}
for db_name in ['website_data', 'normal_sites']:
    print(f"\nAnalyzing {db_name}...")
    
    try:
        # Get certificate data
        df = get_detailed_certificate_data(db_name)
        print(f"Retrieved {len(df)} records with certificate data")

        # Parse expiry dates
        df = parse_certificate_expiry(df)

        # Analyze signature algorithms
        signature_stats = analyze_signature_algorithm(df)
        print("\nSignature Algorithm Statistics:")
        print(signature_stats)

        # Visualize expiry distribution
        visualize_expiry_distribution(df, output_dir, db_name)

        # Save numerical results
        timestamp = datetime.now().strftime('%Y%m%d')
        df.to_csv(output_dir / f'{db_name}_detailed_analysis_{timestamp}.csv', index=False)

    except Exception as e:
        print(f"Error analyzing {db_name}: {str(e)}")
        logging.error(f"Error analyzing {db_name}: {str(e)}")



Analyzing website_data...
Retrieved 11096 records with certificate data

Signature Algorithm Statistics:
https_certificate_signature_algorithm
sha256WithRSAEncryption    9298
ecdsa-with-SHA384          1412
ecdsa-with-SHA256           235
sha384WithRSAEncryption      88
md5WithRSAEncryption         55
sha1WithRSAEncryption         7
sha512WithRSAEncryption       1
Name: count, dtype: int64

Analyzing normal_sites...
Retrieved 9591 records with certificate data

Signature Algorithm Statistics:
https_certificate_signature_algorithm
sha256WithRSAEncryption    6393
ecdsa-with-SHA256          1985
ecdsa-with-SHA384           779
sha384WithRSAEncryption     427
sha512WithRSAEncryption       7
Name: count, dtype: int64
