In [1]:
# Registrar Distribution Analysis and Certificate Analysis with Debug Logging
# Location: RAPIDS/notebooks/exploration/02_registrar_distribution_analysis.ipynb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
import json
from pathlib import Path
from datetime import datetime
import matplotlib
#matplotlib.rc('font', family='IPAGothic')  # IPAゴシックフォントを使用
matplotlib.rc('font', family='Noto Sans CJK JP')

# 基本設定
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

# データベース設定の読み込み
config_path = Path('/home/asomura/waseda/nextstep/RAPIDS/config/database.json')
with open(config_path) as f:
    config = json.load(f)['database']

def get_engine(db_name: str) -> create_engine:
    """指定したデータベースのエンジンを作成"""
    host = '192.168.1.'
    return create_engine(
        f'postgresql://{config["user"]}:{config["password"]}@{host}/{db_name}'
    )

def get_record_count(db_name: str) -> int:
    """
    各データベースの該当条件（status=7 かつ domain_registrar が存在）のレコード数を取得
    """
    engine = get_engine(db_name)
    query = """
    SELECT COUNT(*) as total FROM website_data
    WHERE status = 7 AND domain_registrar IS NOT NULL AND domain_registrar != ''
    """
    df = pd.read_sql_query(query, engine)
    return int(df['total'].iloc[0])

def analyze_certificates_from_dir(cert_dir: Path) -> (pd.DataFrame, int):
    """
    指定ディレクトリ内（サブディレクトリも含む）の証明書ファイルを解析し、
    証明書情報を DataFrame にまとめる
    （対応形式: PEM, DER, PKCS#7（PEM/DER））
    
    Returns:
        DataFrame と、抽出された証明書の総数
    """
    records = []
    file_count = 0
    cert_count = 0
    # 再帰的に全ファイルを取得
    files = list(cert_dir.glob("**/*"))
    if not files:
        print(f"DEBUG: No files found in directory: {cert_dir}")
    else:
        print(f"DEBUG: Found {len(files)} files in directory (including subdirectories): {cert_dir}")
        # ファイル名の一覧を出力（デバッグ用）
        for f in files:
            if f.is_file():
                print(f"DEBUG: Found file: {f}")

    for file in files:
        if file.is_file():
            file_count += 1
            try:
                certs = load_certificate(file)
                cert_count += len(certs)
                for cert in certs:
                    subject = cert.subject.rfc4514_string()
                    issuer = cert.issuer.rfc4514_string()
                    not_before = cert.not_valid_before
                    not_after = cert.not_valid_after
                    records.append({
                        "file": file.name,
                        "subject": subject,
                        "issuer": issuer,
                        "not_before": not_before,
                        "not_after": not_after
                    })
            except Exception as e:
                print(f"Error processing {file}: {e}")
    print(f"DEBUG: Processed {file_count} certificate files from {cert_dir}")
    print(f"DEBUG: Extracted total {cert_count} certificates from {cert_dir}")
    return pd.DataFrame(records), cert_count

# メイン処理での使用例（抜粋）

# 各データベースごとに証明書ディレクトリを設定
certificates_dirs = {
    'website_data': Path('/home/asomura/waseda/nextstep/RAPIDS/reports/certificate_analysis/website_data'),
    'normal_sites': Path('/home/asomura/waseda/nextstep/RAPIDS/reports/certificate_analysis/normal_sites')
}

for db_name in ['website_data', 'normal_sites']:
    cert_dir = certificates_dirs.get(db_name)
    if cert_dir and cert_dir.exists():
        print(f"\nAnalyzing certificates for {db_name} in directory: {cert_dir}")
        certs_df, cert_count = analyze_certificates_from_dir(cert_dir)
        print(f"DEBUG: {db_name} certificate count: {cert_count}")
        
        # DBのレコード数と証明書の数を比較（不一致ならデバッグの手がかりに）
        db_record_count = get_record_count(db_name)
        print(f"DEBUG: {db_name} table record count (再取得): {db_record_count}")
        if cert_count < db_record_count:
            print(f"WARNING: {db_name} の証明書数 ({cert_count}) がテーブルレコード数 ({db_record_count}) より少ない")
        else:
            print(f"INFO: {db_name} の証明書数はテーブルレコード数と一致しています")
        
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        cert_output_path = output_dir / f'certificate_analysis_{db_name}_{timestamp}.csv'
        certs_df.to_csv(cert_output_path, index=False)
        print(f"Certificate analysis results saved to: {cert_output_path}")
    else:
        print(f"\nCertificate directory for {db_name} does not exist: {cert_dir}. Skipping certificate analysis.")


def plot_registrar_distribution(df: pd.DataFrame, db_name: str, output_dir: Path) -> None:
    """
    レジストラ分布の可視化を作成し、保存
    
    Args:
        df: レジストラデータを含む DataFrame
        db_name: データベース名
        output_dir: プロットの保存先ディレクトリ
    """
    plt.figure(figsize=(15, 8))
    colors = plt.cm.viridis(np.linspace(0, 1, 10))
    plt.barh(range(10), df.head(10)['percentage'], color=colors)
    plt.yticks(range(10), df.head(10)['domain_registrar'])
    
    plt.title(f'Top 10 Registrars Distribution - {db_name}')
    plt.xlabel('Percentage (%)')
    plt.ylabel('Registrar')
    
    plt.grid(True, axis='x', linestyle='--', alpha=0.7)
    
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_path = output_dir / f'registrar_distribution_{db_name}_{timestamp}.png'
    plt.savefig(output_path, bbox_inches='tight', dpi=300)
    plt.close()

def save_results(df: pd.DataFrame, db_name: str, output_dir: Path) -> None:
    """
    解析結果を CSV で保存
    
    Args:
        df: 解析結果を含む DataFrame
        db_name: データベース名
        output_dir: 結果の保存先ディレクトリ
    """
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_path = output_dir / f'registrar_distribution_{db_name}_{timestamp}.csv'
    df.to_csv(output_path, index=False)
    print(f"Results saved to: {output_path}")

# ----- 以下、証明書データ解析用の追加処理 -----

from cryptography import x509
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives.serialization import pkcs7

def load_certificate(file_path: Path) -> list:
    """
    PEM, DER, PKCS#7 (PEM/DER) の各形式に対応してファイルから証明書を読み込む
    ※PEM形式の場合、複数の証明書ブロックがある場合すべて抽出します。
    
    Args:
        file_path: 証明書ファイルのパス
        
    Returns:
        x509.Certificate オブジェクトのリスト
    """
    data = file_path.read_bytes()
    certificates = []
    
    # PEM形式の場合、複数ブロックを処理する
    if b"-----BEGIN CERTIFICATE-----" in data:
        pem_blocks = data.split(b"-----BEGIN CERTIFICATE-----")
        for block in pem_blocks[1:]:
            pem = b"-----BEGIN CERTIFICATE-----" + block
            try:
                cert = x509.load_pem_x509_certificate(pem, default_backend())
                certificates.append(cert)
            except Exception:
                continue
    else:
        # DER形式として試行
        try:
            cert = x509.load_der_x509_certificate(data, default_backend())
            certificates.append(cert)
        except Exception:
            pass
    
    # PEM/DERともに失敗した場合、PKCS#7形式として試行
    if not certificates:
        try:
            if b'-----BEGIN PKCS7-----' in data:
                certs = pkcs7.load_pem_pkcs7_certificates(data)
            else:
                certs = pkcs7.load_der_pkcs7_certificates(data)
            certificates.extend(certs)
        except Exception:
            pass
    
    if not certificates:
        raise ValueError(f"Unable to load certificate(s) from file: {file_path}")
    return certificates

def analyze_certificates_from_dir(cert_dir: Path) -> (pd.DataFrame, int):
    """
    指定ディレクトリ内の証明書ファイルを解析し、証明書情報をDataFrameにまとめる
    （対応形式: PEM, DER, PKCS#7（PEM/DER））
    
    Returns:
        DataFrame と、抽出された証明書の総数
    """
    records = []
    file_count = 0
    cert_count = 0
    for file in cert_dir.glob("*"):
        if file.is_file():
            file_count += 1
            try:
                certs = load_certificate(file)
                cert_count += len(certs)
                for cert in certs:
                    subject = cert.subject.rfc4514_string()
                    issuer = cert.issuer.rfc4514_string()
                    not_before = cert.not_valid_before
                    not_after = cert.not_valid_after
                    records.append({
                        "file": file.name,
                        "subject": subject,
                        "issuer": issuer,
                        "not_before": not_before,
                        "not_after": not_after
                    })
            except Exception as e:
                print(f"Error processing {file}: {e}")
    # デバッグメッセージ出力
    print(f"DEBUG: Processed {file_count} certificate files from {cert_dir}")
    print(f"DEBUG: Extracted total {cert_count} certificates from {cert_dir}")
    return pd.DataFrame(records), cert_count

# ----- メイン処理 -----

output_dir = Path('/home/asomura/waseda/nextstep/RAPIDS/reports/database_analysis')
output_dir.mkdir(parents=True, exist_ok=True)

# 証明書ディレクトリは、各データベースごとに分けて管理（例）
certificates_dirs = {
    'website_data': Path('/home/asomura/waseda/nextstep/RAPIDS/reports/certificate_analysis/website_data'),
    'normal_sites': Path('/home/asomura/waseda/nextstep/RAPIDS/reports/certificate_analysis/normal_sites')
}

# 両データベースの解析
results = {}
for db_name in ['website_data', 'normal_sites']:
    print(f"\nAnalyzing {db_name} database...")
    try:
        # DBレコード数の取得（status=7かつdomain_registrar有りの条件）
        record_count = get_record_count(db_name)
        print(f"DEBUG: {db_name} table record count: {record_count}")
        
        # レジストラ分布の分析
        df = analyze_registrar_distribution(db_name)
        results[db_name] = df
        
        print(f"\nTop 10 Registrars for {db_name}:")
        print(df.head(10))
        
        plot_registrar_distribution(df, db_name, output_dir)
        save_results(df, db_name, output_dir)
        
        print(f"Analysis completed for {db_name}")
        
    except Exception as e:
        print(f"Error analyzing {db_name}: {str(e)}")

# 各データベースの証明書解析（該当ディレクトリが存在する場合）
for db_name in ['website_data', 'normal_sites']:
    cert_dir = certificates_dirs.get(db_name)
    if cert_dir and cert_dir.exists():
        print(f"\nAnalyzing certificates for {db_name} in directory: {cert_dir}")
        certs_df, cert_count = analyze_certificates_from_dir(cert_dir)
        print(f"DEBUG: {db_name} certificate count: {cert_count}")
        
        # DBのレコード数と証明書の数を比較（不一致ならデバッグの手がかりに）
        db_record_count = get_record_count(db_name)
        print(f"DEBUG: {db_name} table record count (再取得): {db_record_count}")
        if cert_count < db_record_count:
            print(f"WARNING: {db_name} の証明書数 ({cert_count}) がテーブルレコード数 ({db_record_count}) より少ない")
        else:
            print(f"INFO: {db_name} の証明書数はテーブルレコード数と一致しています")
        
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        cert_output_path = output_dir / f'certificate_analysis_{db_name}_{timestamp}.csv'
        certs_df.to_csv(cert_output_path, index=False)
        print(f"Certificate analysis results saved to: {cert_output_path}")
    else:
        print(f"\nCertificate directory for {db_name} does not exist: {cert_dir}. Skipping certificate analysis.")

# 比較分析（両DBのレジストラ分布を比較）
if len(results) == 2:
    print("\nComparison of top registrars between phishing and normal sites:")
    comparison_df = pd.merge(
        results['website_data'].rename(columns={'percentage': 'phishing_percentage'}),
        results['normal_sites'].rename(columns={'percentage': 'normal_percentage'}),
        on='domain_registrar',
        how='outer'
    ).fillna(0)
    comparison_df = comparison_df.sort_values('phishing_percentage', ascending=False)
    
    print("\nTop 10 registrars by phishing percentage:")
    print(comparison_df[['domain_registrar', 'phishing_percentage', 'normal_percentage']].head(10))
    
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    comparison_path = output_dir / f'registrar_comparison_{timestamp}.csv'
    comparison_df.to_csv(comparison_path, index=False)
    print(f"\nComparison results saved to: {comparison_path}")


NameError: name 'Path' is not defined