In [1]:
# Registrar Distribution Analysis
# Location: RAPIDS/notebooks/exploration/02_registrar_distribution_analysis.ipynb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
import json
from pathlib import Path
from datetime import datetime
# 基本設定の前に以下を追加
import matplotlib
#matplotlib.rc('font', family='IPAGothic')  # IPAゴシックフォントを使用
matplotlib.rc('font', family='Noto Sans CJK JP')

# 基本設定
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

# データベース設定の読み込み
config_path = Path('/home/asomura/waseda/nextstep/RAPIDS/config/database.json')
with open(config_path) as f:
    config = json.load(f)['database']

def get_engine(db_name: str) -> create_engine:
    """Create database engine for the specified database"""
    host = 'localhost' if db_name == 'website_data' else 'localhost'
    return create_engine(
        f'postgresql://{config["user"]}:{config["password"]}@{host}/{db_name}'
    )

def analyze_registrar_distribution(db_name: str) -> pd.DataFrame:
    """
    Analyze registrar distribution for the specified database
    
    Args:
        db_name: Name of the database to analyze
        
    Returns:
        DataFrame containing registrar distribution
    """
    engine = get_engine(db_name)
    
    # レジストラの分布を取得（空値を除外）
    query = """
    SELECT 
        domain_registrar,
        COUNT(*) as count,
        CAST((COUNT(*)::float * 100 / SUM(COUNT(*)) OVER()) as numeric(10,2)) as percentage
    FROM website_data
    WHERE status = 7 
        AND domain_registrar IS NOT NULL 
        AND domain_registrar != ''
    GROUP BY domain_registrar
    ORDER BY count DESC
    LIMIT 20
    """
    
    return pd.read_sql_query(query, engine)

def plot_registrar_distribution(df: pd.DataFrame, db_name: str, output_dir: Path) -> None:
    """
    Create and save visualization of registrar distribution
    
    Args:
        df: DataFrame containing registrar data
        db_name: Name of the database
        output_dir: Directory to save the plot
    """
    plt.figure(figsize=(15, 8))
    
    # Top 10のレジストラを可視化
    colors = plt.cm.viridis(np.linspace(0, 1, 10))
    plt.barh(range(10), df.head(10)['percentage'], color=colors)
    plt.yticks(range(10), df.head(10)['domain_registrar'])
    
    plt.title(f'Top 10 Registrars Distribution - {db_name}')
    plt.xlabel('Percentage (%)')
    plt.ylabel('Registrar')
    
    # グリッド追加
    plt.grid(True, axis='x', linestyle='--', alpha=0.7)
    
    # Save plot
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_path = output_dir / f'registrar_distribution_{db_name}_{timestamp}.png'
    plt.savefig(output_path, bbox_inches='tight', dpi=300)
    plt.close()

def save_results(df: pd.DataFrame, db_name: str, output_dir: Path) -> None:
    """
    Save analysis results to CSV
    
    Args:
        df: DataFrame containing analysis results
        db_name: Name of the database
        output_dir: Directory to save results
    """
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_path = output_dir / f'registrar_distribution_{db_name}_{timestamp}.csv'
    df.to_csv(output_path, index=False)
    print(f"Results saved to: {output_path}")

# メイン処理
output_dir = Path('/home/asomura/waseda/nextstep/RAPIDS/reports/database_analysis')
output_dir.mkdir(parents=True, exist_ok=True)

# 両データベースの分析
results = {}
for db_name in ['website_data', 'normal_sites']:
    print(f"\nAnalyzing {db_name}...")
    
    try:
        # レジストラ分布の分析
        df = analyze_registrar_distribution(db_name)
        results[db_name] = df
        
        # 結果の表示
        print(f"\nTop 10 Registrars for {db_name}:")
        print(df.head(10))
        
        # 可視化と保存
        plot_registrar_distribution(df, db_name, output_dir)
        save_results(df, db_name, output_dir)
        
        print(f"Analysis completed for {db_name}")
        
    except Exception as e:
        print(f"Error analyzing {db_name}: {str(e)}")

# 比較分析
if len(results) == 2:
    print("\nComparison of top registrars between phishing and normal sites:")
    comparison_df = pd.merge(
        results['website_data'].rename(columns={'percentage': 'phishing_percentage'}),
        results['normal_sites'].rename(columns={'percentage': 'normal_percentage'}),
        on='domain_registrar',
        how='outer'
    ).fillna(0)

    comparison_df = comparison_df.sort_values('phishing_percentage', ascending=False)
    
    print("\nTop 10 registrars by phishing percentage:")
    print(comparison_df[['domain_registrar', 'phishing_percentage', 'normal_percentage']].head(10))
    
    # 比較結果の保存
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    comparison_path = output_dir / f'registrar_comparison_{timestamp}.csv'
    comparison_df.to_csv(comparison_path, index=False)
    print(f"\nComparison results saved to: {comparison_path}")


Analyzing website_data...

Top 10 Registrars for website_data:
                          domain_registrar  count  percentage
0                                Gandi SAS   3375       31.97
1                         MarkMonitor Inc.   1407       13.33
2                            阿里云计算有限公司（万网）   1105       10.47
3                         GoDaddy.com, LLC    834        7.90
4                      Gname.com Pte. Ltd.    565        5.35
5                         Cloudflare, Inc.    343        3.25
6                        MarkMonitor, Inc.    330        3.13
7                       Tucows Domains Inc    209        1.98
8                             TUCOWS, INC.    135        1.28
9  PDR Ltd. d/b/a PublicDomainRegistry.com    114        1.08
Results saved to: /home/asomura/waseda/nextstep/RAPIDS/reports/database_analysis/registrar_distribution_website_data_20250119_153624.csv
Analysis completed for website_data

Analyzing normal_sites...

Top 10 Registrars for normal_sites:
                 