# Edge Transit Log Analysis

Fab별 반송 효율 분석을 위한 노트북

## 분석 가능 항목
- Throughput (시간당 처리량)
- Transit Time (Edge 통과 시간)
- Vehicle 활용률
- Edge별 혼잡도
- Fab간 비교 분석
- 여러 실험 결과 비교

## 1. Setup & Load Data

In [None]:
import struct
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from pathlib import Path

# 한글 폰트 설정 (Colab)
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['figure.figsize'] = (12, 6)
sns.set_style('whitegrid')

print('Setup complete!')

In [None]:
# Google Drive 마운트
from google.colab import drive
drive.mount('/content/drive')

# 로그 파일 경로 설정
LOG_DIR = Path('/content/drive/MyDrive/vps_logs')  # 필요시 수정
print(f'Log directory: {LOG_DIR}')

# 파일 목록 확인
if LOG_DIR.exists():
    log_files = list(LOG_DIR.glob('*.bin'))
    print(f'Found {len(log_files)} log files:')
    for f in log_files:
        size_kb = f.stat().st_size / 1024
        print(f'  - {f.name} ({size_kb:.1f} KB)')
else:
    print(f'Directory not found: {LOG_DIR}')
    print('Create the directory and upload your .bin files')

In [None]:
# Binary Format
RECORD_SIZE = 28
RECORD_FORMAT = '<I BB H I I I f B 3x'  # little-endian

EDGE_TYPES = {
    0: 'LINEAR',
    1: 'CURVE_90',
    2: 'CURVE_180',
    3: 'CURVE_CSC',
    4: 'S_CURVE',
    5: 'LEFT_CURVE',
    6: 'RIGHT_CURVE',
}

def parse_log_file(filepath):
    """바이너리 로그 파일을 DataFrame으로 변환"""
    records = []
    
    with open(filepath, 'rb') as f:
        data = f.read()
    
    for i in range(0, len(data), RECORD_SIZE):
        chunk = data[i:i+RECORD_SIZE]
        if len(chunk) < RECORD_SIZE:
            break
        
        r = struct.unpack(RECORD_FORMAT, chunk)
        records.append({
            'timestamp': r[0],
            'worker_id': r[1],
            'fab_id': r[2],
            'edge_id': r[3],
            'veh_id': r[4],
            'enter_time': r[5],
            'exit_time': r[6],
            'edge_length': r[7],
            'edge_type': r[8],
        })
    
    df = pd.DataFrame(records)
    
    if len(df) > 0:
        # 계산 컬럼 추가
        df['transit_time'] = df['exit_time'] - df['enter_time']
        df['speed'] = df['edge_length'] / (df['transit_time'] / 1000.0)
        df['speed'] = df['speed'].replace([np.inf, -np.inf], np.nan)
        df['edge_type_name'] = df['edge_type'].map(EDGE_TYPES)
        
        # 시간을 초 단위로 변환
        df['timestamp_sec'] = df['timestamp'] / 1000.0
        df['enter_time_sec'] = df['enter_time'] / 1000.0
        df['exit_time_sec'] = df['exit_time'] / 1000.0
    
    return df

print('Parser ready!')

In [None]:
# 로그 파일 로드 (파일명 수정 필요)
# LOG_FILE = LOG_DIR / 'edge_transit_sim_xxx.bin'
LOG_FILE = log_files[0] if log_files else None  # 첫 번째 파일 사용

if LOG_FILE:
    df = parse_log_file(LOG_FILE)
    print(f'Loaded: {LOG_FILE.name}')
    print(f'Total records: {len(df):,}')
    print(f'\nDataFrame shape: {df.shape}')
    print(f'\nColumns: {list(df.columns)}')
    df.head(10)

## 2. Basic Statistics

In [None]:
def print_summary(df, name='Dataset'):
    """데이터셋 요약 출력"""
    print(f'\n{"="*60}')
    print(f'{name} Summary')
    print(f'{"="*60}')
    
    print(f'\nTotal Records: {len(df):,}')
    print(f'Unique Vehicles: {df["veh_id"].nunique()}')
    print(f'Unique Edges: {df["edge_id"].nunique()}')
    print(f'Unique Fabs: {df["fab_id"].nunique()}')
    print(f'Unique Workers: {df["worker_id"].nunique()}')
    
    duration = (df['timestamp'].max() - df['timestamp'].min()) / 1000
    print(f'\nSimulation Duration: {duration:.1f} seconds')
    print(f'Throughput: {len(df) / duration:.1f} transits/sec')
    
    print(f'\nTransit Time (ms):')
    print(f'  Mean: {df["transit_time"].mean():.1f}')
    print(f'  Median: {df["transit_time"].median():.1f}')
    print(f'  Std: {df["transit_time"].std():.1f}')
    print(f'  Min: {df["transit_time"].min()}')
    print(f'  Max: {df["transit_time"].max()}')
    
    valid_speeds = df['speed'].dropna()
    print(f'\nSpeed (m/s):')
    print(f'  Mean: {valid_speeds.mean():.2f}')
    print(f'  Median: {valid_speeds.median():.2f}')

print_summary(df)

In [None]:
# Fab별 통계
fab_stats = df.groupby('fab_id').agg({
    'veh_id': 'nunique',
    'edge_id': 'nunique',
    'timestamp': ['count', 'min', 'max'],
    'transit_time': ['mean', 'median', 'std'],
    'speed': 'mean',
}).round(2)

fab_stats.columns = ['_'.join(col).strip() for col in fab_stats.columns]
fab_stats['duration_sec'] = (fab_stats['timestamp_max'] - fab_stats['timestamp_min']) / 1000
fab_stats['throughput'] = fab_stats['timestamp_count'] / fab_stats['duration_sec']

print('\nFab-level Statistics:')
fab_stats

## 3. Throughput Analysis

In [None]:
def plot_throughput_over_time(df, window_sec=1.0, by_fab=True):
    """시간에 따른 Throughput 시각화"""
    fig, axes = plt.subplots(2, 1, figsize=(14, 8))
    
    # 전체 Throughput
    df_sorted = df.sort_values('timestamp')
    min_time = df_sorted['timestamp_sec'].min()
    df_sorted['time_bin'] = ((df_sorted['timestamp_sec'] - min_time) / window_sec).astype(int)
    
    throughput = df_sorted.groupby('time_bin').size()
    throughput.index = throughput.index * window_sec
    
    axes[0].plot(throughput.index, throughput.values / window_sec, 'b-', linewidth=1)
    axes[0].fill_between(throughput.index, throughput.values / window_sec, alpha=0.3)
    axes[0].set_xlabel('Time (seconds)')
    axes[0].set_ylabel('Throughput (transits/sec)')
    axes[0].set_title(f'Overall Throughput (window={window_sec}s)')
    axes[0].axhline(y=throughput.mean() / window_sec, color='r', linestyle='--', label=f'Mean: {throughput.mean()/window_sec:.1f}')
    axes[0].legend()
    
    # Fab별 Throughput
    if by_fab and df['fab_id'].nunique() > 1:
        for fab_id in sorted(df['fab_id'].unique()):
            fab_df = df_sorted[df_sorted['fab_id'] == fab_id]
            fab_throughput = fab_df.groupby('time_bin').size()
            fab_throughput.index = fab_throughput.index * window_sec
            axes[1].plot(fab_throughput.index, fab_throughput.values / window_sec, 
                        label=f'Fab {fab_id}', linewidth=1, alpha=0.8)
        
        axes[1].set_xlabel('Time (seconds)')
        axes[1].set_ylabel('Throughput (transits/sec)')
        axes[1].set_title('Throughput by Fab')
        axes[1].legend()
    else:
        axes[1].text(0.5, 0.5, 'Single Fab - No comparison', ha='center', va='center')
    
    plt.tight_layout()
    plt.show()

plot_throughput_over_time(df, window_sec=1.0)

## 4. Transit Time Analysis

In [None]:
def plot_transit_time_distribution(df):
    """Transit Time 분포 시각화"""
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # 전체 분포 (히스토그램)
    axes[0, 0].hist(df['transit_time'], bins=50, edgecolor='black', alpha=0.7)
    axes[0, 0].axvline(df['transit_time'].mean(), color='r', linestyle='--', label=f'Mean: {df["transit_time"].mean():.1f}ms')
    axes[0, 0].axvline(df['transit_time'].median(), color='g', linestyle='--', label=f'Median: {df["transit_time"].median():.1f}ms')
    axes[0, 0].set_xlabel('Transit Time (ms)')
    axes[0, 0].set_ylabel('Count')
    axes[0, 0].set_title('Transit Time Distribution')
    axes[0, 0].legend()
    
    # Edge Type별 분포
    edge_types = df['edge_type_name'].dropna().unique()
    transit_by_type = [df[df['edge_type_name'] == t]['transit_time'].values for t in edge_types]
    axes[0, 1].boxplot(transit_by_type, labels=edge_types)
    axes[0, 1].set_xlabel('Edge Type')
    axes[0, 1].set_ylabel('Transit Time (ms)')
    axes[0, 1].set_title('Transit Time by Edge Type')
    axes[0, 1].tick_params(axis='x', rotation=45)
    
    # Fab별 분포
    if df['fab_id'].nunique() > 1:
        fab_ids = sorted(df['fab_id'].unique())
        transit_by_fab = [df[df['fab_id'] == f]['transit_time'].values for f in fab_ids]
        axes[1, 0].boxplot(transit_by_fab, labels=[f'Fab {f}' for f in fab_ids])
        axes[1, 0].set_xlabel('Fab')
        axes[1, 0].set_ylabel('Transit Time (ms)')
        axes[1, 0].set_title('Transit Time by Fab')
    else:
        axes[1, 0].text(0.5, 0.5, 'Single Fab', ha='center', va='center')
    
    # Speed 분포
    valid_speeds = df['speed'].dropna()
    axes[1, 1].hist(valid_speeds, bins=50, edgecolor='black', alpha=0.7)
    axes[1, 1].axvline(valid_speeds.mean(), color='r', linestyle='--', label=f'Mean: {valid_speeds.mean():.2f}m/s')
    axes[1, 1].set_xlabel('Speed (m/s)')
    axes[1, 1].set_ylabel('Count')
    axes[1, 1].set_title('Speed Distribution')
    axes[1, 1].legend()
    
    plt.tight_layout()
    plt.show()

plot_transit_time_distribution(df)

## 5. Edge Congestion Analysis

In [None]:
def analyze_edge_congestion(df, top_n=20):
    """Edge별 혼잡도 분석"""
    edge_stats = df.groupby('edge_id').agg({
        'veh_id': 'count',  # 통과 횟수
        'transit_time': ['mean', 'median', 'std', 'max'],
        'speed': 'mean',
        'edge_length': 'first',
        'edge_type_name': 'first',
    }).round(2)
    
    edge_stats.columns = ['transit_count', 'transit_mean', 'transit_median', 
                          'transit_std', 'transit_max', 'speed_mean', 
                          'edge_length', 'edge_type']
    
    # 혼잡도 점수 계산 (높을수록 혼잡)
    # = 통과 횟수 * (평균 통과시간 / 기대 통과시간)
    expected_time = edge_stats['edge_length'] / 3.0 * 1000  # 3m/s 기준
    edge_stats['congestion_score'] = edge_stats['transit_count'] * (edge_stats['transit_mean'] / expected_time)
    
    edge_stats_sorted = edge_stats.sort_values('congestion_score', ascending=False)
    
    print(f'\nTop {top_n} Congested Edges:')
    display(edge_stats_sorted.head(top_n))
    
    # 시각화
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # 통과 횟수 분포
    axes[0].hist(edge_stats['transit_count'], bins=30, edgecolor='black', alpha=0.7)
    axes[0].set_xlabel('Transit Count')
    axes[0].set_ylabel('Number of Edges')
    axes[0].set_title('Edge Usage Distribution')
    
    # Top N 혼잡 Edge
    top_edges = edge_stats_sorted.head(top_n)
    axes[1].barh(range(len(top_edges)), top_edges['congestion_score'])
    axes[1].set_yticks(range(len(top_edges)))
    axes[1].set_yticklabels([f'Edge {i}' for i in top_edges.index])
    axes[1].set_xlabel('Congestion Score')
    axes[1].set_title(f'Top {top_n} Congested Edges')
    axes[1].invert_yaxis()
    
    plt.tight_layout()
    plt.show()
    
    return edge_stats_sorted

edge_congestion = analyze_edge_congestion(df)

## 6. Vehicle Utilization

In [None]:
def analyze_vehicle_utilization(df):
    """Vehicle별 활용률 분석"""
    veh_stats = df.groupby('veh_id').agg({
        'timestamp': ['count', 'min', 'max'],
        'transit_time': ['sum', 'mean'],
        'edge_length': 'sum',
        'fab_id': 'first',
    }).round(2)
    
    veh_stats.columns = ['transit_count', 'first_time', 'last_time', 
                         'total_transit_time', 'avg_transit_time', 
                         'total_distance', 'fab_id']
    
    veh_stats['active_duration'] = veh_stats['last_time'] - veh_stats['first_time']
    veh_stats['utilization'] = veh_stats['total_transit_time'] / veh_stats['active_duration'] * 100
    veh_stats['utilization'] = veh_stats['utilization'].clip(0, 100)
    
    print('\nVehicle Utilization Summary:')
    print(f'  Mean Utilization: {veh_stats["utilization"].mean():.1f}%')
    print(f'  Median Utilization: {veh_stats["utilization"].median():.1f}%')
    print(f'  Total Distance (all vehicles): {veh_stats["total_distance"].sum():.1f}m')
    
    # 시각화
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # 활용률 분포
    axes[0, 0].hist(veh_stats['utilization'], bins=30, edgecolor='black', alpha=0.7)
    axes[0, 0].axvline(veh_stats['utilization'].mean(), color='r', linestyle='--')
    axes[0, 0].set_xlabel('Utilization (%)')
    axes[0, 0].set_ylabel('Number of Vehicles')
    axes[0, 0].set_title('Vehicle Utilization Distribution')
    
    # 통과 횟수 분포
    axes[0, 1].hist(veh_stats['transit_count'], bins=30, edgecolor='black', alpha=0.7)
    axes[0, 1].set_xlabel('Transit Count')
    axes[0, 1].set_ylabel('Number of Vehicles')
    axes[0, 1].set_title('Transits per Vehicle')
    
    # Fab별 활용률
    if df['fab_id'].nunique() > 1:
        fab_util = veh_stats.groupby('fab_id')['utilization'].mean()
        axes[1, 0].bar(fab_util.index, fab_util.values)
        axes[1, 0].set_xlabel('Fab ID')
        axes[1, 0].set_ylabel('Mean Utilization (%)')
        axes[1, 0].set_title('Vehicle Utilization by Fab')
    else:
        axes[1, 0].text(0.5, 0.5, 'Single Fab', ha='center', va='center')
    
    # 총 이동거리 분포
    axes[1, 1].hist(veh_stats['total_distance'], bins=30, edgecolor='black', alpha=0.7)
    axes[1, 1].set_xlabel('Total Distance (m)')
    axes[1, 1].set_ylabel('Number of Vehicles')
    axes[1, 1].set_title('Total Distance per Vehicle')
    
    plt.tight_layout()
    plt.show()
    
    return veh_stats

veh_utilization = analyze_vehicle_utilization(df)

## 7. Compare Multiple Experiments

In [None]:
def load_multiple_experiments(file_paths, names=None):
    """
    여러 실험 결과 로드
    
    Args:
        file_paths: 로그 파일 경로 리스트
        names: 실험 이름 리스트 (없으면 파일명 사용)
    """
    experiments = {}
    
    for i, path in enumerate(file_paths):
        path = Path(path)
        name = names[i] if names else path.stem
        
        df = parse_log_file(path)
        experiments[name] = df
        print(f'Loaded {name}: {len(df):,} records')
    
    return experiments

def compare_experiments(experiments):
    """실험 결과 비교"""
    comparison = []
    
    for name, df in experiments.items():
        duration = (df['timestamp'].max() - df['timestamp'].min()) / 1000
        comparison.append({
            'experiment': name,
            'records': len(df),
            'duration_sec': duration,
            'throughput': len(df) / duration,
            'transit_mean': df['transit_time'].mean(),
            'transit_median': df['transit_time'].median(),
            'speed_mean': df['speed'].mean(),
            'unique_vehicles': df['veh_id'].nunique(),
            'unique_edges': df['edge_id'].nunique(),
        })
    
    comp_df = pd.DataFrame(comparison).round(2)
    print('\nExperiment Comparison:')
    display(comp_df)
    
    # 시각화
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    exp_names = list(experiments.keys())
    
    # Throughput 비교
    axes[0, 0].bar(exp_names, comp_df['throughput'])
    axes[0, 0].set_ylabel('Throughput (transits/sec)')
    axes[0, 0].set_title('Throughput Comparison')
    axes[0, 0].tick_params(axis='x', rotation=45)
    
    # Transit Time 비교
    x = np.arange(len(exp_names))
    width = 0.35
    axes[0, 1].bar(x - width/2, comp_df['transit_mean'], width, label='Mean')
    axes[0, 1].bar(x + width/2, comp_df['transit_median'], width, label='Median')
    axes[0, 1].set_xticks(x)
    axes[0, 1].set_xticklabels(exp_names, rotation=45)
    axes[0, 1].set_ylabel('Transit Time (ms)')
    axes[0, 1].set_title('Transit Time Comparison')
    axes[0, 1].legend()
    
    # Speed 비교
    axes[1, 0].bar(exp_names, comp_df['speed_mean'])
    axes[1, 0].set_ylabel('Speed (m/s)')
    axes[1, 0].set_title('Average Speed Comparison')
    axes[1, 0].tick_params(axis='x', rotation=45)
    
    # Transit Time 분포 비교 (Box plot)
    transit_data = [experiments[name]['transit_time'].values for name in exp_names]
    axes[1, 1].boxplot(transit_data, labels=exp_names)
    axes[1, 1].set_ylabel('Transit Time (ms)')
    axes[1, 1].set_title('Transit Time Distribution Comparison')
    axes[1, 1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    return comp_df

# 예시: 여러 실험 비교
# experiments = load_multiple_experiments([
#     LOG_DIR / 'experiment_baseline.bin',
#     LOG_DIR / 'experiment_new_routing.bin',
#     LOG_DIR / 'experiment_optimized.bin',
# ], names=['Baseline', 'New Routing', 'Optimized'])
# compare_experiments(experiments)

## 8. Export Results

In [None]:
# CSV로 내보내기
OUTPUT_DIR = Path('/content/drive/MyDrive/vps_logs/output')
OUTPUT_DIR.mkdir(exist_ok=True)

# 전체 데이터
# df.to_csv(OUTPUT_DIR / 'full_data.csv', index=False)

# 요약 통계
# fab_stats.to_csv(OUTPUT_DIR / 'fab_stats.csv')
# edge_congestion.to_csv(OUTPUT_DIR / 'edge_congestion.csv')
# veh_utilization.to_csv(OUTPUT_DIR / 'vehicle_utilization.csv')

print('Export examples (uncomment to use):')
print('  df.to_csv(OUTPUT_DIR / "full_data.csv", index=False)')
print('  fab_stats.to_csv(OUTPUT_DIR / "fab_stats.csv")')

## 9. Custom Analysis

자유롭게 분석 코드를 추가하세요.

In [None]:
# 여기에 커스텀 분석 코드 작성
# 예시:
#
# # 특정 Fab만 필터링
# fab0_df = df[df['fab_id'] == 0]
#
# # 특정 시간대만 필터링
# start_time = 10000  # 10초 이후
# end_time = 60000    # 60초까지
# time_filtered = df[(df['timestamp'] >= start_time) & (df['timestamp'] <= end_time)]
#
# # Edge Type별 평균 속도
# df.groupby('edge_type_name')['speed'].mean()