In [10]:
import re
import pandas as pd

def parse_log_to_dataframe(filename: str) -> pd.DataFrame:
    """
    Parse the specified log file and return a DataFrame with extracted statistics,
    handling cases where the log file is incomplete.

    Parameters:
    - filename: str : The name of the log file to parse.

    Returns:
    - pd.DataFrame: DataFrame containing parsed statistics.
    """
    # Read log file
    with open(filename, "r") as file:
        log_text = file.read()

    print(log_text)

    # Regular expression patterns
    trial_pattern = r"\[I \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d+\] Trial (\d+) finished with value: ([\d.]+) and parameters: \{([^}]+)\}."
    mean_stats_pattern = r"INFO - Candidate Mean Stats: \{'Max': ([\d.e+-]+), 'Min': ([\d.e+-]+), 'Mean': ([\d.e+-]+), 'Std': ([\d.e+-]+)\}"
    std_stats_pattern = r"INFO - Candidate Std Stats: \{'Max': ([\d.e+-]+), 'Min': ([\d.e+-]+), 'Mean': ([\d.e+-]+), 'Std': ([\d.e+-]+)\}"
    top_mean_pattern = r"INFO - Top 10 Mean Stats: \{'Max': ([\d.e+-]+), 'Min': ([\d.e+-]+), 'Mean': ([\d.e+-]+), 'Std': ([\d.e+-]+)\}"
    top_std_pattern = r"INFO - Top 10 Std Stats: \{'Max': ([\d.e+-]+), 'Min': ([\d.e+-]+), 'Mean': ([\d.e+-]+), 'Std': ([\d.e+-]+)\}"
    bottom_mean_pattern = r"INFO - Bottom 10 Mean Stats: \{'Max': ([\d.e+-]+), 'Min': ([\d.e+-]+), 'Mean': ([\d.e+-]+), 'Std': ([\d.e+-]+)\}"
    bottom_std_pattern = r"INFO - Bottom 10 Std Stats: \{'Max': ([\d.e+-]+), 'Min': ([\d.e+-]+), 'Mean': ([\d.e+-]+), 'Std': ([\d.e+-]+)\}"

    # Extract matches
    trials = re.findall(trial_pattern, log_text)
    mean_stats = re.findall(mean_stats_pattern, log_text)
    std_stats = re.findall(std_stats_pattern, log_text)
    top_mean_stats = re.findall(top_mean_pattern, log_text)
    top_std_stats = re.findall(top_std_pattern, log_text)
    bottom_mean_stats = re.findall(bottom_mean_pattern, log_text)
    bottom_std_stats = re.findall(bottom_std_pattern, log_text)

    # Determine the minimum length across all extracted lists
    min_length = min(
        len(trials), len(mean_stats), len(std_stats), 
        len(top_mean_stats), len(top_std_stats), 
        len(bottom_mean_stats), len(bottom_std_stats)
    )

    # Truncate lists to the minimum length
    trials = trials[:min_length]
    mean_stats = mean_stats[:min_length]
    std_stats = std_stats[:min_length]
    top_mean_stats = top_mean_stats[:min_length]
    top_std_stats = top_std_stats[:min_length]
    bottom_mean_stats = bottom_mean_stats[:min_length]
    bottom_std_stats = bottom_std_stats[:min_length]

    # Organize data into lists for each column
    data = {
        "Trial": [int(t[0]) for t in trials],
        "Value": [float(t[1]) for t in trials],
        "Parameters": [t[2] for t in trials],
        "Candidate Mean Max": [float(m[0]) for m in mean_stats],
        "Candidate Mean Min": [float(m[1]) for m in mean_stats],
        "Candidate Mean Mean": [float(m[2]) for m in mean_stats],
        "Candidate Mean Std": [float(m[3]) for m in mean_stats],
        "Candidate Std Max": [float(s[0]) for s in std_stats],
        "Candidate Std Min": [float(s[1]) for s in std_stats],
        "Candidate Std Mean": [float(s[2]) for s in std_stats],
        "Candidate Std Std": [float(s[3]) for s in std_stats],
        "Top 10 Mean Max": [float(t[0]) for t in top_mean_stats],
        "Top 10 Mean Min": [float(t[1]) for t in top_mean_stats],
        "Top 10 Mean Mean": [float(t[2]) for t in top_mean_stats],
        "Top 10 Mean Std": [float(t[3]) for t in top_mean_stats],
        "Top 10 Std Max": [float(t[0]) for t in top_std_stats],
        "Top 10 Std Min": [float(t[1]) for t in top_std_stats],
        "Top 10 Std Mean": [float(t[2]) for t in top_std_stats],
        "Top 10 Std Std": [float(t[3]) for t in top_std_stats],
        "Bottom 10 Mean Max": [float(b[0]) for b in bottom_mean_stats],
        "Bottom 10 Mean Min": [float(b[1]) for b in bottom_mean_stats],
        "Bottom 10 Mean Mean": [float(b[2]) for b in bottom_mean_stats],
        "Bottom 10 Mean Std": [float(b[3]) for b in bottom_mean_stats],
        "Bottom 10 Std Max": [float(b[0]) for b in bottom_std_stats],
        "Bottom 10 Std Min": [float(b[1]) for b in bottom_std_stats],
        "Bottom 10 Std Mean": [float(b[2]) for b in bottom_std_stats],
        "Bottom 10 Std Std": [float(b[3]) for b in bottom_std_stats],
    }

    # Create DataFrame
    df = pd.DataFrame(data)
    return df


In [11]:
import os

files = sorted([file for file in os.listdir() if "output_" in file])
files

['output_0.log',
 'output_1.log',
 'output_2.log',
 'output_3.log',
 'output_4.log']

In [12]:
df = parse_log_to_dataframe(files[0])
df

2024-11-09 16:06:57,756 - INFO - Experiment settings: {'name': 'bo_parafac_dim5_rank2_mask0.3_tradeoff1.0_seed0', 'seed': 0, 'dimensions': 5, 'iter_bo': 300, 'storage': 'sqlite:////Users/keisukeonoue/ws/constrained_BO_v2/results/dbs/2024-11-09_16-06-57_bo_parafac_dim5_rank2_mask0.3_tradeoff1.0_seed0.db', 'unique_sampling': False, 'cp_settings': {'rank': 2, 'als_iterations': 100, 'mask_ratio': 0.3, 'random_dist_type': 'uniform'}, 'acqf_settings': {'trade_off_param': 1.0, 'maximize': False}}
[I 2024-11-09 16:06:58,182] A new study created in RDB with name: bo_parafac_dim5_rank2_mask0.3_tradeoff1.0_seed0
2024-11-09 16:06:58,183 - INFO - Created new study 'bo_parafac_dim5_rank2_mask0.3_tradeoff1.0_seed0' in sqlite:////Users/keisukeonoue/ws/constrained_BO_v2/results/dbs/2024-11-09_16-06-57_bo_parafac_dim5_rank2_mask0.3_tradeoff1.0_seed0.db
2024-11-09 16:06:58,199 - INFO - Using sample_independent for sampling.
2024-11-09 16:06:58,203 - INFO - Using sample_independent for sampling.
2024-11-0

Unnamed: 0,Trial,Value,Parameters,Candidate Mean Max,Candidate Mean Min,Candidate Mean Mean,Candidate Mean Std,Candidate Std Max,Candidate Std Min,Candidate Std Mean,...,Top 10 Std Mean,Top 10 Std Std,Bottom 10 Mean Max,Bottom 10 Mean Min,Bottom 10 Mean Mean,Bottom 10 Mean Std,Bottom 10 Std Max,Bottom 10 Std Min,Bottom 10 Std Mean,Bottom 10 Std Std
