In [2]:
import pandas as pd
import re
from tabulate import tabulate

# Read in the HTML file as a string
with open('./ppg014_data.html', "r") as f:
    html = f.read()

# Define a regular expression to match the tab-delimited tables
pattern = r"///\s*(.*?)\s*\n([\s\S]*?)///\s*\n"

tables = []
table_data = []
title = None
file_counter = 0
fig_counter = 1

for title, table_content in re.findall(pattern, html, re.DOTALL):
    # Split the table content into rows
    rows = table_content.strip().split("\n")

    # Split each row into columns and strip whitespace
    table_data = [row.strip().split() for row in rows]

    # Convert the list of lists to a DataFrame
    df = pd.DataFrame(table_data[1:], columns=[col.strip() for col in table_data[0]])

    # Remove columns by header name
    columns_to_remove = ["epT", "eNpi0", "eNpi0_sys", "eR_AA(%)", "eNorm(%)"]
    columns_to_drop = [col for col in columns_to_remove if col in df.columns]
    df = df.drop(columns=columns_to_drop)

    # Convert the DataFrame back to a list of lists
    table_data = [df.columns.tolist()] + df.values.tolist()

    # Add the table data to the tables list, along with the title
    tables.append((title, table_data))

    # Write table to a tab-separated file
    if file_counter < 13:
        filename = f"fig1_p{file_counter+1}.txt"
        filename_bin = f"fig1_p{file_counter+1}_binned.txt"
    else:
        filename = f"fig2_p{file_counter-12}.txt"
        filename_bin = f"fig2_p{file_counter-12}_binned.txt"
        
    with open(filename, 'w') as f:
        for row in table_data:
            f.write('\t'.join(row))
            f.write('\n')

    # Convert the 'pT' column to float
    df['pT'] = pd.to_numeric(df['pT'], errors='coerce')

    # Calculate bin width
    bin_width = df['pT'].diff().mean()

    # Subtract half bin width from pT to get pT_low
    df['pT_low'] = df['pT'] - bin_width/2

    # Add half bin width to pT to get pT_high
    df['pT_high'] = df['pT'] + bin_width/2

    # Rearrange column order
    df = df[['pT_low', 'pT_high'] + df.columns[:-2].tolist()]

    # Drop original pT column
    df.drop('pT', axis=1, inplace=True)

    # Save updated dataframe to output file
    df.to_csv(filename_bin, index=False, header=True, sep ='\t')

    file_counter += 1
    if file_counter == 13:
        fig_counter = 2

    # Print each table using tabulate, excluding rows with "----"
    if not re.match(r"^-*\n$", table_data[1][0]):
        print(title)
        print(tabulate(table_data, headers="firstrow", tablefmt='plain'))
        print()


0-92%  (minimum bias)
   pT       Npi0    eNpi0_stat    eNpi0_ccuncorr    eNpi0_pTcorr.
 1.25  0.8824        0.00994           0.07742          0.03416
 1.75  0.1698        0.00165           0.0149           0.007103
 2.25  0.03497       0.0004103         0.00306          0.00156
 2.75  0.008222      0.0001327         0.0007318        0.000384
 3.25  0.002219      4.723e-05         0.0001996        0.0001076
 3.75  0.0007308     9.735e-06         6.685e-05        3.686e-05
 4.25  0.0002631     4.445e-06         2.367e-05        1.381e-05
 4.75  0.0001071     2.383e-06         9.555e-06        5.805e-06
 5.25  4.585e-05     1.306e-06         4.076e-06        2.557e-06
 5.75  2.024e-05     7.684e-07         1.831e-06        1.147e-06
 6.25  1.039e-05     4.934e-07         9.518e-07        6.017e-07
 6.75  5.523e-06     3.432e-07         5.209e-07        3.172e-07
 7.25  2.954e-06     2.332e-07         2.879e-07        1.699e-07
 7.75  1.85e-06      1.891e-07         1.903e-07        1.01