In [None]:
import re
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.gridspec import GridSpec
from matplotlib.backends.backend_pdf import PdfPages
from collections import defaultdict

# plotting style configuration
plt.rcParams['figure.dpi'] = 400
plt.rcParams['figure.figsize'] = [12, 10]
plt.rcParams['font.size'] = 14

# analysis configuration
base_dir = '/Users/leodrake/Documents/MIT/ss433/HRC_2024/2Dfits'
center_pixel = 80.5
g1_component = 'core'

# dynamic file configuration
num_comps = 4
sigma_val = 1
bin_size = 0.25 

# signifiers: add or remove strings here (e.g., 'jittercorr', 'empPSF', 'mcmc')
# they will be automatically joined by dashes.
signifiers = ['empPSF', 'mcmc']

# automatic filename construction
# create the bin string (e.g., 0.25 -> "0p25", 1 -> "1")
bin_str = str(bin_size).replace('.', 'p')

# create the signifiers string (e.g., "jittercorr-empPSF")
signifiers_str = "-".join(signifiers)

# construct the common suffix used by all files
# format: ncomp-nsigma-{signifiers}-bin{n}
# example: 4comp-1sigma-jittercorr-empPSF-bin0p25
file_identifier = f"{num_comps}comp-{sigma_val}sigma-{signifiers_str}-bin{bin_str}"

# calculate pixel scale based on bin size
pixscale_arcsec = 0.13175 * bin_size 

print(f"file id set to: {file_identifier}")

In [None]:
def load_sherpa_log_to_dataframe(filename):
    """
    reads raw sherpa log, dynamically renames components 
    (core, east, west) based on physics, and returns a dataframe.
    automatically handles both standard confidence bounds and mcmc (emcee) results.
    """
    # compiled regex for performance
    obs_id_re = re.compile(r"Observation:\s*(\d+)")
    date_re = re.compile(r"Date:\s*([\d\.]+).*?Exptime:\s*([\d\.]+)")
    
    # regex for standard confidence bounds
    # format: name | val | low | up
    conf_re = re.compile(r"^\s*(g\d+|c\d+)\.(?P<param>[a-z0-9]+)\s+(?P<val>[-\d.eE]+)\s+(?P<low>[-\d.eE]+|-------)\s+(?P<up>[-\d.eE]+|-------)", re.M)

    # regex for mcmc (emcee) results
    # format: name | val | median | low (-error) | up (+error)
    # we capture val, low, up and ignore median
    mcmc_re = re.compile(r"^\s*(g\d+|c\d+)\.(?P<param>[a-z0-9]+)\s+(?P<val>[-\d.eE]+)\s+(?:[-\d.eE]+)\s+(?P<low>[-\d.eE]+)\s+(?P<up>[-\d.eE]+)", re.M)
    
    # regex for count rates
    rate_line_re = re.compile(r"^\s*(g\d+|c\d+)\s*:\s*(?P<val>[-\d.eE]+)\s*\((?P<low>[-\d.eE+]+)\s*/\s*(?P<up>[-\d.eE+]+)\)", re.M)

    try:
        with open(filename, 'r') as f:
            raw_text = f.read()
    except FileNotFoundError:
        print(f"error: file not found at {filename}")
        return pd.DataFrame()

    # split into blocks by observation
    obs_blocks = re.split(r'(?=Observation:)', raw_text)
    
    rows = []
    
    for block in obs_blocks:
        if not block.strip(): continue

        # 1. extract metadata
        obs_match = obs_id_re.search(block)
        if not obs_match: continue
        obs_id = int(obs_match.group(1))
        
        date_match = date_re.search(block)
        if date_match:
            mjd, exptime = float(date_match.group(1)), float(date_match.group(2))
        else:
            mjd, exptime = (np.nan, np.nan)

        # 2. extract components and parameters
        comps = defaultdict(dict)
        
        # determine which regex to use based on block content
        if "emcee Results" in block:
            target_re = mcmc_re
        else:
            target_re = conf_re
            
        for match in target_re.finditer(block):
            c_id = match.group(1)
            param = match.group('param')
            val = float(match.group('val'))
            low_str = match.group('low')
            up_str = match.group('up')
            comps[c_id][param] = (val, low_str, up_str)

        # 3. extract count rates
        rates = {}
        for match in rate_line_re.finditer(block):
            c_id = match.group(1)
            val = float(match.group('val'))
            low_val = float(match.group('low'))
            up_val = float(match.group('up'))
            
            # calculate absolute errors for rates
            if low_val < 0:
                minus_err = abs(low_val)
                plus_err = abs(up_val)
            else:
                minus_err = abs(up_val)
                plus_err = abs(low_val)
            rates[c_id] = (val, minus_err, plus_err)

        # 4. logic: identify core, east, west based on relative position
        g_ids = [k for k in comps.keys() if k.startswith('g')]
        if not g_ids: continue 

        def get_val(cid, p): return comps[cid].get(p, (0,0,0))[0]

        # a. identify core (max amplitude)
        core_id = max(g_ids, key=lambda c: get_val(c, 'ampl'))
        core_x = get_val(core_id, 'xpos')
        
        mapping = {core_id: 'core'}
        
        # b. split remaining into left (east candidates) and right (west candidates)
        remaining = [c for c in g_ids if c != core_id]
        left_cands = [c for c in remaining if get_val(c, 'xpos') < core_x]
        right_cands = [c for c in remaining if get_val(c, 'xpos') >= core_x]
        
        extras = []

        # c. assign east
        if left_cands:
            left_sorted = sorted(left_cands, key=lambda c: get_val(c, 'xpos'), reverse=True)
            mapping[left_sorted[0]] = 'east'
            if len(left_sorted) > 1:
                extras.extend(left_sorted[1:])
        
        # d. assign west
        if right_cands:
            right_sorted = sorted(right_cands, key=lambda c: get_val(c, 'xpos'))
            mapping[right_sorted[0]] = 'west'
            if len(right_sorted) > 1:
                extras.extend(right_sorted[1:])
                
        # e. label extras
        extras_sorted = sorted(extras, key=lambda c: get_val(c, 'xpos'))
        for i, eid in enumerate(extras_sorted, start=1):
            mapping[eid] = f'extra_{i}'
        
        # f. background
        for c_id in comps:
            if c_id.startswith('c'):
                mapping[c_id] = 'bkg'

        # 5. build rows
        for old_id, new_name in mapping.items():
            if old_id not in comps: continue
            
            row = {
                'obs_id': obs_id,
                'mjd': mjd,
                'exptime': exptime,
                'component': new_name
            }
            
            # flatten parameters
            for param, (val, low_s, up_s) in comps[old_id].items():
                row[param] = val
                row[f'{param}_minus'] = low_s
                row[f'{param}_plus'] = up_s
            
            # add count rate data
            if old_id in rates:
                r_val, r_min, r_plus = rates[old_id]
                row['nominal'] = r_val
                row['minus_err'] = r_min
                row['plus_err'] = r_plus
            else:
                row['nominal'] = np.nan
                row['minus_err'] = np.nan
                row['plus_err'] = np.nan

            rows.append(row)

    df = pd.DataFrame(rows)
    
    if df.empty:
        return df

    # 6. cleanup types
    err_cols = [c for c in df.columns if c.endswith('_minus') or c.endswith('_plus')]
    for col in err_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce').abs()
    
    if df['mjd'].isna().all() and 'obs_id' in df.columns:
        df['mjd'] = df['obs_id'].astype('category').cat.codes

    return df

In [None]:
# load data directly from raw log
# construct the raw filename using the identifier from cell 1
raw_filename = f'multi-comp-fit-results-{file_identifier}.txt'
raw_file_path = os.path.join(base_dir, 'multi comp fit results', raw_filename)

print(f"loading and processing: {raw_filename}")
df = load_sherpa_log_to_dataframe(raw_file_path)

# check if data loaded
if df.empty:
    raise ValueError("dataframe is empty. check if the file exists and has correct format.")

# save 1: component tracker table (raw positions)
# destination: 2dfits/comp tracker tables/
tracker_dir = os.path.join(base_dir, 'comp tracker tables')
os.makedirs(tracker_dir, exist_ok=True)

tracker_filename = f'comp-tracker-table-{file_identifier}.csv'
tracker_file_path = os.path.join(tracker_dir, tracker_filename)

df.to_csv(tracker_file_path, index=False)
print(f"tracker table (raw) saved to: {tracker_file_path}")

# processing: recenter and calculate physics

# vectorized recenter on core
# isolate reference (core) positions
ref_df = df[df['component'] == g1_component][['obs_id', 'xpos', 'ypos']]
ref_df = ref_df.rename(columns={'xpos': 'ref_x', 'ypos': 'ref_y'})

# merge reference positions back into the main dataframe
df = df.merge(ref_df, on='obs_id', how='left')

# fill missing reference positions with center pixel
df['ref_x'] = df['ref_x'].fillna(center_pixel)
df['ref_y'] = df['ref_y'].fillna(center_pixel)

# calculate offsets vectorized
df['dx'] = df['ref_x'] - center_pixel
df['dy'] = df['ref_y'] - center_pixel

# apply displacement
df['xpos'] -= df['dx']
df['ypos'] -= df['dy']

# cleanup columns
df.drop(columns=['dx', 'dy', 'ref_x', 'ref_y'], inplace=True)

# calculate offsets, pa, radius, and propagate errors
df['xoff'] = df['xpos'] - center_pixel
df['yoff'] = df['ypos'] - center_pixel

pa_rad = np.arctan2(-df['xoff'], df['yoff'])
df['PA'] = np.degrees(pa_rad)
df['pa_rad'] = pa_rad 

d2 = df['xoff']**2 + df['yoff']**2
dpa_dx = np.divide(-df['yoff'], d2, out=np.full_like(d2, np.nan), where=d2 != 0)
dpa_dy = np.divide(df['xoff'], d2, out=np.full_like(d2, np.nan), where=d2 != 0)

df['PA_err_plus'] = np.degrees(np.sqrt((dpa_dx * df['xpos_plus'])**2 + (dpa_dy * df['ypos_plus'])**2))
df['PA_err_minus'] = np.degrees(np.sqrt((dpa_dx * df['xpos_minus'])**2 + (dpa_dy * df['ypos_minus'])**2))

df['radius'] = np.hypot(df['xoff'], df['yoff']) * pixscale_arcsec

r_pix = df['radius'] / pixscale_arcsec
is_zero = np.isclose(r_pix, 0)
df['radius_plus_err'] = np.where(is_zero, np.hypot(df['xpos_plus'], df['ypos_plus']), np.sqrt((df['xoff']*df['xpos_plus'])**2 + (df['yoff']*df['ypos_plus'])**2)/r_pix) * pixscale_arcsec
df['radius_minus_err'] = np.where(is_zero, np.hypot(df['xpos_minus'], df['ypos_minus']), np.sqrt((df['xoff']*df['xpos_minus'])**2 + (df['yoff']*df['ypos_minus'])**2)/r_pix) * pixscale_arcsec

# sort by time
df.sort_values('mjd', inplace=True)
df['flag'] = 'clean'

# save 2: data table (processed physics)
# destination: 2dfits/data tables/
data_dir = os.path.join(base_dir, 'data tables')
os.makedirs(data_dir, exist_ok=True)

data_filename = f'data-table-{file_identifier}.csv'
data_file_path = os.path.join(data_dir, data_filename)

df.to_csv(data_file_path, index=False)
print(f"data table (processed) saved to: {data_file_path}")

# create pivoted views for plotting
pivoted = df.pivot_table(index='mjd', columns='component', values=['nominal', 'plus_err', 'minus_err'])
df_nom, df_plus, df_minus = [pivoted[val].sort_index() for val in ['nominal', 'plus_err', 'minus_err']]

In [None]:
grouped_by_comp = df.groupby('component')

# exclude the reference component (core) from the list
comps = [c for c in df_nom.columns if c != g1_component]

# define specific colormaps for each component type
# east -> blues, west -> reds, extra -> greens
comp_cmaps = {}
for c in comps:
    if c == 'east':
        comp_cmaps[c] = plt.cm.Blues
    elif c == 'west':
        comp_cmaps[c] = plt.cm.Reds
    elif c.startswith('extra'):
        comp_cmaps[c] = plt.cm.Greens
    else:
        # fallback
        comp_cmaps[c] = plt.cm.Purples

# setup time normalization
time_min = df['mjd'].min()
time_max = df['mjd'].max()
time_norm = plt.Normalize(vmin=time_min, vmax=time_max)

n = len(comps)
delta = 0.02
offsets = {c: (i - (n - 1) / 2) * delta for i, c in enumerate(comps)}

# dynamic pdf name
plots_dir = os.path.join(base_dir, 'comp tracker plots')
os.makedirs(plots_dir, exist_ok=True)

pdf_name = f'comp-tracker-plots-{file_identifier}.pdf'
pdf_filename = os.path.join(plots_dir, pdf_name)

with PdfPages(pdf_filename) as pdf:
    # first figure: pa vs time + stacked count rates
    # create a discrete color cycle for the standard plots
    discrete_colors = ['dodgerblue', 'mediumseagreen', 'mediumslateblue', 'lightcoral']
    comp_discrete_map = {comp: discrete_colors[i % len(discrete_colors)] for i, comp in enumerate(comps)}

    fig = plt.figure(figsize=(12, 6))
    gs  = GridSpec(n, 2, figure=fig, width_ratios=[1,1], hspace=0, wspace=0.3)

    # left: pa vs time
    ax_pa = fig.add_subplot(gs[:,0])
    for comp, color in comp_discrete_map.items():
        if comp in grouped_by_comp.groups:
            grp = grouped_by_comp.get_group(comp)
            
            ax_pa.errorbar(
                grp['mjd'], grp['PA'],
                yerr=[grp['PA_err_minus'], grp['PA_err_plus']],
                marker='.', linestyle='-', capsize=3, color=color, label=comp
            )
            
    ax_pa.set_ylabel('Position Angle (°)')
    ax_pa.set_xlabel('MJD')
    ax_pa.set_title('Position Angle vs Time')
    ax_pa.set_ylim(-180,180)
    ax_pa.grid(True)
    ax_pa.legend()

    # right: stacked count-rate panels
    ax_bottom = None
    for i_comp, focus in reversed(list(enumerate(comps))):
        if i_comp == n - 1:
            ax = fig.add_subplot(gs[i_comp, 1])
            ax.set_xlabel('MJD')
            ax_bottom = ax
        else:
            ax = fig.add_subplot(gs[i_comp, 1], sharex=ax_bottom)
            ax.tick_params(labelbottom=False)
        ax.grid(True, zorder=0)
        
        for comp_name, current_color in comp_discrete_map.items():
            if comp_name not in df_nom.columns: continue
            
            x_val = df_nom.index + offsets[comp_name]
            y_val = df_nom[comp_name]
            y_err_val = [df_minus[comp_name], df_plus[comp_name]]
            
            alpha_val, line_style, label_text, z_order = (1.0, '-', comp_name, 10) if comp_name == focus else (0.3, '', None, 1)
            
            ax.errorbar(
                x_val, y_val, yerr=y_err_val, color=current_color,
                marker='.', linestyle=line_style, capsize=3,
                alpha=alpha_val, label=label_text, zorder=z_order
            )

        ax.set_yticks([0.1,0.3])
        if ax.has_data():
             ax.legend(loc='upper left')

    fig.text(0.73, 0.885, 'Component Count Rates', ha='center', va='bottom', fontsize=17)
    fig.text(0.495, 0.5, 'Count rate (counts/s)', va='center', rotation='vertical')
    
    pdf.savefig(fig)
    plt.close(fig) 

    # second figure: polar plot of pa on sky
    fig_polar = plt.figure(figsize=(10, 8))
    ax_polar = fig_polar.add_subplot(111, projection='polar')
    ax_polar.set_theta_zero_location('N')
    ax_polar.set_theta_direction(1)
    ax_polar.set_thetamin(-180)
    ax_polar.set_thetamax(180)
    ax_polar.set_rlabel_position(135)

    # plot loop
    for comp in comps:
        if comp in grouped_by_comp.groups:
            grp = grouped_by_comp.get_group(comp)
            cmap = comp_cmaps.get(comp, plt.cm.Greys)
            
            # iterate row by row to color each point by its time
            for _, row in grp.iterrows():
                # map mjd to range 0.4-1.0 to ensure visibility
                normalized_t = time_norm(row['mjd'])
                color_idx = 0.4 + 0.6 * normalized_t
                t_color = cmap(color_idx)
                
                ax_polar.errorbar(
                    row['pa_rad'], row['radius'],
                    xerr=[[np.deg2rad(row['PA_err_minus'])], [np.deg2rad(row['PA_err_plus'])]],
                    yerr=[[row['radius_minus_err']], [row['radius_plus_err']]],
                    marker='.', linestyle='', color=t_color, capsize=2, markersize=8
                )

    # add colorbar
    sm = plt.cm.ScalarMappable(cmap=plt.cm.Greys, norm=time_norm)
    sm.set_array([])
    cbar = plt.colorbar(sm, ax=ax_polar, pad=0.1)
    cbar.set_label('MJD (Light=Early, Dark=Late)')

    # add legend for component colors
    legend_elements = []
    for c in comps:
        cm = comp_cmaps.get(c, plt.cm.Greys)
        c_color = cm(0.7) 
        legend_elements.append(
            Line2D([0], [0], marker='.', color=c_color, label=c, markerfacecolor=c_color, markersize=8, linestyle='None')
        )
        
    ax_polar.legend(handles=legend_elements, loc='upper right', bbox_to_anchor=(1.3, 1.1), title="Components")

    angles = np.arange(-150, 180, 30)
    angles = np.append(angles, 180)
    ax_polar.set_rmin(0)
    ax_polar.set_thetagrids(angles, [f"{int(a)}°" for a in angles])
    ax_polar.set_title('On Sky Component Positions (arcsec)')

    pdf.savefig(fig_polar)
    plt.close(fig_polar)
    
print(f"plots saved to: {pdf_filename}")