In [None]:
import os
import numpy as np
from astropy.io import fits
from tqdm.auto import tqdm
from multiprocess import Pool

In [None]:
# --- Core Parameters (User Configuration) ---

# The percentage of events to scramble (e.g., 80 for 80%)
SCRAMBLE_PERCENT = 50

# The base path where the input simulation data is stored
# Note: Using an f-string for the Box path to make it clear
user_home = os.path.expanduser("~")
BASE_INPUT_PATH = os.path.join(user_home, 'Library/CloudStorage/Box-Box/IXPE_rmfs/sim_data_mit')


# The base path for the output scrambled data. A subdirectory will be created.
BASE_OUTPUT_PATH = os.path.join(user_home, 'Library/CloudStorage/Box-Box/IXPE_rmfs')

# The column to scramble. Choose 'DETPHI1' or 'DETPHI2' based on your analysis pipeline.
# We'll use DETPHI2 as it's common in recent processing, but verify for your use case.
TARGET_COLUMN = 'DETPHI2'

# The FITS extension containing the event data
EVENTS_EXTENSION = 'EVENTS'

# --- Derived Parameters (Do not change) ---

# Create the full output path including the percentage directory
output_dir_name = f'scrambled_sim_data_{SCRAMBLE_PERCENT}percent'
FULL_OUTPUT_PATH = os.path.join(BASE_OUTPUT_PATH, output_dir_name)

# List of detector units
detector_units = ['du1', 'du2', 'du3']

# List of simulation number prefixes
sim_numbers = range(1000, 10001, 100) # Goes from 1000 to 10000 in steps of 100

print(f"--- Configuration ---")
print(f"Scrambling Percentage: {SCRAMBLE_PERCENT}%")
print(f"Input Data Path:       {BASE_INPUT_PATH}")
print(f"Output Data Path:      {FULL_OUTPUT_PATH}")
print(f"Target Column:         {TARGET_COLUMN}")
print("--------------------")

# Create the main output directory if it doesn't exist
os.makedirs(FULL_OUTPUT_PATH, exist_ok=True)
print(f"Output directory '{FULL_OUTPUT_PATH}' is ready.")

In [None]:
def scramble_fits_file_fast(input_path, output_path, scramble_percent, column_name):
    """
    OPTIMIZED & FAST VERSION:
    This function safely modifies the FITS data in-memory and writes to a
    new file. It is much faster because it avoids rebuilding the entire FITS
    table from scratch. The original file on disk is never altered.
    """
    # Note: The EVENTS_EXTENSION variable is passed from the main script's scope
    try:
        with fits.open(input_path, mode='update') as hdul:
            event_data = hdul[EVENTS_EXTENSION].data
            n_events = len(event_data)

            if n_events == 0:
                hdul.writeto(output_path, overwrite=True)
                return f"Success: {os.path.basename(input_path)} (copied empty)"

            n_to_scramble = int(n_events * (scramble_percent / 100.0))
            if n_to_scramble == 0:
                hdul.writeto(output_path, overwrite=True)
                return f"Success: {os.path.basename(input_path)} (copied original)"

            scramble_indices = np.random.choice(n_events, size=n_to_scramble, replace=False)
            random_phis = np.random.uniform(-np.pi, np.pi, n_to_scramble).astype(np.float32)

            event_data[column_name][scramble_indices] = random_phis
            
            hdul.writeto(output_path, overwrite=True)
            
            return f"Success: {os.path.basename(input_path)}"

    except FileNotFoundError:
        return f"ERROR: File not found at {input_path}"
    except Exception as e:
        return f"ERROR processing {os.path.basename(input_path)}: {e}"

# --- MAIN PARALLEL PROCESSING LOOP ---

# Step 1: Create a list of all tasks to be done.
tasks = []
print("Preparing list of files to process...")
for du in detector_units:
    du_output_path = os.path.join(FULL_OUTPUT_PATH, du)
    os.makedirs(du_output_path, exist_ok=True)
    
    for num in sim_numbers:
        filename_base = f"sim_{num:05d}_pol_recon"
        input_filename = f"{filename_base}.fits"
        output_filename = f"{filename_base}_scrambled_{SCRAMBLE_PERCENT}.fits"
        
        input_filepath = os.path.join(BASE_INPUT_PATH, du, input_filename)
        output_filepath = os.path.join(du_output_path, output_filename)
        
        tasks.append((input_filepath, output_filepath, SCRAMBLE_PERCENT, TARGET_COLUMN))

# Step 2: Run the tasks in parallel.
# This guard is still essential for safety, especially on macOS and Windows.
if __name__ == '__main__':
    print(f"Starting parallel processing of {len(tasks)} files using dill backend...")
    
    # The key change: Using Pool from the 'multiprocess' library
    with Pool() as pool:
        # pool.starmap works identically but can handle notebook functions
        results = list(tqdm(pool.starmap(scramble_fits_file_fast, tasks), total=len(tasks)))

    print("\n--- Processing Complete ---")
    # Step 3: Report any errors.
    errors = [res for res in results if res.startswith("ERROR")]
    if errors:
        print(f"\nEncountered {len(errors)} errors during processing:")
        for err in errors:
            print(f"- {err}")
    else:
        print("\n✅ All files processed successfully with no errors.")

In [None]:
import os
import glob
import numpy as np
from astropy.io import fits
from tqdm.auto import tqdm

# ===================================================================
# --- 1. CONFIGURATION ---
# ===================================================================

# The number of photons to randomly sample from EACH energy file.
# A larger number gives better statistics but results in a larger final file.
# 5000 is a reasonable starting point.
N_PHOTONS_PER_FILE = 5000

# The base path where your data folders are located
BASE_PATH = os.path.join(os.path.expanduser("~"), 'Library/CloudStorage/Box-Box/IXPE_rmfs')

# The specific input directory for the scrambled data
INPUT_DIR = "scrambled_sim_data_80percent"

# Where to save the final combined FITS files
OUTPUT_DIR = os.path.join(BASE_PATH, "combined_events_from_scrambled")

# Define the energy ranges (in simulation units, e.g., 1000 for 1.0 keV)
# for each detector unit. The upper bound is exclusive.
ENERGY_RANGES = {
    'du1': range(1000, 10001, 100), # 1.0 to 10.0 keV
    'du2': range(1000, 10001, 100), # 1.0 to 10.0 keV
    'du3': range(1000, 5001, 100),  # 1.0 to 5.0 keV
}

# The FITS extension containing the event data
EVENTS_EXTENSION = 'EVENTS'

# --- End of Configuration ---

# Automatically create the output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f" Configuration set. Final files will be saved in: {OUTPUT_DIR}")


# ===================================================================
# --- 2. MAIN PROCESSING LOOP ---
# ===================================================================

# Loop over each detector unit defined in the energy ranges
for du_name in ENERGY_RANGES.keys():
    print(f"\n--- Processing {du_name} ---")
    
    # List to hold the sampled data from all files for this DU
    all_sampled_events = []
    template_header = None # To store the header from the first file

    # Construct the path to the current DU's data
    du_path = os.path.join(BASE_PATH, INPUT_DIR, du_name)
    if not os.path.isdir(du_path):
        print(f" Warning: Directory not found, skipping: {du_path}")
        continue

    # Loop through each energy file for the current DU
    energy_range = ENERGY_RANGES[du_name]
    for sim_num in tqdm(energy_range, desc=f"Sampling {du_name} files"):
        
        # Construct the expected filename
        filename = f"sim_{sim_num:05d}_pol_recon_scrambled_80.fits"
        filepath = os.path.join(du_path, filename)

        if not os.path.exists(filepath):
            continue # Skip if a file for this energy doesn't exist

        try:
            # Open the FITS file
            with fits.open(filepath) as hdul:
                event_data = hdul[EVENTS_EXTENSION].data
                
                # If this is the first valid file, grab its header as a template
                if template_header is None:
                    template_header = hdul[EVENTS_EXTENSION].header

                # Determine how many events to sample
                n_events_in_file = len(event_data)
                if n_events_in_file == 0:
                    continue # Skip empty files
                
                n_to_sample = min(n_events_in_file, N_PHOTONS_PER_FILE)
                
                # Randomly select row indices without replacement
                sample_indices = np.random.choice(n_events_in_file, size=n_to_sample, replace=False)
                
                # Append the sampled event rows to our master list
                all_sampled_events.append(event_data[sample_indices])

        except Exception as e:
            print(f" Error processing file {filename}: {e}")

    # After checking all files for the DU, combine the sampled events
    if not all_sampled_events:
        print(f" No events were sampled for {du_name}. No output file will be created.")
        continue

    # Concatenate all the data chunks into a single large table
    print(f"Concatenating sampled events for {du_name}...")
    final_event_table = np.concatenate(all_sampled_events)
    total_events = len(final_event_table)
    print(f"Total events in combined file for {du_name}: {total_events:,}")

    # Create a new FITS file for the combined data
    primary_hdu = fits.PrimaryHDU() # A minimal primary HDU
    events_hdu = fits.BinTableHDU(data=final_event_table, header=template_header)
    
    hdul_out = fits.HDUList([primary_hdu, events_hdu])
    
    # Define the output path and save the file
    output_filename = f"{du_name}_combined_sampled_events.fits"
    output_filepath = os.path.join(OUTPUT_DIR, output_filename)
    
    hdul_out.writeto(output_filepath, overwrite=True)
    hdul_out.close()
    
    print(f" Successfully created: {output_filepath}")

print("\n--- All processing complete! ---")

In [None]:
import numpy as np
from astropy.io import fits
from tqdm.auto import tqdm
import os

# ===================================================================
# --- 3. ADD PI COLUMN TO COMBINED FILES ---
# ===================================================================

# This script uses the OUTPUT_DIR and ENERGY_RANGES defined in the previous cells.
print(f"Adding 'PI' column to combined files in: {OUTPUT_DIR}")

# Conversion factor from PHA (ADC counts) to PI channels.
# PI = PHA / (3000 ADC/keV * 0.04 keV/PI_channel) = PHA / 120
pha2pi = 1.0 / (3000.0 * 0.04)

# Loop over each detector unit we've processed
for du_name in tqdm(ENERGY_RANGES.keys(), desc="Processing DUs"):
    
    # Construct the path to the combined file for this DU
    combined_filename = f"{du_name}_combined_sampled_events.fits"
    filepath = os.path.join(OUTPUT_DIR, combined_filename)
    
    if not os.path.exists(filepath):
        print(f"⚠️ Warning: File not found, skipping: {filepath}")
        continue
        
    try:
        # Open the FITS file in 'update' mode to allow for changes
        with fits.open(filepath, mode='update') as hdul:
            
            # --- 1. Perform Calculation ---
            events_hdu = hdul[EVENTS_EXTENSION]
            pha_data = events_hdu.data['PHA'].astype(float)
            pi_data = pha_data * pha2pi
            
            # --- 2. Add the New Column ---
            # Get the original table columns
            original_cols = events_hdu.columns
            
            # Create a new FITS Column object for our PI data
            # Format 'E' is for a standard 32-bit floating-point number
            pi_col = fits.Column(name='PI', format='E', array=pi_data)
            
            # Create a new column definition object by adding our new column
            new_cols = original_cols + pi_col
            
            # --- 3. Replace the Old Table with the New One ---
            # Create a new table HDU from the new columns, preserving the original header
            new_hdu = fits.BinTableHDU.from_columns(new_cols, header=events_hdu.header)
            
            # Replace the old EVENTS HDU in the file with our new one
            hdul[EVENTS_EXTENSION] = new_hdu
            
            # The changes will be saved automatically when the 'with' block closes.
            print(f"✅ Successfully added 'PI' column to {combined_filename}")
            
    except Exception as e:
        print(f"❌ ERROR processing {combined_filename}: {e}")

print("\n--- PI column processing complete! ---")

In [None]:
import numpy as np
from astropy.io import fits

# --- 1. Configuration ---
# Update with the path to your combined event file
INPUT_FITS_FILE = '/Users/leodrake/Library/CloudStorage/Box-Box/IXPE_rmfs/combined_events_from_scrambled/du1_combined_sampled_events.fits' 
OUTPUT_FITS_FILE = '/Users/leodrake/Library/CloudStorage/Box-Box/IXPE_rmfs/combined_events_from_scrambled/du1_streamlined_events.fits'

# IXPE detector pixel size in mm
MM_PER_PIXEL = 0.050

# The FITS extension containing the event data
EVENTS_EXTENSION = 'EVENTS'

print(f"Reading input file: {INPUT_FITS_FILE}")

# --- 2. Read Source Data ---
try:
    with fits.open(INPUT_FITS_FILE) as hdul:
        source_data = hdul[EVENTS_EXTENSION].data
except FileNotFoundError:
    print(f"❌ ERROR: Input file not found at '{INPUT_FITS_FILE}'. Please check the path.")
    # Exit gracefully if in a script, or just stop if in a notebook
    # import sys; sys.exit()
except (KeyError, IndexError):
    print(f"❌ ERROR: Could not find the '{EVENTS_EXTENSION}' extension in the FITS file.")
    # import sys; sys.exit()
else:
    # --- 3. Create New Column Data from Source ---
    print("Processing columns...")

    # Direct copies
    trg_id_new = source_data['TRG_ID']
    time_new = source_data['TIME']
    status_new = source_data['STATUS']
    status2_new = source_data['STATUS2']
    
    # Copy and typecast PI to integer
    pi_new = source_data['PI'].astype('int32')

    # Calculate W_MOM, handling potential division by zero
    m2l = source_data['TRK_M2L']
    m2t = source_data['TRK_M2T']
    denominator = m2l + m2t
    # Use np.divide to safely handle division by zero, filling with 0.0
    w_mom_new = np.divide(m2l - m2t, denominator, out=np.zeros_like(denominator, dtype=float), where=denominator!=0)

    # Calculate POS(X,Y) in pixels and stack into a 2D array
    pos_x_new = source_data['ABSX'] / MM_PER_PIXEL
    pos_y_new = source_data['ABSY'] / MM_PER_PIXEL
    pos_new = np.column_stack((pos_x_new, pos_y_new))

    # --- 4. Define the Columns for the New FITS Table ---
    # This list defines the structure of the new file.
    # FITS format codes: J=Int32, D=Real8, 2X=2-element Bit Array, E=Real4, 2E=2-element Real4 vector
    new_columns = [
        fits.Column(name='TRG_ID',    format='J',  array=trg_id_new),
        fits.Column(name='TIME',      format='D',  unit='s', array=time_new),
        fits.Column(name='STATUS',    format='2X', array=status_new),
        fits.Column(name='STATUS2',   format='2X', array=status2_new),
        fits.Column(name='PI',        format='J',  unit='chan', array=pi_new),
        fits.Column(name='W_MOM',     format='E',  array=w_mom_new),
        fits.Column(name='POS',       format='2E', unit='pixel', array=pos_new)
    ]
    
    # --- 5. Create and Save the New FITS File ---
    # Create the new binary table HDU from our defined columns
    new_table_hdu = fits.BinTableHDU.from_columns(new_columns, name='EVENTS')

    # Create a minimal primary HDU
    primary_hdu = fits.PrimaryHDU()
    
    # Combine into a final HDU list
    final_hdul = fits.HDUList([primary_hdu, new_table_hdu])
    
    # Write to disk, overwriting if the file already exists
    final_hdul.writeto(OUTPUT_FITS_FILE, overwrite=True)
    final_hdul.close()

    print(f"\n✨ Successfully created streamlined FITS file: {OUTPUT_FITS_FILE}")
    print(f"   - Total Events: {len(trg_id_new):,}")