# 01 - Data Preparation

This notebook loads and preprocesses all data files used in the SFH analysis.

## Data Sources:
1. **Iyer et al. 2020 SFH Data** - Star formation histories from multiple simulations
2. **Autoencoder Results** - Predictions from multimodal autoencoder analysis

## Processing Steps:
1. Load raw simulation data
2. Apply mass cuts
3. Remove zero SFHs
4. Normalize SFHs
5. Prepare data for UMAP analysis

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sys
import os

# Add src to path
sys.path.append('../src')

from utils.data_processing import (
    load_iyer_data, 
    load_autoencoder_data,
    apply_mass_cuts,
    remove_zero_sfhs,
    normalize_sfh,
    prepare_umap_data
)
from utils.analysis import sim_name, times

%matplotlib inline

## Load Raw Data

In [None]:
# Load Iyer et al. 2020 simulation data
print("Loading Iyer et al. 2020 simulation data...")
sim_data_raw = load_iyer_data()

print("\nSimulation summary:")
for sim in sim_name:
    if sim_data_raw[sim] is not None:
        ngal = sim_data_raw[sim]['ngal']
        print(f"{sim:>15s}: {ngal:>8,d} galaxies")
    else:
        print(f"{sim:>15s}: No data")

In [None]:
# Load autoencoder results
print("Loading autoencoder results...")
ae_data = load_autoencoder_data()

print("\nAutoencoder data summary:")
for key, data in ae_data.items():
    print(f"{key:>20s}: {data.shape}")

## Apply Data Processing Steps

In [None]:
# Step 1: Apply mass cuts
# Default: 10^10 for Mufasa, Simba; 10^9 for everything else
print("Applying mass cuts...")
sim_data_cut = apply_mass_cuts(sim_data_raw)

In [None]:
# Step 2: Remove zero SFHs
print("\nRemoving zero star formation histories...")
sim_data_clean = remove_zero_sfhs(sim_data_cut)

In [None]:
# Step 3: Prepare normalized data for UMAP analysis
print("\nPreparing data for UMAP analysis...")
combined_sfh, combined_labels = prepare_umap_data(sim_data_clean, 
                                                 normalize=True, 
                                                 remove_zeros=False)  # Already removed

print(f"\nFinal dataset: {len(combined_sfh):,} galaxies with {combined_sfh.shape[1]} time bins")

## Save Processed Data

Save the processed data for use in subsequent notebooks.

In [None]:
# Summary statistics
print("Final galaxy counts by simulation:")
print("-" * 40)

for sim_idx, sim in enumerate(sim_name):
    if sim_data_clean[sim] is not None:
        sim_mask = combined_labels == sim_idx
        count = np.sum(sim_mask)
        percentage = 100 * count / len(combined_labels)
        print(f"{sim:>15s}: {count:>8,d} galaxies ({percentage:5.1f}%)")
    
print("-" * 40)
print(f"{'Total':>15s}: {len(combined_labels):>8,d} galaxies")

In [None]:
# Save processed simulation data
import pickle

with open('../data/sim_data_processed.pkl', 'wb') as f:
    pickle.dump(sim_data_clean, f)

# Save combined arrays for UMAP analysis
np.save('../data/combined_sfh.npy', combined_sfh)
np.save('../data/combined_labels.npy', combined_labels)

print("Processed data saved successfully!")
print("\nFiles created:")
print("- ../data/sim_data_processed.pkl")
print("- ../data/combined_sfh.npy")
print("- ../data/combined_labels.npy")

## Save Processed Data

Save the processed data for use in subsequent notebooks.

In [None]:
# Create figures directory if it doesn't exist
os.makedirs('../figures', exist_ok=True)

# Save processed simulation data
import pickle

with open('../data/sim_data_processed.pkl', 'wb') as f:
    pickle.dump(sim_data_clean, f)

# Save combined arrays for UMAP analysis
np.save('../data/combined_sfh.npy', combined_sfh)
np.save('../data/combined_labels.npy', combined_labels)

print("Processed data saved successfully!")
print("\nFiles created:")
print("- ../data/sim_data_processed.pkl")
print("- ../data/combined_sfh.npy")
print("- ../data/combined_labels.npy")
print("- ../figures/sample_sfhs_normalized.png")