# 00 - Data Preparation

This notebook loads and preprocesses all data files used in the SFH analysis.

In [None]:
import sys
import numpy as np
import os

# Add src to path
sys.path.append('../src')

from utils.data_processing import (
    load_iyer_data, 
    load_autoencoder_data,
    apply_mass_cuts,
    remove_zero_sfhs,
    prepare_umap_data
)
from utils.analysis import sim_name

## Load Raw Data

## Data Sources:
1. **Iyer et al. 2020 SFH Data** - Star formation histories from multiple simulations
2. **Autoencoder Results** - Predictions from multimodal autoencoder analysis

In [2]:
# Load Iyer et al. 2020 simulation data
print("Loading Iyer et al. 2020 simulation data...")
sim_data_raw = load_iyer_data()

print("\nSimulation summary:")
for sim in sim_name:
    if sim_data_raw[sim] is not None:
        ngal = sim_data_raw[sim]['ngal']
        print(f"{sim:>15s}: {ngal:>8,d} galaxies")
    else:
        print(f"{sim:>15s}: No data")

Loading Iyer et al. 2020 simulation data...
Loaded EAGLE data: 16710 galaxies
Loaded Illustris data: 19354 galaxies
Loaded IllustrisTNG data: 12220 galaxies
Loaded Mufasa data: 4574 galaxies
Loaded Simba data: 4938 galaxies
Loaded SC-SAM data: 18722 galaxies
Loaded UniverseMachine data: 75696 galaxies

Simulation summary:
          EAGLE:   16,710 galaxies
      Illustris:   19,354 galaxies
   IllustrisTNG:   12,220 galaxies
         Mufasa:    4,574 galaxies
          Simba:    4,938 galaxies
         SC-SAM:   18,722 galaxies
UniverseMachine:   75,696 galaxies


In [3]:
# Load and process zoom simulation data  
print("Loading zoom simulation data...")
zoom_data_raw = load_iyer_data(sim_name_list=['FIRE-2', 'g14', 'Marvel_JL'])

Loading zoom simulation data...
Loaded FIRE-2 data: 22 galaxies
Loaded g14 data: 8 galaxies
Loaded Marvel_JL data: 5 galaxies


In [4]:
# Load autoencoder results
print("Loading autoencoder results...")
ae_data = load_autoencoder_data()

print("\nAutoencoder data summary:")
for key, data in ae_data.items():
    print(f"{key:>20s}: {data.shape}")

Loading autoencoder results...
Loaded predictions_sfh.npy: shape (63083, 139)
Loaded predictions_sfh_w.npy: shape (63083, 139)
Loaded predictions_sfr.npy: shape (63083, 2)
Loaded predictions_sfr_w.npy: shape (63083, 2)
Loaded predictions_sim.npy: shape (63083, 7)
Loaded predictions_sim_w.npy: shape (63083, 7)

Autoencoder data summary:
     predictions_sfh: (63083, 139)
   predictions_sfh_w: (63083, 139)
     predictions_sfr: (63083, 2)
   predictions_sfr_w: (63083, 2)
     predictions_sim: (63083, 7)
   predictions_sim_w: (63083, 7)


## Apply Data Processing Steps

1. Apply mass cuts
2. Remove zero SFHs
3. Normalize/interpolate SFHs (prepare data for UMAP analysis)

In [5]:
# Step 1: Apply mass cuts
# Default: 10^10 for Mufasa, Simba; 10^9 for everything else
print("Applying mass cuts...")
sim_data_cut = apply_mass_cuts(sim_data_raw)

Applying mass cuts...
EAGLE: 7482/16710 galaxies after M* > 1e+09 cut
Illustris: 19354/19354 galaxies after M* > 1e+09 cut
IllustrisTNG: 12220/12220 galaxies after M* > 1e+09 cut
Mufasa: 1900/4574 galaxies after M* > 1e+10 cut
Simba: 1982/4938 galaxies after M* > 1e+10 cut
SC-SAM: 12821/18722 galaxies after M* > 1e+09 cut
UniverseMachine: 7361/75696 galaxies after M* > 1e+09 cut


In [6]:
# Step 2: Remove zero SFHs
print("\nRemoving zero star formation histories...")
sim_data_clean = remove_zero_sfhs(sim_data_cut)

# Process zoom data (no mass cuts for zooms)
zoom_data_clean = remove_zero_sfhs(zoom_data_raw)


Removing zero star formation histories...
EAGLE: 7445/7482 galaxies after removing zero SFHs
Illustris: 19354/19354 galaxies after removing zero SFHs
IllustrisTNG: 12220/12220 galaxies after removing zero SFHs
Mufasa: 1900/1900 galaxies after removing zero SFHs
Simba: 1982/1982 galaxies after removing zero SFHs
SC-SAM: 12821/12821 galaxies after removing zero SFHs
UniverseMachine: 7361/7361 galaxies after removing zero SFHs
FIRE-2: 22/22 galaxies after removing zero SFHs
g14: 8/8 galaxies after removing zero SFHs
Marvel_JL: 5/5 galaxies after removing zero SFHs


In [7]:
# Step 3: Prepare normalized data for UMAP analysis (includes interpolation)
print("\nPreparing data for UMAP analysis...")
combined_sfh, combined_labels = prepare_umap_data(sim_data_clean, 
                                                  normalize=True, 
                                                  remove_zeros=False)  # Already removed


Preparing data for UMAP analysis...
Combined data: 63083 total galaxies with 136 time bins


In [8]:
# Prepare zoom SFH data for UMAP transform using same function
zoom_sfh, zoom_labels = prepare_umap_data(zoom_data_clean, 
                                          normalize=True, 
                                          remove_zeros=False)  # Already removed

if len(zoom_sfh) > 0:
    pass
else:
    print("No zoom simulation data loaded")
    zoom_sfh = None

Combined data: 35 total galaxies with 136 time bins


## UMAP Analysis

Generate UMAP embedding for reproducible figure generation.

In [9]:
zoom_data_raw

{'FIRE-2': {'sfh_raw': array([[0.00000000e+00, 0.00000000e+00, 5.02304718e-02, ...,
          1.24570131e+01, 1.38750256e+01, 1.25251132e+01],
         [0.00000000e+00, 0.00000000e+00, 2.63193534e-02, ...,
          1.23768194e+01, 1.35059587e+01, 1.18102306e+01],
         [0.00000000e+00, 0.00000000e+00, 7.30348821e-04, ...,
          3.44641474e-01, 2.51629826e-01, 8.84574390e-03],
         ...,
         [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
          1.86151800e-04, 1.71702150e-05, 7.92247623e-05],
         [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
          1.15560574e-02, 3.06208655e-04, 2.64865217e-04],
         [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
          7.14430027e-04, 2.90104003e-03, 2.42536889e-03]], shape=(22, 137)),
  'times': array([[ 0. ,  0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ,
           1.1,  1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9,  2. ,  2.1,
           2.2,  2.3,  2.4,  2.5,  2.6,  2.7,  2.8,  

In [10]:
# Generate UMAP embedding
import umap
from sklearn.preprocessing import MinMaxScaler
import pickle

print("Generating UMAP embedding...")

# Create UMAP reducer (with combined data from all simulations + zooms)
reducer = umap.UMAP(random_state=13)
embedding_unscaled = reducer.fit_transform(np.vstack([combined_sfh, zoom_sfh]) 
                                           if zoom_sfh is not None else combined_sfh)

# Normalize to [0, 10] range as in original
scaler = MinMaxScaler(feature_range=(0, 10))
embedding = scaler.fit_transform(embedding_unscaled)

print(f"UMAP embedding shape: {embedding.shape}")

# Split up into main sims and zooms
if zoom_sfh is not None:
    zoom_embedding = embedding[combined_sfh.shape[0]:]
    embedding = embedding[:combined_sfh.shape[0]]

Generating UMAP embedding...


  warn(


UMAP embedding shape: (63118, 2)


## Save All Processed Data

Save the processed data and UMAP embeddings for use in subsequent notebooks.

In [11]:
# Create figures directory if it doesn't exist
os.makedirs('../figures', exist_ok=True)

# Save simulation data
with open('../data/sim_data_raw.pkl', 'wb') as f:
    pickle.dump(sim_data_raw, f)
with open('../data/sim_data_processed.pkl', 'wb') as f:
    pickle.dump(sim_data_clean, f)

# Save zoom simulation data
with open('../data/zoom_data_raw.pkl', 'wb') as f:
    pickle.dump(zoom_data_raw, f)
with open('../data/zoom_data_processed.pkl', 'wb') as f:
    pickle.dump(zoom_data_clean, f)

# Save combined arrays for UMAP analysis
np.save('../data/combined_sfh.npy', combined_sfh)
np.save('../data/combined_labels.npy', combined_labels)

# Save UMAP embeddings and models for reproducibility
np.save('../data/umap_embedding.npy', embedding)
if zoom_embedding is not None:
    np.save('../data/zoom_umap_embedding.npy', zoom_embedding)

# Save UMAP parameters and models
umap_data = {
    'reducer': reducer,
    'scaler': scaler,
    'random_state': 13,
    'feature_range': (0, 10)
}
with open('../data/umap_models.pkl', 'wb') as f:
    pickle.dump(umap_data, f)

print("All processed data saved successfully!")
print("\nFiles created:")
print("- ../data/sim_data_processed.pkl")
print("- ../data/zoom_data_processed.pkl") 
print("- ../data/combined_sfh.npy")
print("- ../data/combined_labels.npy")
print("- ../data/umap_embedding.npy")
if zoom_embedding is not None:
    print("- ../data/zoom_umap_embedding.npy")
print("- ../data/umap_models.pkl")

All processed data saved successfully!

Files created:
- ../data/sim_data_processed.pkl
- ../data/zoom_data_processed.pkl
- ../data/combined_sfh.npy
- ../data/combined_labels.npy
- ../data/umap_embedding.npy
- ../data/zoom_umap_embedding.npy
- ../data/umap_models.pkl


In [12]:
# Summary statistics
print("Final galaxy counts by simulation:")
print("-" * 40)

for sim_idx, sim in enumerate(sim_name):
    if sim_data_clean[sim] is not None:
        sim_mask = combined_labels == sim_idx
        count = np.sum(sim_mask)
        percentage = 100 * count / len(combined_labels)
        print(f"{sim:>15s}: {count:>8,d} galaxies ({percentage:5.1f}%)")
    
print("-" * 40)
print(f"{'Total':>15s}: {len(combined_labels):>8,d} galaxies")

if zoom_embedding is not None:
    print(f"\nZoom simulations: {len(zoom_embedding):,} galaxies")

Final galaxy counts by simulation:
----------------------------------------
          EAGLE:    7,445 galaxies ( 11.8%)
      Illustris:   19,354 galaxies ( 30.7%)
   IllustrisTNG:   12,220 galaxies ( 19.4%)
         Mufasa:    1,900 galaxies (  3.0%)
          Simba:    1,982 galaxies (  3.1%)
         SC-SAM:   12,821 galaxies ( 20.3%)
UniverseMachine:    7,361 galaxies ( 11.7%)
----------------------------------------
          Total:   63,083 galaxies

Zoom simulations: 35 galaxies
