In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from scipy.special import gammainc, gammaincinv, gammaincc, gamma
from scipy.interpolate import interp1d
from pyutils import *
import scipy.integrate as spi

# Define file paths
directory = '/mount/sirocco1/imw2293/GROUP_CAT/DATA/POPMOCK/'
input_filename = 'hlist_0.83530.list'
output = directory+'smdpl_0.83530.h5'

In [None]:
#scale(0) id(1) desc_scale(2) desc_id(3) num_prog(4) pid(5) upid(6) desc_pid(7) phantom(8) sam_mvir(9) mvir(10) rvir(11) rs(12) vrms(13) mmp?(14) scale_of_last_MM(15) vmax(16) x(17) y(18) z(19) vx(20) vy(21) vz(22) Jx(23) Jy(24) Jz(25) Spin(26) Breadth_first_ID(27) Depth_first_ID(28) Tree_root_ID(29) Orig_halo_ID(30) Snap_num(31) Next_coprogenitor_depthfirst_ID(32) Last_progenitor_depthfirst_ID(33) Last_mainleaf_depthfirst_ID(34) Tidal_Force(35) Tidal_ID(36) Rs_Klypin(37) Mmvir_all(38) M200b(39) M200c(40) M500c(41) M2500c(42) Xoff(43) Voff(44) Spin_Bullock(45) b_to_a(46) c_to_a(47) A[x](48) A[y](49) A[z](50) b_to_a(500c)(51) c_to_a(500c)(52) A[x](500c)(53) A[y](500c)(54) A[z](500c)(55) T/|U|(56) M_pe_Behroozi(57) M_pe_Diemer(58) Macc(59) Mpeak(60) Vacc(61) Vpeak(62) Halfmass_Scale(63) Acc_Rate_Inst(64) Acc_Rate_100Myr(65) Acc_Rate_1*Tdyn(66) Acc_Rate_2*Tdyn(67) Acc_Rate_Mpeak(68) Acc_Log_Vmax_Inst(69) Acc_Log_Vmax_1*Tdyn(70) Mpeak_Scale(71) Acc_Scale(72) First_Acc_Scale(73) First_Acc_Mvir(74) First_Acc_Vmax(75) Vmax\@Mpeak(76) Tidal_Force_Tdyn(77) Log_(Vmax/Vmax_max(Tdyn;Tmpeak))(78) Time_to_future_merger(79) Future_merger_MMP_ID(80) Spin_at_Mpeak_Scale(81)
column_names = [
    'scale', 'ID', 'desc_scale', 'desc_id', 'num_prog', 'pid', 'upid', 'desc_pid', 'phantom',
    'sam_mvir', 'mvir', 'rvir', 'rs', 'vrms', 'mmp?', 'scale_of_last_MM', 'vmax',
    'x', 'y', 'z', 'vx', 'vy', 'vz', 'Jx', 'Jy', 'Jz', 'Spin',
    'Breadth_first_ID', 'Depth_first_ID', 'Tree_root_ID', 'Orig_halo_ID', 'Snap_num',
    'Next_coprogenitor_depthfirst_ID', 'Last_progenitor_depthfirst_ID', 'Last_mainleaf_depthfirst_ID', 'Tidal_Force', 'Tidal_ID',
    'Rs_Klypin', 'Mmvir_all', 'M200b', 'M200c', 'M500c', 'M2500c',
    'Xoff', 'Voff', 'Spin_Bullock', 'b_to_a', 'c_to_a', 'A[x]',
    'A[y]', 'A[z]', 'b_to_a(500c)', 'c_to_a(500c)', 'A[x](500c)',
    'A[y](500c)', 'A[z](500c)', 'T/|U|', 'M_pe_Behroozi',
    'M_pe_Diemer', 'Macc', 'Mpeak', 'Vacc', 'Vpeak',
    'Halfmass_Scale', 'Acc_Rate_Inst', 'Acc_Rate_100Myr', 'Acc_Rate_1tTdyn', 'Acc_Rate_2tTdyn', 'Acc_Rate_Mpeak',
    'Acc_Log_Vmax_Inst', 'Acc_Log_Vmax_1tTdyn', 'Mpeak_Scale', 'Acc_Scale', 'First_Acc_Scale',
    'First_Acc_Mvir', 'First_Acc_Vmax', 'Vmax@Mpeak', 'Tidal_Force_Tdyn', 'Log_(Vmax/Vmax_max(Tdyn;Tmpeak))',
    'Time_to_future_merger', 'Future_merger_MMP_ID', 'Spin_at_Mpeak_Scale'
]

# Select a subset of columns to keep for the final catalog.
# This reduces memory usage and file size.
columns_to_keep = [
    'ID', 'upid', 'M200b', 'Mpeak', 'mvir', 'rvir', 'rs', 'vmax',
    'x', 'y', 'z', 'vx', 'vy', 'vz'
]

# Since the file is very large, we process it in chunks.
chunk_size = 1000000  # Process 1 million rows at a time.

print(f"Starting to process {input_filename}...")
print(f"Output will be saved to {output}")

# Remove the output file if it already exists to start fresh.
if os.path.exists(directory+output):
    os.remove(output)

# Create a reader object to iterate over the file in chunks.
# We skip commented lines (the header) and use a space as the delimiter.
reader = pd.read_csv(
    directory+input_filename,
    comment='#',
    sep='\s+',  # Matches one or more whitespace characters
    header=None,
    names=column_names,
    chunksize=chunk_size
)

# Process each chunk
for i, chunk in enumerate(reader):
    print(f"Processing chunk {i+1}...")

    # 2. Select only the columns we want to keep
    final_chunk = chunk[columns_to_keep]

    # 3. Append the processed chunk to an HDF5 file.
    # HDF5 is a binary format, efficient for storage and fast to read.
    # 'format=table' allows for appending and querying.
    if not final_chunk.empty:
        with pd.HDFStore(output, mode='a') as store:
            store.append(
            'halos',
            final_chunk,
            format='table',
            data_columns=columns_to_keep
        )

print("\nProcessing complete.")
print(f"Filtered halo catalog saved to {output}")

In [None]:
# Read the entire HDF5 file into memory
full_df = pd.read_hdf(output, key='halos')

# Filter rows where M200b >= 1e10
filtered_df = full_df[full_df['M200b'] >= 1e10]

# Save the filtered dataframe to a new HDF5 file
small_output = directory + 'smdpl_z0.19717.M1E10.h5'
filtered_df.to_hdf(small_output, key='halos', format='table', data_columns=filtered_df.columns)

print(f"Filtered catalog saved to {small_output}")

In [None]:
# --- 1. Load the Halo Catalog ---
halo_file = '/mount/sirocco1/imw2293/GROUP_CAT/DATA/POPMOCK/smdpl_z0.19717.M1E10.h5'
halos_df = pd.read_hdf(halo_file, key='halos')
print(f"Loaded {len(halos_df)} halos from {halo_file}")

In [None]:
# --- Cosmology and box volume ---
box_volume = (400.0)**3  # Mpc^3 / h^3

# --- Blanton et al. 2003 LF parameters ---
# This is for k-corrected to z=0.1 and some e-correction applied
M_star = -20.44 # -5log(h)
L_star = abs_mag_r_to_solar_L(M_star)
alpha = -1.05 # faint-end slope
phi_star = 0.0149  # h^3 Mpc^-3 normalization

from astropy.modeling.models import Schechter1D
blanton_model = Schechter1D(phi_star * u.Mpc**-3, M_star, alpha)

def schechter_integral(mag):
    return spi.quad(lambda mag: blanton_model(mag), mag, np.inf)[0]
vectorized_schechter_integral = np.vectorize(schechter_integral)

# --- Abundance matching ---
# 1. Sort halos by Mpeak (descending)
halos_df = halos_df.sort_values('Mpeak', ascending=False).reset_index(drop=True)

# 2. Compute cumulative number density for halos
n_halos = (np.arange(len(halos_df)) + 1) / box_volume

# 3. Compute cumulative number density for galaxies (LF grid)
mag_grid = np.linspace(-24, -14, 500)
#L_grid = abs_mag_r_to_solar_L(mag_grid)
n_gal_grid = vectorized_schechter_integral(mag_grid)

# 4. Interpolator: n_gal -> L
n_to_mag = interp1d(n_gal_grid[::-1], mag_grid[::-1], bounds_error=False, fill_value=mag_grid[-1])

# 5. Assign luminosities to halos
#halos_df['Lgal'] = n_to_L(n_halos)
#halos_df['M_r'] = log_solar_L_to_abs_mag_r(np.log10(halos_df['Lgal']))
halos_df['M_r'] = n_to_mag(n_halos)

# --- Result ---
print(halos_df[['Mpeak', 'M_r']].head())

In [None]:
# --- 5. Validation Plot: Check the resulting Luminosity Function ---
plt.figure(figsize=(8, 6))
# Plot the input LF
plt.plot(mag_grid, blanton_model(mag_grid), 'r-', label='Input LF (Blanton 2003)')

# Calculate and plot the LF from the generated mock
#mock_n_gal = (np.arange(len(mock_df)) + 1) / box_volume
#plt.plot(np.sort(mock_df['M_r']), mock_n_gal, 'b--', label='Mock Catalog LF')

plt.yscale('log')
plt.xlabel('$M_r$ -5log(h)')
#plt.xlabel('$L_{gal}$ [L$_\odot h^{-2}$]')
plt.ylabel('$\Phi(M_r^{0.1} -5\log(h))$')
plt.title('Luminosity Function: Input vs. Mock Catalog')
plt.legend()
plt.grid(True)
plt.show()