In [1]:
import numpy as np
import pandas as pd
from astropy.coordinates import SkyCoord
import astropy.units as u


In [11]:
def main_sequence_logSFR(logm, z):
    """Calculate main sequence SFR"""
    a = 0.84 - 0.026*z
    b = (0.11*z - 6.51) + 0.7
    return a*(logm-10.5) + b


def select_galaxy_pairs(catalog_file, z_max=3.0, sep_max=50, delta_v_max=500, ms_tolerance=0.5):
    """
    Select physically connected galaxy pairs and their control sample
    
    Parameters:
    -----------
    catalog_file : str
        Path to the input galaxy catalog
    z_max : float
        Maximum redshift to consider (updated for COSMOS-Web sample)
    sep_max : float
        Maximum projected separation in kpc
    delta_v_max : float
        Maximum velocity difference in km/s
    """
    # Read the galaxy catalog
    df = pd.read_csv(catalog_file)
    # df.info()
    
    # Calculate distance from main sequence
    print('main sequence calculation')
    df['logSFR'] = np.log10(df['sfr'])
    df['logm'] = df['logm']  # Already in log
    df['MS_SFR'] = main_sequence_logSFR(df['logm'], df['z'])
    df['delta_MS'] = df['logSFR'] - df['MS_SFR']

    print('main sequence calculation done')
    
    # Filter for main sequence galaxies
    # ms_mask = np.abs(df['delta_MS']) < ms_tolerance
    # df = df[ms_mask].copy()
    # df.info()
    # Rename columns to match the function expectations

    print('KKK')

    # Convert log mass to linear mass
    #df['mass'] = 10**df['logm']
    
    # Convert coordinates to SkyCoord with explicit units
    try:
        coords = SkyCoord(ra=df['ra'].values*u.degree, 
                         dec=df['dec'].values*u.degree, 
                         frame='icrs')
    except KeyError:
        # Try alternative column names
        coords = SkyCoord(ra=df['RA_deg'].values*u.degree, 
                         dec=df['Dec_deg'].values*u.degree, 
                         frame='icrs')
     
    # Initialize lists to store pair indices
    paired_galaxies = []
    
    # Find pairs
    for i in range(len(df)):
        # Skip if outside redshift range
        if df.iloc[i]['z'] > z_max:
            continue
            
        # Calculate angular separations to all other galaxies
        seps = coords[i].separation(coords)
        
        # Convert angular separation to projected distance at galaxy redshift
        # Assuming Planck18 cosmology
        kpc_per_arcsec = 8.558 * df.iloc[i]['z']  # Approximate conversion at z~2
        proj_sep_kpc = seps.arcsec * kpc_per_arcsec
        
        # Calculate velocity differences
        v1 = df.iloc[i]['z'] * 299792.458  # Convert redshift to velocity
        v2 = df['z'] * 299792.458
        delta_v = np.abs(v1 - v2)
        
        # Apply selection criteria
        mask = (proj_sep_kpc < sep_max) & (delta_v < delta_v_max) & (df['z'] < z_max)
        
        # Store valid pairs
        pairs = np.where(mask)[0]
        for j in pairs:
            if i < j:  # Avoid duplicate pairs
                paired_galaxies.append((i, j))
    
    # Select first 10 pairs
    selected_pairs = paired_galaxies[:10]
    
    # Select control sample
    control_sample = []
    paired_indices = set([i for pair in selected_pairs for i in pair])
    
    for pair in selected_pairs:
        for galaxy_idx in pair:
            galaxy_mass = df.iloc[galaxy_idx]['logm']
            galaxy_z = df.iloc[galaxy_idx]['z']
            
            # Find galaxies with similar mass and redshift that are not in pairs
            mass_diff = np.abs(df['logm'] - galaxy_mass)
            z_diff = np.abs(df['z'] - galaxy_z)
            mask = (mass_diff < 0.3 * galaxy_mass) & (z_diff < 0.1 * galaxy_z) & (~df.index.isin(paired_indices))
            
            control_indices = df.index[mask].tolist()
            if control_indices:
                control_idx = min(control_indices, 
                                key=lambda x: np.sqrt((df.iloc[x]['logm'] - galaxy_mass)**2 + 
                                                    (df.iloc[x]['z'] - galaxy_z)**2))
                control_sample.append((galaxy_idx, control_idx))
    # Create summary tables
    pair_info = []
    control_info = []
    
    for pair_idx, (i, j) in enumerate(selected_pairs):
        # Pair information
        pair_info.append({
            'pair_id': f'P{pair_idx+1}',
            'gal1_id': df.iloc[i]['id'],
            'gal2_id': df.iloc[j]['id'],
            'gal1_ra': df.iloc[i]['ra'],
            'gal1_dec': df.iloc[i]['dec'],
            'gal2_ra': df.iloc[j]['ra'],
            'gal2_dec': df.iloc[j]['dec'],
            'gal1_z': df.iloc[i]['z'],
            'gal2_z': df.iloc[j]['z'],
            'gal1_mass': df.iloc[i]['logm'],
            'gal2_mass': df.iloc[j]['logm'],
            'gal1_sfr': df.iloc[i]['logSFR'],
            'gal2_sfr': df.iloc[j]['logSFR'],
            'separation_kpc': proj_sep_kpc[j]
        })
        
        # Control information
        for control_pair in control_sample:
            if control_pair[0] in [i, j]:
                control_info.append({
                    'pair_id': f'P{pair_idx+1}',
                    'target_id': df.iloc[control_pair[0]]['id'],
                    'control_id': df.iloc[control_pair[1]]['id'],
                    'target_mass': df.iloc[control_pair[0]]['logm'],
                    'control_mass': df.iloc[control_pair[1]]['logm'],
                    'target_sfr': df.iloc[control_pair[0]]['logSFR'],
                    'control_sfr': df.iloc[control_pair[1]]['logSFR'],
                    'target_z': df.iloc[control_pair[0]]['z'],
                    'control_z': df.iloc[control_pair[1]]['z']
                })
    
    # Convert to DataFrames
    pair_df = pd.DataFrame(pair_info)
    control_df = pd.DataFrame(control_info)
    
    # Save tables
    pair_df.to_csv('galaxy_pairs.csv', index=False)
    control_df.to_csv('control_sample.csv', index=False)
    
    return selected_pairs, control_sample, pair_df, control_df


In [12]:
# Test and print results
catalog = 'selection_criteria_data.csv'
pairs, controls, pair_table, control_table = select_galaxy_pairs(catalog)

print("\nGalaxy Pairs Summary:")
print(pair_table.to_string())
print("\nControl Sample Summary:")
print(control_table.to_string())

main sequence calculation
main sequence calculation done
KKK

Galaxy Pairs Summary:
  pair_id  gal1_id  gal2_id     gal1_ra  gal1_dec     gal2_ra  gal2_dec  gal1_z  gal2_z  gal1_mass  gal2_mass  gal1_sfr  gal2_sfr  separation_kpc
0      P1    21853    21854  149.906161  2.240751  149.905652  2.241033  1.6433  1.6449   9.916833  10.509070  2.472120  1.340809    53576.524299
1      P2    27376   577102  149.821541  2.322134  149.821527  2.322101  2.0703  2.0703  10.498873  10.498873  1.117638  1.117638    61850.083919
2      P3    44542    44544  149.981820  2.102386  149.982423  2.102467  1.8737  1.8726  10.709138  10.234291  0.244283 -0.754638    47902.387679
3      P4    64576    64636  150.036103  2.218026  150.036857  2.217819  1.7625  1.7626  10.162339  10.746480 -0.519339 -1.955011    41608.935720
4      P5   284846   324600  150.106079  1.916448  150.106086  1.916488  1.5059  1.5059   9.935256   9.935256 -0.046783 -0.046783    45179.878693
5      P6   325949   325950  150.131684 

In [None]:

# Test the function
try:
    catalog = 'selection_criteria_data.csv'
    df_test = pd.read_csv(catalog)
    required_cols = ['dec', 'ra', 'z', 'logm']
    missing_cols = [col for col in required_cols if col not in df_test.columns]
    
    if missing_cols:
        print(f"Error: Missing required columns: {missing_cols}")
    else:
        pairs, controls = select_galaxy_pairs(catalog)
        print(f"Successfully processed {len(pairs)} galaxy pairs and {len(controls)} control samples")
        
        # Print some information about the pairs
        df_test = pd.read_csv(catalog)
        for i, (idx1, idx2) in enumerate(pairs):
            print(f"\nPair {i+1}:")
            print(f"Galaxy 1: z={df_test.iloc[idx1]['z']:.2f}, mass={df_test.iloc[idx1]['logm']:.2f}")
            print(f"Galaxy 2: z={df_test.iloc[idx2]['z']:.2f}, mass={df_test.iloc[idx2]['logm']:.2f}")
            
except FileNotFoundError:
    print(f"Error: Could not find catalog file '{catalog}'")
except pd.errors.EmptyDataError:
    print("Error: The catalog file is empty")
except Exception as e:
    print(f"An unexpected error occurred: {str(e)}")

In [6]:
# Example usage
catalog = 'selection_criteria_data.csv' 
pairs, controls = select_galaxy_pairs(catalog)
print(f"Found {len(pairs)} galaxy pairs")

main sequence calculation


KeyError: 'logm'

In [4]:
pairs

[(529, np.int64(530)),
 (649, np.int64(8681)),
 (815, np.int64(817)),
 (1273, np.int64(1277)),
 (4579, np.int64(5236)),
 (5258, np.int64(5259)),
 (5482, np.int64(5485)),
 (5578, np.int64(5579)),
 (6467, np.int64(6469)),
 (7061, np.int64(10790))]