## Making a mega-table from the results tables

This notebook compiles the fitting results from all clusters into a single large table (a "megatable"), which makes comparison and visualisation far easier. It also identifies any cases where apertures overlap, discarding one of the two degenerate spectra from the sample.

In [1]:
from astropy.io import fits, ascii
import astropy.table as aptb
import numpy as np
import gc
from contextlib import contextmanager
import pandas as pd

from astro_utils import io as auio
from astro_utils import constants as auconst
from astro_utils import catalogue_operations as aucat

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


No Bottleneck unit testing available.


In [2]:
# Variables that define which results to load
SPEC_TYPE   = '1fwhm_opt' # can be '1fwhm', '2fwhm_opt', etc for APER, 'weight_skysub' etc for R21

In [3]:
@contextmanager
def load_results(cluster, spec_type):
    """ Load fit results generated by S01 spectral extraction and fitting script along with
    Richard et al. 2021 catalogue for a given cluster and spectrum type.
    
    Parameters
    ----------
    cluster : str
        Cluster name to load results for (e.g. 'A2744').
    spec_type : str
        What subtype of spectrum to load (aperture size for APER, or weighting/sky subtraction for R21).
        For aperture spectra, can be '1fwhm', '2fwhm', etc.
        For Richard et al. 2021 spectra, can be 'weight_skysub', etc.

    Returns
    -------
    tuple of astropy.table.Table
        A tuple containing three tables:
        - The main fit results table (all lines or Lyman-alpha only).
        - The Lyman-alpha fit results table.
        - The Richard et al. 2021 catalogue table.

    Notes
    -----
    This function assumes a specific directory structure for the fit results files (specified in 
    astro_utils documentation).
    """
    fn = f"{cluster}_{spec_type}_lines.fits"
    fn_lya = f"{cluster}_{spec_type}_lya.fits"
    fn_r21 = f"{cluster}_v?.?.fits"

    data_dir = auio.get_data_dir()
    filepath = data_dir / cluster / "catalogs" / "fit_results" / fn
    filepath_lya = data_dir / cluster / "catalogs" / "fit_results" / fn_lya
    filepath_r21 = next((data_dir / cluster / "catalogs" / "R21").glob(fn_r21))
    hdul = fits.open(filepath)
    hdul_lya = fits.open(filepath_lya)
    hdul_r21 = fits.open(filepath_r21)

    try:
        yield aptb.Table(hdul[1].data).to_pandas(), \
                aptb.Table(hdul_lya[1].data).to_pandas(), \
                aptb.Table(hdul_r21[1].data).to_pandas()
    finally:
        hdul.close()
        hdul_lya.close()
        hdul_r21.close()

In [4]:
# Get list of cluster names based on directories in data folder that contain fit results files
data_dir = auio.get_data_dir()
cluster_dirs = [d for d in data_dir.iterdir() if any((d / "catalogs" / "fit_results").glob("*.fits"))]
cluster_names = [d.name for d in cluster_dirs]

In [5]:
# Constants
c = 299792.458  # speed of light in km/s
wavedict = auconst.wavedict  # Dictionary of common wavelengths in Angstroms
muse_start = 4749.8310546875 # minimum wavelength of MUSE in Angstroms
muse_end = muse_start + 3681. * 1.25 # maximum wavelength of MUSE in Angstroms
minz = muse_start / wavedict['LYALPHA'] - 1.
maxz = muse_end / wavedict['LYALPHA'] - 1.

# Get a list of all lines that fall within the MUSE wavelength range at the redshifts of our Lya sources
good_lines = []
for line, restwave in wavedict.items():
    if line == 'DUST':  # Skip DUST pseudo-line
        continue
    wvl = restwave
    minlam = wvl * (1. + minz)
    maxlam = wvl * (1. + maxz)
    if (muse_start < minlam < muse_end) or (muse_start < maxlam < muse_end):
        # Check to see if this line is ever reported at significance > 5 sigma in our fit results
        for cluster in cluster_names:
            with load_results(cluster, SPEC_TYPE) as (restab, _, _):
                sigmas = np.abs(restab['SNR'])
                line_mask = restab['LINE'] == line
                if np.any(sigmas[line_mask] > 5):
                    good_lines.append(line)
                    break  # No need to check other clusters for this line

print(f"{len(good_lines)} lines within MUSE wavelength range at Lya source redshifts:")
for line in good_lines:
    print(f" - {line}")

gc.collect()

29 lines within MUSE wavelength range at Lya source redshifts:
 - AlII1671
 - AlIII1854
 - AlIII1862
 - CII1334
 - CII2329
 - CIII1907
 - CIII1909
 - CIV1548
 - CIV1551
 - FeII1608
 - FeII1611
 - FeII2383
 - HeII1640
 - NIII1750
 - NIV1483
 - NIV1487
 - NV1238
 - NV1243
 - OI1302
 - OIII1660
 - OIII1666
 - OVI1032
 - OVI1038
 - SiII1260
 - SiII1304
 - SiIII1883
 - SiIII1892
 - SiIV1394
 - SiIV1403


0

In [6]:
# Loop over clusters, creating mgatables for each one
jtab_paths = {}
for cluster in cluster_names:
    with load_results(cluster, SPEC_TYPE) as (linetab, lyatab, r21tab):
        print(f"Processing cluster {cluster}...")

        r21tab = r21tab[r21tab.MU != 1.0] # Get rid of MU == 1.0 rows in the r21tab as they are possibly bugged
        lyatab = lyatab[lyatab.MU != 1.0]
        linetab = linetab[linetab.MU != 1.0]

        # Update the naming convention for the catalogue sources to match the fit results
        new_ids = []
        for row in r21tab.itertuples():
            full_id = row.idfrom[0].replace('E','X') + str(row.iden).strip()
            new_ids.append(full_id)
        r21tab['iden'] = new_ids

        # Check to make sure that the IDs match those in the fit results and eliminate any
        # sources in the catalogue that are not in the fit results (to save memory later on)
        fit_ids = linetab.iden
        r21_ids = r21tab.iden

        # Retain only those rows in r21tab that are also in fit_ids
        mask = np.isin(r21_ids, fit_ids)
        r21tab = r21tab[mask]

        # Update r21_ids after masking
        r21_ids = r21tab.iden

        # Check to make sure the IDs now match
        assert set(fit_ids) == set(r21_ids), "Mismatch between fit results IDs and Richard et al. 2021 catalogue IDs"

        # Raise an error if there are duplicate IDs in the R21 or lya results
        if r21tab['iden'].duplicated().any():
            raise ValueError(f"Duplicate IDs found in R21 catalogue for cluster {cluster}")
        if lyatab['iden'].duplicated().any():
            raise ValueError(f"Duplicate IDs found in Lya results for cluster {cluster}")

        print(f"Processing results tables for cluster {cluster}...")

        # Filtering and renaming columns in the Lya results table
        lyatab.drop(['LINE', 'LBDA_REST', 'MU', 'MU_ERR'], axis=1, inplace=True)  # This will cause duplication if we leave it in
        lyatab.rename(dict(zip(['Z', 'Z_ERR'], ['Z_LYA', 'Z_LYA_ERR'])), axis=1, inplace=True) # Rename for clarity
        lyatab['CLUSTER'] = cluster  # This just makes a column with the cluster name in it for the Lya results tab

        clus_linelist = np.unique(linetab.LINE)  # This tells us what individual unique lines are in the big results tab
        clus_linelist = [line for line in clus_linelist if line in good_lines]  # Keep only lines that are in the good lines list
        for i, line in enumerate(clus_linelist):
            t = linetab[linetab.LINE == line].copy() # Get only rows for this line
            t.drop_duplicates(subset='iden', keep='first', inplace=True)  # Remove duplicate entries for same source
            t.drop(['LINE', 'LBDA_REST', 'MU', 'MU_ERR', 'Z', 'Z_ERR'], axis=1, inplace=True) # Remove unnecessary columns
            t.rename(dict(zip(t.columns[1:], [s + '_' + line for s in t.columns[1:]])), axis=1, inplace=True) # Rename columns to include line name suffix

            # Merge this line's results into the Lya megatable
            lyatab = pd.merge(lyatab, t, on='iden', how='left')
            print(f"Line megatab has {len(lyatab)} rows and {len(lyatab.columns)} columns"
                  f" after adding line {i+1}/{len(clus_linelist)}: {line}")
            gc.collect()

        # Join the original catalogue and the Lya results
        print(f"Generating joined table for cluster {cluster}...")
        jtab = pd.merge(r21tab, lyatab, on='iden', how='left')
        
        # Put the cluster name as the second column
        jcolnames = jtab.columns.tolist()
        jcolnames.remove('CLUSTER')
        jcolnames.insert(1, 'CLUSTER')
        jtab = jtab[jcolnames]
        
        # Save the joined table to the disc
        print(f"Saving joined table for cluster {cluster}...")
        pathname = auio.get_data_dir() / cluster / 'catalogs' / 'fit_results' / f'megatable_{SPEC_TYPE}.csv'
        jtab.to_csv(pathname, index=False)
        jtab_paths[cluster] = pathname

        gc.collect()
        
        

Processing cluster MACS0257...
Processing results tables for cluster MACS0257...
Line megatab has 79 rows and 54 columns after adding line 1/28: AlII1671
Line megatab has 79 rows and 67 columns after adding line 2/28: AlIII1854
Line megatab has 79 rows and 80 columns after adding line 3/28: AlIII1862
Line megatab has 79 rows and 93 columns after adding line 4/28: CII1334
Line megatab has 79 rows and 106 columns after adding line 5/28: CII2329
Line megatab has 79 rows and 119 columns after adding line 6/28: CIII1907
Line megatab has 79 rows and 132 columns after adding line 7/28: CIII1909
Line megatab has 79 rows and 145 columns after adding line 8/28: CIV1548
Line megatab has 79 rows and 158 columns after adding line 9/28: CIV1551
Line megatab has 79 rows and 171 columns after adding line 10/28: FeII1608
Line megatab has 79 rows and 184 columns after adding line 11/28: FeII1611
Line megatab has 79 rows and 197 columns after adding line 12/28: HeII1640
Line megatab has 79 rows and 210 c

In [7]:
# Now concatenate all the individual cluster megatables into one big megatable
all_tabs = []
for cluster, path in jtab_paths.items():
    print(f"Loading megatable for cluster {cluster} from {path}...")
    tab = pd.read_csv(path)
    all_tabs.append(tab)

# Combine all cluster tables into one big table
big_megatable = pd.concat(all_tabs, ignore_index=True)
print(f"Combined megatable has {len(big_megatable)} rows and {len(big_megatable.columns)} columns.")

Loading megatable for cluster MACS0257 from /media/james/63C4C5633F1EAE9F/phd/lya_outflows/muse_data/MACS0257/catalogs/fit_results/megatable_1fwhm_opt.csv...
Loading megatable for cluster A2744 from /media/james/63C4C5633F1EAE9F/phd/lya_outflows/muse_data/A2744/catalogs/fit_results/megatable_1fwhm_opt.csv...
Loading megatable for cluster A370 from /media/james/63C4C5633F1EAE9F/phd/lya_outflows/muse_data/A370/catalogs/fit_results/megatable_1fwhm_opt.csv...
Loading megatable for cluster BULLET from /media/james/63C4C5633F1EAE9F/phd/lya_outflows/muse_data/BULLET/catalogs/fit_results/megatable_1fwhm_opt.csv...
Loading megatable for cluster MACS0329 from /media/james/63C4C5633F1EAE9F/phd/lya_outflows/muse_data/MACS0329/catalogs/fit_results/megatable_1fwhm_opt.csv...
Loading megatable for cluster MACS0416NE from /media/james/63C4C5633F1EAE9F/phd/lya_outflows/muse_data/MACS0416NE/catalogs/fit_results/megatable_1fwhm_opt.csv...
Loading megatable for cluster MACS0416S from /media/james/63C4C563

In [25]:
# Convert to an Astropy Table
megatab = aptb.Table.from_pandas(big_megatable)

# Replace any zeros with NaNs in floating point columns
for col in megatab.colnames:
    if np.issubdtype(megatab[col].dtype, np.floating):
        print(f"Replacing zeros with NaN in column {col}...")
        megatab[col][megatab[col] == 0.0] = np.nan

Replacing zeros with NaN in column z...
Replacing zeros with NaN in column RA...
Replacing zeros with NaN in column DEC...
Replacing zeros with NaN in column A_WORLD...
Replacing zeros with NaN in column B_WORLD...
Replacing zeros with NaN in column THETA_J2000...
Replacing zeros with NaN in column KRON_RADIUS...
Replacing zeros with NaN in column THRESHOLD...
Replacing zeros with NaN in column MU_MAX...
Replacing zeros with NaN in column CLASS_STAR...
Replacing zeros with NaN in column FWHM_IMAGE...
Replacing zeros with NaN in column FLUX_RADIUS...
Replacing zeros with NaN in column MAG_ISO_HST_F606W...
Replacing zeros with NaN in column MAGERR_ISO_HST_F606W...
Replacing zeros with NaN in column MAG_AUTO_HST_F606W...
Replacing zeros with NaN in column MAGERR_AUTO_HST_F606W...
Replacing zeros with NaN in column MAG_ISO_HST_F814W...
Replacing zeros with NaN in column MAGERR_ISO_HST_F814W...
Replacing zeros with NaN in column MAG_AUTO_HST_F814W...
Replacing zeros with NaN in column MAGER

In [None]:
import astropy.units as u
from astropy.coordinates import SkyCoord

# Load FWHM table
fwhmtb = ascii.read('../muse_data/muse_fwhms.txt')

#CLEANING OF TABLE
# Remove rows with no significant Lyman alpha
megatab = megatab[(megatab['SNRR'] > 3.0) + (megatab['SNRB'] > 3.0)]
# Remove rows with large FWHM uncertainties (inspected visually, all due to conamination)
megatab = megatab[(megatab['FWHMR_ERR'] < 5.0) * (megatab['FWHMR'] < 20.0)]

# Check for rows with overlapping apertures (effectively duplicates)
megatab.sort(keys='SNRR', reverse=True)
# Now wort by the total combined SNR of all absorption lines
megatab['ABS_SNR'] = np.array([
    np.sqrt(
        np.nansum(
            np.array([np.nanmin([row[f"SNR_{line}"], 0])**2 for line in good_lines])
        )
    ) for row in megatab
])
# Some SNRs my be extremely high due to fitting issues, so set any above 1000 to zero and any above 100 to 100
megatab['ABS_SNR'][megatab['ABS_SNR'] > 1000] = 0
megatab['ABS_SNR'][megatab['ABS_SNR'] > 100] = 100
megatab.sort(keys='ABS_SNR', reverse=True)

repeat = np.zeros(len(megatab))

for i, row in enumerate(megatab[:-1]):
    fwhm = fwhmtb[fwhmtb.field('CLUSTER') == row['CLUSTER']]['PSF_FWHM'][0]
    c1 = SkyCoord(row['RA'], row['DEC'], unit='deg')
    zlya1 = row['LPEAKR'] / 1215.67 - 1.
    if zlya1 == np.nan:
        zlya1 = row['LPEAKB'] / 1215.67 - 1.
    for j, othrow in enumerate(megatab[i+1:]):
        if othrow['CLUSTER'].strip('SNE') != row['CLUSTER'].strip('SNE'):
            continue
        c2 = SkyCoord(othrow['RA'], othrow['DEC'], unit='deg')
        zlya2 = othrow['LPEAKR'] / 1215.67 - 1.
        if zlya2 == np.nan:
            zlya2 = othrow['LPEAKB'] / 1215.67 - 1.
        if 'fwhm' in SPEC_TYPE:
            nfwhm = int(SPEC_TYPE[0])
            if c1.separation(c2).to(u.arcsec).value < nfwhm * fwhm and np.abs(zlya1 - zlya2) < 0.05:
                print(f'Gotcha! In {row["CLUSTER"]}, {othrow["iden"]} is a copy of {row["iden"]}')
                print(f"Separated by {c1.separation(c2).to(u.arcsec).value:.2f} arcsec"
                      f" with dz = {np.abs(zlya1 - zlya2):.4f}.")
                print(f"ABS_SNR of {row['ABS_SNR']:.2f} for {row['iden']} and {othrow['ABS_SNR']:.2f} for {othrow['iden']}\n")
                if row['ABS_SNR'] < 5 and othrow['ABS_SNR'] >= 5:
                    print(f"WARNING! GETTING RID OF AN ABSORBER!")
                repeat[i+j+1] = 1
                
print(f"Filtered megatab contains {len(megatab)} sources.")

  a = np.asanyarray(a)


Gotcha! In MACS0416NE, P6907 is a copy of P1255
Separated by 0.02 arcsec with dz = 0.0002.
ABS_SNR of 6.02 for P1255 and 5.64 for P6907
Gotcha! In MACS0416S, P5175 is a copy of P5278
Separated by 0.51 arcsec with dz = 0.0000.
ABS_SNR of 4.03 for P5278 and 3.64 for P5175
Gotcha! In MACS0416S, P5061 is a copy of P5114
Separated by 0.23 arcsec with dz = 0.0000.
ABS_SNR of 3.47 for P5114 and 3.42 for P5061
Filtered megatab contains 954 sources.


In [30]:
megatab['ABS_SNR'].mean()

4.904833456414233e+17

In [None]:
# Some final cleaning of columns
remove_cols = ['ABS_SNR', 'zbalmer', 'zbalmer_err']
for col in remove_cols:
    if col in megatab.colnames:
        megatab.remove_column(name=col)
megatab['MUL'] = megatab['MUL'].astype(str)
megatab[~repeat.astype(bool)].write(f'./megatables/lae_megatab_{SPEC_TYPE}.fits', overwrite=True)