In [1]:
import os
import pandas as pd
import numpy as np
from glob import glob
from tqdm.notebook import tqdm
import ast
from astropy import units as u
from astropy.coordinates import SkyCoord
from scipy.spatial import cKDTree
from astropy.io import fits
from astropy.wcs import WCS
from sklearn.neighbors import NearestNeighbors

# Paths

In [8]:
data_dir = '/arc/projects/unions/ssl/data/raw/tiles/dwarforge'
table_dir = '/arc/home/heestersnick/dwarforge/tables'
dwarf_cat_file = 'all_known_dwarfs_v2_processed.csv'
dwarf_cat = pd.read_csv(os.path.join(table_dir, dwarf_cat_file))

FileNotFoundError: [Errno 2] No such file or directory: '/arc/home/heestersnick/dwarforge/tables/all_known_dwarfs_v2_processed.csv'

# Functions

In [3]:
def zfill_tile(tile):
    return f'{str(tile[0]).zfill(3)}_{str(tile[1]).zfill(3)}'

def labels_to_df(parent_folder, tile_list, dwarf_df):
    k = 0
    unmatched_dwarf_counter = 0
    unmatched_tile_counter = 0
    additional_dwarf_counter = 0
    for tile in tqdm(tile_list):
        # Convert tile tuple to folder name format
        folder_name = zfill_tile(tile)
        
        # Construct the full path to the parquet file
        tile_nums_zfill = folder_name.split('_')
        file_path = os.path.join(parent_folder, folder_name, "cfis_lsb-r", f"CFIS_LSB.{tile_nums_zfill[0]}.{tile_nums_zfill[1]}.r_rebin_det_params.parquet")
        fits_name = f'CFIS_LSB.{tile_nums_zfill[0]}.{tile_nums_zfill[1]}.r_rebin_seg.fits'
        fits_path = os.path.join(parent_folder, folder_name, "cfis_lsb-r", fits_name)
        
        if os.path.exists(file_path):
            try:
                # Attempt to read the parquet file
                det_df = pd.read_parquet(file_path)
                det_df_updated = det_df.copy()
                dwarfs_in_tile = dwarf_df[dwarf_df['tile'] == str(tile)].reset_index(drop=True)
                _, header = open_fits(fits_path, fits_ext=0)
                additional_dwarfs = check_objects_in_neighboring_tiles(str(tile), dwarf_df, header)

                if not additional_dwarfs.empty:
                    dwarfs_in_tile = pd.concat([dwarfs_in_tile, additional_dwarfs]).reset_index(drop=True)

                det_idx_lsb, lsb_matches, lsb_unmatches, _ = match_cats(det_df_updated, dwarfs_in_tile, tile, header, max_sep=15.0)

                # add lsb labels to detections dataframe
                det_df_updated['lsb'] = np.nan
                det_df_updated['ID_known'] = np.nan

                if len(det_idx_lsb) > 0:
                    print(f'Found {len(det_idx_lsb)} lsb detections for tile {tile}.')
                    det_df_updated.loc[det_idx_lsb, 'lsb'] = 1
                    # Initialize the column to accept strings
                    det_df_updated['ID_known'] = det_df_updated['ID_known'].astype(object)
                    det_df_updated.loc[det_idx_lsb, 'ID_known'] = lsb_matches['ID'].values
                    # print(
                    #     f'Added {np.count_nonzero(~np.isnan(det_df_updated["lsb"]))} LSB labels to the detection dataframe for tile {tile}.'
                    # )
                    k += 1
                    additional_dwarf_counter += len(additional_dwarfs)

                if len(lsb_unmatches) > 0:
                    print(f'Found {len(lsb_unmatches)} unmatched dwarf for tile: {tile}.')
                    unmatched_tile_counter += 1
                    unmatched_dwarf_counter += len(lsb_unmatches)

                # Save updated dataframe
                det_df_updated.to_parquet(file_path, index=False)
            except Exception as e:
                print(f'Something went wrong for tile {tile}: {e}')
    print(f'Was able to match {k}/{len(tile_list)} tiles.')
    print(f'There were {unmatched_dwarf_counter} unmatched dwarfs in {unmatched_tile_counter} tiles.')
    print(f'{additional_dwarf_counter} dwarfs are in multiple tiles.')

def open_fits(file_path, fits_ext):
    """
    Open fits file and return data and header.

    Args:
        file_path (str): name of the fits file
        fits_ext (int): extension of the fits file

    Returns:
        data (numpy.ndarray): image data
        header (fits header): header of the fits file
    """
    # logger.debug(f'Opening fits file {os.path.basename(file_path)}..')
    with fits.open(file_path, memmap=True) as hdul:
        data = hdul[fits_ext].data.astype(np.float32)  # type: ignore
        header = hdul[fits_ext].header  # type: ignore
    # logger.debug(f'Fits file {os.path.basename(file_path)} opened.')
    return data, header
                
def check_objects_in_neighboring_tiles(tile, dwarfs_df, header):
    wcs = WCS(header)
    # Get neighboring tile numbers
    neighboring_tiles = get_neighboring_tile_numbers(tile)

    # Filter dwarfs in neighboring tiles
    neighboring_dwarfs = dwarfs_df[dwarfs_df['tile'].isin(neighboring_tiles)]

    # Check which of these dwarfs are actually within the current tile's boundaries
    dwarfs_in_current_tile = neighboring_dwarfs[
        neighboring_dwarfs.apply(
            lambda row: wcs.footprint_contains(
                SkyCoord(row['ra'], row['dec'], unit='deg', frame='icrs')
            ),
            axis=1,
        )
    ]

    return dwarfs_in_current_tile

def get_neighboring_tile_numbers(tile):
    tile = ast.literal_eval(tile)
    x, y = map(int, tile)
    neighbors = [
        (x - 1, y - 1),
        (x - 1, y),
        (x - 1, y + 1),
        (x, y - 1),
        (x, y + 1),
        (x + 1, y - 1),
        (x + 1, y),
        (x + 1, y + 1),
    ]
    return [f'({nx:03d}, {ny:03d})' for nx, ny in neighbors if 0 <= nx < 1000 and 0 <= ny < 1000]

def dwarfs_to_df(parent_folder):
    # Pattern to match all relevant parquet files
    pattern = os.path.join(parent_folder, "*_*", "cfis_lsb-r", "CFIS_LSB.*.r_rebin_det_params.parquet")
    
    # List to store filtered dataframes
    filtered_dfs = []
    
    # Iterate through all matching files
    for file in tqdm(glob(pattern)):
        try:
            # Attempt to read the parquet file
            df = pd.read_parquet(file)
            # Check if 'label' column exists
            if 'lsb' in df.columns:
                # Filter rows where label is 1
                df_filtered = df[df['lsb'] == 1]
                
                if not df_filtered.empty:
                    filtered_dfs.append(df_filtered)
            # If 'label' column doesn't exist, we skip this file
        
        except Exception as e:
            print(f"Error processing file {file}: {str(e)}")
            continue
        
        # The file is automatically closed after reading
    
    # Combine all dataframes
    if filtered_dfs:
        final_df = pd.concat(filtered_dfs, ignore_index=True)
        return final_df
    else:
        return pd.DataFrame()  # Return an empty dataframe if no data found

def gather_training_data(parent_folder, band='cfis_lsb-r', n_neighbors=1):
    pattern = os.path.join(parent_folder, "*_*", band, "CFIS_LSB.*.r_rebin_det_params.parquet")
    
    all_examples = []
    
    for file in tqdm(glob(pattern)):
        try:
            filename = os.path.basename(file)
            tile_numbers = filename.split('.')[1:3]
            tile_id = f"{tile_numbers[0]}.{tile_numbers[1]}"
            
            df = pd.read_parquet(file)
            
            if 'lsb' in df.columns:
                positive_examples = df[df['lsb'] == 1].copy()
                potential_negatives = df[df['lsb'].isna()].copy()
                
                if not positive_examples.empty and not potential_negatives.empty:
                    nn = NearestNeighbors(n_neighbors=n_neighbors, metric='euclidean')
                    nn.fit(potential_negatives[['ra', 'dec']])
                    
                    all_file_examples = []
                    
                    for idx, lsb_obj in positive_examples.iterrows():
                        # Create a DataFrame for the single LSB object
                        lsb_df = pd.DataFrame({'ra': [lsb_obj['ra']], 'dec': [lsb_obj['dec']]})
                        
                        distances, indices = nn.kneighbors(lsb_df)
                        
                        nearest_neighbors = potential_negatives.iloc[indices[0]].copy()
                        
                        lsb_obj['example_id'] = f"{tile_id}.{lsb_obj['ID']}"
                        
                        nearest_neighbors['example_id'] = f"{tile_id}.{lsb_obj['ID']}"
                        nearest_neighbors['lsb'] = 0  # Set to 0 for negative examples
                        nearest_neighbors['associated_lsb_ra'] = lsb_obj['ra']
                        nearest_neighbors['associated_lsb_dec'] = lsb_obj['dec']
                        
                        all_file_examples.append(pd.concat([lsb_obj.to_frame().T, nearest_neighbors]))
                    
                    if all_file_examples:
                        all_examples.append(pd.concat(all_file_examples))
            
        except Exception as e:
            print(f"Error processing file {file}: {str(e)}")
            continue
    
    if all_examples:
        final_df = pd.concat(all_examples, ignore_index=True)
        return final_df
    else:
        return pd.DataFrame()

def match_cats(df_det, df_label, tile, header, max_sep=15.0):
    """
    Match detections to known objects preferring larger, lsb objects

    Args:
        df_det (dataframe): detections dataframe
        df_label (dataframe): dataframe of objects with labels
        tile (tuple): tile numbers
        header (header): fits header
        max_sep (float): base maximum separation tolerance in arcseconds

    Returns:
        det_matching_idx (list): indices of detections for which labels are available
        label_matches (dataframe): known objects that were detected
        label_unmatches (dataframe): known objects that were not detected
        det_matches (dataframe): detections that are known objects
        matches (list): list of (known_idx, detection_idx) pairs
    """
    tree = cKDTree(np.column_stack((df_det['ra'], df_det['dec'])))
    matches = []
    potential_matches_df = pd.DataFrame()

    for idx, known in df_label.iterrows():
        known_coords = SkyCoord(known['ra'], known['dec'], unit='deg')

        # Adaptive search radius (keep using re for this, but we'll be more cautious with it later)
        if (
            're' in known
            and known['re'] is not None
            and not np.isnan(known['re'])
            and known['re'] > 0
        ):
            search_radius = max(max_sep, known['re'] * 3) / 3600
        else:
            search_radius = max_sep / 3600

        potential_match_indices = tree.query_ball_point([known['ra'], known['dec']], search_radius)
        potential_matches = df_det.iloc[potential_match_indices]

        # print(f'potential matches for {known["ID"]}: {len(potential_matches)}')

        potential_matches_df = pd.concat([potential_matches_df, potential_matches])

        if len(potential_matches) > 0:
            potential_matches_coords = SkyCoord(
                potential_matches['ra'], potential_matches['dec'], unit='deg'
            )
            distances = known_coords.separation(potential_matches_coords).arcsec
            max_n_pix = potential_matches['n_pix'].max()
            max_mu = potential_matches['mu'].max()

            scores = []
            for i, det in potential_matches.iterrows():
                # Size comparison score (now using n_pix)
                size_score = np.log1p(det['n_pix']) / np.log1p(max_n_pix)

                # LSB characteristics score (now incorporating n_pix)
                lsb_score = det['mu'] / max_mu

                # Distance score
                distance_score = 1 / (1 + distances[potential_matches.index.get_loc(i)])

                # Combined score (adjust weights as needed)
                score = lsb_score * 0.2 + size_score * 0.4 + distance_score * 0.4
                # print(f'object: {det["ID"]}; lsb_score: {lsb_score}')
                # print(f'object: {det["ID"]}; size_score: {size_score}')
                # print(f'object: {det["ID"]}; distance_score: {distance_score}')
                # print(f'object: {det["ID"]}; total score: {score}')
                scores.append((i, score))

            best_match = max(scores, key=lambda x: x[1])
            matches.append((idx, best_match[0]))

    if matches:
        label_match_idx, det_match_idx = zip(*matches)
    else:
        label_match_idx, det_match_idx = [], []

    label_matches = df_label.loc[list(label_match_idx)].reset_index(drop=True)
    label_unmatches = df_label.drop(list(label_match_idx)).reset_index(drop=True)
    det_matches = df_det.loc[list(det_match_idx)].reset_index(drop=True)

    return list(det_match_idx), label_matches, label_unmatches, det_matches

def get_tile_list(dwarf_cat):
    tiles = dwarf_cat['tile'].values
    non_nan_tiles = [x for x in tiles if x is not np.nan]
    str_to_tuple = [ast.literal_eval(item) for item in non_nan_tiles]
    unique_tiles = list(set(str_to_tuple))
    return unique_tiles

def check_bands(bands_str, to_check):
    if isinstance(bands_str, str):
        try:
            bands_list = ast.literal_eval(bands_str)
            return all(band in bands_list for band in to_check)
        except:
            return False
    return False  # Return False for NaN values

def check_availability(dwarf_cat, check_for_bands):
    df_select = dwarf_cat.loc[(~dwarf_cat['tile'].isna()) & (dwarf_cat['bands'].apply(lambda x: check_bands(x, check_for_bands)))].reset_index(drop=True)
    return df_select, len(df_select)

# Analysis

In [203]:
tile_list = get_tile_list(dwarf_cat)

In [195]:
labels_to_df(data_dir, tile_list, dwarf_cat)

  0%|          | 0/936 [00:00<?, ?it/s]

Found 2 lsb detections for tile (320, 239).
Found 1 lsb detections for tile (269, 264).
Found 1 lsb detections for tile (289, 260).
Found 1 lsb detections for tile (270, 256).
Found 1 lsb detections for tile (185, 279).
Found 4 lsb detections for tile (154, 310).
Found 1 lsb detections for tile (270, 265).
Found 1 lsb detections for tile (601, 239).
Found 3 lsb detections for tile (347, 252).
Found 1 unmatched dwarf for tile: (347, 252).
Found 2 lsb detections for tile (319, 242).
Found 1 lsb detections for tile (234, 293).
Found 1 lsb detections for tile (297, 251).
Found 4 lsb detections for tile (105, 328).
Found 1 lsb detections for tile (300, 247).
Found 1 lsb detections for tile (285, 258).
Found 3 lsb detections for tile (254, 289).
Found 1 unmatched dwarf for tile: (328, 238).
Found 4 unmatched dwarf for tile: (339, 238).
Found 1 lsb detections for tile (231, 296).
Found 2 lsb detections for tile (285, 276).
Found 1 unmatched dwarf for tile: (285, 276).
Found 1 lsb detections f

In [205]:
dwarf_df = dwarfs_to_df(data_dir)

  0%|          | 0/20789 [00:00<?, ?it/s]

In [206]:
dwarf_df

Unnamed: 0,ID,X,Y,A,B,theta,total_flux,mu_max,mu_median,mu_mean,...,r_fwhm_arcsec,r_10_arcsec,r_90_arcsec,A_arcsec,B_arcsec,axis_ratio,mag,mu,lsb,ID_known
0,10324,1492.31000,1924.23250,7.664771,3.326518,1.538039,4117.464000,68.501020,2.167104,5.340420,...,1.966384,1.185774,7.876701,5.695490,2.471848,0.434001,20.963425,24.807643,1.0,091427+790820
1,12185,1797.62870,2260.27510,1.427872,0.622746,-1.489665,32.298973,4.126541,1.149031,1.699946,...,1.109190,0.000000,1.452271,1.061014,0.462747,0.436136,26.227028,26.938393,1.0,091306+791228
2,6159,701.79584,813.22760,5.905326,3.497783,1.009584,239.241090,9.595763,0.761840,0.934535,...,0.419235,0.838469,4.889072,4.388093,2.599111,0.592310,24.052911,27.816933,1.0,MATLAS-1108
3,15580,2205.67600,2281.36740,12.851192,7.406388,-0.183935,27841.102000,361.939100,2.030642,8.283577,...,1.728548,1.325736,14.975535,9.549384,5.503493,0.576319,18.888284,23.770149,1.0,111939+374245
4,6571,1316.95890,976.33276,12.043360,7.564031,-0.195644,1672.414600,5.057849,0.773654,1.005057,...,5.082948,2.651472,12.386942,8.949105,5.620633,0.628067,21.941640,27.765191,1.0,140344+535636
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1028,3235,1181.35390,519.84076,10.302202,9.257186,-0.627057,6481.537000,28.104645,1.532776,2.809509,...,2.934641,2.515407,15.460604,7.655296,6.878772,0.898564,20.470805,26.764682,1.0,111502+472057
1029,9672,92.45068,1389.13710,4.198488,2.557978,1.426979,1676.860500,71.276890,1.189012,4.021248,...,1.325736,0.592887,5.466149,3.119786,1.900766,0.609262,21.938758,24.912246,1.0,MATLAS-943
1030,7845,2480.97700,1416.98120,9.738636,8.705543,0.675312,833.898800,11.644283,0.766850,0.864144,...,0.838469,1.511571,9.831921,7.236525,6.468861,0.893918,22.697217,28.467289,1.0,123810+740157
1031,8564,2098.17190,1553.05920,11.087471,9.908681,-0.730187,2880.073200,6.459387,1.199072,1.522237,...,6.089733,3.165151,14.715072,8.238809,7.362881,0.893683,21.351491,27.917753,1.0,123919+740343


In [6]:
training_data.columns

Index(['ID', 'X', 'Y', 'A', 'B', 'theta', 'total_flux', 'mu_max', 'mu_median',
       'mu_mean', 'R_fwhm', 'R_e', 'R10', 'R90', 'n_pix', 'ra', 'dec',
       're_arcsec', 'r_fwhm_arcsec', 'r_10_arcsec', 'r_90_arcsec', 'A_arcsec',
       'B_arcsec', 'axis_ratio', 'mag', 'mu', 'lsb', 'ID_known', 'class',
       'example_id', 'associated_lsb_ra', 'associated_lsb_dec'],
      dtype='object')

In [7]:
training_data.to_csv(os.path.join(table_dir, 'training_data_10x_rf.csv'), index=False)

In [4]:
def gather_training_data(parent_folder, band='cfis_lsb-r', n_neighbors=1):
    pattern = os.path.join(parent_folder, "*_*", band, "CFIS_LSB.*.r_rebin_det_params.parquet")
    
    all_examples = []
    
    for file in tqdm(glob(pattern)):
        try:
            filename = os.path.basename(file)
            tile_numbers = filename.split('.')[1:3]
            tile_id = f"{tile_numbers[0]}.{tile_numbers[1]}"
            
            df = pd.read_parquet(file)
            
            if 'lsb' in df.columns:
                positive_examples = df[df['lsb'] == 1].copy()
                potential_negatives = df[df['lsb'].isna()].copy()
                
                if not positive_examples.empty and not potential_negatives.empty:
                    nn = NearestNeighbors(n_neighbors=len(potential_negatives), metric='euclidean')
                    nn.fit(potential_negatives[['ra', 'dec']])
                    
                    used_negatives = set()  # Set to keep track of used negative examples in this field
                    all_file_examples = []
                    
                    for idx, lsb_obj in positive_examples.iterrows():
                        lsb_df = pd.DataFrame({'ra': [lsb_obj['ra']], 'dec': [lsb_obj['dec']]})
                        
                        distances, indices = nn.kneighbors(lsb_df)
                        
                        # Find n_neighbors unique negative examples within this field
                        unique_negatives = []
                        for index in indices[0]:
                            if index not in used_negatives:
                                unique_negatives.append(index)
                                used_negatives.add(index)
                                if len(unique_negatives) == n_neighbors:
                                    break
                        
                        # If we couldn't find enough unique negatives, continue to the next positive example
                        if len(unique_negatives) < n_neighbors:
                            continue
                        
                        nearest_neighbors = potential_negatives.iloc[unique_negatives].copy()
                        
                        lsb_obj['example_id'] = f"{tile_id}.{lsb_obj['ID']}"
                        
                        nearest_neighbors['example_id'] = nearest_neighbors['ID'].apply(lambda x: f"{tile_id}.{x}")
                        nearest_neighbors['lsb'] = 0  # Set to 0 for negative examples
                        nearest_neighbors['associated_lsb_ra'] = lsb_obj['ra']
                        nearest_neighbors['associated_lsb_dec'] = lsb_obj['dec']
                        
                        all_file_examples.append(pd.concat([lsb_obj.to_frame().T, nearest_neighbors]))
                    
                    if all_file_examples:
                        all_examples.append(pd.concat(all_file_examples))
            
        except Exception as e:
            print(f"Error processing file {file}: {str(e)}")
            continue
    
    if all_examples:
        final_df = pd.concat(all_examples, ignore_index=True)
        return final_df
    else:
        return pd.DataFrame()

In [5]:
training_data = gather_training_data(data_dir, band='cfis_lsb-r', n_neighbors=10)

  0%|          | 0/20789 [00:00<?, ?it/s]

In [6]:
len(training_data)

11363