In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
from tabulate import tabulate
from astropy.time import Time
from tqdm import tqdm
import csv
import os
import glob
import multiprocessing

In [2]:
# prelim frequency reduction, remove files with frequencies all less than 2 GHz


directory = "/datag/users/ctremblay/"

# Get list of candidate files
files = [f for f in os.listdir(directory) if f.startswith("Summer_Project") and f.endswith(".pkl")]
full_paths = [os.path.join(directory, f) for f in files]

def check_file(file_path):
    try:
        df = pd.read_pickle(file_path)
        if not df.empty and 'signal_frequency' in df.columns:
            min_freq = df['signal_frequency'].min()
            if pd.notna(min_freq) and min_freq >= 2000:
                return file_path
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
    return None

if __name__ == "__main__":
    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
        results = pool.map(check_file, full_paths)

    high_freq_files = [r for r in results if r is not None]
    print(f"\n {len(high_freq_files)} files with frequency ≥ 2000 MHz")



 50 files with frequency ≥ 2000 MHz


In [3]:


# --- 1. Load Crickets CSV ---
crickets_path = "Full_Crickets.csv"
intervals = pd.read_csv(crickets_path)
intervals.columns = intervals.columns.str.strip()

# Rename columns for clarity
intervals = intervals.rename(columns={
    "rfi_bin_bots": "start_frequency",
    "rfi_bin_tops": "end_frequency"
})

# Ensure frequencies are floats and drop invalid rows
intervals['start_frequency'] = pd.to_numeric(intervals['start_frequency'], errors='coerce')
intervals['end_frequency'] = pd.to_numeric(intervals['end_frequency'], errors='coerce')
intervals = intervals.dropna(subset=['start_frequency', 'end_frequency'])

# --- 2. Define NRAO manual bands ---
# --- 2. Define NRAO manual bands ---
nrao_ranges = [
    (2178.0, 2195.0), (2106.4, 2106.4), (2204.5, 2204.5), (2180, 2290),
    (2227.5, 2231.5), (2246.5, 2252.5), (2268.5, 2274.5), (2282.5, 2288.5),
    (2314.5, 2320.5), (2320.0, 2332.5), (2324.5, 2330.5), (2332.5, 2345.0),
    (2334.5, 2340.5), (2387.5, 2387.5), (2400.0, 2483.5), (2411.0, 2413.0),
    (2483.5, 2500.0), (2741.0, 2741.0), (2791.0, 2791.0), (3700.0, 4200.0),
    (5648.5, 5663.5), (5659.5, 5670.5), (5695.5, 5704.5), (5742.5, 5757.5),
    (5765.0, 5769.0), (5796.0, 5804.0), (6108.1, 6138.1), (6137.75, 6167.75),
    (6182.0, 6212.0), (6360.14, 6390.14), (6389.79, 6419.79), (6772.0, 6778.0),
    (7250.0, 7850.0), (9300.0, 9900.0), (9300.0, 9500.0), (10740.0, 10770.0),
    (10820.0, 10850.0), (10957.0, 10993.0), (11037.0, 11073.0), (11230.0, 11260.0),
    (11310.0, 11340.0), (11447.0, 11483.0), (11527.0, 11563.0), (11700.0, 12000.0),
    (12000.0, 12700.0), (13400.0, 13750.0), (17800.0, 20200.0), (29500.0, 30000.0),
    (34875.0, 34875.0), (36286.0, 36286.0)
]

# --- 3. Combine Crickets + Manual RFI bands ---
manual_df = pd.DataFrame(nrao_ranges, columns=['start_frequency', 'end_frequency'])
combined_df = pd.concat([intervals[['start_frequency', 'end_frequency']], manual_df], ignore_index=True)

# --- 4. Merge overlapping bands ---
def merge_overlapping_bands(bands, tol=1e-6):
    bands = sorted(bands, key=lambda x: x[0])
    merged = []
    for band in bands:
        if not merged:
            merged.append(band)
        else:
            prev_start, prev_end = merged[-1]
            curr_start, curr_end = band
            if curr_start <= prev_end + tol:
                # Overlapping or adjacent
                merged[-1] = (prev_start, max(prev_end, curr_end))
            else:
                merged.append(band)
    return merged

# Convert to list of tuples, merge
combined_bands = list(zip(combined_df['start_frequency'], combined_df['end_frequency']))
merged_bands = merge_overlapping_bands(combined_bands)

# --- 5. Save merged bands to CSV ---
merged_df = pd.DataFrame(merged_bands, columns=['start_frequency', 'end_frequency'])
save_path = os.path.join(os.getcwd(), "Full_Crickets_merged.csv")
merged_df.to_csv(save_path, index=False)
print(f" Saved merged RFI band list to {save_path}")


 Saved merged RFI band list to /mnt_home/ellambishop/SETI-2025/cosmic-testing/Full_Crickets_merged.csv


In [4]:
#NRAO and CRICKETS RFI frequency elimination

# --- Update these paths ---
filtered_files = high_freq_files  # List of full file paths
output_dir = '/datax/scratch/ellambishop/rfi_removed_hits'
os.makedirs(output_dir, exist_ok=True)

# --- Load and clean RFI bands ---
intervals = pd.read_csv("Full_Crickets_merged.csv")
intervals.columns = intervals.columns.str.strip()
intervals = intervals.rename(columns={
    "rfi_bin_bots": "start_frequency",
    "rfi_bin_tops": "end_frequency"
})
intervals['start_frequency'] = pd.to_numeric(intervals['start_frequency'], errors='coerce')
intervals['end_frequency'] = pd.to_numeric(intervals['end_frequency'], errors='coerce')
rfi_bands = intervals.dropna(subset=['start_frequency', 'end_frequency'])[['start_frequency', 'end_frequency']].values


def filter_hits_by_rfi(df):
    """Vectorized RFI filtering using NumPy broadcasting."""
    freqs = df['signal_frequency'].values
    keep_mask = np.ones(len(freqs), dtype=bool)

    for low, high in rfi_bands:
        keep_mask &= ~((freqs >= low) & (freqs <= high))  # Mask out RFI frequencies

    return df[keep_mask]


def process_file(filepath):
    input_path = filepath
    output_path = os.path.join(output_dir, os.path.basename(filepath))

    try:
        df = pd.read_pickle(input_path)

        # Pre-filter: only keep signal_frequency ≥ 2000 MHz
        df = df[df['signal_frequency'] >= 2000]

        # Apply vectorized RFI filtering
        df_filtered = filter_hits_by_rfi(df)

        if df_filtered.empty:
            print(f"{filepath}: No hits remain after filtering, skipping save.")
            return

        df_filtered.to_pickle(output_path)
        print(f"{filepath}: Saved filtered hits ({len(df_filtered)} rows)")

    except Exception as e:
        print(f"Error processing {filepath}: {e}")


if __name__ == "__main__":
    print(f"Processing {len(filtered_files)} files with multiprocessing...")
    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
        pool.map(process_file, filtered_files)

    print("Done filtering all files.")


Processing 50 files with multiprocessing...


/datag/users/ctremblay/Summer_Project_RA10_Dec0.26.pkl: Saved filtered hits (344999 rows)
/datag/users/ctremblay/Summer_Project_RA10_Dec0.22.pkl: Saved filtered hits (478099 rows)
/datag/users/ctremblay/Summer_Project_RA9_Dec0.169.pkl: Saved filtered hits (1000000 rows)
/datag/users/ctremblay/Summer_Project_RA9_Dec0.175.pkl: Saved filtered hits (252400 rows)
/datag/users/ctremblay/Summer_Project_RA10_Dec0.6.pkl: Saved filtered hits (89908 rows)
/datag/users/ctremblay/Summer_Project_RA9_Dec0.184.pkl: Saved filtered hits (1000000 rows)
/datag/users/ctremblay/Summer_Project_RA10_Dec0.9.pkl: Saved filtered hits (80743 rows)
/datag/users/ctremblay/Summer_Project_RA9_Dec0.183.pkl: Saved filtered hits (662375 rows)
/datag/users/ctremblay/Summer_Project_RA10_Dec0.10.pkl: Saved filtered hits (464136 rows)
/datag/users/ctremblay/Summer_Project_RA10_Dec0.16.pkl: Saved filtered hits (122589 rows)
/datag/users/ctremblay/Summer_Project_RA10_Dec0.24.pkl: Saved filtered hits (477338 rows)
/datag/users

In [2]:
#file splitting

file_list = glob.glob("/datax/scratch/ellambishop/rfi_removed_hits/Summer*.pkl")

# Initialize category buckets
vlass_incoherent = []
vlass_phase_center = []
vlass_other = []

non_vlass_incoherent = []
non_vlass_phase_center = []
non_vlass_other = []

for file in file_list:
    if 'Dec-45' in file:
        continue  # Skip large files

    try:
        df = pd.read_pickle(file)
    except Exception as e:
        print(f"Error reading {file}: {e}")
        continue

    if "file_uri" in df.columns and "source_name" in df.columns:
        print(f"Unique file_uri samples: {df['file_uri'].dropna().unique()[:5]}")
        print(f"Does file_uri contain 'vlass' anywhere? {df['file_uri'].str.lower().str.contains('vlass').any()}")

        vlass_mask = df["file_uri"].str.contains("vlass", case=False, na=False)
        df_vlass = df[vlass_mask].copy()
        df_non_vlass = df[~vlass_mask].copy()

        def categorize_and_append(df_subset, incoh_list, phase_list, other_list):
            source_lower = df_subset["source_name"].fillna("").str.lower()
            incoherent_mask = source_lower.str.contains("incoherent")
            phase_center_mask = source_lower.str.contains("phase_center")
            other_mask = ~(incoherent_mask | phase_center_mask)

            if incoherent_mask.any():
                incoh_list.append(df_subset[incoherent_mask])
            if phase_center_mask.any():
                phase_list.append(df_subset[phase_center_mask])
            if other_mask.any():
                other_list.append(df_subset[other_mask])

        if not df_vlass.empty:
            categorize_and_append(df_vlass, vlass_incoherent, vlass_phase_center, vlass_other)

        if not df_non_vlass.empty:
            categorize_and_append(df_non_vlass, non_vlass_incoherent, non_vlass_phase_center, non_vlass_other)

# Save output
output_dir = "/datax/scratch/ellambishop/new_hits_organized"
os.makedirs(output_dir, exist_ok=True)

def save_if_not_empty(dfs_list, filename):
    if dfs_list:
        out_path = os.path.join(output_dir, filename)
        pd.concat(dfs_list, ignore_index=True).to_pickle(out_path)
        print(f"Saved: {out_path}")

save_if_not_empty(vlass_incoherent, "vlass_incoherent.pkl")
save_if_not_empty(vlass_phase_center, "vlass_phase_center.pkl")
save_if_not_empty(vlass_other, "vlass_other.pkl")

save_if_not_empty(non_vlass_incoherent, "non_vlass_incoherent.pkl")
save_if_not_empty(non_vlass_phase_center, "non_vlass_phase_center.pkl")
save_if_not_empty(non_vlass_other, "non_vlass_other.pkl")


Unique file_uri samples: ['/mnt/cosmic-storage-2/data3/vla_target/25A-404.sb48193145.eb48229053.60757.05847386574/25A-404.sb48193145.eb48229053.60757.05847386574.8.1/25A-404.sb48193145.eb48229053.60757.05847386574.8.1.AC.C992.0000.raw.seticore.hits'
 '/mnt/cosmic-storage-2/data3/vla_target/25A-404.sb48193145.eb48229053.60757.05847386574/25A-404.sb48193145.eb48229053.60757.05847386574.8.1/25A-404.sb48193145.eb48229053.60757.05847386574.8.1.BD.C544.0000.raw.seticore.hits'
 '/mnt/cosmic-storage-2/data3/vla_target/25A-404.sb48193145.eb48229053.60757.05847386574/25A-404.sb48193145.eb48229053.60757.05847386574.8.1/25A-404.sb48193145.eb48229053.60757.05847386574.8.1.AC.C608.0000.raw.seticore.hits'
 '/mnt/cosmic-storage-2/data3/vla_target/25A-404.sb48193145.eb48229053.60757.05847386574/25A-404.sb48193145.eb48229053.60757.05847386574.6.1/25A-404.sb48193145.eb48229053.60757.05847386574.6.1.AC.C672.0014.raw.seticore.hits'
 '/mnt/cosmic-storage-2/data3/vla_target/25A-404.sb48193145.eb48229053.6075

In [3]:
#checking to see which params have least number of unique objects 
import os
import pandas as pd
from collections import defaultdict

# Track unique values for each column
global_uniques = defaultdict(set)

directory = '/datax/scratch/ellambishop/rfi_removed_hits/'
total = 0

for file in sorted(os.listdir(directory)):
    if 'Dec-45' in file:
        continue  # Skip large files

    file_path = os.path.join(directory, file)
    df = pd.read_pickle(file_path)

    print(f"Processing {file} ({len(df)} rows)")
    total += len(df)

    for col in df.columns:
        # Limit to only object/int/str columns (skip arrays, large blobs)
        if df[col].dtype.kind in {'O', 'i', 'u', 'S'}:
            unique_vals = df[col].unique()
            global_uniques[col].update(unique_vals)

# Sort and print columns with fewest unique values
print("\nGlobal grouping candidates with fewest unique values:")
summary = {col: len(vals) for col, vals in global_uniques.items()}
for col, count in sorted(summary.items(), key=lambda x: x[1]):
    print(f"{col:<25} → {count} unique values")

print(f"\nTotal rows across all files: {total}")


Processing Summer_Project_RA10_Dec0.1.pkl (712727 rows)
Processing Summer_Project_RA10_Dec0.10.pkl (464136 rows)
Processing Summer_Project_RA10_Dec0.11.pkl (157980 rows)
Processing Summer_Project_RA10_Dec0.12.pkl (179218 rows)
Processing Summer_Project_RA10_Dec0.13.pkl (112923 rows)
Processing Summer_Project_RA10_Dec0.14.pkl (1000000 rows)
Processing Summer_Project_RA10_Dec0.15.pkl (463468 rows)
Processing Summer_Project_RA10_Dec0.16.pkl (122589 rows)
Processing Summer_Project_RA10_Dec0.17.pkl (33540 rows)
Processing Summer_Project_RA10_Dec0.18.pkl (152741 rows)
Processing Summer_Project_RA10_Dec0.19.pkl (85043 rows)
Processing Summer_Project_RA10_Dec0.2.pkl (777537 rows)
Processing Summer_Project_RA10_Dec0.20.pkl (552798 rows)
Processing Summer_Project_RA10_Dec0.21.pkl (449162 rows)
Processing Summer_Project_RA10_Dec0.22.pkl (478099 rows)
Processing Summer_Project_RA10_Dec0.23.pkl (101294 rows)
Processing Summer_Project_RA10_Dec0.24.pkl (477338 rows)
Processing Summer_Project_RA10_Dec

In [7]:
print("\nGlobal grouping candidates with fewest unique values:")
summary = {col: len(vals) for col, vals in global_uniques.items()}
for col, count in sorted(summary.items(), key=lambda x: x[1]):
    print(f"{col:<25} → {count} unique values")

print(f"\nTotal rows across all files: {total}")



Global grouping candidates with fewest unique values:
telescope_id              → 1 unique values
tuning                    → 2 unique values
signal_beam               → 6 unique values
signal_num_timesteps      → 6 unique values
num_timesteps             → 6 unique values
subband_offset            → 32 unique values
signal_coarse_channel     → 32 unique values
coarse_channel            → 32 unique values
num_channels              → 923 unique values
observation_id            → 1743 unique values
signal_drift_steps        → 1765 unique values
source_name               → 13295 unique values
beam_id                   → 31566 unique values
file_local_enumeration    → 32997 unique values
file_uri                  → 142591 unique values
start_channel             → 506857 unique values
signal_index              → 507937 unique values
id                        → 29914722 unique values

Total rows across all files: 29914722
