In [42]:
# Check for duplicates and nulls
print("\n🧼 BATSE")
print(batse.duplicated(subset=["trigger"]).sum(), "duplicate triggers")
print(batse.isnull().sum().sort_values(ascending=False).head())

print("\n🧼 Supernovae")
print(sne.duplicated(subset=["name"]).sum(), "duplicate names")
print(sne.isnull().sum().sort_values(ascending=False).head())

print("\n🧼 OSC")
print(osc.duplicated(subset=["name"]).sum(), "duplicate names")
print(osc.isnull().sum().sort_values(ascending=False).head())



🧼 BATSE
0 duplicate triggers
gal_lon     843
gal_lat     843
time_sec    843
tjd         843
datetime    843
dtype: int64

🧼 Supernovae
0 duplicate names
maxabsmag       1508
redshift        1507
distance_mpc    1507
type             131
dec                0
dtype: int64

🧼 OSC
0 duplicate names
redshift    4
name        0
ra_deg      0
dec_deg     0
datetime    0
dtype: int64


In [43]:
print("📘 BATSE Schema:")
print(batse.dtypes)

print("\n📘 SNe Schema:")
print(sne.dtypes)

print("\n📘 OSC Schema:")
print(osc.dtypes)


📘 BATSE Schema:
trigger                  float64
fluence_ch1              float64
err_ch1                  float64
fluence_ch2              float64
err_ch2                  float64
fluence_ch3              float64
err_ch3                  float64
fluence_ch4              float64
err_ch4                  float64
peak_flux_64ms           float64
err_flux_64ms            float64
time_flux_64ms           float64
peak_flux_256ms          float64
err_flux_256ms           float64
time_flux_256ms          float64
peak_flux_1024ms         float64
err_flux_1024ms          float64
time_flux_1024ms         float64
name                      object
burst_id                  object
tjd                      float64
time_sec                 float64
ra_deg                   float64
dec_deg                  float64
gal_lon                  float64
gal_lat                  float64
error_radius             float64
geocenter_angle          float64
overwrite_flag            object
overwritten_flag          o

In [5]:
md_lines = []

datasets = {
    "batse_master_grb_registry.csv": batse,
    "sne_1990s_cleaned.csv": sne,
    "open_supernova_catalog.csv": osc
}

for name, df in datasets.items():
    md_lines.append(f"###  {name}")
    md_lines.append("")
    md_lines.append(f"- **Rows**: {len(df)}")
    md_lines.append(f"- **Columns**: {len(df.columns)}")
    md_lines.append("- **Schema**:")
    for col in df.columns:
        md_lines.append(f"  - `{col}`: {df[col].dtype}")
    md_lines.append("")

# Save as .md or print
markdown_doc = "\n".join(md_lines)
with open(registry_dir / "DATASET_SCHEMA_LOG.md", "w") as f:
    f.write(markdown_doc)

print(" Markdown schema log saved to DATASET_SCHEMA_LOG.md")


 Markdown schema log saved to DATASET_SCHEMA_LOG.md


In [7]:
from pathlib import Path
import pandas as pd

# Define root directory to start the search
project_root = Path("..").resolve()
all_csv_files = list(project_root.rglob("*.csv"))

# Variants of coordinate column names we want to detect
ra_variants = {"ra", "RA", "ra_deg", "RA_deg"}
dec_variants = {"dec", "DEC", "dec_deg", "DEC_deg"}

print("🔍 Scanning CSV files for RA/Dec variants...\n")

# Scan each file
for csv_file in all_csv_files:
    try:
        df = pd.read_csv(csv_file, nrows=5)  # Read only header and a few rows
        cols = set(df.columns.str.lower())

        has_ra = any(col in cols for col in {"ra", "ra_deg"})
        has_dec = any(col in cols for col in {"dec", "dec_deg"})

        if has_ra or has_dec:
            print(f"📁 {csv_file.relative_to(project_root)}")
            print(f"    Columns: {list(df.columns)}\n")
    except Exception as e:
        print(f"⚠️ Skipped {csv_file.name} due to error: {e}")


🔍 Scanning CSV files for RA/Dec variants...

⚠️ Skipped bruno_batse_icecube_crossmatch.csv due to error: No columns to parse from file
⚠️ Skipped bruno_grb_icecube_matches.csv due to error: No columns to parse from file
⚠️ Skipped bruno_sn_icecube_matches.csv due to error: No columns to parse from file
⚠️ Skipped grb_icecube_crossmatch.csv due to error: No columns to parse from file
📁 data\exports\sn_bruno_candidates.csv
    Columns: ['name', 'ra_deg', 'dec_deg', 'discoverdate', 'claimedtype', 'redshift', 'distance_m', 'fluence']

📁 data\raw\fermi_swift_grb_catalog.csv
    Columns: ['id', 'GRB_name', 'ra_deg', 'dec_deg', 'Error_Radius', 'Redshift', 'Trigger_Time', 'LAT_Boresight', 'Swift_Trigger_Number', 'GBM_Trigger_Number', 'Detection_Flags', 'Likelihood_Detection', 'LLE_Significance', 'Likelihood_Significance', 'Position_Source']

📁 data\raw\Icecube_HESE.csv
    Columns: ['id', 'mjd', 'ra_deg', 'dec_deg', 'f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'reconstruction', 'energy', 'd

In [44]:
from pathlib import Path
import pandas as pd

# Define project root dynamically (works inside /notebooks/)
project_root = Path("..").resolve()

# Core data directories
registry_dir   = project_root / "data" / "registry"
raw_data_dir   = project_root / "data" / "raw"
snfluxes_dir   = raw_data_dir / "snfluxes-public-master"
export_dir     = project_root / "data" / "exports"

# Key data files

fluence_table         = registry_dir / "Supernova_Bruno_Fluence_Table.xlsx"
grb_path              = registry_dir / "grb_catalog_cleaned.csv"
icecube_path          = raw_data_dir / "Icecube_HESE.csv"
sn_flux_file          = snfluxes_dir / "Nakazato_2013" / "nakazato-shen-z0.004-t_rev100ms-s20.0.fits"
uzc_fk_compact        = raw_data_dir / "uzc_fk_compact.csv"
osc_path              = registry_dir / "open_supernova_catalog.csv"
batse_path            = registry_dir / "batse_master_grb_registry.csv"
crossmatch_path       = export_dir / "sne_batse_crossmatches.csv"
sne_1990s_path        = registry_dir / "sne_1990s_cleaned.csv"

# Define which registry you want to use
selected_registry = "bruno_entropy_event_log"  # or "bruno_events_registry", "data_file_registry"

# Safe filename mapping
registry_files = {
    "bruno_events_registry": "bruno_events_registry.csv",
    "bruno_entropy_event_log": "bruno_entropy_event_log.csv",
    "data_file_registry": "data_file_registry.csv"
}

# Resolve the full path
registry_path = registry_dir / registry_files[selected_registry]

# Confirm
print(f"📄 Using registry: {registry_path}")



# Load registry (example)
registry = pd.read_csv(registry_path)

# Load BATSE and compute fluence
batse = pd.read_csv(batse_path)
batse["fluence_total_erg_cm2"] = (
    batse["fluence_ch1"].fillna(0) +
    batse["fluence_ch2"].fillna(0) +
    batse["fluence_ch3"].fillna(0) +
    batse["fluence_ch4"].fillna(0)
)
batse["fluence_total_J_m2"] = batse["fluence_total_erg_cm2"] * 0.1


📄 Using registry: D:\Bruno_Entropy_Project\data\registry\bruno_entropy_event_log.csv


In [25]:
import os
import pandas as pd

# Dynamically resolve path regardless of OS or cwd
project_root = os.path.abspath("..")  # Goes one level up from /notebooks/
exports_path = os.path.join(project_root, "data", "exports")  # ✅ Correct

# Fields to standardize
rename_map = {
    "Fluence": "Fluence_at_Earth_J_per_m2",
    "Fluence_J_m2": "Fluence_at_Earth_J_per_m2",
    "RA (J2000)": "RA_J2000",
    "Dec (J2000)": "Dec_J2000",
    "Distance (Mpc)": "Distance_Mpc",
    "Explosion Energy (erg)": "Explosion_Energy_erg",
    "Bruno Trigger Time (s)": "Bruno_Trigger_Time_s",
    "Detection Date (UTC)": "Detection_Date_UTC"
}

# Loop and clean headers
for f in os.listdir(exports_path):
    if f.endswith(".csv"):
        full_path = os.path.join(exports_path, f)
        try:
            df = pd.read_csv(full_path)
            original_cols = df.columns.tolist()
            cleaned = df.rename(columns=rename_map)
            if cleaned.columns.tolist() != original_cols:
                print(f"🛠️ Fixed headers in: {f}")
                cleaned.to_csv(full_path, index=False)
        except Exception as e:
            print(f"❌ Error in {f}: {e}")


❌ Error in grb_icecube_crossmatch.csv: No columns to parse from file


In [35]:
from pathlib import Path
import pandas as pd
import yaml

registry_dir = Path("../data/registry")
schema_files = list(registry_dir.glob("*.yaml"))

for schema_path in schema_files:
    with open(schema_path, "r") as f:
        schema = yaml.safe_load(f)

    # Skip non-dict schemas
    if not isinstance(schema, dict):
        continue

    updated = False
    for key in list(schema.keys()):
        # Fix field names
        if "fluence" in key.lower():
            schema["fluence_j_m2"] = {
                "title": "Fluence at Earth",
                "description": "Total gamma-ray burst fluence received at Earth in joules per square meter (J/m²)",
                "unit": "J/m²",
                "type": "number"
            }
            del schema[key]
            updated = True

    # Save if updated
    if updated:
        with open(schema_path, "w") as f:
            yaml.dump(schema, f)
        print(f"✅ Standardized fluence in: {schema_path.name}")


In [26]:
for file in registry_dir.glob("*.csv"):
    df = pd.read_csv(file)
    renamed = False

    for col in df.columns:
        if "fluence" in col.lower() and "j" in col.lower():
            df.rename(columns={col: "fluence_j_m2"}, inplace=True)
            renamed = True

    if renamed:
        df.to_csv(file, index=False)
        print(f"📝 Updated fluence column in: {file.name}")


📝 Updated fluence column in: bruno_entropy_event_log.csv
📝 Updated fluence column in: bruno_events_registry.csv
📝 Updated fluence column in: Supernova_Bruno_Fluence_Table.csv


In [33]:
from pathlib import Path
import pandas as pd
import yaml

registry_dir = Path("../data/registry")
schema_files = list(registry_dir.glob("*.yaml"))

for schema_path in schema_files:
    with open(schema_path, "r") as f:
        schema = yaml.safe_load(f)

    # Skip non-dict schemas
    if not isinstance(schema, dict):
        continue

    updated = False
    for key in list(schema.keys()):
        # Fix field names
        if "fluence" in key.lower():
            schema["fluence_j_m2"] = {
                "title": "Fluence at Earth",
                "description": "Total gamma-ray burst fluence received at Earth in joules per square meter (J/m²)",
                "unit": "J/m²",
                "type": "number"
            }
            del schema[key]
            updated = True

    # Save if updated
    if updated:
        with open(schema_path, "w") as f:
            yaml.dump(schema, f)
        print(f"✅ Standardized fluence in: {schema_path.name}")


In [37]:
for file in registry_dir.glob("*.csv"):
    df = pd.read_csv(file)
    renamed = False

    for col in df.columns:
        if "fluence" in col.lower() and "j" in col.lower():
            df.rename(columns={col: "fluence_j_m2"}, inplace=True)
            renamed = True

    if renamed:
        df.to_csv(file, index=False)
        print(f"📝 Updated fluence column in: {file.name}")


📝 Updated fluence column in: bruno_entropy_event_log.csv
📝 Updated fluence column in: bruno_events_registry.csv
📝 Updated fluence column in: Supernova_Bruno_Fluence_Table.csv


In [38]:
from pathlib import Path
import yaml

# Directory with schemas
registry_dir = Path("../data/registry")
schema_files = list(registry_dir.glob("*.yaml"))

# Define the normalized fluence field
normalized_key = "fluence_j_m2"
standard_field = {
    "title": "Fluence at Earth",
    "description": "Total gamma-ray burst fluence received at Earth in joules per square meter (J/m²)",
    "unit": "J/m²",
    "type": "number"
}

def replace_fluence_fields(schema: dict) -> bool:
    updated = False
    if not isinstance(schema, dict):
        return False

    # Search top-level
    for key in list(schema.keys()):
        if "fluence" in key.lower() and key != normalized_key:
            schema[normalized_key] = standard_field
            del schema[key]
            updated = True

    # Search inside "properties"
    if "properties" in schema and isinstance(schema["properties"], dict):
        props = schema["properties"]
        for key in list(props.keys()):
            if "fluence" in key.lower() and key != normalized_key:
                props[normalized_key] = standard_field
                del props[key]
                updated = True

    return updated

# Scan and apply changes
for schema_path in schema_files:
    try:
        with open(schema_path, "r") as f:
            schema = yaml.safe_load(f)

        if replace_fluence_fields(schema):
            with open(schema_path, "w", encoding="utf-8") as f:
                yaml.dump(schema, f, sort_keys=False, allow_unicode=True)
            print(f"✅ Fixed fluence field in: {schema_path.name}")
    except Exception as e:
        print(f"⚠️ Error reading {schema_path.name}: {e}")


In [39]:
from pathlib import Path
import pandas as pd

# Directory to scan for .csv files
registry_dir = Path("../data/registry")  # Adjust as needed
csv_files = list(registry_dir.glob("*.csv"))

# Known fluence column variants
fluence_aliases = [
    "Fluence at Earth (J/m²)",
    "fluence_total_J_m2",
    "fluence_total_j_m2",
    "fluence_total_erg_cm2",  # Will convert if needed
    "Fluence at Earth",
]

# Conversion factor from erg/cm² to J/m²
erg_to_joules = 0.1

def fix_fluence_column(df: pd.DataFrame) -> pd.DataFrame:
    for alias in fluence_aliases:
        if alias in df.columns:
            # If the column is in erg/cm², convert it
            if alias == "fluence_total_erg_cm2":
                df["fluence_j_m2"] = df[alias] * erg_to_joules
            else:
                df["fluence_j_m2"] = df[alias]
            df.drop(columns=[alias], inplace=True)
            return df
    return df

# Process each CSV
for file in csv_files:
    try:
        backup_path = file.with_suffix(".csv.bak")
        file.rename(backup_path)
        df.to_csv(file, index=False)

        original_cols = set(df.columns)
        df = fix_fluence_column(df)
        if "fluence_j_m2" in df.columns and set(df.columns) != original_cols:
            df.to_csv(file, index=False)
            print(f"✅ Updated fluence field in: {file.name}")
    except Exception as e:
        print(f"⚠️ Could not process {file.name}: {e}")


In [40]:
from pathlib import Path
import pandas as pd

# Directory with .csv files
registry_dir = Path("../data/registry")  # Update if needed
csv_files = list(registry_dir.glob("*.csv"))

# Known variants
fluence_aliases = {
    "Fluence at Earth (J/m²)": "direct",
    "fluence_total_J_m2": "direct",
    "fluence_total_j_m2": "direct",
    "Fluence at Earth": "direct",
    "fluence_total_erg_cm2": "convert",
}

# Conversion factor for erg/cm² to J/m²
erg_to_joules = 0.1

# Tracking log
log = []

def fix_fluence_column(df: pd.DataFrame, file_name: str) -> pd.DataFrame:
    for col, mode in fluence_aliases.items():
        if col in df.columns:
            if mode == "convert":
                df["fluence_j_m2"] = df[col] * erg_to_joules
            else:
                df["fluence_j_m2"] = df[col]
            df.drop(columns=[col], inplace=True)
            log.append({"file": file_name, "original_column": col, "action": mode})
            return df
    return df

# Process files
for file in csv_files:
    try:
        df = pd.read_csv(file)
        cols_before = set(df.columns)
        df = fix_fluence_column(df, file.name)
        if "fluence_j_m2" in df.columns and set(df.columns) != cols_before:
            df.to_csv(file, index=False)
            print(f"✅ Updated fluence field in: {file.name}")
    except Exception as e:
        print(f"⚠️ Skipped {file.name}: {e}")

# Save report log
if log:
    report_df = pd.DataFrame(log)
    report_path = registry_dir / "fluency_column_update_report.csv"
    report_df.to_csv(report_path, index=False)
    print(f"📄 Report saved to: {report_path}")
else:
    print("ℹ️ No updates were made to any files.")


ℹ️ No updates were made to any files.


In [41]:
def generate_schema_from_csv(csv_path, yaml_path):
    import pandas as pd, yaml

    df = pd.read_csv(csv_path, nrows=5)  # sample first few rows
    schema = {
        "columns": [
            {
                "name": col,
                "type": str(df[col].dtype),
                "description": "TBD"
            } for col in df.columns
        ]
    }

    with open(yaml_path, "w") as f:
        yaml.dump(schema, f, sort_keys=False)

    print(f"✅ Schema written to {yaml_path}")


In [3]:
import pandas as pd
from pathlib import Path

log_path = Path("../data/registry/bruno_entropy_event_log.csv")
backup_path = log_path.with_suffix(".csv.bak")

rename_map = {
    "Event Name": "Event_Name",
    "Detection Date (UTC)": "Detection_Date_UTC",
    "Source Galaxy": "Source_Galaxy",
    "RA (J2000)": "RA_J2000",
    "Dec (J2000)": "Dec_J2000",
    "Distance (Mpc)": "Distance_Mpc",
    "Explosion Energy (erg)": "Explosion_Energy_erg",
    "Bruno Trigger Time (s)": "Bruno_Trigger_Time_s",
    "Estimated Collapse Time (UTC)": "Estimated_Collapse_Time_UTC",
    "Fluence at Earth (J/m²)": "Fluence_at_Earth_J_per_m2",
    "Bruno Threshold Crossed": "Bruno_Threshold_Crossed",
    "Neutrino Detected": "Neutrino_Detected",
    "Neutrino Energy (TeV)": "Neutrino_Energy_TeV",
    "Positional Match Confidence": "Positional_Match_Confidence"
}

df = pd.read_csv(log_path)
df.rename(columns=rename_map, inplace=True)

log_path.rename(backup_path)
df.to_csv(log_path, index=False)
print(f"✅ Headers cleaned and original backed up to: {backup_path}")


✅ Headers cleaned and original backed up to: ..\data\registry\bruno_entropy_event_log.csv.bak


In [4]:
import pandas as pd
import yaml
from pathlib import Path

def generate_yaml_preview(csv_path, nrows=5):
    csv_path = Path(csv_path)
    df = pd.read_csv(csv_path, nrows=nrows)

    schema = {
        "dataset_name": csv_path.stem,
        "description": "TBD - describe this dataset",
        "columns": []
    }

    for col in df.columns:
        dtype = str(df[col].dtype)
        schema["columns"].append({
            "name": col,
            "type": dtype,
            "description": "TBD"
        })

    # Print to screen only
    print("\n--- YAML PREVIEW ---\n")
    print(yaml.dump(schema, sort_keys=False, default_flow_style=False))

    # Optional: Save if you want
    # yaml_path = csv_path.with_suffix(".schema.yaml")
    # with open(yaml_path, "w") as f:
    #     yaml.dump(schema, f)
    # print(f"✅ Schema saved: {yaml_path}")

# EXAMPLE USE:
generate_yaml_preview("../data/exports/final/Bruno_Highlight_Events.csv")



--- YAML PREVIEW ---

dataset_name: Bruno_Highlight_Events
description: TBD - describe this dataset
columns:
- name: Name
  type: object
  description: TBD
- name: RA (deg)
  type: float64
  description: TBD
- name: Dec (deg)
  type: float64
  description: TBD
- name: Discovery Date
  type: object
  description: TBD
- name: Distance_Mpc
  type: float64
  description: TBD
- name: "Fluence (J/m\xB2)"
  type: float64
  description: TBD
- name: Label
  type: object
  description: TBD



In [5]:
import pandas as pd
import yaml

# Find all CSV and YAML schema files
csv_files = list(extract_path.glob("*.csv"))
yaml_files = list(extract_path.glob("*.yaml"))

# Build a map of dataset base names → schema path
yaml_map = {f.stem.replace("_schema", ""): f for f in yaml_files}

validation_results = []

# Validate each CSV against its matching YAML schema
for csv_path in csv_files:
    base_name = csv_path.stem
    if base_name in yaml_map:
        yaml_path = yaml_map[base_name]
        try:
            df = pd.read_csv(csv_path, nrows=1)  # Just read headers
            with open(yaml_path, 'r') as f:
                schema = yaml.safe_load(f)

            csv_columns = set(df.columns)
            schema_columns = {col['name'] for col in schema.get("columns", [])}

            missing_in_csv = schema_columns - csv_columns
            extra_in_csv = csv_columns - schema_columns

            validation_results.append({
                "csv": csv_path.name,
                "schema": yaml_path.name,
                "missing_in_csv": list(missing_in_csv),
                "extra_in_csv": list(extra_in_csv),
                "status": "✅ PASS" if not missing_in_csv and not extra_in_csv else "⚠️ MISMATCH"
            })

        except Exception as e:
            validation_results.append({
                "csv": csv_path.name,
                "schema": yaml_path.name,
                "error": str(e),
                "status": "❌ ERROR"
            })
    else:
        validation_results.append({
            "csv": csv_path.name,
            "schema": None,
            "status": "❗ NO SCHEMA FOUND"
        })

import pandas as pd
#import ace_tools as tools; tools.display_dataframe_to_user(name="CSV Schema Validation", dataframe=pd.DataFrame(validation_results))


NameError: name 'extract_path' is not defined

In [7]:
from pathlib import Path

# Redefine extract_path since it's out of scope in this new cell
extract_path = Path("/mnt/data/registry_unzipped")

# Re-run the CSV/YAML validation logic
import pandas as pd
import yaml

# Find all CSV and YAML schema files
csv_files = list(extract_path.glob("*.csv"))
yaml_files = list(extract_path.glob("*.yaml"))

# Build a map of dataset base names → schema path
yaml_map = {f.stem.replace("_schema", ""): f for f in yaml_files}

validation_results = []

# Validate each CSV against its matching YAML schema
for csv_path in csv_files:
    base_name = csv_path.stem
    if base_name in yaml_map:
        yaml_path = yaml_map[base_name]
        try:
            df = pd.read_csv(csv_path, nrows=1)  # Just read headers
            with open(yaml_path, 'r') as f:
                schema = yaml.safe_load(f)

            csv_columns = set(df.columns)
            schema_columns = {col['name'] for col in schema.get("columns", [])}

            missing_in_csv = schema_columns - csv_columns
            extra_in_csv = csv_columns - schema_columns

            validation_results.append({
                "csv": csv_path.name,
                "schema": yaml_path.name,
                "missing_in_csv": list(missing_in_csv),
                "extra_in_csv": list(extra_in_csv),
                "status": "✅ PASS" if not missing_in_csv and not extra_in_csv else "⚠️ MISMATCH"
            })

        except Exception as e:
            validation_results.append({
                "csv": csv_path.name,
                "schema": yaml_path.name,
                "error": str(e),
                "status": " ERROR"
            })
    else:
        validation_results.append({
            "csv": csv_path.name,
            "schema": None,
            "status": " NO SCHEMA FOUND"
        })

#import ace_tools as tools; tools.display_dataframe_to_user(name="CSV Schema Validation", dataframe=pd.DataFrame(validation_results))


# 🧠 Bruno Entropy Project — Structure Notebook

This notebook provides the **core structure and starter paths** for accessing all datasets and outputs within the `Bruno_Entropy_Project`.

Use this as your consistent baseline for:

- 🔍 Loading and inspecting cleaned registries
- 🔄 Accessing curated outputs and crossmatches
- 📦 Linking entropy fluence models with matched SN–GRB–IceCube events
- 📚 Ensuring schema-validated data science workflows

---

## 📁 Directory Layout Overview



In [14]:
from pathlib import Path
import pandas as pd

# Dynamically define project root (assuming this notebook is in /notebooks/)
project_root = Path("..").resolve()

# 📁 Core data directories
registry_dir   = project_root / "data" / "registry"
raw_data_dir   = project_root / "data" / "raw"
export_dir     = project_root / "data" / "exports"
snfluxes_dir   = raw_data_dir / "snfluxes-public-master"

# 📁 Subdirectories inside exports
final_dir         = export_dir / "final"
crossmatch_dir    = export_dir / "crossmatch"
entropy_tables_dir = export_dir / "entropy_tables"

# 📄 Registry-level data files
fluence_table         = registry_dir / "Supernova_Bruno_Fluence_Table.csv"
grb_path              = registry_dir / "grb_catalog_cleaned.csv"
icecube_path          = raw_data_dir / "Icecube_HESE.csv"
sn_flux_file          = snfluxes_dir / "Nakazato_2013" / "nakazato-shen-z0.004-t_rev100ms-s20.0.fits"
uzc_fk_compact        = raw_data_dir / "uzc_fk_compact.csv"
osc_path              = registry_dir / "open_supernova_catalog.csv"
batse_path            = registry_dir / "batse_master_grb_registry.csv"
sne_1990s_path        = registry_dir / "sne_1990s_cleaned.csv"
icecat1_path          = raw_data_dir / "icecat1.csv"


# 🧠 Registry selector
selected_registry = "bruno_entropy_event_log"  # options: bruno_events_registry, data_file_registry
registry_files = {
    "bruno_events_registry": "bruno_events_registry.csv",
    "bruno_entropy_event_log": "bruno_entropy_event_log.csv",
    "data_file_registry": "data_file_registry.csv"
}
registry_path = registry_dir / registry_files[selected_registry]
print(f"📄 Using registry: {registry_path}")

# 📄 Final curated datasets
highlight_path       = final_dir / "Bruno_Highlight_Events.csv"
clustered_candidates = final_dir / "bruno_clustered_entropy_candidates.csv"
relaxed_candidates   = final_dir / "bruno_relaxed_entropy_candidates.csv"
typed_relaxed        = final_dir / "typed_bruno_relaxed_candidates.csv"
sn_bruno_path        = final_dir / "sn_bruno_candidates.csv"

# 📄 Crossmatch datasets
batse_sn_crossmatch  = crossmatch_dir / "sne_batse_crossmatches.csv"
grb_icecube_cross    = crossmatch_dir / "grb_icecube_crossmatch.csv"
bruno_icecube_cross  = crossmatch_dir / "bruno_icecube_crossmatch.csv"
sn_grb_matrix_path   = crossmatch_dir / "bruno_sn_grb_matrix.csv"

# 📄 Entropy output tables
entropy_data_path         = entropy_tables_dir / "Entropy_Data.csv"
entropy_horizon_path      = entropy_tables_dir / "Entropy_Horizon_Table.csv"
entropy_water_vapor_path  = entropy_tables_dir / "Entropy_Water_Vapor.csv"

# ✅ Example usage
registry = pd.read_csv(registry_path)
batse = pd.read_csv(batse_path)

# Compute BATSE total fluence
batse["fluence_total_erg_cm2"] = (
    batse["fluence_ch1"].fillna(0) +
    batse["fluence_ch2"].fillna(0) +
    batse["fluence_ch3"].fillna(0) +
    batse["fluence_ch4"].fillna(0)
)
batse["fluence_total_J_m2"] = batse["fluence_total_erg_cm2"] * 0.1


📄 Using registry: D:\Bruno_Entropy_Project\data\registry\bruno_entropy_event_log.csv


In [29]:
print("📊 Registry preview:")
display(registry.head(3))


📊 Registry preview:


Unnamed: 0,Event_Name,Detection_Date_UTC,Source_Galaxy,RA_J2000,Dec_J2000,Distance_Mpc,Explosion_Energy_erg,Bruno_Trigger_Time_s,Estimated_Collapse_Time_UTC,fluence_j_m2,Bruno_Threshold_Crossed,Neutrino_Detected,Neutrino_Energy_TeV,Positional_Match_Confidence,Notes
0,IceCube-49427574,2025-03-30 08:31:06,UGC 11572,307.29,10.74,63.0,1e+52,1.936,2025-03-30 08:31:04.064,2.1e-05,True,True,147.57,High,Confirmed spatial + temporal match. First offi...


In [30]:
import yaml

schema_path = registry_path.with_name(registry_path.stem + "_schema.yaml")
if schema_path.exists():
    with open(schema_path) as f:
        schema = yaml.safe_load(f)
    print("✅ Schema loaded with", len(schema['columns']), "columns")
else:
    print("⚠️ No schema found:", schema_path)


✅ Schema loaded with 15 columns


In [31]:
from glob import glob
available_registries = [Path(f).stem for f in glob(str(registry_dir / "*.csv"))]
print("📁 Available registries:", available_registries)


📁 Available registries: ['basic_table_cleaned', 'BATSE_Flux_Catalog', 'batse_master_grb_registry', 'bruno_entropy_event_log', 'bruno_events_registry', 'data_file_registry', 'grb_catalog_cleaned', 'Open_Supernova_Catalog', 'registry', 'sne_1990s_cleaned', 'Supernova_Bruno_Fluence_Table']


In [32]:
import pandas as pd
from astropy.time import Time

# Load IceCat-1
icecat_path = raw_data_dir / "icecat1.csv"
icecat = pd.read_csv(icecat_path)

# Convert MJD to datetime
icecat["event_time"] = Time(icecat["EVENTMJD"], format="mjd").to_datetime()

# Standardize position columns
icecat["ra_deg"] = icecat["RA"]
icecat["dec_deg"] = icecat["DEC"]

# Optional: Event filter by quality or energy
icecat = icecat[icecat["ENERGY"] > 100]  # TeV-scale neutrinos

# Preview
icecat[["event_time", "ra_deg", "dec_deg", "ENERGY", "I3TYPE"]].head()


FileNotFoundError: [Errno 2] No such file or directory: 'D:\\Bruno_Entropy_Project\\data\\raw\\icecat1.csv'

### Kato data extract ###

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [41]:

# ✅ Step 1: Load Kato m15 ν_x luminosity data
file_path = "D:/Bruno_Entropy_Project/data/raw/Kato_2017/collapse/m15/total_nux/lightcurve.dat"
df_kato = pd.read_csv(file_path, sep=r"\s+", header=None, comment="#")

FileNotFoundError: [Errno 2] No such file or directory: 'D:/Bruno_Entropy_Project/data/raw/Kato_2017/collapse/m15/total_nux/lightcurve.dat'

In [34]:
# ✅ Step 2: Extract relevant columns
time_kato = df_kato[1]  # Time after bounce [s]
lum_mev_s = df_kato[7]  # Energy luminosity [MeV/s]

NameError: name 'lum_kato' is not defined

In [35]:
import numpy as np

distance_kpc = 50
distance_cm = distance_kpc * 3.086e21

dt = time_kato.diff().fillna(0)
fluence_kato = (lum_kato_erg * dt).cumsum() / (4 * np.pi * distance_cm**2)


NameError: name 'time_kato' is not defined

In [36]:
avg_energy_mev = 15
fluence_norm_kato = fluence_kato / (avg_energy_mev * mev_to_erg)


NameError: name 'fluence_kato' is not defined

In [37]:
plt.figure(figsize=(10, 5))
plt.plot(time_kato, fluence_norm_kato, label="Kato m15 ν_x", color="tab:purple")
plt.plot(time_15, fluence_norm_15, label="Bruno 15 Msun", color="tab:blue")
plt.plot(time_nakazato, fluence_norm_nakazato, label="Nakazato", color="tab:green")
plt.xlabel("Time (s)")
plt.ylabel("Neutrino Count (particles/cm²)")
plt.title("Kato vs Bruno vs Nakazato — Particle Fluence")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


NameError: name 'plt' is not defined

In [45]:
import pandas as pd
import numpy as np

# ✅ Load SKII_SKI_table.dat safely
ski_table_path = "D:/Bruno_Entropy_Project/data/raw/SK_Baksan/SKII_SKI_table.dat"

# Step 1: Load everything as string to avoid dtype warnings
df_ski = pd.read_csv(ski_table_path, sep=r"\s+", header=None, dtype=str)

# Step 2: Assign column names (13 columns expected)
df_ski.columns = [
    "tan2_theta", "delta_m2", "chi2", "B8", "hep",
    "B8_unc", "hep_unc", "cross_section",
    "B8_norm", "es_I", "er_I", "es_II", "er_II"
]

# Step 3: Replace '--' or other placeholders with NaN and convert to numeric
df_ski.replace({"--": np.nan, "*": np.nan}, inplace=True)
for col in df_ski.columns:
    df_ski[col] = pd.to_numeric(df_ski[col], errors="coerce")

# ✅ Preview result
print("✅ Cleaned SKII_SKI_table:")
display(df_ski.head())


✅ Cleaned SKII_SKI_table:


Unnamed: 0,tan2_theta,delta_m2,chi2,B8,hep,B8_unc,hep_unc,cross_section,B8_norm,es_I,er_I,es_II,er_II
0,,,,,,,,,,,,,
1,0.0001,1e-09,24.5585,2.3156,21.3428,0.0652,17.7367,1.3177,0.0924,0.3346,-0.1233,-0.3047,-0.2724
2,0.0001,1.047e-09,24.5585,2.3156,21.3511,0.0652,17.7377,1.3177,0.0925,0.3349,-0.1224,-0.3052,-0.2716
3,0.0001,1.096e-09,24.5586,2.3156,21.3522,0.0652,17.7378,1.3177,0.0927,0.3349,-0.1226,-0.3053,-0.2718
4,0.0001,1.148e-09,24.5587,2.3156,21.3522,0.0652,17.7378,1.3177,0.0927,0.3349,-0.1226,-0.3053,-0.2718
