<a href="https://colab.research.google.com/github/javermeire12/Simulations/blob/main/DESI_Data_Verification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# DESI FITS File Verification - What's Actually In The Data?
print("🔍 DESI FITS FILE VERIFICATION")
print("="*50)

import numpy as np
from astropy.io import fits
import matplotlib.pyplot as plt

# Open and inspect the FITS file structure
fits_file = "LRG_SGC_clustering.dat.fits"

# Basic file info
print(f"📂 Analyzing file: {fits_file}")

# Open and get structure
with fits.open(fits_file) as hdul:
    print(f"\n📋 FITS FILE STRUCTURE:")
    hdul.info()

    # Get the data
    data = hdul[1].data  # Usually in extension 1

    print(f"\n📊 DATA OVERVIEW:")
    print(f"Number of objects: {len(data)}")
    print(f"Data columns: {data.columns.names}")

    # Show first few rows
    print(f"\n🔢 FIRST 5 ROWS:")
    for i in range(min(5, len(data))):
        print(f"Row {i}: {data[i]}")

    # Statistical summary
    print(f"\n📈 DATA RANGES:")
    for col_name in data.columns.names:
        try:
            col_data = data[col_name]
            if np.issubdtype(col_data.dtype, np.number):
                print(f"{col_name}: {col_data.min():.4f} to {col_data.max():.4f} (mean: {col_data.mean():.4f})")
            else:
                print(f"{col_name}: {type(col_data[0])} data")
        except:
            print(f"{col_name}: Could not analyze")

print(f"\n✅ File inspection complete!")


🔍 DESI FITS FILE VERIFICATION
📂 Analyzing file: LRG_SGC_clustering.dat.fits

📋 FITS FILE STRUCTURE:
Filename: LRG_SGC_clustering.dat.fits
No.    Name      Ver    Type      Cards   Dimensions   Format
  0  PRIMARY       1 PrimaryHDU       6   ()      
  1  LSS           1 BinTableHDU     40   662492R x 13C   [K, D, K, D, D, 1A, D, D, D, D, D, D, D]   

📊 DATA OVERVIEW:
Number of objects: 662492
Data columns: ['TARGETID', 'Z', 'NTILE', 'RA', 'DEC', 'PHOTSYS', 'FRAC_TLOBS_TILES', 'WEIGHT_ZFAIL', 'WEIGHT_SYS', 'WEIGHT', 'WEIGHT_COMP', 'NX', 'WEIGHT_FKP']

🔢 FIRST 5 ROWS:
Row 0: (np.int64(39627322696933879), np.float64(0.8313465884280199), np.int64(1), np.float64(21.68178218932335), np.float64(-19.38721438383455), 'S', np.float64(0.8228485153497735), np.float64(1.0039407014846802), np.float64(1.0417450666427612), np.float64(1.2736069511619756), np.float64(2.0), np.float64(0.00026451912209734076), np.float64(0.2743340306116948))
Row 1: (np.int64(39627322701121757), np.float64(0.5589686591779

In [2]:
# Verify coordinate conversion and distance calculations
print("🌌 COORDINATE & DISTANCE VERIFICATION")
print("="*50)

from astropy.cosmology import Planck18
from astropy import units as u
from astropy.coordinates import SkyCoord

# Sample a few galaxies to verify coordinates
with fits.open("LRG_SGC_clustering.dat.fits") as hdul:
    data = hdul[1].data

    # Take first 10 galaxies for verification
    sample_size = 10

    print(f"🔢 ANALYZING FIRST {sample_size} GALAXIES:")
    print(f"{'Index':<6} {'RA':<10} {'DEC':<10} {'Z':<8} {'Distance':<12} {'X':<10} {'Y':<10} {'Z':<10}")
    print("-" * 80)

    for i in range(sample_size):
        ra = data['RA'][i]
        dec = data['DEC'][i]
        z = data['Z'][i]

        # Convert to distance using cosmology
        distance = Planck18.comoving_distance(z).to(u.Mpc).value

        # Convert to Cartesian coordinates
        coord = SkyCoord(ra=ra*u.deg, dec=dec*u.deg, distance=distance*u.Mpc)
        x = coord.cartesian.x.to(u.Mpc).value
        y = coord.cartesian.y.to(u.Mpc).value
        z_cart = coord.cartesian.z.to(u.Mpc).value

        print(f"{i:<6} {ra:<10.3f} {dec:<10.3f} {z:<8.4f} {distance:<12.1f} {x:<10.1f} {y:<10.1f} {z_cart:<10.1f}")

# Check if distances make sense for cosmological survey
print(f"\n🎯 DISTANCE VALIDATION:")
with fits.open("LRG_SGC_clustering.dat.fits") as hdul:
    data = hdul[1].data

    # Calculate all distances
    distances = Planck18.comoving_distance(data['Z']).to(u.Mpc).value

    print(f"Redshift range: {data['Z'].min():.3f} to {data['Z'].max():.3f}")
    print(f"Distance range: {distances.min():.1f} to {distances.max():.1f} Mpc")
    print(f"Mean distance: {distances.mean():.1f} Mpc")
    print(f"Median distance: {np.median(distances):.1f} Mpc")

    # These should be cosmological distances (thousands of Mpc)
    if distances.mean() > 2000:
        print("✅ COSMOLOGICAL DISTANCES CONFIRMED - This is real survey data!")
    else:
        print("⚠️ DISTANCES TOO SMALL - Possible error in data or conversion")

print(f"\n📊 COORDINATE VERIFICATION:")
print(f"RA range: {data['RA'].min():.1f}° to {data['RA'].max():.1f}°")
print(f"DEC range: {data['DEC'].min():.1f}° to {data['DEC'].max():.1f}°")
print(f"Total objects: {len(data)}")

# Verify this matches DESI survey specs
if 0 <= data['RA'].min() and data['RA'].max() <= 360:
    if -90 <= data['DEC'].min() and data['DEC'].max() <= 90:
        print("✅ COORDINATES IN VALID RANGES")
    else:
        print("⚠️ DECLINATION OUT OF RANGE")
else:
    print("⚠️ RIGHT ASCENSION OUT OF RANGE")


🌌 COORDINATE & DISTANCE VERIFICATION
🔢 ANALYZING FIRST 10 GALAXIES:
Index  RA         DEC        Z        Distance     X          Y          Z         
--------------------------------------------------------------------------------
0      21.682     -19.387    0.8313   2954.8       2590.1     1029.8     -980.9    
1      21.761     -19.423    0.5590   2141.0       1875.3     748.6      -712.0    
2      21.818     -19.393    0.7975   2860.8       2505.2     1002.9     -949.9    
3      21.856     -19.404    0.7986   2864.0       2507.2     1005.7     -951.5    
4      21.868     -19.385    0.8341   2962.3       2593.3     1040.8     -983.2    
5      21.936     -19.381    0.6394   2395.7       2096.3     844.2      -795.0    
6      22.132     -19.406    0.9124   3172.3       2771.6     1127.3     -1054.0   
7      22.169     -19.411    0.5376   2071.3       1809.2     737.2      -688.4    
8      22.325     -19.423    0.6862   2538.0       2214.2     909.2      -844.0    
9      20.9

In [3]:
# Simple sanity checks you can do quickly
print("🔧 QUICK SANITY CHECKS")
print("="*50)

# Run this to verify your data makes basic sense
print("1. CHECK YOUR EDGE COUNTS INCREASE WITH RADIUS:")
print("   r=100: 38k edges")
print("   r=150: 119k edges")
print("   r=200: 266k edges")
print("   r=250: 500k edges")
print("   r=300: 828k edges")
print("   r=400: ??? edges")
print("   ✅ Should always INCREASE - if not, something's wrong!")

print("\n2. CHECK RANDOM NETWORKS ALSO INCREASE:")
print("   Random should also increase with radius")
print("   If random goes: 7k → 26k → 60k → 116k → 198k")
print("   ✅ That's correct - more radius = more connections")

print("\n3. CHECK YOUR RATIOS MAKE SENSE:")
print("   Real/Random ratios: 5.0 → 4.6 → 4.4 → 4.3 → 4.2")
print("   ✅ Should be >1 (real better than random)")
print("   ✅ Slight decrease is expected (efficiency decay)")

print("\n4. CHECK YOUR Z-SCORES:")
print("   Should be >3 for strong evidence")
print("   1057σ is extremely strong (maybe too strong?)")

print("\n🎯 KEY VALIDATION QUESTIONS:")
print("A) Do edge counts increase monotonically? (YES = good)")
print("B) Are ratios all >1? (YES = real > random)")
print("C) Are distances ~3000 Mpc? (YES = cosmological)")
print("D) Do you have ~660k total galaxies? (YES = full DESI)")

print("\n📋 TO VERIFY DATA INTEGRITY:")
print("Run the verification cells above to see:")
print("- What columns are in your FITS file")
print("- If coordinates look like real RA/DEC")
print("- If redshifts convert to reasonable distances")
print("- If you're actually analyzing galaxy data")

print("\n✅ Your results LOOK correct, but verification is always smart!")


🔧 QUICK SANITY CHECKS
1. CHECK YOUR EDGE COUNTS INCREASE WITH RADIUS:
   r=100: 38k edges
   r=150: 119k edges
   r=200: 266k edges
   r=250: 500k edges
   r=300: 828k edges
   r=400: ??? edges
   ✅ Should always INCREASE - if not, something's wrong!

2. CHECK RANDOM NETWORKS ALSO INCREASE:
   Random should also increase with radius
   If random goes: 7k → 26k → 60k → 116k → 198k
   ✅ That's correct - more radius = more connections

3. CHECK YOUR RATIOS MAKE SENSE:
   Real/Random ratios: 5.0 → 4.6 → 4.4 → 4.3 → 4.2
   ✅ Should be >1 (real better than random)
   ✅ Slight decrease is expected (efficiency decay)

4. CHECK YOUR Z-SCORES:
   Should be >3 for strong evidence
   1057σ is extremely strong (maybe too strong?)

🎯 KEY VALIDATION QUESTIONS:
A) Do edge counts increase monotonically? (YES = good)
B) Are ratios all >1? (YES = real > random)
C) Are distances ~3000 Mpc? (YES = cosmological)
D) Do you have ~660k total galaxies? (YES = full DESI)

📋 TO VERIFY DATA INTEGRITY:
Run the veri