# 01 â€” Data Exploration

Goals:
- Inspect raw files in `data/raw/`
- Produce quick histograms / invariant mass plots
- Validate shapes, units, and event content

In [5]:
import h5py
import pandas as pd
from pathlib import Path

# Define the directory path for raw data
raw_dir = Path('..') / 'data' / 'raw'

# Collect all HDF5 files in the target directory
files = sorted(list(raw_dir.glob('*.h5')) + list(raw_dir.glob('*.hdf5')))

# Locate the background dataset specifically
bg_file = next((f for f in files if 'backgroundMC' in f.name), files[0])

print(f"Applying memory-safe bypass! Scanning {bg_file.name} directly via h5py...")

extracted_data = []
# We will only load the first 100,000 events to prevent MemoryError
NUM_EVENTS = 100000

# A function to locate the exact 2100-column physics matrix
def find_physics_matrix(name, obj):
    # Check if the object is a dataset and has exactly 2100 columns
    if isinstance(obj, h5py.Dataset) and len(obj.shape) == 2 and obj.shape[1] == 2100:
        print(f"Target matrix found! Internal path: '{name}' | Shape: {obj.shape}")
        # SLICE THE DATA: Load only the first 100,000 rows instead of the entire 15.6 GB matrix
        extracted_data.append(obj[:NUM_EVENTS]) 

# Open the HDF5 file in read-only mode and traverse its internal structure
with h5py.File(bg_file, 'r') as f:
    f.visititems(find_physics_matrix)

# Convert the extracted numerical array into a Pandas DataFrame
if extracted_data:
    df_bg = pd.DataFrame(extracted_data[0])
    print(f"\nData successfully recovered! DataFrame Shape: {df_bg.shape}")
    
    # Display the first 5 rows and first 6 columns
    display(df_bg.iloc[:5, :6])
else:
    print("Error: The 2100-column data matrix was not found within the file.")

Applying memory-safe bypass! Scanning events_LHCO2020_backgroundMC_Pythia.h5 directly via h5py...
Target matrix found! Internal path: 'df/block0_values' | Shape: (1000000, 2100)


OSError: Can't synchronously read data (can't open directory (/usr/local/hdf5/lib/plugin). Please verify its existence)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Extract the transverse momentum (pT) of the leading particle (column 0)
pT_leading = df_bg.iloc[:, 0]

# Initialize the plot figure with specific dimensions
plt.figure(figsize=(10, 6))

# Plot the histogram of the real physics data instead of the random placeholder
plt.hist(pT_leading, bins=100, color='teal', alpha=0.7)

plt.title('Transverse Momentum ($p_T$) of the Leading Particle', fontsize=14)
plt.xlabel('$p_T$ (GeV)', fontsize=12)
plt.ylabel('Number of Events', fontsize=12)

# Set the y-axis to logarithmic scale. 
# This is standard in particle physics because high-energy events are rare.
plt.yscale('log')
plt.grid(axis='y', alpha=0.3, linestyle='--')
plt.show()