In [12]:
import pandas as pd
import numpy as np

In [13]:
df = pd.read_csv('first_emission_50gev.csv')
df.head()

Unnamed: 0,Event,Name,Pid,Particle_px,Particle_py,Particle_pz,Particle_E,Particle_pT
0,0,pi0,111,0.160985,0.23589,36.9805,36.9818,0.285588
1,1,pi0,111,0.236153,0.365678,22.8023,22.8068,0.435303
2,2,pi-,-211,0.072274,0.046684,0.632455,0.653362,0.08604
3,3,pi-,-211,0.036291,0.250416,41.8182,41.8192,0.253032
4,4,pi0,111,-0.149215,0.046229,8.4318,8.43433,0.156212


# Data Preprocessing Steps
1. Ensure 4 * 10^5 events
2. Columns used: pT, pz
3. Rescale pz to pz' using a reference energy of 50 GeV
- $p_{z}^{\prime}\equiv E_{ref}\frac{p}{E}$
4. Split into two training sets: one for pT and one for pz'
4. Randomly partition into batches/vectors of size 100
5. Sort each vector in increasing order

In [14]:
# Step 1: Check current number of events
print(f"Current number of events: {len(df)}")
print(f"Target number of events: 400,000")

# If we need more events, we'll need to generate more data
# For now, let's work with what we have and we can duplicate/bootstrap if needed
if len(df) < 400000:
    print(f"\nNote: We have {len(df)} events, which is less than 400,000")
    print("We'll work with the available data for now")

Current number of events: 10000
Target number of events: 400,000

Note: We have 10000 events, which is less than 400,000
We'll work with the available data for now


In [15]:
# Step 2 & 3: Extract pT and pz columns, and rescale pz to pz'
# pz' = E_ref * (pz / E) where E_ref = 50 GeV

E_ref = 50.0  # Reference energy in GeV

# Extract columns
pT = df['Particle_pT'].values
pz = df['Particle_pz'].values
E = df['Particle_E'].values

# Calculate pz' (rescaled pz)
pz_prime = E_ref * (pz / E)

print(f"Original pz range: [{pz.min():.3f}, {pz.max():.3f}]")
print(f"Rescaled pz' range: [{pz_prime.min():.3f}, {pz_prime.max():.3f}]")
print(f"pT range: [{pT.min():.3f}, {pT.max():.3f}]")

Original pz range: [0.254, 49.526]
Rescaled pz' range: [42.608, 50.000]
pT range: [0.002, 1.706]


In [16]:
# Step 4: Split into two training sets
# One for pT and one for pz'

training_pT = pT.copy()
training_pz_prime = pz_prime.copy()

print(f"Training set for pT: {len(training_pT)} events")
print(f"Training set for pz': {len(training_pz_prime)} events")

Training set for pT: 10000 events
Training set for pz': 10000 events


In [17]:
# Step 5: Randomly partition into batches/vectors of size 100 and sort each vector

batch_size = 100

def create_sorted_batches(data, batch_size=100):
    """
    Randomly partition data into batches of given size and sort each batch.
    """
    # Shuffle the data randomly
    np.random.seed(42)  # For reproducibility
    shuffled_data = data.copy()
    np.random.shuffle(shuffled_data)
    
    # Calculate number of complete batches
    n_batches = len(shuffled_data) // batch_size
    
    # Trim data to fit complete batches
    trimmed_data = shuffled_data[:n_batches * batch_size]
    
    # Reshape into batches
    batches = trimmed_data.reshape(n_batches, batch_size)
    
    # Sort each batch
    sorted_batches = np.sort(batches, axis=1)
    
    return sorted_batches

# Create sorted batches for both pT and pz'
pT_batches = create_sorted_batches(training_pT, batch_size)
pz_prime_batches = create_sorted_batches(training_pz_prime, batch_size)

print(f"pT batches shape: {pT_batches.shape}")
print(f"pz' batches shape: {pz_prime_batches.shape}")
print(f"\nNumber of batches: {pT_batches.shape[0]}")
print(f"Batch size: {pT_batches.shape[1]}")
print(f"\nExample of first batch (pT):")
print(f"  First 5 values: {pT_batches[0, :5]}")
print(f"  Last 5 values: {pT_batches[0, -5:]}")
print(f"  Min: {pT_batches[0].min():.4f}, Max: {pT_batches[0].max():.4f}")

pT batches shape: (100, 100)
pz' batches shape: (100, 100)

Number of batches: 100
Batch size: 100

Example of first batch (pT):
  First 5 values: [0.0181661 0.0658976 0.0662864 0.0906684 0.0908882]
  Last 5 values: [0.587231 0.591635 0.66802  0.681434 0.775617]
  Min: 0.0182, Max: 0.7756


In [18]:
# Verify sorting within batches
print("Verifying that batches are sorted:")
print(f"Is first pT batch sorted? {np.all(pT_batches[0, :-1] <= pT_batches[0, 1:])}")
print(f"Is first pz' batch sorted? {np.all(pz_prime_batches[0, :-1] <= pz_prime_batches[0, 1:])}")

# Check a few random batches
random_indices = np.random.choice(pT_batches.shape[0], size=5, replace=False)
all_sorted = True
for idx in random_indices:
    if not np.all(pT_batches[idx, :-1] <= pT_batches[idx, 1:]):
        all_sorted = False
        break
    if not np.all(pz_prime_batches[idx, :-1] <= pz_prime_batches[idx, 1:]):
        all_sorted = False
        break

print(f"All checked batches are sorted: {all_sorted}")

Verifying that batches are sorted:
Is first pT batch sorted? True
Is first pz' batch sorted? True
All checked batches are sorted: True


## Save Preprocessed Data

In [19]:
# Save the preprocessed batches as numpy arrays
np.save('pT_batches_sorted.npy', pT_batches)
np.save('pz_prime_batches_sorted.npy', pz_prime_batches)

print("Preprocessed data saved:")
print(f"  - pT_batches_sorted.npy: shape {pT_batches.shape}")
print(f"  - pz_prime_batches_sorted.npy: shape {pz_prime_batches.shape}")

Preprocessed data saved:
  - pT_batches_sorted.npy: shape (100, 100)
  - pz_prime_batches_sorted.npy: shape (100, 100)
