In [None]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.file_locations import intermediate_files_location

from src.ntuple_variables.variables import combined_training_vars, combined_training_vars_with_duplicates


In [None]:
all_df = pl.read_parquet(f"{intermediate_files_location}/presel_df_train_vars.parquet")
all_df

# Checking For Nans and Small Ranges

In [None]:
for var in tqdm(combined_training_vars):
    vals = all_df.get_column(var).to_numpy()
    # Convert to float to allow NaN assignment
    vals = vals.astype(float)
    vals[vals > 1e10] = np.nan
    vals[vals < -1e10] = np.nan
    
    num_nans = np.sum(np.isnan(vals))
    num_not_nans = np.sum(~np.isnan(vals))

    non_nan_vals = vals[~np.isnan(vals)]

    if num_not_nans == 0:
        print(f"{var} has no non-NaN values")
    
    min_val = np.min(non_nan_vals)
    max_val = np.max(non_nan_vals)

    if np.abs(max_val - min_val) < 1e-1:
        print(f"{var} has a very small range, from {min_val} to {max_val}, with {num_nans} NaN and {num_not_nans} not-NaNs")


# Checking For Equal Variables

In [None]:
import hashlib

def hash_vec(v):
    return hashlib.sha1(np.ascontiguousarray(v)).hexdigest()

hashes = {}
for var in tqdm(combined_training_vars_with_duplicates):
    hashes[var] = hash_vec(all_df.get_column(var).to_numpy())


In [None]:
known_duplicates = []

for var, hash in hashes.items():
    other_hashes = {k: v for k, v in hashes.items() if k != var and k not in known_duplicates}
    backwards_other_hashes = {v: k for k, v in other_hashes.items()}
    if hash in other_hashes.values():
        print(f"{var:<40} = {backwards_other_hashes[hash]:<30}")#adding {var} to known duplicates")
        known_duplicates.append(var)


In [None]:
known_duplicates