In [None]:
import pandas as pd
import polars as pl
import numpy as np
import psutil
import os


In [None]:
def get_system_memory_usage_gb():
    try:
        with open("/proc/meminfo", "r") as f:
            lines = f.readlines()
        meminfo_kb = {}
        for line in lines:
            parts = line.split(":")
            if len(parts) < 2:
                continue
            key = parts[0]
            value_tokens = parts[1].strip().split()
            if not value_tokens:
                continue
            # Values are reported in kB
            meminfo_kb[key] = int(value_tokens[0])

        total_kb = meminfo_kb.get("MemTotal")
        avail_kb = meminfo_kb.get("MemAvailable")
        if total_kb is None or avail_kb is None:
            return None
        used_kb = total_kb - avail_kb
        used_gb = used_kb / (1024 * 1024)
        total_gb = total_kb / (1024 * 1024)
        percent = (used_kb / total_kb) * 100.0
        return used_gb, total_gb, percent
    except Exception:
        return None

def print_system_memory_usage_gb():
    used_gb, total_gb, percent = get_system_memory_usage_gb()
    if used_gb is not None:
        print(f"[mem] {used_gb:5.2f}/{total_gb:5.2f} GB ({percent:5.1f}%)", flush=True)

print_system_memory_usage_gb()

In [None]:
pl_dfs = []

print_system_memory_usage_gb()

for file in os.listdir("/nevis/riverside/data/leehagaman/ngem/intermediate_files/"):
    if file.endswith(".parquet"):
        print(f"Reading {file}")
        curr_pl_df = pl.read_parquet(f"/nevis/riverside/data/leehagaman/ngem/intermediate_files/{file}")
        curr_pl_df = curr_pl_df.with_columns([pl.col(pl.Float64).cast(pl.Float32)])
        curr_pl_df = curr_pl_df.with_columns([pl.col(pl.Int32).cast(pl.Int64)])
        pl_dfs.append(curr_pl_df)
        print(f"Read {file}, estimated size: {pl_dfs[-1].estimated_size() / 1e9:.2f} GB")
        print_system_memory_usage_gb()



In [None]:
def align_columns_for_concat(dfs):
    # Find all columns across all DataFrames
    all_cols = {col for df in dfs for col in df.columns}

    # Build a mapping of column -> dtype (prefer from first df that has it)
    dtype_map = {}
    for df in dfs:
        for col, dtype in zip(df.columns, df.dtypes):
            dtype_map.setdefault(col, dtype)

    aligned = []
    for df in dfs:
        missing = all_cols - set(df.columns)
        if missing:
            # Add missing columns as nulls, cast to the desired dtype
            defaults = [
                pl.lit(None).cast(dtype_map[c]).alias(c)
                for c in missing
            ]
            df = df.with_columns(defaults)
        # Ensure consistent column order
        df = df.select(sorted(all_cols))
        aligned.append(df)

    return aligned

In [None]:
aligned = align_columns_for_concat(pl_dfs)
print_system_memory_usage_gb()
total_concat_pl_df = pl.concat(aligned, how="vertical")
print(f"Total concatenated Polars DataFrame size: {total_concat_pl_df.estimated_size() / 1e9:.2f} GB")

total_concat_pl_df


In [None]:
# convert to pandas
total_concat_pd_df = total_concat_pl_df.to_pandas()
print_system_memory_usage_gb()

# convert to pickle
total_concat_pd_df.to_pickle("/nevis/riverside/data/leehagaman/ngem/intermediate_files/total_concat_pd_df.pkl")
print_system_memory_usage_gb()



In [None]:
print(1/0)

In [None]:


# Create a sample dataset
N = 10_000_000
data1 = {
    "ints": np.random.randint(0, 1000, N),
    "floats": np.random.randn(N),
    "strings": np.random.choice(["apple", "banana", "cherry"], N),
}

data2 = {
    "ints": np.random.randint(0, 1000, N),
    "floats": np.random.randn(N),
    "strings": np.random.choice(["apple", "banana", "cherry"], N),
}

# Measure pandas memory usage
df1_pd = pd.DataFrame(data1)
df2_pd = pd.DataFrame(data2)
pandas_mem_1 = df1_pd.memory_usage(deep=True).sum() / 1e6
print(f"Pandas memory df1: {pandas_mem_1:.2f} MB")
pandas_mem_2 = df2_pd.memory_usage(deep=True).sum() / 1e6
print(f"Pandas memory df2: {pandas_mem_2:.2f} MB")
df_pd = pd.concat([df1_pd, df2_pd])
pandas_mem_concat = df_pd.memory_usage(deep=True).sum() / 1e6
print(f"Pandas memory concatenated: {pandas_mem_concat:.2f} MB")

# Convert to Polars
df1_pl = pl.from_pandas(df1_pd)
df2_pl = pl.from_pandas(df2_pd)
polars_mem_1 = df1_pl.estimated_size() / 1e6
print(f"Polars memory df1: {polars_mem_1:.2f} MB")
polars_mem_2 = df2_pl.estimated_size() / 1e6
print(f"Polars memory df2: {polars_mem_2:.2f} MB")
df_pl = pl.concat([df1_pl, df2_pl], how="vertical")
polars_mem_concat = df_pl.estimated_size() / 1e6
print(f"Polars memory concatenated: {polars_mem_concat:.2f} MB")


In [None]:
# load /nevis/riverside/data/leehagaman/ngem/intermediate_files/intermediate_files/all_df.pkl into pandas
df1_pd = pd.read_pickle("/nevis/riverside/data/leehagaman/ngem/intermediate_files/all_df.pkl")
df2_pd = pd.read_pickle("/nevis/riverside/data/leehagaman/ngem/intermediate_files/all_df.pkl")

pandas_mem_1 = df1_pd.memory_usage(deep=True).sum() / 1e6
print(f"Pandas memory df1: {pandas_mem_1:.2f} MB")
pandas_mem_2 = df2_pd.memory_usage(deep=True).sum() / 1e6
print(f"Pandas memory df2: {pandas_mem_2:.2f} MB")
df_pd = pd.concat([df1_pd, df2_pd])
pandas_mem_concat = 
print(f"Pandas memory concatenated: {df_pd.memory_usage(deep=True).sum() / 1e9:.2f} GB")

df1_pl = pl.from_pandas(df1_pd)
df2_pl = pl.from_pandas(df2_pd)

del df1_pd, df2_pd

polars_mem_1 = df1_pl.estimated_size() / 1e6
print(f"Polars memory df1: {polars_mem_1:.2f} MB")
polars_mem_2 = df2_pl.estimated_size() / 1e6
print(f"Polars memory df2: {polars_mem_2:.2f} MB")

#df_pl = pl.concat([df1_pl, df2_pl], how="vertical")
#polars_mem_concat = df_pl.estimated_size() / 1e6
#print(f"Polars memory concatenated: {polars_mem_concat:.2f} MB")

df1_pl.write_parquet("df1_pl.parquet")
df2_pl.write_parquet("df2_pl.parquet")

del df1_pl, df2_pl

df_pl = pl.read_parquet(["df1_pl.parquet", "df2_pl.parquet"])
polars_mem = df_pl.estimated_size() / 1e6
print(f"Polars memory: {polars_mem:.2f} MB")


In [None]:
os.listdir("/nevis/riverside/data/leehagaman/ngem/intermediate_files/")

In [None]:
# open curr_df_pl_0.parquet with polars
df_pl = pl.read_parquet("/nevis/riverside/data/leehagaman/ngem/intermediate_files/curr_df_pl_0.parquet")

# change to pandas
df = df_pl.to_pandas()
df[["filetype", "filename", "run", "subrun", "event"]]