In [1]:
import polars as pl

In [4]:
%%timeit
def quick_sparsity_check(data_path: str):
    # Scan all partitions
    lf = pl.scan_parquet(data_path)

    # Get feature columns
    feature_cols = [col for col in lf.columns if col.startswith('feature_')]

    # Simple sparsity check - just get null counts for features
    null_stats = (lf
                 .select([
                     pl.col(col).null_count() for col in feature_cols
                 ])
                 .collect()
                 .row(0))

    total_rows = lf.select(pl.len()).collect().item()
    print(total_rows)
    overall_sparsity = sum(null_stats) / (len(feature_cols) * total_rows)

    print(f"Overall sparsity: {overall_sparsity:.2%}")

    # Show most sparse features
    sparsity_per_col = {
        col: nulls/total_rows
        for col, nulls in zip(feature_cols, null_stats)
    }

    print("\nTop 5 most sparse features:")
    for col, sparsity in sorted(sparsity_per_col.items(), key=lambda x: x[1], reverse=True)[:5]:
        print(f"{col}: {sparsity:.2%}")

# Use it
quick_sparsity_check("~/Interning/Kaggle/jane_street_kaggle/jane-street-real-time-market-data-forecasting/train.parquet/")



47127338
Overall sparsity: 2.08%

Top 5 most sparse features:
feature_21: 17.90%
feature_26: 17.90%
feature_27: 17.90%
feature_31: 17.90%
feature_39: 9.13%
47127338
Overall sparsity: 2.08%

Top 5 most sparse features:
feature_21: 17.90%
feature_26: 17.90%
feature_27: 17.90%
feature_31: 17.90%
feature_39: 9.13%


KeyboardInterrupt: 

In [None]:
print("hello world@!")

In [5]:
lf = pl.scan_parquet("~/Interning/Kaggle/jane_street_kaggle/jane-street-real-time-market-data-forecasting/train.parquet/")

In [7]:
col_names = lf.collect_schema().names()

In [None]:
feature_cols = [col for col in lf.columns if col.startswith('feature_')]

# Simple sparsity check - just get null counts for features
null_stats = (lf
                 .select([
                     pl.col(col).null_count() for col in feature_cols
                 ])
                 .collect()
                 .row(0))

  feature_cols = [col for col in lf.columns if col.startswith('feature_')]


In [13]:
feature_cols

[<Expr ['col("feature_00").null_count()'] at 0x3C4A2DCF0>,
 <Expr ['col("feature_01").null_count()'] at 0x3C4A2DB70>,
 <Expr ['col("feature_02").null_count()'] at 0x3C4A2DB40>,
 <Expr ['col("feature_03").null_count()'] at 0x3C4A2DC90>,
 <Expr ['col("feature_04").null_count()'] at 0x3C4A2DD20>,
 <Expr ['col("feature_05").null_count()'] at 0x3C4A2DC30>,
 <Expr ['col("feature_06").null_count()'] at 0x3C4A2D480>,
 <Expr ['col("feature_07").null_count()'] at 0x3C4A2DBD0>,
 <Expr ['col("feature_08").null_count()'] at 0x3C4A2DE10>,
 <Expr ['col("feature_09").null_count()'] at 0x3C4A2DB10>,
 <Expr ['col("feature_10").null_count()'] at 0x3C4A2DA80>,
 <Expr ['col("feature_11").null_count()'] at 0x3C4A2D990>,
 <Expr ['col("feature_12").null_count()'] at 0x3C4A2D930>,
 <Expr ['col("feature_13").null_count()'] at 0x3C4A2DA20>,
 <Expr ['col("feature_14").null_count()'] at 0x3C4A2D9C0>,
 <Expr ['col("feature_15").null_count()'] at 0x3C4A2D8A0>,
 <Expr ['col("feature_16").null_count()'] at 0x3C4A2D600