In [None]:
import dask.dataframe as dd

# Load the dataset in Dask (memory-efficient)
data_path = '/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet'
train_data = dd.read_parquet(data_path)

# Take a 10% random sample of the data for initial inspection
sample_data = train_data.sample(frac=0.1, random_state=42).compute()  # Converts to pandas


In [None]:

# Check the first few rows of the sample
print(sample_data.head())

# Check the data types of each column
print(sample_data.dtypes)

# Get general information about the data, including non-null counts
print(sample_data.info())


In [None]:
# Get the total number of rows (compute once for efficiency)
total_rows = len(train_data)

# Count missing values in each column
missing_values = train_data.isna().sum().compute()  # .compute() to get concrete values in pandas

# Identify columns where the count of NaNs is equal to the total number of rows
all_nan_columns = missing_values[missing_values == total_rows].index
print("Columns with all NaN values:", all_nan_columns.tolist())

In [None]:
# Count missing values in each column
missing_values = sample_data.isna().sum()
print("Missing values per column:\n", missing_values)

# Show only columns with missing values for better clarity
missing_columns = missing_values[missing_values > 0]
print("Columns with missing values:\n", missing_columns)


In [None]:
# Filter to exclude the first 85 days
sample_data = sample_data[sample_data['date_id'] >= 85]
print("Data after filtering the first 85 days:", sample_data.shape)


In [None]:
# Apply backward-fill, then forward-fill
sample_data = sample_data.bfill().ffill()

# Verify that there are no remaining missing values
print("Remaining missing values:", sample_data.isna().sum().sum())  # Should be 0 if all NaNs are filled


In [1]:
#Refined code to avoid sampling dataset, instead applying operations directly and finishing up.

import dask.dataframe as dd

# Load the entire dataset with Dask
train_data = dd.read_parquet('/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet')

# Filter out the first 85 days
train_data = train_data[train_data['date_id'] >= 85]
train_data = train_data.repartition(partition_size="50MB") 
#repartitioned due to previous issue where sometimes an entire partition was NaN and therefore could not be forward/backward filled.
#this allows me to avoid dropping data(although it shouldn't effect my data too much regradless)
# Apply backward-fill and forward-fill to handle NaN values
train_data = train_data.bfill().ffill()

In [2]:
print("Remaining missing values:", train_data.isna().sum().sum().compute())  # Should be 0 if all NaNs are filled

ValueError: All NaN partition encountered in `fillna`. Try using ``df.repartition`` to increase the partition size, or specify `limit` in `fillna`.