In [None]:
import dask.dataframe as dd

# Load the dataset in Dask (memory-efficient)
data_path = '/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet'
train_data = dd.read_parquet(data_path)

# Take a 10% random sample of the data for initial inspection
sample_data = train_data.sample(frac=0.1, random_state=42).compute()  # Converts to pandas


In [None]:

# Check the first few rows of the sample
#print(sample_data.head())

# Check the data types of each column
print(sample_data.dtypes)

# Get general information about the data, including non-null counts
#print(sample_data.info())


In [None]:
# Get the total number of rows (compute once for efficiency)
total_rows = len(train_data)

# Count missing values in each column
missing_values = train_data.isna().sum().compute()  # .compute() to get concrete values in pandas

# Identify columns where the count of NaNs is equal to the total number of rows
all_nan_columns = missing_values[missing_values == total_rows].index
print("Columns with all NaN values:", all_nan_columns.tolist())

In [None]:
#test what percent of each column is filled with NaN values
#will hopefully help determine which ones are causing the fill NaN error

import pandas as pd

# Calculate NaN percentages
nan_percentage = train_data.isna().mean().compute() * 100  # Compute to get concrete values in pandas

# Sort columns by NaN percentage in descending order (highest NaN percentage at the top)
nan_percentage_sorted = nan_percentage.sort_values(ascending=False)

# Display all columns with NaN percentages
pd.set_option('display.max_rows', None)  # Show all rows without truncation
print("NaN percentage per column (sorted):\n", nan_percentage_sorted)
pd.reset_option('display.max_rows')  # Reset display option back to default

'''
NaN percentage per column (sorted):
 feature_26      17.900406
feature_21      17.900406
feature_27      17.900406
feature_31      17.900406
feature_42       9.125593
feature_39       9.125593
feature_50       9.026816
feature_53       9.026816
feature_00       6.752030
feature_01       6.752030
feature_02       6.752030
feature_03       6.752030
feature_04       6.752030
feature_15       2.566024
feature_41       2.319274
feature_44       2.319274
feature_52       2.217180
feature_55       2.217180
feature_74       1.026493
feature_73       1.026493
'''


In [None]:

# Count missing values in each column
missing_values = sample_data.isna().sum()
print("Missing values per column:\n", missing_values)

# Show only columns with missing values for better clarity
missing_columns = missing_values[missing_values > 0]
print("Columns with missing values:\n", missing_columns)


In [None]:
# Filter to exclude the first 85 days
sample_data = sample_data[sample_data['date_id'] >= 85]
print("Data after filtering the first 85 days:", sample_data.shape)


In [None]:
# Apply backward-fill, then forward-fill
sample_data = sample_data.bfill().ffill()

# Verify that there are no remaining missing values
print("Remaining missing values:", sample_data.isna().sum().sum())  # Should be 0 if all NaNs are filled


In [None]:
#Refined code to avoid sampling dataset, instead applying operations directly and finishing up.

import dask.dataframe as dd

# Load the entire dataset with Dask
train_data = dd.read_parquet('/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet')

# Filter out the first 85 days
train_data = train_data[train_data['date_id'] >= 85]
# Step 1: Drop fully empty (all-NaN) partitions to save memory
def drop_empty_partitions(df):
    return df.dropna(how='all')

train_data = train_data.map_partitions(drop_empty_partitions)

# Step 2: Apply fillna with a limit to handle short gaps but retain larger missing sections
train_data = train_data.bfill(limit=5000).ffill(limit=5000)

# Step 3: Sample 10% of the data to compute medians
sample_data = train_data.sample(frac=0.1).compute()

# Select only numeric columns
numeric_columns = sample_data.select_dtypes(include=['number'])
medians = numeric_columns.median()

# Step 4: Fill remaining NaNs using column-specific medians
train_data = train_data.fillna(medians.to_dict())

# Optional: Check for remaining NaNs if needed
remaining_na = train_data.isna().sum().sum().compute()
print("Remaining NaNs after fill:", remaining_na)