In [1]:
import dask.dataframe as dd

# Load the dataset in Dask (memory-efficient)
data_path = '/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet'
train_data = dd.read_parquet(data_path)

# Take a 10% random sample of the data for initial inspection
sample_data = train_data.sample(frac=0.1, random_state=42).compute()  # Converts to pandas


In [2]:
# Check the first few rows of the sample
print(sample_data.head())

# Check the data types of each column
print(sample_data.dtypes)

# Get general information about the data, including non-null counts
print(sample_data.info())


        date_id  time_id  symbol_id    weight  feature_00  feature_01  \
587075       65      332         38  1.351800         NaN         NaN   
757733       81      813         19  3.195052         NaN         NaN   
172256       21      503         10  1.109825         NaN         NaN   
757596       81      802         10  1.189921         NaN         NaN   
650440       71        8          8  1.632438         NaN         NaN   

        feature_02  feature_03  feature_04  feature_05  ...  responder_0  \
587075         NaN         NaN         NaN    0.315594  ...     0.256910   
757733         NaN         NaN         NaN   -1.569358  ...    -0.098848   
172256         NaN         NaN         NaN   -0.064054  ...     0.202636   
757596         NaN         NaN         NaN   -1.472570  ...     2.127164   
650440         NaN         NaN         NaN    0.621679  ...    -0.428931   

        responder_1  responder_2  responder_3  responder_4  responder_5  \
587075    -0.082456    -0.040

In [3]:
# Count missing values in each column
missing_values = sample_data.isna().sum()
print("Missing values per column:\n", missing_values)

# Show only columns with missing values for better clarity
missing_columns = missing_values[missing_values > 0]
print("Columns with missing values:\n", missing_columns)


Missing values per column:
 date_id              0
time_id              0
symbol_id            0
weight               0
feature_00      318231
                 ...  
responder_5          0
responder_6          0
responder_7          0
responder_8          0
partition_id         0
Length: 93, dtype: int64
Columns with missing values:
 feature_00    318231
feature_01    318231
feature_02    318231
feature_03    318231
feature_04    318231
feature_08     29889
feature_15    120995
feature_16        17
feature_17     20035
feature_18        14
feature_19        14
feature_21    843597
feature_26    843597
feature_27    843597
feature_31    843597
feature_32     47846
feature_33     47846
feature_37        94
feature_39    430378
feature_40      6782
feature_41    109176
feature_42    430378
feature_43      6782
feature_44    109176
feature_45     31506
feature_46     31506
feature_47        12
feature_50    425744
feature_51      1408
feature_52    104421
feature_53    425744
feature_54   

In [4]:
# Filter to exclude the first 85 days
sample_data = sample_data[sample_data['date_id'] >= 85]
print("Data after filtering the first 85 days:", sample_data.shape)


Data after filtering the first 85 days: (4633309, 93)


In [5]:
# Apply backward-fill, then forward-fill
sample_data = sample_data.bfill().ffill()

# Verify that there are no remaining missing values
print("Remaining missing values:", sample_data.isna().sum().sum())  # Should be 0 if all NaNs are filled


Remaining missing values: 0


In [6]:
''' Refined code to avoid sampling dataset, instead applying operations directly and finishing up.

import dask.dataframe as dd

# Load the entire dataset with Dask
train_data = dd.read_parquet('/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet')

# Filter out the first 85 days
train_data = train_data[train_data['date_id'] >= 85]

# Apply backward-fill and forward-fill to handle NaN values
train_data = train_data.bfill().ffill()
'''


" Refined code to avoid sampling dataset, instead applying operations directly and finishing up.\n\nimport dask.dataframe as dd\n\n# Load the entire dataset with Dask\ntrain_data = dd.read_parquet('/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet')\n\n# Filter out the first 85 days\ntrain_data = train_data[train_data['date_id'] >= 85]\n\n# Apply backward-fill and forward-fill to handle NaN values\ntrain_data = train_data.bfill().ffill()\n"