In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, parquet file I/O (e.g. pd.read_parquet)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearnex import patch_sklearn
patch_sklearn()

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jane-street-real-time-market-data-forecasting/responders.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/sample_submission.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/features.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=4/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=5/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=6/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=3/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=1/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=8/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=2/part-0.parquet
/kaggle/input/jane-street-real-time-market

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


# Loading Dataset
Given the large dataset and kaggle kernel memory limitations, the data will be loaded into the dataframe in chunks. In addition, I will be using a memory reducing function to type cast all the float and int data types in our dataset to their space efficient data type without affecting their values. (Precision of floating point values may be affected but this downside will not affect training by much)

In [2]:
def reduce_memory_usage(df, float16_as32=True):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object and str(col_type)!='category':
            c_min, c_max = df[col].min(), df[col].max()

            # Reduces all int datatypes in dataframe to smallest datatype possible given the column's min/max values
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)

            # Reduces all float datatypes in dataframe to smallest datatype possible given the column's min/max values
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    if float16_as32:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float16)  
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f}MB'.format(end_mem))
    print('Decreased by {:.1f}% \n'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
# Initialize a list to hold samples from each file
samples = []

# Load a sample from each file
for i in range(10):
    file_path = f"/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id={i}/part-0.parquet"
    print('Processing file: ', file_path)
    chunk = pd.read_parquet(file_path)
    reduce_memory_usage(chunk, False)
    
    # Take a sample of the data (adjust sample size as needed)
    # chunk_sample = chunk.sample(n=50000, random_state=14)
    samples.append(chunk)
    
# Concatenate all samples into one DataFrame if needed
df = pd.concat(samples, ignore_index=True)

Processing file:  /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=0/part-0.parquet
Memory usage of dataframe is 654.51 MB
Memory usage after optimization is: 435.72MB
Decreased by 33.4% 

Processing file:  /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=1/part-0.parquet
Memory usage of dataframe is 944.04 MB
Memory usage after optimization is: 548.24MB
Decreased by 41.9% 

Processing file:  /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=2/part-0.parquet
Memory usage of dataframe is 1022.35 MB
Memory usage after optimization is: 593.72MB
Decreased by 41.9% 

Processing file:  /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=3/part-0.parquet
Memory usage of dataframe is 1352.24 MB
Memory usage after optimization is: 693.36MB
Decreased by 48.7% 

Processing file:  /kaggle/input/jane-street-real-time-market-data-forecasting/train.parque

In [None]:
pd.set_option('display.max_columns', None) # Sets an option to let pandas show all columns. Without this, the columns will be truncated.

In [None]:
df = df[df.columns.drop(list(df.filter(regex='responder_[^6]')))]
df.head()

In [None]:
# Print the number of rows
print(f"Total number of rows: {len(df)}")

In [None]:
# Count the unique 'date_id' values and the number of years these dates add up to
unique_dates = df['date_id'].nunique()

print(f"Number of unique days (date_id): {unique_dates}")
print(f"Number of years: {unique_dates/365.25:.2f}")

In [None]:
plt.imshow(df, cmap='hot', interpolation='nearest')
plt.show()