In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, parquet file I/O (e.g. pd.read_parquet)
import matplotlib.pyplot as plt
import seaborn as sns

# from sklearnex import patch_sklearn
# patch_sklearn()

IS_LOCAL = True

# Loading Dataset
Given the large dataset and kaggle kernel memory limitations, the data will be loaded into the dataframe in chunks. In addition, I will be using a memory reducing function to type cast all the float and int data types in our dataset to their space efficient data type without affecting their values. (Precision of floating point values may be affected but this downside will not affect training by much)

In [2]:
def reduce_memory_usage(df, float16_as32=True):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object and str(col_type)!='category':
            c_min, c_max = df[col].min(), df[col].max()

            # Reduces all int datatypes in dataframe to smallest datatype possible given the column's min/max values
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)

            # Reduces all float datatypes in dataframe to smallest datatype possible given the column's min/max values
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    if float16_as32:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f}MB'.format(end_mem))
    print('Decreased by {:.1f}% \n'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [3]:
# Initialize a list to hold samples from each file
samples = []

# Load a sample from each file
for i in range(10):
    if IS_LOCAL:
        file_path = f"jane-street-real-time-market-data-forecasting/train.parquet/partition_id={i}/part-0.parquet"
    else:
        file_path = f"/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id={i}/part-0.parquet"

    print('Processing file: ', file_path)
    chunk = pd.read_parquet(file_path)
    # reduce_memory_usage(chunk, False)

    # Take a sample of the data (adjust sample size as needed)
    # chunk_sample = chunk.sample(n=50000, random_state=14)
    samples.append(chunk)

# Concatenate all samples into one DataFrame if needed
df = pd.concat(samples, ignore_index=True)

Processing file:  jane-street-real-time-market-data-forecasting/train.parquet/partition_id=0/part-0.parquet
Processing file:  jane-street-real-time-market-data-forecasting/train.parquet/partition_id=1/part-0.parquet
Processing file:  jane-street-real-time-market-data-forecasting/train.parquet/partition_id=2/part-0.parquet
Processing file:  jane-street-real-time-market-data-forecasting/train.parquet/partition_id=3/part-0.parquet
Processing file:  jane-street-real-time-market-data-forecasting/train.parquet/partition_id=4/part-0.parquet
Processing file:  jane-street-real-time-market-data-forecasting/train.parquet/partition_id=5/part-0.parquet
Processing file:  jane-street-real-time-market-data-forecasting/train.parquet/partition_id=6/part-0.parquet
Processing file:  jane-street-real-time-market-data-forecasting/train.parquet/partition_id=7/part-0.parquet
Processing file:  jane-street-real-time-market-data-forecasting/train.parquet/partition_id=8/part-0.parquet
Processing file:  jane-stree

In [4]:
pd.set_option('display.max_columns', None) # Sets an option to let pandas show all columns. Without this, the columns will be truncated.

In [5]:
# df = df[df.columns.drop(list(df.filter(regex='responder_[^6]')))]
df.head()

Unnamed: 0,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,feature_51,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8
0,0,0,1,3.889038,,,,,,0.851033,0.242971,0.2634,-0.891687,11,7,76,-0.883028,0.003067,-0.744703,,-0.169586,,-1.335938,-1.707803,0.91013,,1.636431,1.522133,-1.551398,-0.229627,,,1.378301,-0.283712,0.123196,,,,0.28118,0.269163,0.349028,-0.012596,-0.225932,,-1.073602,,,-0.181716,,,,0.564021,2.088506,0.832022,,0.204797,,,-0.808103,,-2.037683,0.727661,,-0.989118,-0.345213,-1.36224,,,,,,-1.251104,-0.110252,-0.491157,-1.02269,0.152241,-0.659864,,,-0.261412,-0.211486,-0.335556,-0.281498,0.738489,-0.069556,1.380875,2.005353,0.186018,1.218368,0.775981,0.346999,0.095504
1,0,0,7,1.370613,,,,,,0.676961,0.151984,0.192465,-0.521729,11,7,76,-0.865307,-0.225629,-0.582163,,0.317467,,-1.250016,-1.682929,1.412757,,0.520378,0.744132,-0.788658,0.641776,,,0.2272,0.580907,1.128879,,,,-1.512286,-1.414357,-1.823322,-0.082763,-0.184119,,,,,,,,,-10.835207,-0.002704,-0.621836,,1.172836,,,-1.625862,,-1.410017,1.063013,,0.888355,0.467994,-1.36224,,,,,,-1.065759,0.013322,-0.592855,-1.052685,-0.393726,-0.741603,,,-0.281207,-0.182894,-0.245565,-0.302441,2.965889,1.190077,-0.523998,3.849921,2.626981,5.0,0.703665,0.216683,0.778639
2,0,0,9,2.285698,,,,,,1.056285,0.187227,0.249901,-0.77305,11,7,76,-0.675719,-0.199404,-0.586798,,-0.814909,,-1.296782,-2.040234,0.639589,,1.597359,0.657514,-1.350148,0.364215,,,-0.017751,-0.317361,-0.122379,,,,-0.320921,-0.95809,-2.436589,0.070999,-0.245239,,,,,,,,,-1.420632,-3.515137,-4.67776,,0.535897,,,-0.72542,,-2.29417,1.764551,,-0.120789,-0.063458,-1.36224,,,,,,-0.882604,-0.072482,-0.617934,-0.86323,-0.241892,-0.709919,,,0.377131,0.300724,-0.106842,-0.096792,-0.864488,-0.280303,-0.326697,0.375781,1.271291,0.099793,2.109352,0.670881,0.772828
3,0,0,10,0.690606,,,,,,1.139366,0.273328,0.306549,-1.262223,42,5,150,-0.694008,3.004091,0.114809,,-0.251882,,-1.902009,-0.979447,0.241165,,-0.392359,-0.224699,-2.129397,-0.855287,,,0.404142,-0.578156,0.105702,,,,0.544138,-0.087091,-1.500147,-0.201288,-0.038042,,,,,,,,,0.382074,2.669135,0.611711,,2.413415,,,1.313203,,-0.810125,2.939022,,3.988801,1.834661,-1.36224,,,,,,-0.697595,1.074309,-0.206929,-0.530602,4.765215,0.571554,,,-0.226891,-0.251412,-0.215522,-0.296244,0.408499,0.223992,2.294888,1.097444,1.225872,1.225376,1.114137,0.775199,-1.379516
4,0,0,14,0.44057,,,,,,0.9552,0.262404,0.344457,-0.613813,44,3,16,-0.947351,-0.030018,-0.502379,,0.646086,,-1.844685,-1.58656,-0.182024,,-0.969949,-0.673813,-1.282132,-1.399894,,,0.043815,-0.320225,-0.031713,,,,-0.08842,-0.995003,-2.635336,-0.196461,-0.618719,,,,,,,,,-2.0146,-2.321076,-3.711265,,1.253902,,,0.476195,,-0.771732,2.843421,,1.379815,0.411827,-1.36224,,,,,,-0.948601,-0.136814,-0.447704,-1.141761,0.099631,-0.661928,,,3.678076,2.793581,2.61825,3.418133,-0.373387,-0.502764,-0.348021,-3.928148,-1.591366,-5.0,-3.57282,-1.089123,-5.0


In [6]:
df.describe()

: 

Based on the training dataset, some of the notable traits I found are:
- There are 79 total features named feature_xx where xx is 0 to 78
- There are 9 total responders named responder_x where x is 0 to 8 (The responder of value to us for this competition is responder 6)
- There are 47,127,338 rows
- There are 1,699 unique days

In [9]:
# Print the number of rows
print(f"Total number of rows: {len(df)}")

Total number of rows: 47127338


In [10]:
# Count the unique 'date_id' values and the number of years these dates add up to
unique_dates = df['date_id'].nunique()

print(f"Number of unique days (date_id): {unique_dates}")
print(f"Number of years: {unique_dates/365.25:.2f}")

Number of unique days (date_id): 1699
Number of years: 4.65


In [11]:
plt.imshow(df, cmap='hot', interpolation='nearest')
plt.show()

: 