### Importing the libraries

In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc
import joblib

import seaborn as sns
from sklearn import set_config
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor


Set visualization style

In [2]:
sns.set_theme(style='white', palette='viridis')

Set pandas display options

In [3]:
pd.set_option('display.max_rows', 100)
set_config(transform_output='pandas')
pd.options.mode.chained_assignment = None

Change data type for memory optimization

In [4]:
dtypes = {
    'stock_id': np.uint8,
    'date_id': np.uint16,
    'seconds_in_bucket': np.uint16,
    'imbalance_buy_sell_flag': np.int8,
    'time_id': np.uint16,
}

### Downloading the data   

In [47]:
train = pd.read_csv('train.csv', dtype=dtypes).drop(['row_id', 'time_id'], axis=1)
test = pd.read_csv('example_test_files/test.csv', dtype=dtypes).drop(['row_id', 'time_id'], axis=1)
train.shape

# Memory cleanup
gc.collect()

517

## Handling missing data

In [11]:
missing_values = train.isnull().sum()
missing_percentage = (missing_values / len(train)) * 100
print("\nPercentage of Missing Values in Each Column:\n", missing_percentage)


Percentage of Missing Values in Each Column:
 stock_id                    0.000000
date_id                     0.000000
seconds_in_bucket           0.000000
imbalance_size              0.004200
imbalance_buy_sell_flag     0.000000
reference_price             0.004200
matched_size                0.004200
far_price                  55.256836
near_price                 54.547364
bid_price                   0.004200
bid_size                    0.000000
ask_price                   0.004200
ask_size                    0.000000
wap                         0.004200
target                      0.001680
dtype: float64


Handling missing values using median imputation

In [12]:
for column in train.columns:
    if train[column].isnull().any():
        train[column].fillna(train[column].median(), inplace=True)

Same for test set

In [42]:
missing_values = test.isnull().sum()
missing_percentage = (missing_values / len(test)) * 100
print("\nPercentage of Missing Values in Each Column:\n", missing_percentage)

for column in test.columns:
    if test[column].isnull().any():
        test[column].fillna(test[column].median(), inplace=True)


Percentage of Missing Values in Each Column:
 stock_id                    0.000000
date_id                     0.000000
seconds_in_bucket           0.000000
imbalance_size              0.000000
imbalance_buy_sell_flag     0.000000
reference_price             0.000000
matched_size                0.000000
far_price                  55.239394
near_price                 54.545455
bid_price                   0.000000
bid_size                    0.000000
ask_price                   0.000000
ask_size                    0.000000
wap                         0.000000
dtype: float64


In [35]:
def nan_handler(data):
    data_copy = data.copy()
    # Replace infinite values with NaN
    data_copy.replace([np.inf, -np.inf], np.nan, inplace=True)
    for column in data_copy.columns:
        if data_copy[column].isnull().any():
            data_copy[column].fillna(data_copy[column].median(), inplace=True)
    return data_copy

### Feature engineering

In [14]:
def create_features(data):
    # Imbalance features
    data['bid_ask_size_imbalance'] = data['bid_size'] - data['ask_size']
    data['matched_imbalance_ratio'] = (data['imbalance_size'] - data['matched_size']) / (data['matched_size'] + data['imbalance_size'])

    # Price differences and ratios
    price_cols = ['ask_price', 'bid_price', 'far_price', 'near_price']
    for i in range(len(price_cols)):
        for j in range(i + 1, len(price_cols)):
            col1, col2 = price_cols[i], price_cols[j]
            data[f'{col1}_{col2}_diff'] = data[col1] - data[col2]
            data[f'{col1}_{col2}_ratio'] = data[col1] / data[col2]

    # Statistical features
    data['price_mean'] = data['wap'].rolling(window=10).mean()
    data['price_std'] = data['wap'].rolling(window=10).std()

    # Time-based features
    data['time_of_day'] = data['seconds_in_bucket'] // 60  # Convert seconds to minutes

    return data

# Apply the feature engineering to your datasets
train = create_features(train)
test = create_features(test)

list(train)

['stock_id',
 'date_id',
 'seconds_in_bucket',
 'imbalance_size',
 'imbalance_buy_sell_flag',
 'reference_price',
 'matched_size',
 'far_price',
 'near_price',
 'bid_price',
 'bid_size',
 'ask_price',
 'ask_size',
 'wap',
 'target',
 'bid_ask_size_imbalance',
 'matched_imbalance_ratio',
 'ask_price_bid_price_diff',
 'ask_price_bid_price_ratio',
 'ask_price_far_price_diff',
 'ask_price_far_price_ratio',
 'ask_price_near_price_diff',
 'ask_price_near_price_ratio',
 'bid_price_far_price_diff',
 'bid_price_far_price_ratio',
 'bid_price_near_price_diff',
 'bid_price_near_price_ratio',
 'far_price_near_price_diff',
 'far_price_near_price_ratio',
 'price_mean',
 'price_std',
 'time_of_day']

In [43]:
# Replace infinite values with NaN
train.replace([np.inf, -np.inf], np.nan, inplace=True)
test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Impute missing values (if not already done)
for column in train.columns:
    train[column].fillna(train[column].median(), inplace=True)
for column in test.columns:
    test[column].fillna(test[column].median(), inplace=True)

In [16]:
# Define the number of splits for TimeSeriesSplit
n_splits = 5
tss = TimeSeriesSplit(n_splits=n_splits)

In [38]:
# Define your feature engineering transformer
FeatureEngineering = FunctionTransformer(create_features)
Feature_cleaning = FunctionTransformer(nan_handler)

# Initialize the CatBoost Regressor
catboost_model = CatBoostRegressor(
    task_type='CPU',  # remove if not using GPU
    verbose=False,
    random_state=42  # for reproducibility
)

# Create a pipeline
pipeline = Pipeline([
    ('feature_engineering', FeatureEngineering),
    ('regressor', catboost_model)
])

In [37]:


X = train[['stock_id', 'seconds_in_bucket', 'imbalance_size', 'imbalance_buy_sell_flag', 'reference_price', 'matched_size', 'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap', 'bid_ask_size_imbalance', 'matched_imbalance_ratio', 'ask_price_bid_price_diff', 'ask_price_bid_price_ratio', 'ask_price_far_price_diff', 'ask_price_far_price_ratio', 'ask_price_near_price_diff', 'ask_price_near_price_ratio', 'bid_price_far_price_diff', 'bid_price_far_price_ratio', 'bid_price_near_price_diff', 'bid_price_near_price_ratio', 'far_price_near_price_diff', 'far_price_near_price_ratio', 'price_mean', 'price_std', 'time_of_day']]
y = train['target']


# Proceed with TimeSeriesSplit and model training
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
import numpy as np

# Define TimeSeriesSplit
tss = TimeSeriesSplit(n_splits=5)
validation_scores = []
training_scores = []

for train_index, val_index in tss.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]

    pipeline.fit(X_train, y_train)
    val_predictions = pipeline.predict(X_val)
    train_predictions = pipeline.predict(X_train)

    val_score = mean_absolute_error(y_val, val_predictions)
    train_score = mean_absolute_error(y_train, train_predictions)

    validation_scores.append(val_score)
    training_scores.append(train_score)

# Calculate and print average scores
avg_val_score = np.mean(validation_scores)
avg_train_score = np.mean(training_scores)

print(f"Average Training MAE: {avg_train_score}")
print(f"Average Validation MAE: {avg_val_score}")

# Train the final model on the entire dataset
final_model = pipeline.fit(X, y)

KeyError: "['bid_ask_size_imbalance', 'matched_imbalance_ratio', 'ask_price_bid_price_diff', 'ask_price_bid_price_ratio', 'ask_price_far_price_diff', 'ask_price_far_price_ratio', 'ask_price_near_price_diff', 'ask_price_near_price_ratio', 'bid_price_far_price_diff', 'bid_price_far_price_ratio', 'bid_price_near_price_diff', 'bid_price_near_price_ratio', 'far_price_near_price_diff', 'far_price_near_price_ratio', 'price_mean', 'price_std', 'time_of_day'] not in index"

Concatenate with data cleaning 

In [39]:
final_model_to_save = Pipeline([
    ('nan_handling', Feature_cleaning),
    ('trained_model', final_model)

])

In [41]:
final_model_to_save.predict(test)

array([-2.37462619, -0.38397499,  2.40504068, ...,  0.34452756,
        1.66055261, -1.70658602])

SAVE THE PRE TRAINED PIPELINE

In [46]:
joblib.dump(final_model_to_save, 'trained_pipeline.pkl')


['trained_pipeline.pkl']

In [48]:
loaded_model = joblib.load('trained_pipeline.pkl')


LOAD THE PRE TRAINED MODEL

In [49]:
loaded_model.predict(test)

array([-2.37462619, -0.38397499,  2.40504068, ...,  0.34452756,
        1.66055261, -1.70658602])