In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc
from joblib import dump, load

from sklearn import set_config
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor, Pool

In [5]:
def nan_handler(data):
    data_copy = data.copy()
    # Replace infinite values with NaN
    data_copy.replace([np.inf, -np.inf], np.nan, inplace=True)
    for column in data_copy.columns:
        if data_copy[column].isnull().any():
            data_copy[column].fillna(data_copy[column].median(), inplace=True)
    return data_copy

In [6]:
def create_features(data):
    # Imbalance features
    data['bid_ask_size_imbalance'] = data['bid_size'] - data['ask_size']
    data['matched_imbalance_ratio'] = (data['imbalance_size'] - data['matched_size']) / (data['matched_size'] + data['imbalance_size'])

    # Price differences and ratios
    price_cols = ['ask_price', 'bid_price', 'far_price', 'near_price']
    for i in range(len(price_cols)):
        for j in range(i + 1, len(price_cols)):
            col1, col2 = price_cols[i], price_cols[j]
            data[f'{col1}_{col2}_diff'] = data[col1] - data[col2]
            data[f'{col1}_{col2}_ratio'] = data[col1] / data[col2]

    # Statistical features
    data['price_mean'] = data['wap'].rolling(window=10).mean()
    data['price_std'] = data['wap'].rolling(window=10).std()

    # Time-based features
    data['time_of_day'] = data['seconds_in_bucket'] // 60  # Convert seconds to minutes

    return data

In [16]:


# Define your feature engineering transformer
FeatureEngineering = FunctionTransformer(create_features)
Feature_cleaning = FunctionTransformer(nan_handler)

# Initialize the CatBoost Regressor for GPU
catboost_model = CatBoostRegressor(
    task_type='GPU',  # Change to GPU
    verbose=False,
    random_state=42  # for reproducibility
)

# Create a pipeline
pipeline = Pipeline([
    ('feature_engineering', FeatureEngineering),
    ('regressor', catboost_model)
])




In [8]:
dtypes = {
    'stock_id': np.uint8,
    'date_id': np.uint16,
    'seconds_in_bucket': np.uint16,
    'imbalance_buy_sell_flag': np.int8,
    'time_id': np.uint16,
}

In [14]:
train_not_clean = pd.read_csv('train.csv', dtype=dtypes).drop(['row_id', 'time_id'], axis=1)
test_not_clean = pd.read_csv('example_test_files/test.csv', dtype=dtypes).drop(['row_id', 'time_id'], axis=1)


In [15]:
train = nan_handler(train_not_clean)
test = nan_handler(test_not_clean)

In [None]:
# Your data
X = train.drop('target',axis=1) 
y = train['target']

# Proceed with TimeSeriesSplit and model training
tss = TimeSeriesSplit(n_splits=5)
validation_scores = []
training_scores = []

for train_index, val_index in tss.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]

    pipeline.fit(X_train, y_train)
    val_predictions = pipeline.predict(X_val)
    train_predictions = pipeline.predict(X_train)

    val_score = mean_absolute_error(y_val, val_predictions)
    train_score = mean_absolute_error(y_train, train_predictions)

    validation_scores.append(val_score)
    training_scores.append(train_score)

# Calculate and print average scores
avg_val_score = np.mean(validation_scores)
avg_train_score = np.mean(training_scores)

print(f"Average Training MAE: {avg_train_score}")
print(f"Average Validation MAE: {avg_val_score}")



In [18]:
print(f"Average Training MAE: {avg_train_score}")
print(f"Average Validation MAE: {avg_val_score}")

Average Training MAE: 6.073920117423537
Average Validation MAE: 6.5992352416445375


In [19]:
# Train the final model on the entire dataset
final_model = pipeline.fit(X, y)

In [21]:
pre_trained_pipeline = Pipeline([
    ('feature_cleaning', Feature_cleaning),
    ('pre_trained_model', final_model)
])

In [22]:
pre_trained_pipeline.predict(test_not_clean)

array([-1.87684567, -0.21027406,  2.02085565, ..., -0.26712981,
        1.24411629, -1.8548337 ])

In [25]:
dump(pre_trained_pipeline, 'API/app/models/catboost_2.joblib')

['API/app/models/catboost_2.joblib']