### Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import dump, load

from sklearn import set_config
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from API.app.NanHandler import NanHandlerTransformer
from API.app.feature_creator import FeatureCreator
new_feature_creator = FeatureCreator()


In [2]:
# def nan_handler(data):
#     data_copy = data.copy()
#     data_copy.replace([np.inf, -np.inf], np.nan, inplace=True)
#     for column in data_copy.columns:
#         if data_copy[column].isnull().any():
#             data_copy[column].fillna(data_copy[column].median(), inplace=True)
#     return data_copy

nan_handler = NanHandlerTransformer()

Set visualization style

In [3]:
sns.set_theme(style='white', palette='viridis')

Set pandas display options

In [4]:
pd.set_option('display.max_rows', 100)
set_config(transform_output='pandas')
pd.options.mode.chained_assignment = None

### Downloading the data   

In [5]:
train_not_clean = pd.read_csv('train.csv').drop(['row_id', 'time_id'], axis=1)
test_not_clean = pd.read_csv('example_test_files/test.csv').drop(['row_id', 'time_id'], axis=1)

In [7]:
test_not_clean.columns

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap'],
      dtype='object')

In [5]:
train = nan_handler.transform(train_not_clean)
test = nan_handler.transform(test_not_clean)

### Feature engineering

In [20]:
def create_features(data):
    # Time-based Features
    data['intraday_momentum'] = data['wap'].diff()  # Change in WAP between consecutive data points
    data['time_decay'] = data['seconds_in_bucket'] / (data['seconds_in_bucket'].max() + 1)

    # Price and Volume Imbalance Features
    data['bid_ask_spread'] = data['ask_price'] - data['bid_price']
    data['imbalance_ratio'] = data['imbalance_size'] / (data['matched_size'] + 1e-9)

    # Statistical Features
    data['wap_mean'] = data['wap'].rolling(window=5).mean()
    data['wap_std'] = data['wap'].rolling(window=5).std()

    # Relative Price Features
    data['price_vs_ma'] = data['wap'] / data['wap_mean']  # WAP relative to moving average

     # Auction Imbalance Indicators
    # Assuming imbalance_buy_sell_flag is already encoded appropriately

    # Lagged Features
    data['wap_lag_1'] = data['wap'].shift(1)

    # Non-linear Transformations
    data['log_bid_size'] = np.log1p(data['bid_size'])
    data['log_ask_size'] = np.log1p(data['ask_size'])

    # Remove any infinite values created by feature engineering
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    return data

train = create_features(train)
test = create_features(test)

# Fill any NaN values
train.fillna(method='bfill', inplace=True)
test.fillna(method='bfill', inplace=True)

  train.fillna(method='bfill', inplace=True)
  test.fillna(method='bfill', inplace=True)


In [6]:
X = train.drop(['target'], axis=1)
y = train['target']

In [9]:
# Data Preprocessing
scaler = StandardScaler()
X_scaled = scaler.fit_transform(new_feature_creator.transform(X))

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [10]:
# Initialize CatBoost Regressor
catboost_model = CatBoostRegressor(
    iterations=300,
    depth=8,
    learning_rate=0.05,
    loss_function='RMSE',
    verbose=False,
    random_seed=42,
    task_type='GPU'
)


In [11]:
# Train the pipeline on the training set
catboost_model.fit(X_scaled, y)

<catboost.core.CatBoostRegressor at 0x7f5885855610>

In [42]:
returned = pd.DataFrame(catboost_model.predict(create_features(scaler.fit_transform(nan_handler.transform(test_not_clean))))).rename(columns= {0: 'target'})

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [57]:
pd.DataFrame(catboost_model.predict(new_feature_creator.transform(scaler.fit_transform(nan_handler.transform(test_not_clean)))))

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


AttributeError: 'NoneType' object has no attribute 'fillna'

In [43]:
returned

Unnamed: 0,target
0,-4.833314
1,1.050859
2,-1.113951
3,-0.937054
4,-0.251894
...,...
32995,-1.174585
32996,-2.007723
32997,0.512451
32998,0.233574


In [12]:
dump(catboost_model, 'API/app/models/usable_model.joblib')

['API/app/models/usable_model.joblib']

## Grid Search

In [20]:
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor
import numpy as np

# Assuming your feature creation and preprocessing functions are already defined

# Feature Engineering: Add new features or refine existing ones
X = create_features(X)
X = nan_handler(X)

# Scaling the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define a range of hyperparameters for tuning
params = {
    'iterations': [300, 500, 700],
    'depth': [6, 8, 10],
    'learning_rate': [0.05, 0.1, 0.15]
}

# Initialize the CatBoost model
catboost_model = CatBoostRegressor(
    loss_function='RMSE',
    verbose=False,
    random_seed=42,
    task_type='GPU'
)

# Grid search for hyperparameter tuning
grid_search = GridSearchCV(estimator=catboost_model, param_grid=params, scoring='neg_mean_absolute_error', cv=3)
grid_search.fit(X_scaled, y)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best MAE:", -grid_search.best_score_)


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if

Best Parameters: {'depth': 8, 'iterations': 300, 'learning_rate': 0.05}
Best MAE: 6.301275240953573


## SUBMISSION

In [4]:
pre_trained = load('API/app/models/catboost_3.joblib')
predictions = pre_trained.predict(X_val)
evaluation_metric = mean_absolute_error(y_val, predictions)
print(f"Evaluation Metric (MAE): {evaluation_metric}")

AttributeError: Can't get attribute 'nan_handler' on <module '__main__'>

In [45]:
submission_csv = pd.concat([pd.read_csv('example_test_files/sample_submission.csv').drop('target', axis=1),
 pd.DataFrame(pre_trained.predict(test_not_clean)).rename(columns = {0:'target'})
], axis = 1)

In [46]:
submission_csv.to_csv('submissions/submission1.csv')