In [1]:
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error



In [2]:
import pandas as pd

# Load the Parquet file from the correct Kaggle path
df_2 = pd.read_parquet('/kaggle/input/df-2-final-process/df_2.parquet')

# Preview the data
df_2.head()


Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,far_price,near_price,bid_price,ask_price,...,auction_signal_strength_mean_0_300,auction_signal_strength_std_0_300,stock_vs_index_wap_ratio_min_0_300,stock_vs_index_wap_ratio_max_0_300,stock_vs_index_wap_ratio_mean_0_300,stock_vs_index_wap_ratio_std_0_300,spread_min_0_300,spread_max_0_300,spread_mean_0_300,spread_std_0_300
0,0,0,300,0.0,0,1.000241,1.000241,1.000241,1.000026,1.000241,...,3.3e-05,0.000153,0.999314,1.000355,0.999812,0.000225,0.000107,0.000535,0.000171,9.1e-05
1,0,0,310,0.0,0,0.999919,0.999919,0.999919,0.999812,0.999919,...,3.3e-05,0.000153,0.999314,1.000355,0.999812,0.000225,0.000107,0.000535,0.000171,9.1e-05
2,0,0,320,0.0,0,0.999919,0.999919,0.999919,0.999705,0.999919,...,3.3e-05,0.000153,0.999314,1.000355,0.999812,0.000225,0.000107,0.000535,0.000171,9.1e-05
3,0,0,330,0.0,0,0.999812,0.999812,0.999812,0.999705,0.999812,...,3.3e-05,0.000153,0.999314,1.000355,0.999812,0.000225,0.000107,0.000535,0.000171,9.1e-05
4,0,0,340,0.0,0,0.999491,0.999491,0.999491,0.999169,0.999383,...,3.3e-05,0.000153,0.999314,1.000355,0.999812,0.000225,0.000107,0.000535,0.000171,9.1e-05


In [3]:

pd.set_option('display.max_columns', None)
df_2.columns.tolist()


['stock_id',
 'date_id',
 'seconds_in_bucket',
 'imbalance_size',
 'imbalance_buy_sell_flag',
 'reference_price',
 'far_price',
 'near_price',
 'bid_price',
 'ask_price',
 'ask_size',
 'wap',
 'target',
 'time_id',
 'row_id',
 'spread',
 'mid_price',
 'wap_diff',
 'is_buy_pressure',
 'auction_signal_strength',
 'rolling_avg_imbalance',
 'seconds_to_close',
 'spread_change',
 'wap_velocity',
 'wap_velocity_60s',
 'wap_lag_60s',
 'spread_lag_60s',
 'imbalance_lag_60s',
 'synthetic_index_wap',
 'wap_velocity_lag_60s',
 'spread_change_lag_60s',
 'stock_vs_index_wap_ratio',
 'window_label',
 'wap_diff_sd_max',
 'wap_diff_sd_mean',
 'wap_diff_sd_min',
 'wap_diff_sd_std',
 'auction_signal_strength_sd_max',
 'auction_signal_strength_sd_mean',
 'auction_signal_strength_sd_min',
 'auction_signal_strength_sd_std',
 'stock_vs_index_wap_ratio_sd_max',
 'stock_vs_index_wap_ratio_sd_mean',
 'stock_vs_index_wap_ratio_sd_min',
 'stock_vs_index_wap_ratio_sd_std',
 'near_price_sd_max',
 'near_price_sd_me

In [4]:
drop_columns = ['time_id', 'row_id', 'window_label', 'window_label_order']
df_2 = df_2.drop(columns=drop_columns, errors='ignore')


In [5]:
# Display columns with nulls and their corresponding null counts
null_counts = df_2.isnull().sum()
null_counts = null_counts[null_counts > 0]
print(null_counts)


far_price    35834
dtype: int64


In [6]:
# Specify categorical and numeric features
cat_cols = ['stock_id']
numeric_cols = [col for col in df_2.columns if col not in cat_cols + ['target', 'date_id']]

In [7]:
# Type conversions
df_2['stock_id'] = df_2['stock_id'].astype(str)
for col in numeric_cols:
    df_2[col] = pd.to_numeric(df_2[col])

# Combine all features
all_features = numeric_cols + cat_cols

In [8]:
# Step 1: Split data by days (chronologically)
unique_days = sorted(df_2['date_id'].unique())
split_index = int(len(unique_days) * 0.8)

In [9]:
# Step 0: Filter train set by date
train_days = sorted(df_2['date_id'].unique())[:split_index]
train_df = df_2[df_2['date_id'].isin(train_days)].copy()

# Step 1: Reset index so indexing is clean
train_df = train_df.reset_index(drop=True)

# Step 2: Separate X and y after resetting index
X_train = train_df.drop(columns=['target'])  # or ['target', 'date_id'] if date_id not needed as feature
y_train = train_df['target']

# Step 3: Store date_id separately for CV split
train_date_ids = train_df['date_id'].copy()  # cleanly indexed now


In [10]:
# Step 4: Determine test set days (remaining 20% of date_ids)
test_days = sorted(df_2['date_id'].unique())[split_index:]

# Step 5: Filter test set by date
test_df = df_2[df_2['date_id'].isin(test_days)].copy()
test_df = test_df.reset_index(drop=True)

# Step 6: Separate X and y for test set
X_test = test_df.drop(columns=['target'])  # or drop 'date_id' as well if not needed
y_test = test_df['target']


In [11]:
def time_based_cv_from_date_ids(date_ids, n_splits=4):
    unique_dates = sorted(date_ids.unique())
    split_size = len(unique_dates) // (n_splits + 1)

    for i in range(1, n_splits + 1):
        train_days = unique_dates[: i * split_size]
        test_days  = unique_dates[i * split_size : (i + 1) * split_size]

        train_idx = date_ids[date_ids.isin(train_days)].index
        test_idx  = date_ids[date_ids.isin(test_days)].index
        yield train_idx, test_idx


In [12]:
time_cv = time_based_cv_from_date_ids(train_date_ids, n_splits=4)


In [13]:
param_dist = {
    'learning_rate': np.linspace(0.01, 0.2, 20),
    'depth': np.arange(4, 10),
    'l2_leaf_reg': np.logspace(-1, 2, 10),
    'iterations': [100, 200, 300]
}
random_cv = RandomizedSearchCV(
    estimator=CatBoostRegressor(verbose=0, random_state=42, task_type="GPU",),
    param_distributions=param_dist,
    n_iter=20,
    scoring='neg_mean_absolute_error',
    cv=time_cv,
    verbose=2,
    n_jobs=1
)

random_cv.fit(X_train, y_train, cat_features=cat_cols)

print("Best Parameters from Random Search:")
print(random_cv.best_params_)

best_model = random_cv.best_estimator_

Fitting 4 folds for each of 20 candidates, totalling 80 fits
[CV] END depth=5, iterations=200, l2_leaf_reg=1.0, learning_rate=0.2; total time=   5.3s
[CV] END depth=5, iterations=200, l2_leaf_reg=1.0, learning_rate=0.2; total time=   6.7s
[CV] END depth=5, iterations=200, l2_leaf_reg=1.0, learning_rate=0.2; total time=   8.9s
[CV] END depth=5, iterations=200, l2_leaf_reg=1.0, learning_rate=0.2; total time=  11.9s
[CV] END depth=7, iterations=200, l2_leaf_reg=4.641588833612777, learning_rate=0.04; total time=   5.7s
[CV] END depth=7, iterations=200, l2_leaf_reg=4.641588833612777, learning_rate=0.04; total time=   8.0s
[CV] END depth=7, iterations=200, l2_leaf_reg=4.641588833612777, learning_rate=0.04; total time=  10.9s
[CV] END depth=7, iterations=200, l2_leaf_reg=4.641588833612777, learning_rate=0.04; total time=  15.4s
[CV] END depth=5, iterations=300, l2_leaf_reg=10.0, learning_rate=0.2; total time=   5.8s
[CV] END depth=5, iterations=300, l2_leaf_reg=10.0, learning_rate=0.2; total 

In [19]:
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)

print(f"MAE on Test Set: {mae:.4f}")

MAE on Test Set: 5.2229


In [42]:
import pickle

# Assuming your best model is in variable: best_model
with open('catboost_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

In [20]:
import pandas as pd

# Convert CV results to DataFrame
cv_results_df = pd.DataFrame(random_cv.cv_results_)

# Select relevant columns
columns = [
    'params', 
    'mean_test_score', 
    'std_test_score'
] + [col for col in cv_results_df.columns if col.startswith('split') and col.endswith('_test_score')]

# Sort by best mean_test_score (highest negative MAE = lowest actual MAE)
cv_results_df = cv_results_df[columns].sort_values(by='mean_test_score', ascending=False)

# Convert scores back to positive MAE for readability
cv_results_df['mean_MAE'] = -cv_results_df['mean_test_score']
cv_results_df['std_MAE'] = cv_results_df['std_test_score']
for col in columns:
    if col.startswith('split'):
        cv_results_df[col] = -cv_results_df[col]

# Show top 10 results
cv_results_df[['params', 'mean_MAE', 'std_MAE'] + [col for col in columns if col.startswith('split')]].head(10)


Unnamed: 0,params,mean_MAE,std_MAE,split0_test_score,split1_test_score,split2_test_score,split3_test_score
1,"{'learning_rate': 0.04, 'l2_leaf_reg': 4.64158...",5.946793,0.430277,6.50865,6.209062,5.443919,5.625541
4,"{'learning_rate': 0.09999999999999999, 'l2_lea...",5.94903,0.42731,6.509133,6.208749,5.458032,5.620205
19,"{'learning_rate': 0.08, 'l2_leaf_reg': 46.4158...",5.949285,0.428212,6.508904,6.210736,5.452563,5.624939
9,"{'learning_rate': 0.02, 'l2_leaf_reg': 0.21544...",5.949536,0.431182,6.51097,6.214103,5.443455,5.629613
11,"{'learning_rate': 0.09999999999999999, 'l2_lea...",5.953745,0.436594,6.528721,6.213185,5.445906,5.62717
12,"{'learning_rate': 0.060000000000000005, 'l2_le...",5.955098,0.432392,6.51914,6.219286,5.448974,5.632991
3,"{'learning_rate': 0.19, 'l2_leaf_reg': 100.0, ...",5.955117,0.429272,6.516547,6.216567,5.457157,5.630198
5,"{'learning_rate': 0.060000000000000005, 'l2_le...",5.955502,0.434989,6.532593,6.208982,5.454618,5.625817
13,"{'learning_rate': 0.17, 'l2_leaf_reg': 21.5443...",5.956098,0.433153,6.527286,6.21372,5.456848,5.626536
16,"{'learning_rate': 0.01, 'l2_leaf_reg': 0.21544...",5.958002,0.435283,6.52769,6.220989,5.448114,5.635216


In [21]:
# Parameters for gridsearch
# Convert to DataFrame
cv_results_df = pd.DataFrame(random_cv.cv_results_)

# Get top 5 by best (lowest) MAE
top_params = cv_results_df.sort_values(by='mean_test_score', ascending=False).head(3)



In [22]:
from catboost import CatBoostRegressor
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import train_test_split

class CatBoostWithEarlyStop(BaseEstimator, RegressorMixin):
    def __init__(self, **params):
        self.params = params
        self.model = None

    def fit(self, X, y):
        # Internally split into train/val for early stopping
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.2, shuffle=False  # keep time order
        )

        self.model = CatBoostRegressor(
            **self.params,
            verbose=0,
            early_stopping_rounds=50,
            random_seed=42
        )

        self.model.fit(
            X_train, y_train,
            eval_set=(X_val, y_val),
            cat_features=self.params.get("cat_features", [])
        )
        return self

    def predict(self, X):
        return self.model.predict(X)

    def get_params(self, deep=True):
        return self.params.copy()

    def set_params(self, **params):
        self.params.update(params)
        return self


In [23]:
# Example: Extract ranges around best values
top_learning_rates = sorted(set(p['learning_rate'] for p in top_params['params']))
top_depths = sorted(set(p['depth'] for p in top_params['params']))
top_l2_regs = sorted(set(p['l2_leaf_reg'] for p in top_params['params']))
top_iterations = sorted(set(p['iterations'] for p in top_params['params']))

# These come from your previous top_params['params']
grid_params = {
    'learning_rate': top_learning_rates,
    'depth': top_depths,
    'l2_leaf_reg': top_l2_regs,
    'iterations': top_iterations,
    'cat_features': [cat_cols],  # Important! Needed inside wrapper
    'task_type': ['GPU'] 
}
def time_based_cv_from_date_ids(date_ids, n_splits=4):
    unique_dates = sorted(date_ids.unique())
    split_size = len(unique_dates) // (n_splits + 1)

    for i in range(1, n_splits + 1):
        train_days = unique_dates[: i * split_size]
        test_days  = unique_dates[i * split_size : (i + 1) * split_size]

        train_idx = date_ids[date_ids.isin(train_days)].index
        test_idx  = date_ids[date_ids.isin(test_days)].index
        yield train_idx, test_idx

time_cv = time_based_cv_from_date_ids(train_date_ids, n_splits=4)

def time_based_cv_from_date_ids(date_ids, n_splits=4):
    unique_dates = sorted(date_ids.unique())
    split_size = len(unique_dates) // (n_splits + 1)

    for i in range(1, n_splits + 1):
        train_days = unique_dates[: i * split_size]
        test_days  = unique_dates[i * split_size : (i + 1) * split_size]

        train_idx = date_ids[date_ids.isin(train_days)].index
        test_idx  = date_ids[date_ids.isin(test_days)].index
        yield train_idx, test_idx

time_cv = time_based_cv_from_date_ids(train_date_ids, n_splits=4)
from sklearn.model_selection import GridSearchCV

grid_cv = GridSearchCV(
    estimator=CatBoostWithEarlyStop(),
    param_grid=grid_params,
    scoring='neg_mean_absolute_error',
    cv=time_cv,
    verbose=2,
    n_jobs=1
)

grid_cv.fit(X_train, y_train)

# Best model and params
print("Best Parameters from Grid Search:")
print(grid_cv.best_params_)
best_model = grid_cv.best_estimator_

Fitting 4 folds for each of 12 candidates, totalling 48 fits
[CV] END cat_features=['stock_id'], depth=4, iterations=200, l2_leaf_reg=4.641588833612777, learning_rate=0.04, task_type=GPU; total time=   4.4s
[CV] END cat_features=['stock_id'], depth=4, iterations=200, l2_leaf_reg=4.641588833612777, learning_rate=0.04, task_type=GPU; total time=   5.6s
[CV] END cat_features=['stock_id'], depth=4, iterations=200, l2_leaf_reg=4.641588833612777, learning_rate=0.04, task_type=GPU; total time=   7.4s
[CV] END cat_features=['stock_id'], depth=4, iterations=200, l2_leaf_reg=4.641588833612777, learning_rate=0.04, task_type=GPU; total time=   9.3s
[CV] END cat_features=['stock_id'], depth=4, iterations=200, l2_leaf_reg=4.641588833612777, learning_rate=0.08, task_type=GPU; total time=   3.5s
[CV] END cat_features=['stock_id'], depth=4, iterations=200, l2_leaf_reg=4.641588833612777, learning_rate=0.08, task_type=GPU; total time=   5.8s
[CV] END cat_features=['stock_id'], depth=4, iterations=200, l2

In [24]:
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)

print(f" MAE on Test Set: {mae:.4f}")

 MAE on Test Set: 5.2225


In [25]:
grid_cv.best_params_

{'cat_features': ['stock_id'],
 'depth': 7,
 'iterations': 200,
 'l2_leaf_reg': 46.41588833612778,
 'learning_rate': 0.04,
 'task_type': 'GPU'}

In [27]:
import pickle

X_full = pd.concat([X_train, X_test], axis=0)
y_full = pd.concat([y_train, y_test], axis=0)

# Step 2: Retrain best model on full data
final_model = CatBoostRegressor(
    **grid_cv.best_params_,
    verbose=0,
    random_state=42
)
final_model.fit(X_full, y_full, cat_features=cat_cols)

# Step 3: Save to pickle
with open("final_catboost_model.pkl", "wb") as f:
    pickle.dump(final_model, f)