In [None]:
import os
import joblib 

import pandas as pd
import polars as pl
import lightgbm as lgb
import xgboost as xgb
import catboost as cbt
import numpy as np 
import gc
from sklearn.ensemble import RandomForestRegressor

from joblib import Parallel, delayed

# import kaggle_evaluation.jane_street_inference_server


In [None]:
def reduce_mem_usage(df, float16_as32=True):
   #memory_usage()是df每列的内存使用量,sum求和, B->KB->MB
   start_mem = df.memory_usage().sum() / 1024**2
   print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

   for col in df.columns:
       col_type = df[col].dtype
       if col_type != object and str(col_type)!='category':
           c_min,c_max = df[col].min(),df[col].max() 
           if str(col_type)[:3] == 'int':

               if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                   df[col] = df[col].astype(np.int8)

               elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                   df[col] = df[col].astype(np.int16)

               elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                   df[col] = df[col].astype(np.int32)

               elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                   df[col] = df[col].astype(np.int64)  
           else:

               if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                   if float16_as32:
                       df[col] = df[col].astype(np.float32)
                   else:
                       df[col] = df[col].astype(np.float16)  

               elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                   df[col] = df[col].astype(np.float32)

               else:
                   df[col] = df[col].astype(np.float64)
   #算结束後的内存
   end_mem = df.memory_usage().sum() / 1024**2
   print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))

   print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

   return df

In [None]:
data = []
for i in [6,7,8,9]:
    df = pl.read_parquet(f'/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id={i}')
    df = df.to_pandas()
    df = reduce_mem_usage(df,float16_as32=False)
    data.append(df)
df = pd.concat(data).reset_index(drop=True)
print(df.shape)
del data
gc.collect()


In [None]:

gc.collect()
# Flag to determine if the script is in training mode or not
TRAINING = True

# Define the feature names based on the number of features (79 in this case)
feature_names = [f"feature_{i:02d}" for i in range(79)]
# Number of validation dates to use
num_valid_dates = 100

# # Number of folds for cross-validation
N_fold = 5


# Get unique dates from the DataFrame
dates = df['date_id'].unique()

# Define validation dates as the last `num_valid_dates` dates
valid_dates = dates[-num_valid_dates:]

# Define training dates as all dates except the last `num_valid_dates` dates
train_dates = dates[:-num_valid_dates]

# Create a directory to store the trained models
os.system('mkdir models')


# If in training mode, prepare validation data
if TRAINING:
    # Extract features, target, and weights for validation dates
    X_valid = df[feature_names].loc[df['date_id'].isin(valid_dates)]
    y_valid = df['responder_6'].loc[df['date_id'].isin(valid_dates)]
    w_valid = df['weight'].loc[df['date_id'].isin(valid_dates)]

# Initialize a list to store trained models
models = []

# Function to train a model or load a pre-trained model
def train(model_dict, model_name='lgb'):
    if TRAINING:
        # Select dates for training based on the fold number
        selected_dates = [date for ii, date in enumerate(train_dates) if ii % N_fold != i]

        # Get the model from the dictionary
        model = model_dict[model_name]

        # Extract features, target, and weights for the selected training dates
        X_train = df[feature_names].loc[df['date_id'].isin(selected_dates)]
        y_train = df['responder_6'].loc[df['date_id'].isin(selected_dates)]
        w_train = df['weight'].loc[df['date_id'].isin(selected_dates)]

        # Train the model based on the type (LightGBM, XGBoost, CatBoost, or RandomForest)
        if model_name == 'lgb':
           # Train LightGBM model with early stopping and evaluation logging
            model.fit(X_train, y_train, w_train,  
                    eval_metric=[r2_lgb],
                    eval_set=[(X_valid, y_valid, w_valid)], 
                    callbacks=[
                        lgb.early_stopping(100), 
                        lgb.log_evaluation(10)
                    ])

        elif model_name == 'cbt':
            # Prepare evaluation set for CatBoost
            evalset = cbt.Pool(X_valid, y_valid, weight=w_valid)

            # Train CatBoost model with early stopping and verbose logging
            model.fit(X_train, y_train, sample_weight=w_train, 
                      eval_set=[evalset], 
                      verbose=10, 
                      early_stopping_rounds=100)

        elif model_name == 'xgb':
            # Train XGBoost model with early stopping and verbose logging
            model.fit(X_train, y_train, sample_weight=w_train, 
                      eval_set=[(X_valid, y_valid)], 
                      sample_weight_eval_set=[w_valid], 
                      verbose=10, 
                      early_stopping_rounds=100)

        predictions = model.predict(X_valid)
        # r2 = r2_score(test_y, predictions)
        r2 = r2_xgb(y_valid, predictions, w_valid)
        with open('model_results.txt', 'a') as file:    
            file.write(f"{model_name} R^2: {r2:.4f}\n")
        
        # Save predictions to a text file
        predictions_path = f'./models/{model_name}_{i}_predictions.txt'
        np.savetxt(predictions_path, predictions, fmt='%.6f')
        
        print(f"Predictions saved for {model_name} to {predictions_path}")
        # Save the trained model to a file
        joblib.dump(model, f'./models/{model_name}_{i}.model')

        # Delete training data to free up memory
        del X_train
        del y_train
        del w_train

        # Collect garbage to free up memory
        gc.collect()

    else:
        # If not in training mode, load the pre-trained model from the specified path
        models.append(joblib.load(f'./models/{model_name}_{i}.model'))
    return 

# Custom R2 metric for XGBoost
def r2_xgb(y_true, y_pred, sample_weight):
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (np.average((y_true) ** 2, weights=sample_weight) + 1e-38)
    return r2

# Custom R2 metric for LightGBM
def r2_lgb(y_true, y_pred, sample_weight):
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (np.average((y_true) ** 2, weights=sample_weight) + 1e-38)
    return 'r2', r2, True

# Custom R2 metric for CatBoost
class r2_cbt(object):
    def get_final_error(self, error, weight):
        return 1 - error / (weight + 1e-38)

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w * (target[i] ** 2)
            error_sum += w * ((approx[i] - target[i]) ** 2)

        return error_sum, weight_sum

# Dictionary to store different models with their configurations
# GPU
model_dict = {
    'lgb': lgb.LGBMRegressor(n_estimators=500, device='gpu', objective='l2'),
    'xgb': xgb.XGBRegressor(n_estimators=500, learning_rate=0.1, max_depth=6,tree_method='gpu_hist',device="cuda", objective='reg:squarederror', eval_metric=r2_xgb, disable_default_eval_metric=True),
    'cbt': cbt.CatBoostRegressor(iterations=500, learning_rate=0.05,task_type='GPU', loss_function='RMSE', eval_metric=r2_cbt())
    
}

# #CPU
# model_dict = {
#     'lgb': lgb.LGBMRegressor(n_estimators=500, device='cpu', gpu_use_dp=True, objective='l2'),
#     'xgb': xgb.XGBRegressor(n_estimators=500, learning_rate=0.1, max_depth=6, tree_method='hist', objective='reg:squarederror', eval_metric=r2_xgb, disable_default_eval_metric=True),
#     'cbt': cbt.CatBoostRegressor(iterations=500, learning_rate=0.05, task_type='CPU', loss_function='RMSE', eval_metric=r2_cbt()),

# }

print("是否啟用 GPU：")
print(f"LightGBM: {'gpu' if 'gpu' in model_dict['lgb'].device else 'cpu'}")
print(f"XGBoost: {'gpu_hist' if model_dict['xgb'].tree_method == 'gpu_hist' else 'hist'}")
print(f"CatBoost: {'GPU' if 'GPU' in model_dict['cbt']._init_params.get('task_type', 'CPU') else 'CPU'}")


In [None]:
for i in range(N_fold):
    train(model_dict, 'lgb')
    train(model_dict, 'xgb')
    train(model_dict, 'cbt')

# class

In [2]:
#%%
import os
import pandas as pd
import polars as pl
import numpy as np 
# from cuml.ensemble import RandomForestRegressor as cuRF
from joblib import Parallel, delayed
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool, cv
import lightgbm as lgb
from sklearn.metrics import accuracy_score
import joblib
import kaggle_evaluation.jane_street_inference_server

def reduce_mem_usage(df, float16_as32=True):
   #memory_usage()是df每列的内存使用量,sum求和, B->KB->MB
   start_mem = df.memory_usage().sum() / 1024**2
   print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

   for col in df.columns:
       col_type = df[col].dtype
       if col_type != object and str(col_type)!='category':
           c_min,c_max = df[col].min(),df[col].max() 
           if str(col_type)[:3] == 'int':

               if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                   df[col] = df[col].astype(np.int8)

               elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                   df[col] = df[col].astype(np.int16)

               elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                   df[col] = df[col].astype(np.int32)

               elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                   df[col] = df[col].astype(np.int64)  
           else:

               if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                   if float16_as32:
                       df[col] = df[col].astype(np.float32)
                   else:
                       df[col] = df[col].astype(np.float16)  

               elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                   df[col] = df[col].astype(np.float32)

               else:
                   df[col] = df[col].astype(np.float64)
   #算结束後的内存
   end_mem = df.memory_usage().sum() / 1024**2
   print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))

   print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

   return df
# Define the path to the input data directory
# If the local directory exists, use it; otherwise, use the Kaggle input directory
input_path = '/kaggle/input/jane-street-real-time-market-data-forecasting'
# Flag to determine if the script is in training mode or not
TRAINING = True
data = []
for i in [7,8,9]:
    df = pl.read_parquet(f'/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id={i}')
    df = df.to_pandas()
    df = df.fillna(-1)
    df = reduce_mem_usage(df,float16_as32=False)
    data.append(df)
df = pd.concat(data).reset_index(drop=True)
print(df.shape)
del data


Memory usage of dataframe is 2132.85 MB
Memory usage after optimization is: 1093.61 MB
Decreased by 48.7%
Memory usage of dataframe is 2067.02 MB
Memory usage after optimization is: 1059.86 MB
Decreased by 48.7%
Memory usage of dataframe is 2112.32 MB
Memory usage after optimization is: 1083.09 MB
Decreased by 48.7%
(18750160, 92)


In [3]:
gc.collect()

# Define the feature names based on the number of features (79 in this case)
feature_names = [f"feature_{i:02d}" for i in range(79)]
df['up_down'] = df['responder_6'].apply(lambda x: 1 if x > 0 else 0)

In [13]:
os.system('mkdir class_models')
def train_models(train_X, train_y, test_X, test_y,categorical_feature,feature_name):
    cat_param = {
    'depth': 8,
    'learning_rate':0.05,
    'iterations': 500,
    'bagging_temperature': 0.5,
    'min_data_in_leaf': 1,
    'task_type': "GPU"
    }
    models = {
        'XGBoost': XGBClassifier(learning_rate=0.1,n_estimators=500,max_depth=20,min_child_weight = 1,      
                                gamma=0.,subsample=0.8,device="cuda",objective='binary:logistic')
        # 'CatBoost': CatBoostClassifier(**cat_param, cat_features=categorical_feature, verbose=100),
        # # 'RandomForest': RandomForestClassifier(),
        # 'LightGBM': lgb.LGBMClassifier(device='gpu',boosting_type='gbdt',num_leaves=65,max_depth=8,
        #                                learning_rate=0.1,n_estimators=500,objective='binary',
        #                                min_child_samples=1,subsample=0.8,subsample_freq=4,
        #                                colsample_bytree=0.8,random_state=1)  

    }
    
    accuracies = {}  
    
    for model_name, model in models.items():
        print(f"Training {model_name}...")
        if model_name == 'CatBoost':
            train_pool = Pool(data=train_X, label=train_y, cat_features=categorical_feature)
            test_pool = Pool(data=test_X, label=test_y, cat_features=categorical_feature)
            model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=100, verbose=10)
            predictions = model.predict(test_pool)

        elif model_name == 'LightGBM':
            # train_X = train_X[feature_names]
            # test_X = test_data[feature_names] 
            model.fit(train_X, train_y, eval_set=[(test_X, test_y)],callbacks=[
                        lgb.early_stopping(100), 
                        lgb.log_evaluation(10)
                    ])
            predictions = model.predict(test_X)
        else:
            # train_X = train_X[feature_names]
            # test_X = test_data[feature_names] 
            model.fit(train_X, train_y, eval_set=[(test_X, test_y)], early_stopping_rounds=100, verbose=10)
            predictions = model.predict(test_X)
        accuracy = accuracy_score(test_y, predictions)
        accuracies[model_name] = accuracy
        # Save predictions to a text file
        predictions_path = f'./class_models/{model_name}_predictions.txt'
        np.savetxt(predictions_path, predictions, fmt='%.6f') 
        print(f"{model_name} Accuracy: {accuracy:.4f}")
        joblib.dump(model, f'./class_models/{model_name}.model')
    
    return accuracies

feature_name = [f"feature_{i:02d}" for i in range(79)]
categorical_feature = ['symbol_id']
feature_names = [f"feature_{i:02d}" for i in range(79)]
feature_names.append('symbol_id')


# Number of validation dates to use
num_test_dates = 100

# Get unique dates from the DataFrame
dates = df['date_id'].unique()
print(dates)
# Define validation dates as the last `num_valid_dates` dates
test_dates = dates[-num_test_dates:]

# Define training dates as all dates except the last `num_valid_dates` dates
train_dates = dates[:-num_test_dates]

train_data = df.loc[df['date_id'].isin(train_dates)]
test_data = df.loc[df['date_id'].isin(test_dates)]
print(len(train_data))
print(len(test_data))
train_X = train_data[feature_names]
train_y = train_data['up_down']
del train_data
test_X = test_data[feature_names] 
test_y = test_data['up_down']
del test_data

del train_dates
del test_dates
del dates
gc.collect()

[1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203
 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217
 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231
 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245
 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259
 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273
 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287
 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301
 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315
 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329
 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343
 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357
 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371
 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385
 1386 

0

In [14]:
results = train_models(train_X, train_y, test_X, test_y,categorical_feature,feature_name)


Training XGBoost...




[0]	validation_0-logloss:0.69139
[10]	validation_0-logloss:0.69247
[20]	validation_0-logloss:0.69517
[30]	validation_0-logloss:0.69786
[40]	validation_0-logloss:0.70059
[50]	validation_0-logloss:0.70306
[60]	validation_0-logloss:0.70553
[70]	validation_0-logloss:0.70810
[80]	validation_0-logloss:0.71024
[90]	validation_0-logloss:0.71274
[100]	validation_0-logloss:0.71507
[101]	validation_0-logloss:0.71523
XGBoost Accuracy: 0.5277


In [6]:
for model_name, accuracy_value in results.items():
    print(f"{model_name}: {accuracy_value:.4f}")

XGBoost: 0.5277
CatBoost: 0.5422
LightGBM: 0.5419
