In [1]:
import numpy as np
import pandas as pd

In [2]:
def preprocess(df):
    df = df.rename(columns={'Unnamed: 0': 'tick'})
    df['lastPx'] = df['lastPx'].fillna(method='ffill')
    df['BP1'] = df['BP1'].replace(0, np.nan).fillna(method='ffill')
    df['SP1'] = df['SP1'].replace(0, np.nan).fillna(method='ffill')
    # assert df['avg_price'].isna().sum() == 0
    df['mid'] = (df['BP1'] + df['SP1']) / 2
    df['return'] = (df['mid'] / df['mid'].shift(1) - 1).fillna(0)
    df['diff_v'] = (df['volume'] - df['volume'].shift(1)).fillna(0)
    df = df.rename(columns={'lastPx': 'price', 'BP1':'b1', 'BP2':'b2', 'BP3':'b3', 'BP4':'b4', 'BP5':'b5', 
                            'SP1':'a1', 'SP2':'a2', 'SP3':'a3', 'SP4':'a4', 'SP5':'a5',
                            'BV1':'b1_v', 'BV2':'b2_v', 'BV3':'b3_v', 'BV4':'b4_v', 'BV5':'b5_v',
                            'SV1':'a1_v', 'SV2':'a2_v', 'SV3':'a3_v', 'SV4':'a4_v', 'SV5':'a5_v',
                            'volume': 'volume_sum', 'diff_v': 'volume'})
    df['amount'] = df['price'] * df['volume']
    df = df[df['b1']!=0]
    df = df[df['a1']!=0]
    return df

In [3]:
### HF factors
def get_realvar(df, lookback_len, lookback_shift):
    return df['return'].shift(lookback_shift).rolling(lookback_len).var().fillna(0)

def get_realskew(df, lookback_len, lookback_shift):
    return df['return'].shift(lookback_shift).rolling(lookback_len).skew().fillna(0)

def get_realkurtosis(df, lookback_len, lookback_shift):
    return df['return'].shift(lookback_shift).rolling(lookback_len).kurt().fillna(0)

def get_realupvar(df, lookback_len, lookback_shift):
    df['return_up'] = df['return'][df['return'] > 0]
    df['return_up'] = df['return_up'].fillna(0)
    return df['return_up'].shift(lookback_shift).rolling(lookback_len).var().fillna(0)

def get_realdownvar(df, lookback_len, lookback_shift):
    df['return_down'] = df['return'][df['return'] < 0]
    df['return_down'] = df['return_down'].fillna(0)
    return df['return_down'].shift(lookback_shift).rolling(lookback_len).var().fillna(0)

def get_ratio_upvar(df, lookback_len, lookback_shift):
    return get_realupvar(df, lookback_len, lookback_shift) / get_realvar(df, lookback_len, lookback_shift)

def get_ratio_downvar(df, lookback_len, lookback_shift):
    return get_realdownvar(df, lookback_len, lookback_shift) / get_realvar(df, lookback_len, lookback_shift)

def get_trendratio(df, lookback_len, lookback_shift):
    abs_price_diff = abs(df['price'].diff()).fillna(0)
    abs_price_diff_sum = abs_price_diff.shift(lookback_shift).rolling(lookback_len).sum().fillna(0)
    trend_ratio = (df['price']-df['price'].shift(lookback_len)).shift(lookback_shift) / abs_price_diff_sum
    return trend_ratio.replace(np.inf, 0).fillna(0)

def get_windowreturn(df, lookback_len, lookback_shift):
    return np.exp((np.log(df['return']+1)).shift(lookback_shift).rolling(lookback_len).sum())-1

def get_minreturn(df, lookback_len, lookback_shift):
    return df['return'].shift(lookback_shift).rolling(lookback_len).min().fillna(0)

def calculate_mdd(series):
    max_price = np.maximum.accumulate(series)
    drawdown = (max_price - series) / max_price
    return np.max(drawdown)

def get_mdd(df, lookback_len, lookback_shift):
    return df['price'].shift(lookback_shift).rolling(lookback_len).apply(lambda x: calculate_mdd(x), raw=True)

def get_corrVP_price(df, lookback_len, lookback_shift):
    return df['price'].shift(lookback_shift).rolling(lookback_len).corr(df['volume'].shift(lookback_shift)).fillna(0)

def get_corrVP_mid(df, lookback_len, lookback_shift):
    return df['mid'].shift(lookback_shift).rolling(lookback_len).corr(df['volume'].shift(lookback_shift)).fillna(0)

def get_corrVR(df, lookback_len, lookback_shift):
    return df['return'].shift(lookback_shift).rolling(lookback_len).corr(df['volume'].shift(lookback_shift)).fillna(0)

def get_Amihud(df, lookback_len, lookback_shift):
    abs_return = abs(df['return'].diff()).fillna(0)
    sum_abs_return = abs_return.shift(lookback_shift).rolling(lookback_len).sum()
    return (1 / (lookback_len) * sum_abs_return / df['amount'].shift(lookback_shift)).fillna(0)

def get_BAspread(df, lookback_len, lookback_shift):
    bidsum = df["b1"]*df["b1_v"]+0.8*df["b2"]*df["b2_v"]+0.6*df["b3"]*df["b3_v"]+0.4*df["b4"]*df["b4_v"]+0.2*df["b5"]*df["b5_v"]
    asksum = df["a1"]*df["a1_v"]+0.8*df["a2"]*df["a2_v"]+0.6*df["a3"]*df["a3_v"]+0.4*df["a4"]*df["a4_v"]+0.2*df["a5"]*df["a5_v"]
    df["spread"] = (bidsum - asksum) / (bidsum + asksum)
    return df["spread"].shift(lookback_shift).rolling(lookback_len).mean().fillna(0)

def delta_V_A(a1, a1_v):
    # a1 and a1_v are ndarrays
    diff = a1[-1] - a1[0]
    if diff < 0:
        return a1_v[-1]
    elif diff == 0:
        return a1_v[-1] - a1_v[0]
    else:
        return 0

def delta_V_B(b1, b1_v):
    # b1 and b1_v are ndarrays
    diff = b1[-1] - b1[0]
    if diff < 0:
        return 0
    elif diff == 0:
        return b1_v[-1] - b1_v[0]
    else:
        return b1_v[-1]
    
def get_VOI(df, lookback_len, lookback_shift):
    delta_Va = np.zeros_like(df['a1_v'])
    for i in range(1, len(df)):
        a1_slice = df['a1'].values[i-1:i+1]
        a1_v_slice = df['a1_v'].values[i-1:i+1]
        delta_Va[i] = delta_V_A(a1_slice, a1_v_slice)
    df['delta_Va'] = delta_Va

    delta_Vb = np.zeros_like(df['b1_v'])
    for i in range(1, len(df)):
        a1_slice = df['b1'].values[i-1:i+1]
        a1_v_slice = df['b1_v'].values[i-1:i+1]
        delta_Va[i] = delta_V_A(a1_slice, a1_v_slice)
    df['delta_Vb'] = delta_Vb

    df['ori_VOI'] = df['delta_Vb'] - df['delta_Va']
    df['ori_VOI'] = df['ori_VOI'].fillna(0)
    df = df.drop(columns=['delta_Va'])
    df = df.drop(columns=['delta_Vb'])

    mean = df['ori_VOI'].shift(lookback_shift).rolling(lookback_len).mean()
    std = df['ori_VOI'].shift(lookback_shift).rolling(lookback_len).std()
    return ((df['ori_VOI'] - mean)/std).replace([np.inf, -np.inf], 0).fillna(0)

def get_BAspread_1_mean(df, lookback_len, lookback_shift):
    return (df['b1'] - df['a1']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_BAspread_2_mean(df, lookback_len, lookback_shift):
    return (df['b2'] - df['a2']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_BAspread_3_mean(df, lookback_len, lookback_shift):
    return (df['b3'] - df['a3']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_BAspread_4_mean(df, lookback_len, lookback_shift):
    return (df['b4'] - df['a4']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_BAspread_5_mean(df, lookback_len, lookback_shift):
    return (df['b5'] - df['a5']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_ap_sum_mean(df, lookback_len, lookback_shift):
    return (1/5 * (df['a1'] + df['a2'] + df['a3'] + df['a4'] + df['a5'])).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_bp_sum_mean(df, lookback_len, lookback_shift):
    return (1/5 * (df['b1'] + df['b2'] + df['b3'] + df['b4'] + df['b5'])).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_av_sum_mean(df, lookback_len, lookback_shift):
    return (1/5 * (df['a1_v'] + df['a2_v'] + df['a3_v'] + df['a4_v'] + df['a5_v'])).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_bv_sum_mean(df, lookback_len, lookback_shift):
    return (1/5 * (df['b1_v'] + df['b2_v'] + df['b3_v'] + df['b4_v'] + df['b5_v'])).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_av_1_mean(df, lookback_len, lookback_shift):
    return (df['a1_v'] - df['a1_v'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_av_2_mean(df, lookback_len, lookback_shift):
    return (df['a2_v'] - df['a2_v'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_av_3_mean(df, lookback_len, lookback_shift):
    return (df['a3_v'] - df['a3_v'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_av_4_mean(df, lookback_len, lookback_shift):
    return (df['a4_v'] - df['a4_v'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_av_5_mean(df, lookback_len, lookback_shift):
    return (df['a5_v'] - df['a5_v'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_bv_1_mean(df, lookback_len, lookback_shift):
    return (df['b1_v'] - df['b1_v'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_bv_2_mean(df, lookback_len, lookback_shift):
    return (df['b2_v'] - df['b2_v'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_bv_3_mean(df, lookback_len, lookback_shift):
    return (df['b3_v'] - df['b3_v'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_bv_4_mean(df, lookback_len, lookback_shift):
    return (df['b4_v'] - df['b4_v'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_bv_5_mean(df, lookback_len, lookback_shift):
    return (df['b5_v'] - df['b5_v'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_depth_price_range(df, lookback_len, lookback_shift):
    return (df['a1'].shift(lookback_shift).rolling(lookback_len).max() / df['a1'].shift(lookback_shift).rolling(lookback_len).min() - 1).fillna(0)

import numba as nb

@nb.jit(nopython=True)
def age(prices):
    last_value = prices[-1]
    age = 0
    for i in range(2, len(prices)):
        if prices[-i] != last_value:
            return age
        age += 1
    return age

def get_BAage(df, lookback_len, lookback_shift):
    return df['b1'].shift(lookback_shift).rolling(lookback_len).apply(age, engine='numba', raw=True).fillna(0)

def get_cofi(df, lookback_len, lookback_shift):
    a = df['b1_v']*np.where(df['b1'].diff()>=0, 1, 0)
    b = df['b1_v'].shift()*np.where(df['b1'].diff()<=0, 1, 0)
    c = df['a1_v']*np.where(df['a1'].diff()>=0, 1, 0)
    d = df['a1_v'].shift()*np.where(df['a1'].diff()<=0, 1, 0)
    return (a-b-c+d).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)

def get_bp_rank(df, lookback_len, lookback_shift):
    return ((df['b1'].shift(lookback_shift).rolling(lookback_len).rank()) / lookback_len*2 - 1).fillna(0)

def get_ap_rank(df, lookback_len, lookback_shift):
    return ((df['a1'].shift(lookback_shift).rolling(lookback_len).rank()) / lookback_len*2 - 1).fillna(0)

@nb.jit(nopython=True)
def first_location_of_maximum(x):
    max_value = max(x)
    for loc in range(len(x)):
        if x[loc] == max_value:
            return loc + 1
        
def get_price_idxmax(df, lookback_len, lookback_shift):
    return df['a1'].shift(lookback_shift).rolling(lookback_len).apply(first_location_of_maximum, engine='numba', raw=True).fillna(0)

@nb.jit(nopython=True)
def mean_second_derivative_centra(x):
    sum_value = 0
    for i in range(len(x)-5):
        sum_value += (x[i+5]-2*x[i+3]+x[i])/2
    return sum_value/(2*(len(x)-5))

def get_center_deri_two(df, lookback_len, lookback_shift):
    return df['a1'].shift(lookback_shift).rolling(lookback_len).apply(mean_second_derivative_centra, engine='numba', raw=True).fillna(0)

def get_quasi(df, lookback_len, lookback_shift):
    return df['a1'].diff(1).abs().shift(lookback_shift).rolling(lookback_len).sum().fillna(0)

def get_weighted_price_to_mid(df, lookback_len, lookback_shift):
    avs = df[['a1_v', 'a2_v', 'a3_v', 'a4_v', 'a5_v']].values
    bvs = df[['b1_v', 'b2_v', 'b3_v', 'b4_v', 'b5_v']].values
    aps = df[['a1', 'a2', 'a3', 'a4', 'a5']].values
    bps = df[['b1', 'b2', 'b3', 'b4', 'b5']].values
    return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)

In [4]:
lookback_len = [1, 2, 4, 8, 16, 32, 64, 128, 256]
lookback_shift = 0

for i, length in enumerate(lookback_len):
    if i == 0:
        functions = {f'{k}_{lookback_shift}_{length}': (v, length, lookback_shift) for k, v in globals().items() if callable(v) and k.startswith('get_')}
    else:
        functions.update({f'{k}_{lookback_shift}_{length}': (v, length, lookback_shift) for k, v in globals().items() if callable(v) and k.startswith('get_')})
        

print('numbers of factors:', len(functions))

numbers of factors: 414


In [5]:
### train
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso, LinearRegression, Ridge
import lightgbm as lgb
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import joblib
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.simplefilter('ignore')

stock_list = ['0050', '2330', '2603']
year_list = ['2020', '2021', '2022', '2023']
month_list = [str(i).zfill(2) for i in range(1, 13)]
for stock in stock_list:
    df_stock = pd.DataFrame()
    for year in year_list:
        for month in month_list:
            file_name = f'{stock}_md_{year}{month}_{year}{month}.csv'
            data_address = f'/Users/ianzou/Desktop/Cornorstone/MAFM_6100/project_code/{stock}/'
            try:
                df = pd.read_csv(f'{data_address}{file_name}')
                print(f'I found {stock} in {year}.{month}. Now you can open it.')
            except:
                print(f'{stock}: data in {year}.{month} does not exist.')
                continue
                
            date_list = df['date'].unique().tolist()
            df_prc = pd.DataFrame()

            for i, date in enumerate(date_list):
                df_prc = pd.concat([df_prc, preprocess(df[df['date'] == date])])

            df = df_prc.copy()
            date_list = df['date'].unique().tolist()
            df_resampled = pd.DataFrame()
            for date in date_list:
                start_time = pd.to_datetime(f'{date} 09:00:00')
                df_tmp = df[df['date'] == date]
                df_tmp.loc[:, 'time'] = pd.to_timedelta(df_tmp['time']-90000000, unit='ms') + start_time
                # 合并date列和time列成新的datetime列，并设置为索引
                df_tmp.set_index(df_tmp['time'], inplace=True)
                df_tmp.drop(['time', 'date'], axis=1, inplace=True)
                # 每隔1s进行一次下采样
                df_tmp = df_tmp.resample('1S').mean()
                df_tmp = df_tmp.rename_axis('index')
                df_tmp['date'] = df_tmp.index.date.astype(str)
                df_tmp['time'] = df_tmp.index.time.astype(str)
                df_tmp.reset_index(inplace=True)
                df_tmp.drop(columns='index', inplace=True)
                df_tmp['label'] = (df_tmp['mid'].shift(-30) / df_tmp['mid'] - 1).fillna(0)

                df_resampled = pd.concat([df_resampled, df_tmp])
                df_resampled = df_resampled.fillna(0).reset_index(drop=True)

            for name, (func, *args) in tqdm(functions.items()):
                if 'get_ipython' in name:
                    continue
                # print(name)
                result = func(df_resampled, *args)
                var_name = name.replace('get_', '')
                df_resampled = pd.concat([df_resampled, result.rename(var_name)], axis=1)

            # 定义滚动训练的窗口大小
            window_size = 5
            date_list = df_resampled['date'].unique().tolist()

            # 定义 LightGBM 模型的参数
            params = {
                'objective': 'regression',
                'metric': 'mse',
                'learning_rate': 0.001,
                'num_leaves': 31,
                'max_depth': -1,
                'n_estimators': 1000,
                'random_state': 42,
                'n_jobs': -1
            }

            # 定义 LGBM 模型
            model = lgb.LGBMRegressor(boosting_type='gbdt', **params)

            # 定义用于存储每个 tick 预测结果的列表
            r2_list = []
            ic_list = []

            # 定义用于存储特征重要性的字典
            feature_importances = {}

            # 对于每个滚动窗口
            for i in tqdm(range(window_size, len(date_list))):
                # 选择训练数据和目标值
                X_train = df_resampled.loc[(df_resampled['date'] < date_list[i]) & (df_resampled['date'] >= date_list[i-window_size]), :]
                X_train = X_train.drop(['tick', 'date', 'time'], axis=1)

                y_train = X_train['label']
                X_train = X_train.drop(['label'], axis=1).replace([np.inf, -np.inf], 0).fillna(0)

                # 选择测试数据
                X_test = df_resampled.loc[(df_resampled['date'] == date_list[i]), :]
                X_test = X_test.drop(['tick', 'date', 'time'], axis=1)

                y_test = X_test['label']
                X_test = X_test.drop(['label'], axis=1).replace([np.inf, -np.inf], 0).fillna(0)

                # 标准化训练数据和测试数据
                feature_cols = X_train.columns
                # scaler = StandardScaler()
                # X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
                # X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

                # 训练模型
                model.fit(X_train, y_train)

                # 进行预测
                y_pred = model.predict(X_test)

                # 计算预测值和真实值之间的R方
                r2 = r2_score(y_test, y_pred)
                corr = np.corrcoef(y_test, y_pred)[0, 1]

                # 将R方添加到列表中
                r2_list.append(r2)
                ic_list.append(corr)

                # 记录特征重要性
                for feature, coef in zip(feature_cols, model.coef_):
                    if feature not in feature_importances:
                        feature_importances[feature] = []
                    feature_importances[feature].append(coef)
                
                featureList = []
                featureImportance = []

                for key in feature_importances:
                    featureList.append(key)
                    featureImportance.append(np.mean(feature_importances[key]))

                ## Draw Feature Importance
                # Create a dataframe to store the feature importances & Sort the importacne in descending order
                dfFeatImp = pd.DataFrame({'feature': featureList, 'importance': featureImportance})
                dfFeatImp = dfFeatImp.sort_values('importance', ascending = False).reset_index(drop=True)

                df_stock[f'{date_list[i]}'] = list(dfFeatImp['feature'].iloc[:10].reset_index(drop=True))+[r2, corr]
                df_stock.transpose().to_csv(f'./{stock}_lasso_resampled_tmp.csv')

                print(f'Total training number: {len(date_list)-window_size}, completed: {i-window_size+1}, IC: {corr}')

            # 计算所有天预测的平均 R2 值
            r2_mean = np.mean(r2_list)
            ic_mean = np.mean(ic_list)

            # 输出结果
            # print('特征重要性:', feature_importances)
            print('平均 R2 值:', r2_mean)
            print('平均 IC 值:', ic_mean)

            # featureList = []
            # featureImportance = []

            # for key in feature_importances:
            #     featureList.append(key)
            #     featureImportance.append(np.mean(feature_importances[key]))

            # ## Draw Feature Importance
            # # Create a dataframe to store the feature importances & Sort the importacne in descending order
            # dfFeatImp = pd.DataFrame({'feature': featureList, 'importance': featureImportance})
            # dfFeatImp = dfFeatImp.sort_values('importance', ascending = False).reset_index(drop=True)

            # df_stock[f'{year}_{month}'] = list(dfFeatImp['feature'].iloc[:10].reset_index(drop=True))+[r2_mean, ic_mean]
            # df_stock.transpose().to_csv(f'./{stock}_lasso_resampled_tmp.csv')
                
    df_stock.transpose().to_csv(f'./{stock}_lgbm_resampled.csv')
    joblib.dump(model, f'loan_{stock}_lgbm.pkl')

0050: data in 2020.01 does not exist.
0050: data in 2020.02 does not exist.
0050: data in 2020.03 does not exist.
0050: data in 2020.04 does not exist.
I found 0050 in 2020.05. Now you can open it.


100%|██████████| 414/414 [04:34<00:00,  1.51it/s]
  0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
### predict
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import warnings
import joblib

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.simplefilter('ignore')

stock_list = ['2603', '2330', '0050']
year_list = ['2023']
# month_list = [str(i).zfill(2) for i in range(1, 13)]
month_list = ['09']

for stock in stock_list:
    result_df = pd.DataFrame()
    for year in year_list:
        for month in month_list:
            file_name = f'{stock}_md_{year}{month}_{year}{month}.csv'
            data_address = f'autodl-tmp/chou/'
            try:
                df = pd.read_csv(f'{data_address}{file_name}')
                print(f'I found {stock} in {year}.{month}. Now you can open it.')
            except:
                print(f'{stock}: data in {year}.{month} does not exist.')
                continue
                
            date_list = df['date'].unique().tolist()
            df_prc = pd.DataFrame()

            for i, date in enumerate(date_list):
                df_prc = pd.concat([df_prc, preprocess(df[df['date'] == date])])

            date_list = df['date'].unique().tolist()
            df_prc = pd.DataFrame()

            for i, date in enumerate(date_list):
                df_prc = pd.concat([df_prc, preprocess(df[df['date'] == date])])

            df = df_prc.copy()
            date_list = df['date'].unique().tolist()
            df_resampled = pd.DataFrame()
            for date in date_list:
                start_time = pd.to_datetime(f'{date} 09:00:00')
                df_tmp = df[df['date'] == date]
                df_tmp.loc[:, 'time'] = pd.to_timedelta(df_tmp['time']-90000000, unit='ms') + start_time
                # 合并date列和time列成新的datetime列，并设置为索引
                df_tmp.set_index(df_tmp['time'], inplace=True)
                df_tmp.drop(['time', 'date'], axis=1, inplace=True)
                # 每隔1s进行一次下采样
                df_tmp = df_tmp.resample('1S').mean()
                df_tmp = df_tmp.rename_axis('index')
                df_tmp['date'] = df_tmp.index.date.astype(str)
                df_tmp['time'] = df_tmp.index.time.astype(str)
                df_tmp.reset_index(inplace=True)
                df_tmp.drop(columns='index', inplace=True)
                df_tmp['label'] = (df_tmp['mid'].shift(-30) / df_tmp['mid'] - 1).fillna(0)

                df_resampled = pd.concat([df_resampled, df_tmp])
                df_resampled = df_resampled.fillna(0).reset_index(drop=True)

            for name, (func, *args) in tqdm(functions.items()):
                if 'get_ipython' in name:
                    continue
                result = func(df_resampled, *args)
                var_name = name.replace('get_', '')
                df_resampled = pd.concat([df_resampled, result.rename(var_name)], axis=1)
                
            model = joblib.load(f'loan_{stock}_lgbm.pkl')
            X_test = df_resampled
            X_date = list(X_test['date'])
            X_test = X_test.drop(['tick', 'date', 'time'], axis=1)
            y_test = list(X_test['label'])
            X_test = X_test.drop(['label'], axis=1).replace([np.inf, -np.inf], 0).fillna(0)
            y_pred = list(model.predict(X_test))

            # Save the results into the overall DataFrame
            result_df = pd.concat([result_df, pd.DataFrame({'date': X_date, 'y_pred': y_pred, 'y_true': y_test})], ignore_index=True)

    # After all iterations, save the final DataFrame to a CSV file
    result_df.to_csv(f'./predict_{stock}_lgbm_resample.csv')