In [1]:
import os
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense,GRU,Dropout, Bidirectional
from sklearn.preprocessing import MinMaxScaler
from keras.utils import Sequence
import warnings
warnings.filterwarnings('ignore')


data_clean = pd.read_csv("future_taobao_ss2403_tick.csv")


# 1. 数据加载和预处理
data_clean['last_price'] = pd.to_numeric(data_clean['last_price'], errors='coerce')

# pre_features = ['last_price','bid_price1','ask_price1','bid_price2','ask_price2','bid_price3','ask_price3','bid_price4','ask_price4','bid_price5','ask_price5','bid_volume1','bid_volume2','bid_volume3','bid_volume4','bid_volume5','ask_volume1','ask_volume2','ask_volume3','ask_volume4','ask_volume5']
pre_features = ['last_price','volume','open_interest','bid_price1','ask_price1','bid_price2','ask_price2','bid_price3','ask_price3','bid_price4','ask_price4','bid_price5','ask_price5','bid_volume1','bid_volume2','bid_volume3','bid_volume4','bid_volume5','ask_volume1','ask_volume2','ask_volume3','ask_volume4','ask_volume5']

for feature in pre_features:
    data_clean[feature + '_diff'] =  data_clean[feature].diff()
data_clean['last_price_bid_diff'] =  data_clean['last_price'] - data_clean['bid_price1']  
data_clean['last_price_ask_diff'] =  data_clean['last_price'] - data_clean['ask_price1']  
data_clean['last_price_highest_diff'] =  data_clean['last_price'] - data_clean['highest']  
data_clean['last_price_lowest_diff'] =  data_clean['last_price'] - data_clean['lowest']  
data_clean['datetime'] = pd.to_datetime(data_clean['datetime'])

# Initialize features list with pre_features
features = list(pre_features)

# Add difference features for each pre_feature
diff_features = [feature + '_diff' for feature in pre_features]
features.extend(diff_features)

# Add specific price difference features
additional_features = [
    'last_price_bid_diff', 'last_price_ask_diff', 'last_price_highest_diff', 'last_price_lowest_diff'
]
features.extend(additional_features)

# Define label
data_clean.dropna(subset=['bid_price1'], inplace=True)
data_clean['label'] = (data_clean['bid_price1'].shift(-100) > data_clean['bid_price1']+5).astype(int)

label_df = pd.DataFrame(data_clean['label'])
label_df.to_csv('label.csv')

# 3. 分割数据
# Now you can filter the data between two dates
train_data = data_clean[(data_clean['datetime'] >= '2023-12-01 09:00:00') & 
                        (data_clean['datetime'] < '2024-01-15 09:00:00')]

test_data = data_clean[(data_clean['datetime'] >= '2024-01-19 09:00:00') & 
                        (data_clean['datetime'] < '2024-01-22 09:00:00')]


# 初始化归一化器
scaler = MinMaxScaler(feature_range=(0, 1))

train_data[features] = scaler.fit_transform(train_data[features])

# 将 DataFrame 转换为 NumPy 数组
X_train = np.array(train_data[features])
y_train = np.array(train_data['label'])

# 删除 NaN 值
mask = ~np.isnan(X_train).any(axis=1)
X_train = X_train[mask]
y_train = y_train[mask]

# 首先，确保 X_train 和 X_test 没有 NaN 值
X_train = X_train[~np.isnan(X_train).any(axis=1)]
y_train = y_train[~np.isnan(X_train).any(axis=1)]

class TimeseriesGenerator(Sequence):
    def __init__(self, data, labels, length, stride=1, batch_size=32):
        self.data = data
        self.labels = labels
        self.length = length
        self.stride = stride
        self.batch_size = batch_size

    def __len__(self):
        return max(int(np.ceil((len(self.data) - self.length) / float(self.stride * self.batch_size))), 0)

    def __getitem__(self, idx):
        batch_x = []
        batch_y = []

        start = idx * self.batch_size * self.stride
        end = start + self.batch_size * self.stride + self.length

        for i in range(start, min(end, len(self.data) - self.length), self.stride):
            batch_x.append(self.data[i: i + self.length])
            batch_y.append(self.labels[i + self.length])

        return np.array(batch_x), np.array(batch_y)

# 定义时间步长和步长
time_steps = 300
stride = 1  # 增加步长以减少内存使用

# 创建数据生成器
train_generator = TimeseriesGenerator(X_train, y_train, length=time_steps, stride=stride, batch_size=32)

# model = Sequential()
# model.add(LSTM(units=50, input_shape=(time_steps, X_train.shape[1])))
# model.add(Dense(1, activation='sigmoid'))
# model.compile(optimizer='adam', loss='binary_crossentropy')

# # 使用生成器训练模型
# model.fit(train_generator, epochs=5)
# model.save('model_taobao_lstm_limit_order.h5')

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from keras.models import load_model
from datetime import datetime, time
model = load_model('model_taobao_lstm_limit_order.h5')

def parse_time_range(time_range_str):
    """解析时间范围字符串并返回时间对象的开始和结束时间"""
    start_str, end_str = time_range_str.split('-')
    start_time = datetime.strptime(start_str, "%H:%M").time()
    end_time = datetime.strptime(end_str, "%H:%M").time()
    return start_time, end_time

def is_time_in_ranges(time_to_check, time_ranges):
    """判断给定时间是否在时间范围数组内"""
    for time_range in time_ranges:
        start_time, end_time = parse_time_range(time_range)
        if start_time <= time_to_check <= end_time:
            return True
    return False


def prepare_data_for_prediction(test_data, time_steps, scaler):

    # 使用归一化
    scaled_data = scaler.transform(test_data[features].dropna())

    # 重塑数据以适应模型
    X = np.array([scaled_data[i:i+time_steps] for i in range(len(scaled_data)-time_steps+1)])
    return X


# 定义时间范围数组
notrade_time = ["11:20-11:30","14:50-15:00","0:30-1:00"]
# 准备数据
X_test = prepare_data_for_prediction(test_data, time_steps, scaler)
# 检查数据形状
if X_test.shape[1] != time_steps or X_test.shape[2] != len(features):
    raise ValueError("测试数据的形状不正确，应该是 (样本数, 时间步长, 特征数)")
# 批量预测
predictions = model.predict(X_test, verbose=0)
# 确保test_data的索引与predictions对齐
aligned_test_data = test_data.iloc[time_steps - 1:]


initial_funds = 100000
funds = initial_funds
stock_quantity = 0
stock_price = 0
buy_threshold = 0.3
sold_threshold = 0.2
transactions = []
minute_count = 0
buy_price = 0
for prediction,  (index, row)  in zip(predictions, aligned_test_data.iterrows()):
    current_probability = prediction[0]
    # print(current_probability)
    # print(row['last_price'])
    if '.' in str(row['datetime']):
        # 如果有小数点，分割为主时间部分和纳秒部分
        time, nano_part = str(row['datetime']).split('.')
    else:
        # 如果没有小数点，则没有纳秒部分
        time = str(row['datetime'])
        nano_part = '0'
    if is_time_in_ranges(datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
.time(),notrade_time):
            continue
    if current_probability is not None:
        # current_price = row['bid_price1']
        buy_price = row['ask_price1']
        sell_price = row['bid_price1']
        minute_count = minute_count + 1
            
        if current_probability > buy_threshold and stock_quantity == 0 :
            print(current_probability)
            stock_quantity = funds // buy_price
            funds -= stock_quantity * buy_price
            fee = stock_quantity * 2
            funds -= fee
            stock_price = buy_price
            last_buy_price = buy_price  # 记录买入价格
            print(f"Time: {row['datetime']} - Action: BUY at {buy_price}, Quantity: {stock_quantity},Funds:{funds}")
            transactions.append({
                'action': 'buy',
                'time': row['datetime'],
                'price': buy_price,
                'quantity': stock_quantity,
                'funds_remaining': funds
            })
            minute_count = 0
        # elif minute_count > 30 and stock_quantity > 0 and current_probability<sold_threshold :
        # elif minute_count > 100 and stock_quantity > 0:
        elif (stock_quantity > 0 and sell_price > last_buy_price) or (stock_quantity>0 and minute_count > 100):
            print(current_probability)
            funds += stock_quantity * sell_price
            price_diff = sell_price - last_buy_price  # 计算价格差异
            # 判断价格差异是涨、跌还是平
            if price_diff > 0:
                direction = '涨'
            elif price_diff < 0:
                direction = '跌'
            else:
                direction = '平'
            print(f"Time: {row['datetime']} - Action: SELL at {sell_price}, Quantity: {stock_quantity},Funds:{funds}, Price Change: {direction}")
            transactions.append({
                'action': 'sell',
                'time': row['datetime'],
                'price': sell_price,
                'quantity': stock_quantity,
                'funds_remaining': funds,
                'price_change': direction
            })
            stock_quantity = 0
            last_buy_price = 0  # 重置买入价格为0

    
if len(transactions)>0:
    transactions_df = pd.DataFrame(transactions)
    print(transactions_df['price_change'].value_counts())
transactions_df.to_csv('transactions_tick_lstm.csv')







0.053287964
0.05896363
0.057650417
0.05889701
0.058328282
0.0569202
0.057121567
0.056735408
0.05671724
0.056075435
0.059683457
0.069371164
0.066168934
0.06710093
0.06658642
0.06644497
0.06293817
0.06355983
0.06753346
0.066361226
0.06601425
0.06505565
0.061520685
0.062581494
0.061299477
0.06999928
0.06511042
0.064400725
0.06518881
0.061693016
0.06278765
0.062324498
0.062401347
0.062420435
0.062350422
0.06489754
0.06310621
0.050879396
0.05974857
0.056776647
0.058457807
0.057393674
0.054426737
0.07670524
0.07165232
0.054939657
0.061409455
0.059931964
0.06892488
0.05103661
0.054048516
0.050748978
0.054755464
0.04992211
0.053540945
0.051742207
0.055313572
0.05488104
0.05198595
0.053406924
0.055602014
0.054081585
0.054205958
0.054151483
0.0542245
0.053909477
0.054049525
0.05378503
0.05386204
0.0534625
0.05592865
0.0550065
0.055307787
0.052599452
0.053064577
0.052826915
0.05553731
0.052185528
0.05515377
0.053976044
0.049989417
0.048351087
0.07208595
0.06480559
0.0668568
0.06620679
0.06590081


NameError: name 'transactions_df' is not defined