In [15]:
import tushare as ts
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

data_clean = pd.read_csv("future_ssmain_tick.csv")


# 1. 数据加载和预处理
# data_clean = data.sort_values(by='trade_time')
data_clean['close'] = pd.to_numeric(data_clean['close'], errors='coerce')
data_clean['trade_time'] = pd.to_datetime(data_clean['trade_time'])
data_clean['volume'] = pd.to_numeric(data_clean['数量'], errors='coerce')

# 2. 特征生成
# Example usage:
rolling_windows = {'mean': 300, 'std': 300, 'rsi': 400}
ewm_spans = {'short': 200, 'long': 800, 'signal': 800}

# 滚动统计特征
data_clean['rolling_mean'] = data_clean['close'].rolling(window=rolling_windows['mean']).mean()
data_clean['rolling_std'] = data_clean['close'].rolling(window=rolling_windows['std']).std()


# Calculate RSI
delta = data_clean['close'].diff()
gain = (delta.where(delta > 0, 0)).fillna(0)
loss = (-delta.where(delta < 0, 0)).fillna(0)
avg_gain = gain.rolling(window=rolling_windows['rsi']).mean()
avg_loss = loss.rolling(window=rolling_windows['rsi']).mean()
rs = avg_gain / avg_loss
data_clean['RSI'] = 100 - (100 / (1 + rs))

# Calculate MACD
short_ema = data_clean['close'].ewm(span=ewm_spans['short'], adjust=False).mean()
long_ema = data_clean['close'].ewm(span=ewm_spans['long'], adjust=False).mean()
data_clean['MACD'] = short_ema - long_ema
data_clean['MACD_signal'] = data_clean['MACD'].ewm(span=ewm_spans['signal'], adjust=False).mean()

# Shift RSI and MACD to use them as features for next timestep
data_clean['RSI_shifted'] = data_clean['RSI'].shift(1)
data_clean['MACD_shifted'] = data_clean['MACD'].shift(1)
data_clean['MACD_signal_shifted'] = data_clean['MACD_signal'].shift(1)

# 累积成交量特征
# 计算单位时间内的成交量变化
data_clean['volume_change'] = data_clean['volume'].diff()

# 累积成交量特征生成
data_clean['volume_rolling_mean'] = data_clean['volume'].rolling(window=300).mean()


# 价格变化率和成交量变化
data_clean['price_change_rate'] = data_clean['close'].pct_change()

# 定义标签
data_clean['label'] = (data_clean['close'].shift(-1000) > data_clean['close']).astype(int)


# 3. 分割数据
# 分割数据为训练集和测试集
# Convert the 'trade_time' column to datetime
data_clean['trade_time'] = pd.to_datetime(data_clean['trade_time'])

# 将时间分解为小时和分钟
data_clean['hour'] = data_clean['trade_time'].dt.hour
data_clean['minute'] = data_clean['trade_time'].dt.minute

# Now you can filter the data between two dates
train_data = data_clean[(data_clean['trade_time'] >= '2023-10-01 09:00:00') & 
                        (data_clean['trade_time'] < '2023-10-25 09:00:00')]

test_data = data_clean[(data_clean['trade_time'] >= '2023-10-25 09:00:00') & 
                        (data_clean['trade_time'] < '2023-10-31 09:00:00')]


features = ['rolling_mean', 'rolling_std', 'RSI', 'MACD', 'volume_rolling_mean', 'price_change_rate', 'volume_change','hour','minute']


X_train = train_data[features].dropna()
y_train = train_data['label'][X_train.index]
X_test = test_data[features].dropna()
y_test = test_data['label'][X_test.index]

# # 初始化标准化缩放器
# scaler = StandardScaler()

# # 使用训练数据拟合缩放器，然后转换训练数据
# X_train_scaled = scaler.fit_transform(X_train)


# 模型训练
model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=200, learning_rate=0.05, max_depth=8)
model.fit(X_train, y_train)

In [16]:
def predict_next_move(tick, model, rolling_windows, ewm_spans, historical_data):
    # 将 'last_price' 作为 'current' 进行计算
    tick['current'] = tick['close']
    
    # 将新的 tick 数据追加到历史数据中
    historical_data = pd.concat([historical_data, pd.DataFrame([tick])], ignore_index=True)
    
    
    # 检查我们是否有足够的数据来计算滚动和EWM特征
    if len(historical_data) >= max(rolling_windows['mean'], rolling_windows['std'], rolling_windows['rsi'], ewm_spans['long']):
        # 在历史数据上计算滚动平均和标准差
        historical_data['rolling_mean'] = historical_data['current'].rolling(window=rolling_windows['mean'], min_periods=1).mean()
        historical_data['rolling_std'] = historical_data['current'].rolling(window=rolling_windows['std'], min_periods=1).std()

        # 在历史数据上计算RSI
        delta = historical_data['current'].diff()
        gain = (delta.where(delta > 0, 0)).fillna(0)
        loss = (-delta.where(delta < 0, 0)).fillna(0)
        avg_gain = gain.rolling(window=rolling_windows['rsi'], min_periods=1).mean()
        avg_loss = loss.rolling(window=rolling_windows['rsi'], min_periods=1).mean()
        rs = avg_gain / avg_loss.replace(0, 1)  # 避免除以零
        historical_data['RSI'] = 100 - (100 / (1 + rs))

        # 在历史数据上计算MACD
        short_ema = historical_data['current'].ewm(span=ewm_spans['short'], adjust=False, min_periods=1).mean()
        long_ema = historical_data['current'].ewm(span=ewm_spans['long'], adjust=False, min_periods=1).mean()
        historical_data['MACD'] = short_ema - long_ema
        historical_data['MACD_signal'] = historical_data['MACD'].ewm(span=ewm_spans['signal'], adjust=False, min_periods=1).mean()

        # 将RSI和MACD移位以用作下一个时间步的特征
        historical_data['RSI_shifted'] = historical_data['RSI'].shift(1)
        historical_data['MACD_shifted'] = historical_data['MACD'].shift(1)
        historical_data['MACD_signal_shifted'] = historical_data['MACD_signal'].shift(1)
        
        #累积成交量
        historical_data['volume_rolling_mean'] = historical_data['volume'].rolling(window=300).mean()

        # 价格变化率和成交量变化
        historical_data['price_change_rate'] = historical_data['close'].pct_change()
        historical_data['volume_change'] = historical_data['volume'].diff()

        # 使用最后一行数据进行预测
        X_new = historical_data.iloc[-1:][features]
        
        # 检查X_new是否包含NaN值
        if X_new.isnull().values.any():
            # 处理包含NaN值的行（例如，跳过预测或使用占位符值）
            # 例如，我们可以返回None或一个特定的信号表示数据不足
            return None, historical_data
        else:
            prediction_proba = model.predict_proba(X_new)
            # 获取预测为类别1的概率
            probability_of_one = prediction_proba[0][1]
            return probability_of_one, historical_data
    else:
        # 数据不足以进行预测
        return None, historical_data




# Initialize historical_data with the correct column names and types if necessary
historical_data = pd.DataFrame()

initial_funds = 100000
funds = initial_funds
stock_quantity = 0
stock_price = 0
buy_threshold = 0.8
sold_threshold = 0.6
transactions = []
minute_count = 0

for idx, row in test_data.iterrows():
    current_probability, historical_data = predict_next_move(row, model, rolling_windows, ewm_spans, historical_data) 
    # print(current_probability)
    # print(row['trade_time'])
    # print(row['close'])
    if current_probability is not None:
        current_price = row['close']
        minute_count = minute_count + 1
            
        if current_probability > buy_threshold and stock_quantity == 0 :
            print(current_probability)
            stock_quantity = funds // current_price
            funds -= stock_quantity * current_price
            fee = stock_quantity * 2
            funds -= fee
            stock_price = current_price
            buy_price = current_price  # 记录买入价格
            print(f"Time: {row['trade_time']} - Action: BUY at {current_price}, Quantity: {stock_quantity},Funds:{funds}")
            transactions.append({
                'action': 'buy',
                'time': row['trade_time'],
                'price': current_price,
                'quantity': stock_quantity,
                'funds_remaining': funds
            })
            minute_count = 0
        elif minute_count > 1000 and stock_quantity > 0 and current_probability<sold_threshold:
            print(current_probability)
            funds += stock_quantity * current_price
            price_diff = current_price - buy_price  # 计算价格差异
            # 判断价格差异是涨、跌还是平
            if price_diff > 0:
                direction = '涨'
            elif price_diff < 0:
                direction = '跌'
            else:
                direction = '平'
            print(f"Time: {row['trade_time']} - Action: SELL at {current_price}, Quantity: {stock_quantity},Funds:{funds}, Price Change: {direction}")
            transactions.append({
                'action': 'sell',
                'time': row['trade_time'],
                'price': current_price,
                'quantity': stock_quantity,
                'funds_remaining': funds,
                'price_change': direction
            })
            stock_quantity = 0
            buy_price = 0  # 重置买入价格为0
        
    
if len(transactions)>0:
    transactions_df = pd.DataFrame(transactions)
    print(transactions_df['price_change'].value_counts())
transactions_df.to_csv('transactions_tick_xgb.csv')

0.8090602
Time: 2023-10-25 09:09:16 - Action: BUY at 14780.0, Quantity: 6.0,Funds:11308.0
0.48344377
Time: 2023-10-25 09:20:01 - Action: SELL at 14775.0, Quantity: 6.0,Funds:99958.0, Price Change: 跌
0.83261263
Time: 2023-10-25 09:22:24 - Action: BUY at 14755.0, Quantity: 6.0,Funds:11416.0
0.36029854
Time: 2023-10-25 09:34:02 - Action: SELL at 14755.0, Quantity: 6.0,Funds:99946.0, Price Change: 平
0.8082622
Time: 2023-10-25 09:50:18 - Action: BUY at 14795.0, Quantity: 6.0,Funds:11164.0
0.5993341
Time: 2023-10-25 10:01:49 - Action: SELL at 14795.0, Quantity: 6.0,Funds:99934.0, Price Change: 平
0.8048687
Time: 2023-10-25 10:12:10 - Action: BUY at 14750.0, Quantity: 6.0,Funds:11422.0
0.57766724
Time: 2023-10-25 10:36:49 - Action: SELL at 14745.0, Quantity: 6.0,Funds:99892.0, Price Change: 跌
0.8030178
Time: 2023-10-25 11:00:00 - Action: BUY at 14705.0, Quantity: 6.0,Funds:11650.0
0.5621281
Time: 2023-10-25 11:20:57 - Action: SELL at 14740.0, Quantity: 6.0,Funds:100090.0, Price Change: 涨
0.816

In [14]:
if len(transactions)>0:
    transactions_df = pd.DataFrame(transactions)
    print(transactions_df['price_change'].value_counts())
transactions_df.to_csv('transactions_tick_xgb.csv')

price_change
涨    8
跌    5
平    3
Name: count, dtype: int64
