In [18]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# 1. 数据加载和预处理
data = pd.read_csv("merged_sorted_file1.csv")
data_clean = data.dropna(subset=['current']).copy()
# 1. 数据加载和预处理
# data = pd.read_csv("SF主力连续.csv",encoding="gb2312")
# data['time'] = pd.to_datetime(data['date'].astype(str) + ' ' + data['datetime'])
# data = data.sort_values(by='time')
# data_clean = data.dropna(subset=['current']).copy()


# 2. 特征生成
# Calculate rolling mean and standard deviation
data_clean['rolling_mean'] = data_clean['current'].rolling(window=600).mean()
data_clean['rolling_std'] = data_clean['current'].rolling(window=600).std()
# data_clean = data_clean.fillna(data_clean.median())

# Calculate RSI
delta = data_clean['current'].diff()
gain = (delta.where(delta > 0, 0)).fillna(0)
loss = (-delta.where(delta < 0, 0)).fillna(0)
avg_gain = gain.rolling(window=800).mean()
avg_loss = loss.rolling(window=800).mean()
rs = avg_gain / avg_loss
data_clean['RSI'] = 100 - (100 / (1 + rs))

# Calculate MACD
short_ema = data_clean['current'].ewm(span=200, adjust=False).mean()
long_ema = data_clean['current'].ewm(span=800, adjust=False).mean()
data_clean['MACD'] = short_ema - long_ema
data_clean['MACD_signal'] = data_clean['MACD'].ewm(span=800, adjust=False).mean()

# Shift RSI and MACD to use them as features for next timestep
data_clean['RSI_shifted'] = data_clean['RSI'].shift(1)
data_clean['MACD_shifted'] = data_clean['MACD'].shift(1)
data_clean['MACD_signal_shifted'] = data_clean['MACD_signal'].shift(1)

# Define label
data_clean['label'] = (data_clean['current'].shift(-100) > data_clean['current']).astype(int)


# # 2. 特征生成
# # Calculate rolling mean and standard deviation
# data_clean['rolling_mean'] = data_clean['current'].rolling(window=5).mean()
# data_clean['rolling_std'] = data_clean['current'].rolling(window=5).std()
# data_clean = data_clean.fillna(data_clean.median())

# # Calculate RSI
# delta = data_clean['current'].diff()
# gain = (delta.where(delta > 0, 0)).fillna(0)
# loss = (-delta.where(delta < 0, 0)).fillna(0)
# avg_gain = gain.rolling(window=14).mean()
# avg_loss = loss.rolling(window=14).mean()
# rs = avg_gain / avg_loss
# data_clean['RSI'] = 100 - (100 / (1 + rs))

# # Calculate MACD
# short_ema = data_clean['current'].ewm(span=12, adjust=False).mean()
# long_ema = data_clean['current'].ewm(span=26, adjust=False).mean()
# data_clean['MACD'] = short_ema - long_ema
# data_clean['MACD_signal'] = data_clean['MACD'].ewm(span=9, adjust=False).mean()

# # Shift RSI and MACD to use them as features for next timestep
# data_clean['RSI_shifted'] = data_clean['RSI'].shift(1)
# data_clean['MACD_shifted'] = data_clean['MACD'].shift(1)
# data_clean['MACD_signal_shifted'] = data_clean['MACD_signal'].shift(1)

# # Define label
# data_clean['label'] = (data_clean['current'].shift(-120) > data_clean['current']).astype(int)



# 3. 分割数据
data_clean['date_only'] = pd.to_datetime(data_clean['time']).dt.date
# Updated the data split to use 'date_only'
first_date = data_clean['date_only'].iloc[0]
first_month_data = data_clean[data_clean['date_only'] <= first_date + pd.Timedelta(days=20)]
# first_month_data = data_clean[(data_clean['date_only'] > first_date + pd.Timedelta(days=40)) & 
#                                (data_clean['date_only'] <= first_date + pd.Timedelta(days=70))]
features = ['current', 'rolling_mean', 'rolling_std', 'RSI_shifted', 'MACD_shifted', 'MACD_signal_shifted']
X_first_month = first_month_data[features]
y_first_month = first_month_data['label']

X_train_month_clean = X_first_month.dropna()
y_train_month_clean = y_first_month[X_train_month_clean.index]


# 4. 模型训练
rf = RandomForestClassifier(n_estimators=100, random_state=42)
# rf = RandomForestRegressor(n_estimators=100, random_state=42)

rf.fit(X_train_month_clean, y_train_month_clean)  # Use the cleaned data for training



KeyboardInterrupt: 

In [17]:
#涨跌预测
import collections
# 5. 使用第二个月的数据进行预测
second_month_data = data_clean[(data_clean['date_only'] > first_date + pd.Timedelta(days=20)) & 
                               (data_clean['date_only'] <= first_date + pd.Timedelta(days=30))]

# 预测的初始化
probabilities_second_month = []


initial_funds = 100000
funds = initial_funds
stock_quantity = 0
stock_price = 0
buy_threshold = 0.9
sold_threshold = 0.5
transactions = []

# 在逐行预测和模拟交易部分，初始化买入价格为0
buy_price = 0
minute_count = 0
daily_transaction_count = 0  # 新增：每天的交易次数计数器

for idx, row in second_month_data.iterrows():
    current_row = row[features].fillna(method='ffill')
    current_probability = rf.predict_proba([current_row])[0][1]
        
    current_price = row['current']
    minute_count = minute_count + 1

    flag = 1
    # time_string = row['time'].strftime('%H:%M:%S')
    time_string = row['time'][-8:]  # 获取时间部分，例如 "22:59:10"
    if time_string.startswith("22:59:"):
        daily_transaction_count = 0
        flag = 0
        
    if current_probability > buy_threshold and stock_quantity == 0 and daily_transaction_count<10 and flag==1:
        print(current_probability)
        daily_transaction_count += 1  # 新增：每次交易后增加计数器
        stock_quantity = funds // current_price
        funds -= stock_quantity * current_price
        stock_price = current_price
        fee = stock_quantity * current_price * 0.0001
        funds -= fee
        buy_price = current_price  # 记录买入价格
        print(f"Time: {row['time']} - Action: BUY at {current_price}, Quantity: {stock_quantity},Funds:{funds}")
        transactions.append({
            'action': 'buy',
            'time': row['time'],
            'price': current_price,
            'quantity': stock_quantity,
            'funds_remaining': funds
        })
        minute_count = 0
    # elif minute_count > 100 and stock_quantity > 0 or  (stock_quantity > 0 and flag==0) :
    elif current_probability < sold_threshold and stock_quantity > 0 and minute_count>100 or (stock_quantity > 0 and flag==0):
        print(current_probability)
        funds += stock_quantity * current_price
        fee = stock_quantity * current_price * 0.0001
        funds -= fee
        price_diff = current_price - buy_price  # 计算价格差异
        # 判断价格差异是涨、跌还是平
        if price_diff > 0:
            direction = '涨'
        elif price_diff < 0:
            direction = '跌'
        else:
            direction = '平'
        print(f"Time: {row['time']} - Action: SELL at {current_price}, Quantity: {stock_quantity},Funds:{funds}, Price Change: {direction}")
        transactions.append({
            'action': 'sell',
            'time': row['time'],
            'price': current_price,
            'quantity': stock_quantity,
            'funds_remaining': funds,
            'price_change': direction
        })
        stock_quantity = 0
        buy_price = 0  # 重置买入价格为0
    
    

transactions_df = pd.DataFrame(transactions)
print(transactions_df['price_change'].value_counts())
transactions_df.to_csv('transactions_classify.csv')

0.92
Time: 2023-09-22 09:01:55 - Action: BUY at 7516.0, Quantity: 13.0,Funds:2290.0
0.49
Time: 2023-09-22 09:03:15 - Action: SELL at 7498.0, Quantity: 13.0,Funds:99764.0, Price Change: 跌
0.91
Time: 2023-09-22 09:13:20 - Action: BUY at 7468.0, Quantity: 13.0,Funds:2678.0
0.49
Time: 2023-09-22 09:18:10 - Action: SELL at 7474.0, Quantity: 13.0,Funds:99840.0, Price Change: 涨
0.91
Time: 2023-09-25 09:12:42 - Action: BUY at 7512.0, Quantity: 13.0,Funds:2182.0
0.49
Time: 2023-09-25 09:16:10 - Action: SELL at 7512.0, Quantity: 13.0,Funds:99838.0, Price Change: 平
0.91
Time: 2023-09-25 13:31:22 - Action: BUY at 7506.0, Quantity: 13.0,Funds:2258.0
0.4
Time: 2023-09-25 13:34:09 - Action: SELL at 7494.0, Quantity: 13.0,Funds:99680.0, Price Change: 跌
0.91
Time: 2023-09-25 14:22:12 - Action: BUY at 7486.0, Quantity: 13.0,Funds:2360.0
0.49
Time: 2023-09-25 14:24:36 - Action: SELL at 7488.0, Quantity: 13.0,Funds:99704.0, Price Change: 涨
0.92
Time: 2023-09-26 09:00:07 - Action: BUY at 7432.0, Quantity: 