In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# 1. 数据加载和预处理
data = pd.read_csv("merged_sorted_file1.csv")
data_clean = data.dropna(subset=['current']).copy()

# 2. 特征生成
# Calculate rolling mean and standard deviation
data_clean['rolling_mean'] = data_clean['current'].rolling(window=600).mean()
data_clean['rolling_std'] = data_clean['current'].rolling(window=600).std()
# data_clean = data_clean.fillna(data_clean.median())

# Calculate RSI
delta = data_clean['current'].diff()
gain = (delta.where(delta > 0, 0)).fillna(0)
loss = (-delta.where(delta < 0, 0)).fillna(0)
avg_gain = gain.rolling(window=800).mean()
avg_loss = loss.rolling(window=800).mean()
rs = avg_gain / avg_loss
data_clean['RSI'] = 100 - (100 / (1 + rs))

# Calculate MACD
short_ema = data_clean['current'].ewm(span=200, adjust=False).mean()
long_ema = data_clean['current'].ewm(span=800, adjust=False).mean()
data_clean['MACD'] = short_ema - long_ema
data_clean['MACD_signal'] = data_clean['MACD'].ewm(span=800, adjust=False).mean()

# Shift RSI and MACD to use them as features for next timestep
data_clean['RSI_shifted'] = data_clean['RSI'].shift(1)
data_clean['MACD_shifted'] = data_clean['MACD'].shift(1)
data_clean['MACD_signal_shifted'] = data_clean['MACD_signal'].shift(1)

# Define label
data_clean['label'] = (data_clean['current'].shift(-1000) > data_clean['current']).astype(int)



# 3. 分割数据
data_clean['date_only'] = pd.to_datetime(data_clean['time']).dt.date
# Updated the data split to use 'date_only'
first_date = data_clean['date_only'].iloc[0]
# first_month_data = data_clean[data_clean['date_only'] <= first_date + pd.Timedelta(days=60)]
first_month_data = data_clean[(data_clean['date_only'] > first_date + pd.Timedelta(days=60)) & 
                               (data_clean['date_only'] <= first_date + pd.Timedelta(days=90))]
features = ['current', 'rolling_mean', 'rolling_std', 'RSI_shifted', 'MACD_shifted', 'MACD_signal_shifted']
X_first_month = first_month_data[features]
y_first_month = first_month_data['label']

X_train_month_clean = X_first_month.dropna()
y_train_month_clean = y_first_month[X_train_month_clean.index]


# 4. 模型训练
rf = RandomForestClassifier(n_estimators=100, random_state=42)
# rf = RandomForestRegressor(n_estimators=100, random_state=42)

rf.fit(X_train_month_clean, y_train_month_clean)  # Use the cleaned data for training



In [14]:
import pandas as pd

def predict_next_move(tick, model, rolling_windows, ewm_spans, historical_data):
    # Use 'last_price' as 'current' for calculations
    tick['current'] = tick['last_price']
    
    # Append the new tick to the historical data
    historical_data = pd.concat([historical_data, pd.DataFrame([tick])], ignore_index=True)
    
    # Ensure we have enough data to calculate the rolling and EWM features
    if len(historical_data) >= rolling_windows['mean'] and len(historical_data) >= ewm_spans['long']:
        # Calculate rolling mean and standard deviation on historical data
        historical_data['rolling_mean'] = historical_data['current'].rolling(window=rolling_windows['mean']).mean()
        historical_data['rolling_std'] = historical_data['current'].rolling(window=rolling_windows['std']).std()

        # Calculate RSI on historical data
        delta = historical_data['current'].diff()
        gain = (delta.where(delta > 0, 0)).fillna(0)
        loss = (-delta.where(delta < 0, 0)).fillna(0)
        avg_gain = gain.rolling(window=rolling_windows['rsi']).mean()
        avg_loss = loss.rolling(window=rolling_windows['rsi']).mean()
        rs = avg_gain / avg_loss
        historical_data['RSI'] = 100 - (100 / (1 + rs))

        # Calculate MACD on historical data
        short_ema = historical_data['current'].ewm(span=ewm_spans['short'], adjust=False).mean()
        long_ema = historical_data['current'].ewm(span=ewm_spans['long'], adjust=False).mean()
        historical_data['MACD'] = short_ema - long_ema
        historical_data['MACD_signal'] = historical_data['MACD'].ewm(span=ewm_spans['signal'], adjust=False).mean()

        # Shift RSI and MACD to use them as features for next timestep
        historical_data['RSI_shifted'] = historical_data['RSI'].shift(1)
        historical_data['MACD_shifted'] = historical_data['MACD'].shift(1)
        historical_data['MACD_signal_shifted'] = historical_data['MACD_signal'].shift(1)

        # Use the last row for prediction
        X_new = historical_data.iloc[-1:][['current', 'rolling_mean', 'rolling_std', 'RSI_shifted', 'MACD_shifted', 'MACD_signal_shifted']]
        
        # Check if X_new contains NaN values
        if X_new.isnull().values.any():
            # Handle rows with NaN values (e.g., skip prediction or use a placeholder value)
            # For example, we can return None or a specific signal indicating insufficient data
            return None, historical_data
        else:
            # Predict
            prediction = model.predict(X_new)
            return prediction[0], historical_data
    else:
        # Not enough data to predict
        return None, historical_data

# Example usage:
rolling_windows = {'mean': 600, 'std': 600, 'rsi': 800}
ewm_spans = {'short': 200, 'long': 800, 'signal': 800}

# Initialize historical_data with the correct column names and types if necessary
historical_data = pd.DataFrame()

# Assuming `rf` is your trained RandomForestClassifier model
# Assuming `ticks` is your new input tick data in DataFrame format
for index, tick in ticks.iterrows():
    prediction, historical_data = predict_next_move(tick, rf, rolling_windows, ewm_spans, historical_data)
    if prediction is not None:
        print(prediction)

    else:
        print("Insufficient data for prediction")


Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficient data for prediction
Insufficie