In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

# Load data
daily_data = pd.read_csv('baba_daily.csv')
min_data = pd.read_csv('baba_min.csv')

# Convert to datetime format
daily_data['date'] = pd.to_datetime(daily_data['date'])
min_data['date'] = pd.to_datetime(min_data['date'])
min_data['day'] = min_data['date'].dt.date

# Calculate the target value
sixtieth_min = min_data.groupby('day').nth(59).reset_index()
targets = sixtieth_min.set_index('day')['close'] - min_data.groupby('day').nth(9)['close']
targets = targets.reset_index()
targets.columns = ['day', 'target']
targets['day'] = pd.to_datetime(targets['day'])

# Feature engineering
final_data = daily_data.copy()
final_data['is_high_open'] = (final_data['open'] > final_data['high'].shift(1)).astype(int)
final_data['is_low_open'] = (final_data['open'] < final_data['low'].shift(1)).astype(int)
first_10_mins = min_data.groupby('day').head(10)
first_open = first_10_mins.groupby('day').first()['open']
tenth_close = first_10_mins.groupby('day').last()['close']
final_data['trend_10min'] = (tenth_close - first_open) / first_open
first_10_high = first_10_mins.groupby('day')['high'].max()
first_10_low = first_10_mins.groupby('day')['low'].min()
final_data['volatility_10min'] = (first_10_high - first_10_low) / first_10_low
average_volume_yesterday = min_data.groupby('day')['volume'].mean().shift(1)
average_volume_10min = first_10_mins.groupby('day')['volume'].mean()
final_data['relative_volume'] = average_volume_10min / average_volume_yesterday
final_data = pd.merge(final_data, targets, left_on='date', right_on='day')

# Define features and target
feature_columns = ['is_high_open', 'is_low_open', 'trend_10min', 'volatility_10min', 'relative_volume']
X = final_data[feature_columns]
y = final_data['target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute NaN values
imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_imputed, y_train)

# Predict and evaluate
y_pred = model.predict(X_test_imputed)
rmse = mean_squared_error(y_test, y_pred, squared=False)
accuracy = ((y_pred > 0) == (y_test > 0)).mean()

comparison_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred,
    'Difference': y_pred - y_test
})
print(comparison_df)

print(f"RMSE: {rmse}")
print(f"Accuracy: {accuracy}")


    Actual  Predicted  Difference
19 -1.1407  -0.019654    1.121046
16 -0.3900   0.517950    0.907950
15  0.0250  -0.019654   -0.044654
26  0.9350  -0.055644   -0.990644
4  -0.5998  -0.055644    0.544156
12  0.4450   0.517950    0.072950
37 -1.0801  -0.019654    1.060446
27  0.4000  -0.055644   -0.455644
RMSE: 0.7663283107615511
Accuracy: 0.5
