# Bitcoin Price Prediction using Random Forest

In [None]:
import pandas as pd
import time
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
import joblib
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'bitcoin_price_sentiment_addmean.csv'
data = pd.read_csv(file_path)

# Automatically convert the 'date' column to datetime format and drop rows with parsing errors
data['date'] = pd.to_datetime(data['date'], dayfirst=False, errors='coerce')
data.dropna(subset=['date'], inplace=True)

# Drop rows with NaN values due to shift operations (if any)
data.dropna(inplace=True)

# Check if data is empty after creating features
if data.empty:
    raise ValueError("The dataset is empty after processing. Please check the input data.")

# Define features and target variable
x = data[['Open', 'High', 'Low', 'Volume', 'sentiment_scores']]
y = data['Close']

# Standardize the features
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

# Split the data into training and testing sets based on time sequence
# Output the start and end dates for the training and testing sets
print(f"Training set start date: {data['date'].iloc[0]}")
print(f"Training set end date: {data['date'].iloc[train_size - 1]}")
print(f"Testing set start date: {data['date'].iloc[train_size]}")
print(f"Testing set end date: {data['date'].iloc[-1]}")
train_size = int(len(data) * 0.8)
x_train = x_scaled[:train_size]
x_test = x_scaled[train_size:]
y_train = y[:train_size]
y_test = y[train_size:]

# Train the Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
start_time = time.time()
rf_model.fit(x_train, y_train)
end_time = time.time()
training_time = end_time - start_time

# Evaluate the model
y_pred = rf_model.predict(x_test)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

# Output evaluation results
print(f"Training time: {training_time:.2f} seconds")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2%}")

# Export the model
model_export_path = 'rf_model.pkl'
joblib.dump(rf_model, model_export_path)
print(f"Model exported to: {model_export_path}")

# Export the scaler
scaler_export_path = 'rf_scaler.pkl'
joblib.dump(scaler, scaler_export_path)
print(f"Scaler exported to: {scaler_export_path}")

# Visualize the prediction results
# Create a DataFrame with test results to access the date index
test_results = data.iloc[y_test.index]
plt.figure(figsize=(12, 6))
plt.plot(range(len(y_test)), y_test.values, label='Actual', color='blue')
plt.plot(range(len(y_pred)), y_pred, label='Predicted', color='orange', linestyle='--')
plt.title('Bitcoin Price Prediction')
plt.xlabel('Samples')
plt.ylabel('Closing Price')
plt.legend()
plt.grid(True)
plt.show()


This notebook demonstrates how to use a Random Forest model to predict the closing price of Bitcoin using historical data.