In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller


In [None]:
btc = pd.read_csv(r"C:\Users\walte\Downloads\BTC.csv")
rates = pd.read_csv(r"C:\Users\walte\Downloads\DFF.csv")
rates['date'] = rates ['observation_date']
rates = rates.drop('observation_date', axis=1)
unemploy = pd.read_csv(r"C:\Users\walte\Downloads\UNRATE.csv")
unemploy['date'] = unemploy ['observation_date']
unemploy = unemploy.drop('observation_date', axis=1)
bond = pd.read_csv(r"C:\Users\walte\Downloads\DAAA.csv")
bond['date'] = bond ['observation_date']
bond = bond.drop('observation_date', axis=1)
reserve = pd.read_csv(r"C:\Users\walte\Downloads\WRESBAL.csv")
reserve['date'] = reserve ['observation_date']
reserve = reserve.drop('observation_date', axis=1)
btc["date"] = pd.to_datetime(btc["date"])
rates["date"] = pd.to_datetime(rates["date"])
bond["date"] = pd.to_datetime(bond["date"])
reserve["date"] = pd.to_datetime(reserve["date"])
unemploy["date"] = pd.to_datetime(unemploy["date"])
start_date = "2015-01-01"
end_date = "2025-01-01"
btc = btc[(btc["date"] >= start_date) & (btc["date"] <= end_date)]
rates = rates[(rates["date"] >= start_date) & (rates["date"] <= end_date)]

In [None]:
df = pd.merge(btc, rates, on='date', how='inner')
df = pd.merge(df, bond, on='date', how='inner')
#df = pd.merge(df, unemploy, on='date', how='inner')
#df = pd.merge(df, reserve, on='date', how='inner')

print(df.head())

In [None]:
result = adfuller(df['log_return'])
print("Log Return Dickey Fuller Results:")

print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')
print(f'Critical Values: {result[4]}')

if result[1] < 0.05:
    print("Reject the null hypothesis: The series is stationary.")
else:
    print("Fail to reject the null hypothesis: The series is non-stationary.")

In [None]:
df['date'] = pd.to_datetime(df['date'])

df = df.sort_values('date')

df['log_return'] = np.log(df['close'] / df['close'].shift(1))

lag_features = ['close', 'open', 'high', 'low', 'DFF', 'DAAA']
for col in lag_features:
    df[f'{col}_lag1'] = df[col].shift(1)

df = df.dropna().reset_index(drop=True)

feature_cols = [f'{col}_lag1' for col in lag_features]
X = df[feature_cols]
y = df['log_return']

split_index = int(0.8 * len(df))
X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print("Evaluation Metrics:")
print(f"MAE:  {mae:.6f}")
print(f"RMSE: {rmse:.6f}")
print(f"R squared:   {r2:.6f}")

plt.figure(figsize=(10, 5))
plt.plot(df['date'].iloc[split_index:], y_test, label='Actual')
plt.plot(df['date'].iloc[split_index:], y_pred, label='Predicted')
plt.legend()
plt.title('Actual vs Predicted Log Returns')
plt.xlabel('Date')
plt.ylabel('Log Return')
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')
df['log_return'] = np.log(df['close'] / df['close'].shift(1))

lag_features = ['close', 'open', 'high', 'low', 'DFF', 'DAAA']
for col in lag_features:
    for lag in range(1, 6):
        df[f'{col}_lag{lag}'] = df[col].shift(lag)

df['volatility_5d'] = df['log_return'].rolling(5).std()
df['log_return_ma5'] = df['log_return'].rolling(5).mean()
df['price_ma5'] = df['close'].rolling(5).mean()

df = df.dropna().reset_index(drop=True)

feature_cols = [col for col in df.columns if col.endswith('lag1') or 'lag' in col or 'volatility' in col or 'ma' in col]
X = df[feature_cols]
y = df['log_return']

split_index = int(0.8 * len(df))
X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

tscv = TimeSeriesSplit(n_splits=5)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15]
}

grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=tscv,
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

print("Best Parameters:", grid_search.best_params_)

y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print("\nEvaluation Metrics:")
print(f"MAE:  {mae:.6f}")
print(f"RMSE: {rmse:.6f}")
print(f"R squared:   {r2:.6f}")

plt.figure(figsize=(12, 5))
plt.plot(df['date'].iloc[split_index:], y_test, label='Actual', alpha=0.8)
plt.plot(df['date'].iloc[split_index:], y_pred, label='Predicted', alpha=0.8)
plt.legend()
plt.title('Actual vs Predicted Log Returns')
plt.xlabel('Date')
plt.ylabel('Log Return')
plt.grid(True)
plt.tight_layout()
plt.show()