In [1]:
# Cell 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [21]:
#Load the Data
weekly_data = pd.read_csv('c:/Users/jason.maughan/bitcoin2.csv')
weekly_data['Date'] = pd.to_datetime(weekly_data['Date'])


print(weekly_data)

# Convert to numpy array
prices = weekly_data.values.reshape(-1, 1)

          Date       Close
0   2012-05-28      5.1390
1   2012-06-04      5.2051
2   2012-06-11      5.4683
3   2012-06-18      6.1638
4   2012-06-25      6.3500
..         ...         ...
626 2024-05-27  68552.6000
627 2024-06-03  67781.4700
628 2024-06-10  69657.3400
629 2024-06-17  66661.0500
630 2024-06-24  63290.4300

[631 rows x 2 columns]


In [26]:
# Create lag features
def create_lag_features(df, lags):
    for lag in lags:
        df[f'lag_{lag}'] = df['y'].shift(lag)
    return df

In [27]:
# Rename columns to fit the standard
weekly_data.rename(columns={'Date': 'ds', 'Close': 'y'})

Unnamed: 0,ds,y
0,2012-05-28,5.1390
1,2012-06-04,5.2051
2,2012-06-11,5.4683
3,2012-06-18,6.1638
4,2012-06-25,6.3500
...,...,...
626,2024-05-27,68552.6000
627,2024-06-03,67781.4700
628,2024-06-10,69657.3400
629,2024-06-17,66661.0500


In [28]:
# Create lag features
lags = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]  # 12 lag features (weeks)
weekly_data = create_lag_features(weekly_data, lags)

# Drop rows with NaN values created by the lagging
#weekly_data.dropna()
print(weekly_data)

KeyError: 'y'

In [7]:
# Split data into features and target variable
X = weekly_data.drop(['ds', 'y'], axis=1)
y = weekly_data['y']

KeyError: "['ds'] not found in axis"

In [75]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [76]:
# Cell 3: Train the XGBoost model
# Convert the data into DMatrix format
train_dmatrix = xgb.DMatrix(data=X_train, label=y_train)
test_dmatrix = xgb.DMatrix(data=X_test, label=y_test)

In [77]:
# Define the parameters for the XGBoost model
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'eta': 0.1,
    'max_depth': 5,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}


In [78]:
# Train the model
num_boost_round = 100
model = xgb.train(params, train_dmatrix, num_boost_round)

In [79]:
# Predict on the test set
preds = model.predict(test_dmatrix)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, preds))
print(f'Test RMSE: {rmse}')

Test RMSE: 8033.022360854204


In [80]:
# Cell 4: Predict future prices
# Function to predict future prices
def predict_future_prices(model, X_last, num_weeks):
    future_prices = []
    current_X = X_last.copy()

    for _ in range(num_weeks):
        dmatrix = xgb.DMatrix(data=current_X.values.reshape(1, -1))
        next_price = model.predict(dmatrix)[0]
        future_prices.append(next_price)

        # Update the features for the next prediction
        new_row = current_X.values.flatten().tolist()[1:] + [next_price]
        current_X = pd.DataFrame([new_row], columns=current_X.columns)

    return future_prices

In [81]:
# Use the last available row in the training data to predict future prices
X_last = X.iloc[-1]
future_weeks = 10
future_predictions = predict_future_prices(model, X_last, future_weeks)

# Print future predictions
print(f'Future {future_weeks} weeks predictions: {future_predictions}')

ValueError: training data did not have the following fields: lag_1, lag_2, lag_3, lag_4, lag_5, lag_6, lag_7, lag_8, lag_9, lag_10, lag_11, lag_12