In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

# 1. Load Data
df = pd.read_csv('../data/raw/BTC-USD_historical.csv')
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

# 2. Calculate Returns (The "Stationary" Step)
# This converts "Price" ($90,000) into "Change" (+0.02)
df['Returns'] = df['Close'].pct_change()
df['Volume_Change'] = df['Volume'].pct_change()

# Clean infinite values (rare, but happens if volume is 0)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

# 3. Select Features
# Feed the model: Past Returns, Volume Changes
features = ['Returns', 'Volume_Change']
dataset = df[features].values

# 4. Scale Data (-1 to 1)
# Returns can be negative, so -1 to 1 is the mathematically correct range
scaler = MinMaxScaler(feature_range=(-1, 1))
scaled_data = scaler.fit_transform(dataset)

# 5. Create Sequences
def create_sequences(data, seq_length=14):
    X = []
    y = []
    for i in range(seq_length, len(data)):
        # Input: Past 14 days of returns
        X.append(data[i-seq_length:i])
        # Target: Tomorrow's return (Index 0 is 'Returns')
        y.append(data[i, 0])
    return np.array(X), np.array(y)

SEQ_LENGTH = 14
X, y = create_sequences(scaled_data, SEQ_LENGTH)

print(f"New Data Shape: {X.shape}")
print("Data is now stationary (centered around 0).")

# 6. Save Data
np.save('../data/processed/X.npy', X)
np.save('../data/processed/y.npy', y)
joblib.dump(scaler, '../data/processed/scaler.pkl')

# CRITICAL: Save the actual prices for the Test Set
# Needed later to convert % predictions back to $$$
# Assume a standard 80/20 split
split_index = int(len(df) * 0.8)
test_prices = df['Close'].iloc[split_index:].values

# Trim the first 14 days off the test_prices because sequences eat them
# The model can only predict starting from Day 14 of the test set
test_prices = test_prices[SEQ_LENGTH:]

np.save('../data/processed/test_prices.npy', test_prices)
print("Saved raw test prices for reconstruction later.")

New Data Shape: (1812, 14, 2)
Data is now stationary (centered around 0).
Saved raw test prices for reconstruction later.
