In [1]:
# ISE 537 - Homework 4, Problem 2: Bitcoin Price Prediction using LSTM

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
import warnings
warnings.filterwarnings('ignore')

plt.style.use("fivethirtyeight")
sns.set_style('whitegrid')
%matplotlib inline
plt.rcParams.update({'font.size': 15})

ModuleNotFoundError: No module named 'keras'

# Step 1: Load and Clean Bitcoin Data

**Note:** The data has duplicate observations at each timestamp. We need to clean it.

In [None]:
# Load the Bitcoin data
df = pd.read_csv('Bitcoin_Price__1_.csv')
print(f"Original data shape: {df.shape}")
print(f"\nFirst 10 rows:")
df.head(10)

In [None]:
# Check for duplicates
print(f"Number of unique timestamps: {df['datetime'].nunique()}")
print(f"Total rows: {len(df)}")
print(f"\nDuplicates per timestamp: {len(df) / df['datetime'].nunique():.1f}")

In [None]:
# Clean the data by averaging duplicate observations
df_clean = df.groupby('datetime')['price'].mean().reset_index()
print(f"Cleaned data shape: {df_clean.shape}")
print(f"\nCleaned data:")
df_clean.head()

# Step 2: Visualize Bitcoin Price

In [None]:
plt.figure(figsize=(16,6))
plt.title('Bitcoin Price History')
plt.plot(df_clean['price'])
plt.xlabel('Time', fontsize=18)
plt.ylabel('Bitcoin Price USD ($)', fontsize=18)
plt.show()

# Step 3: Prepare Data for LSTM

**Homework Requirements:**
- Normalize to [0, 1] using MinMaxScaler ✓
- 80% training, 20% testing (NOT 95%/5%) ✓
- Lookback window = 10 (NOT 60) ✓

In [None]:
# Get the price data
data = df_clean.filter(['price'])
dataset = data.values

# Get the number of rows to train the model on (80% for training)
training_data_len = int(np.ceil(len(dataset) * 0.80))  # Changed from 0.95 to 0.80

print(f"Total data points: {len(dataset)}")
print(f"Training data length: {training_data_len} ({training_data_len/len(dataset)*100:.1f}%)")
print(f"Testing data length: {len(dataset) - training_data_len} ({(1-training_data_len/len(dataset))*100:.1f}%)")

In [None]:
# Scale the data to [0, 1]
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(dataset)

print(f"Data normalized. Min: {scaled_data.min():.4f}, Max: {scaled_data.max():.4f}")
scaled_data

In [None]:
# Create the training data set with lookback window = 10 (changed from 60)
train_data = scaled_data[0:int(training_data_len), :]

x_train = []
y_train = []

lookback = 10  # Changed from 60 to 10

for i in range(lookback, len(train_data)):
    x_train.append(train_data[i-lookback:i, 0])
    y_train.append(train_data[i, 0])
        
# Convert to numpy arrays
x_train, y_train = np.array(x_train), np.array(y_train)

# Reshape the data for LSTM [samples, time steps, features]
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))

print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}")

# Step 4: Build LSTM Model

**Homework Specifications:**
- LSTM layer: 4 units (NOT 128 and 64)
- Dense layer: 1 unit
- Optimizer: Adam
- Loss: Mean Squared Error
- Batch size: 256
- Epochs: 100

In [None]:
# Build the LSTM model (as specified in homework)
model = Sequential()
model.add(LSTM(4, input_shape=(x_train.shape[1], 1)))  # Changed from 128 to 4
model.add(Dense(1))  # Single output layer

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Display model summary
model.summary()

In [None]:
# Train the model
history = model.fit(x_train, y_train, batch_size=256, epochs=100, verbose=2, validation_split=0.1)

print("\nTraining completed!")

# Step 5: Make Predictions

In [None]:
# Create the testing data set
test_data = scaled_data[training_data_len - lookback:, :]

x_test = []
y_test = dataset[training_data_len:, :]

for i in range(lookback, len(test_data)):
    x_test.append(test_data[i-lookback:i, 0])
    
# Convert to numpy array
x_test = np.array(x_test)

# Reshape the data
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))

print(f"x_test shape: {x_test.shape}")
print(f"y_test shape: {y_test.shape}")

In [None]:
# Get predictions
train_predictions = model.predict(x_train)
test_predictions = model.predict(x_test)

# Inverse transform to get actual prices
train_predictions = scaler.inverse_transform(train_predictions)
test_predictions = scaler.inverse_transform(test_predictions)

print("Predictions completed!")

# Question (a): Calculate RMSE and Analyze Performance

In [None]:
# Calculate RMSE for training set
train_rmse = np.sqrt(mean_squared_error(y_train.reshape(-1, 1), 
                                         scaler.inverse_transform(y_train.reshape(-1, 1)),
                                         scaler.inverse_transform(model.predict(x_train))))
train_rmse = np.sqrt(np.mean((scaler.inverse_transform(model.predict(x_train)) - 
                               scaler.inverse_transform(y_train.reshape(-1, 1))) ** 2))

# Calculate RMSE for testing set  
test_rmse = np.sqrt(np.mean((test_predictions - y_test) ** 2))

print("=" * 70)
print("QUESTION (a): RMSE Analysis")
print("=" * 70)
print(f"\nTraining RMSE: ${train_rmse:.2f}")
print(f"Testing RMSE: ${test_rmse:.2f}")
print(f"\nTest/Train RMSE Ratio: {test_rmse/train_rmse:.3f}")

# Calculate percentage errors
train_mean_price = scaler.inverse_transform(y_train.reshape(-1, 1)).mean()
test_mean_price = y_test.mean()
print(f"\nTraining RMSE as % of mean price: {(train_rmse/train_mean_price)*100:.2f}%")
print(f"Testing RMSE as % of mean price: {(test_rmse/test_mean_price)*100:.2f}%")

## Answer to Question (a):

**Training RMSE:** [See output above]

**Testing RMSE:** [See output above]

**Is the performance satisfactory?**

[Write your analysis based on the RMSE values - if they're < 5% of mean price, performance is excellent]

**Is there evidence of overfitting?**

[Analyze the Test/Train RMSE ratio:
- If ratio < 1.2: No significant overfitting
- If ratio 1.2-1.5: Mild overfitting  
- If ratio > 1.5: Significant overfitting]

# Question (b): Plot True vs Predicted Prices

Plot the last 1/3 of training period + all of testing period

In [None]:
# Prepare data for plotting
train_data_plot = data[:training_data_len]
train_predictions_plot = np.empty_like(data)
train_predictions_plot[:, :] = np.nan
train_predictions_plot[lookback:training_data_len, :] = scaler.inverse_transform(model.predict(x_train))

test_predictions_plot = np.empty_like(data)
test_predictions_plot[:, :] = np.nan
test_predictions_plot[training_data_len:len(dataset), :] = test_predictions

# Calculate indices for last 1/3 of training
train_plot_start = training_data_len - (training_data_len - lookback) // 3

print(f"Plotting from index {train_plot_start} to {len(dataset)}")
print(f"Last 1/3 of training: {training_data_len - train_plot_start} points")
print(f"Testing period: {len(dataset) - training_data_len} points")

In [None]:
# Create the plot
plt.figure(figsize=(16, 8))

# Plot true prices
plt.plot(data.index[train_plot_start:training_data_len], 
         data[train_plot_start:training_data_len], 
         label='True Price (Training)', color='blue', linewidth=2, alpha=0.7)

plt.plot(data.index[training_data_len:], 
         data[training_data_len:], 
         label='True Price (Testing)', color='green', linewidth=2, alpha=0.7)

# Plot predicted prices
plt.plot(data.index[train_plot_start:training_data_len], 
         train_predictions_plot[train_plot_start:training_data_len], 
         label='Predicted Price (Training)', color='cyan', linewidth=2, linestyle='--', alpha=0.7)

plt.plot(data.index[training_data_len:], 
         test_predictions_plot[training_data_len:], 
         label='Predicted Price (Testing)', color='red', linewidth=2, linestyle='--', alpha=0.7)

# Add vertical line at train/test split
plt.axvline(x=data.index[training_data_len], color='black', linestyle=':', linewidth=2, 
            label='Train/Test Split')

plt.xlabel('Time Step', fontsize=12, fontweight='bold')
plt.ylabel('Bitcoin Price ($)', fontsize=12, fontweight='bold')
plt.title('Bitcoin Price Prediction using LSTM\n(Last 1/3 of Training Period + Testing Period)', 
          fontsize=14, fontweight='bold')
plt.legend(loc='best')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Answer to Question (b):

**Does the performance look good on the plot?**

[Analyze the plot:
- How closely do predicted prices track true prices?
- Does the model capture trends and volatility?
- Is there a drop in performance at the train/test boundary?
- Overall assessment of visual performance]

# Summary

**Model Configuration:**
- Lookback window: 10
- Training/Testing split: 80/20
- LSTM units: 4
- Batch size: 256
- Epochs: 100

**Key Results:**
- Training RMSE: [value]
- Testing RMSE: [value]
- Performance assessment: [excellent/satisfactory/needs improvement]
- Overfitting: [none/mild/significant]