In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential, Model
from keras.layers import Input, LSTM, Dense
from sklearn.metrics import mean_squared_error

In [3]:
root_path = Path("/Users/adminnistrator/Documents/Stanford/STATS 207/STATS-207-Final-Project")

In [4]:
electricity_data = pd.read_csv(root_path / "clean_data.csv")

# Aggregate the data: compute the monthly average electricity price at the national level
electricity_data['date'] = pd.to_datetime(
    electricity_data[['year', 'month']].assign(day=1)
)  # Create a date column for time series analysis

# Calculate the monthly average price
national_avg_price = electricity_data.groupby('date')['price'].mean().reset_index()

national_avg_price.to_csv(root_path / "national_avg_price.csv", index = False)

In [10]:
national_avg_price['date'] = pd.to_datetime(national_avg_price['date'])
national_avg_price.set_index('date', inplace=True)
national_avg_price.head()

Unnamed: 0_level_0,price
date,Unnamed: 1_level_1
2001-01-01,7.136839
2001-02-01,7.271097
2001-03-01,7.443806
2001-04-01,7.544065
2001-05-01,7.675194


In [28]:
def create_lagged_features(data, lag=1):
    """
    Create lagged features for supervised learning.
    :param data: Input time series (numpy array or pandas Series).
    :param lag: Number of lagged observations to include.
    :return: Feature matrix X and target vector y.
    """
    X, y = [], []
    for i in range(len(data) - lag):
        X.append(data[i:i + lag])
        y.append(data[i + lag])
    return np.array(X), np.array(y)

# Function to create and compile LSTM model
def create_lstm_model(input_shape):
    # Use the Input layer to specify the input shape
    inputs = Input(shape=input_shape)
    x = LSTM(50, activation='relu')(inputs)  # Add LSTM layer after Input
    outputs = Dense(1)(x)  # Output layer
    model = Model(inputs=inputs, outputs=outputs)  # Create the model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [None]:
# 2. Normalize the data
scaler = MinMaxScaler(feature_range=(0, 1))
data_normalized = scaler.fit_transform(national_avg_price)

# Create lagged features with a window size of 12
X, y = create_lagged_features(data_normalized, lag)

# Reshape X for LSTM (samples, timesteps, features)
X = X.reshape((X.shape[0], X.shape[1], 1))

In [30]:
tscv = TimeSeriesSplit(n_splits=5)

# 4. Train and evaluate the model using cross-validation
results = []
rmse_values = []

for train_index, test_index in tscv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Create and train the LSTM model
    model = create_lstm_model((X_train.shape[1], X_train.shape[2]))
    model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)

    # Evaluate the model
    loss = model.evaluate(X_test, y_test, verbose=0)
    results.append(loss)

    # Make predictions
    y_pred = model.predict(X_test)

    # Compute RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_values.append(rmse)
    results.append(model.evaluate(X_test, y_test, verbose=0))

# Print cross-validation results
print(f"Cross-validation losses: {results}")
print(f"Mean loss: {np.mean(results)}")
print(f"Cross-validation RMSE values: {rmse_values}")
print(f"Average RMSE: {np.mean(rmse_values)}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
Cross-validation losses: [0.014076460152864456, 0.014076460152864456, 0.0019427213119342923, 0.0019427213119342923, 0.001077650929801166, 0.001077650929801166, 0.0009068173822015524, 0.0009068173822015524, 0.004165248014032841, 0.004165248014032841]
Mean loss: 0.004433779558166861
Cross-validation RMSE values: [0.11864426380840773, 0.044076314062877056, 0.03282759495483703, 0.030113409387336726, 0.06453873279677005]
Average RMSE: 0.058040063002045714
