In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import StandardScaler
from itertools import product
import matplotlib.pyplot as plt
import matplotlib.dates as mdates



def get_statistics(train_df, val_df, test_df, target_column='Close'):
    """
    Calculate and print statistical information for the train, validation, and test DataFrames.

    Parameters:
    - train_df: DataFrame containing the training data.
    - val_df: DataFrame containing the validation data.
    - test_df: DataFrame containing the test data.
    - target_column: The column name of the target variable to calculate statistics for (default is 'Close').
    """
    # Function to print statistics for a given DataFrame
    def print_statistics(df, name):
        print(f"\nStatistics for {name} DataFrame:")
        print(f"Mean: {df[target_column].mean()}")
        print(f"Variance: {df[target_column].var()}")
        print(f"Standard Deviation: {df[target_column].std()}")
        print(f"Minimum: {df[target_column].min()}")
        print(f"Maximum: {df[target_column].max()}")
        print(f"25th Percentile: {df[target_column].quantile(0.25)}")
        print(f"Median: {df[target_column].median()}")
        print(f"75th Percentile: {df[target_column].quantile(0.75)}")
#         print("\nDescriptive Statistics:\n", df.describe())

    # Print statistics for each DataFrame
    print_statistics(train_df, "Training")
    print_statistics(val_df, "Validation")
    print_statistics(test_df, "Test")


def plot_close(train_df, val_df, test_df, target_column='Close',xlim_start=None):
    """
    Plots the 'Close' column from the train, validation, and test DataFrames, with different colors for each set.
    
    Parameters:
    - train_df: DataFrame containing the training data.
    - val_df: DataFrame containing the validation data.
    - test_df: DataFrame containing the test data.
    - target_column: The column name of the target variable to plot (default is 'Close').
    """
    # Ensure the index is a datetime type
    train_df.index = pd.to_datetime(train_df.index)
    val_df.index = pd.to_datetime(val_df.index)
    test_df.index = pd.to_datetime(test_df.index)

    # Convert target_column to numeric, if necessary
    train_df[target_column] = pd.to_numeric(train_df[target_column], errors='coerce')
    val_df[target_column] = pd.to_numeric(val_df[target_column], errors='coerce')
    test_df[target_column] = pd.to_numeric(test_df[target_column], errors='coerce')

    plt.figure(figsize=(14, 6))

    # Plot the training set
    plt.plot(train_df.index, train_df[target_column], label='Train', color='blue')

    # Plot the validation set
    plt.plot(val_df.index, val_df[target_column], label='Validation', color='orange')

    # Plot the test set
    plt.plot(test_df.index, test_df[target_column], label='Test', color='green')

    # Add labels and title
    plt.title(f'{target_column} over Time for Train, Validation, and Test Sets')
    plt.xlabel('Date')
    plt.ylabel(target_column)

    # Concatenate the index objects into a single Series for setting ticks
    all_dates = pd.Series(train_df.index.append(val_df.index).append(test_df.index))
    unique_months = all_dates.dt.to_period('M').unique()  # Get unique months

    # Set major ticks to the unique months in the dataset
    ax = plt.gca()
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=6))  # Set major ticks to every 6 months
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))  # Format ticks as Year-Month

    # Set x-axis limits to the provided values, or default to the range of the data
    if xlim_start is not None and xlim_end is not None:
        plt.xlim(pd.to_datetime(xlim_start), pd.to_datetime(xlim_end))
    else:
        plt.xlim(train_df.index.min(), test_df.index.max())

    # Add a legend to differentiate the sets
    plt.legend()

    # Show the plot
    plt.tight_layout()  # Adjust layout to prevent clipping of tick-labels
    plt.show()
    
    
# Function to split the time series data into train, validation, and test sets
def split_time_series(df, train_size=0.86, val_size=0.09):
    train_end = int(len(df) * train_size)
    val_end = int(len(df) * (train_size + val_size))
    train_df = df[:train_end]
    val_df = df[train_end:val_end]
    test_df = df[val_end:]
    return train_df, val_df, test_df


# Prepare sequences for LSTM
def prepare_sequences(df: pd.DataFrame, target_column="Close", sequence_length=30):
    feature_columns = df.columns[df.columns != target_column]
    features = df[feature_columns].values
    target = df[target_column].values

    X = []
    y = []

    for i in range(len(df) - sequence_length):
        end_ix = i + sequence_length
        X.append(features[i:end_ix])
        y.append(target[end_ix])

    return np.array(X), np.array(y)

def prepare_data_for_predictions(train_df, validation_df, test_df, n_lags=50, target_column='Close'):
    """
    Prepares data for making predictions on the test set starting from day 1 using the last samples 
    from the validation set to create the lagged features.

    Parameters:
    - train_df: DataFrame containing the training data.
    - validation_df: DataFrame containing the validation data.
    - test_df: DataFrame containing the test data (to predict on).
    - n_lags: Number of lagged time steps to use as features.
    - target_column: The column name of the target variable.

    Returns:
    - X_test: Numpy array containing the lagged features for prediction.
    """
    # Drop target column and ensure no NaNs in the validation and test data
    validation_df = validation_df.dropna().drop(columns=[target_column])
    test_df = test_df.dropna().drop(columns=[target_column])

    # Take the last n_lags rows from the validation data to start creating lagged features for test
    if len(validation_df) < n_lags:
        raise ValueError(f"Not enough validation data for lagging: {len(validation_df)} found, but {n_lags} required.")

    # Initialize the historical data with the last n_lags rows from validation data
    historical_data = validation_df[-n_lags:].copy()

    # Initialize a list to store lagged features for X_test
    X_test = []

    # Create lagged features for the test data
    for i in range(len(test_df)):
        if i == 0:
            # For the first test day, use the last n_lags from validation
            window_data = historical_data
        else:
            # For subsequent test days, use a sliding window of the available test data
            window_data = pd.concat([historical_data, test_df.iloc[:i]]).iloc[-n_lags:]

        # Append the lagged features for the current test day
        X_test.append(window_data.values)
        
        # Update the historical data by appending the current test day
        historical_data = pd.concat([historical_data, test_df.iloc[[i]]])

    # Convert the list of lagged features to a NumPy array
    X_test = np.array(X_test)

    # Verify that X_test contains no NaN values
    if np.any(np.isnan(X_test)):
        raise ValueError("X_test contains NaN values.")
    
    return X_test


def predict_future_prices(model, last_sequence, n_days=7):
    future_prices = []

    # Use the last known sequence to predict future values
    current_sequence = last_sequence.copy()

    for _ in range(n_days):
        # Make a prediction
        next_price = model.predict(current_sequence[np.newaxis, :, :])
        future_prices.append(next_price[0, 0])
        
        # Update the current sequence with the predicted price
        # Shift the sequence to the left and add the new prediction at the end
        current_sequence = np.roll(current_sequence, -1, axis=0)
        current_sequence[-1, -1] = next_price  # Replace the last feature with the predicted price
    
    return future_prices


# Function to build and compile the LSTM model
def build_and_compile_model(input_shape, hidden_dims, dense_units, dropout_rate, lr):
    model = Sequential()
    num_layers = len(hidden_dims)
    for i in range(num_layers):
        model.add(LSTM(hidden_dims[i], return_sequences=True if i < num_layers - 1 else False, input_shape=input_shape))
        model.add(Dropout(dropout_rate))
    for dense_unit in dense_units:
        model.add(Dense(dense_unit, activation='relu'))
    model.add(Dense(1))  # Output layer
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Function to save results and plots
def save_results_and_plots(history, model_name, model_dir):
    # Save training history
#     history_df = pd.DataFrame(history.history)
#     history_df.to_excel(os.path.join(model_dir, f'{model_name}_history.xlsx'), index=False)

    # Plot loss
    plt.figure(figsize=(10, 6))
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(os.path.join(model_dir, f'{model_name}_loss_plot.png'))
    plt.close()

# Function to save predictions plot
def save_predictions_plot(y_true, y_pred, dates, model_name, model_dir, prediction_type):
    plt.figure(figsize=(10, 5))
    plt.plot(dates, y_true, label='Actual Values', color='blue', marker='o')
    plt.plot(dates, y_pred, label='Predicted Values', color='orange', marker='x')

    # Formatting the x-axis
    plt.xticks(rotation=45)
    plt.gca().xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%Y-%m-%d'))
    plt.gca().xaxis.set_major_locator(plt.matplotlib.dates.DayLocator(interval=1))

    # Adding titles and labels
    plt.title(f'{model_name} - {prediction_type} - Predictions vs Actual Values')
    plt.xlabel('Date')
    plt.ylabel('Values')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(model_dir, f'{model_name}_{prediction_type}_predictions_plot.png'))
    plt.close()


In [None]:
# Main code starts here
print('starting...')
df = pd.read_csv(r"transformed_data.csv")
df = df.iloc[: -220]
df = df.set_index('index')
df.dropna(inplace = True)
print('data loaded')
# Split the data into train, validation, and test sets
train_df,val_df,  test_df = split_time_series(df)

# Apply Standard Scaling to each set
scaler = StandardScaler()
target_scaler = StandardScaler()

X_train = train_df.drop('Close', axis = 1)
y_train = train_df['Close']

X_val = val_df.drop('Close', axis = 1)
y_val = val_df['Close']


X_test = test_df.drop('Close', axis = 1)
y_test = test_df['Close']

train_target_scaled = target_scaler.fit_transform(y_train.values.reshape(-1,1))
val_target_scaled = target_scaler.transform(y_val.values.reshape(-1,1))
test_target_scaled = target_scaler.transform(y_test.values.reshape(-1,1))


train_scaled = scaler.fit_transform(train_df)
val_scaled = scaler.transform(val_df)
test_scaled =  scaler.transform(test_df)

# Convert scaled arrays back to DataFrame with original indices and columns
train_scaled_df = pd.DataFrame(train_scaled, columns= df.columns, index=train_df.index)
val_scaled_df = pd.DataFrame(val_scaled, columns= df.columns, index=val_df.index)
test_scaled_df = pd.DataFrame(test_scaled, columns= df.columns, index=test_df.index)

# Preparing sequences for training and validation data
sequence_length = 50
num_features = 50  # Number of features

X_train, y_train = prepare_sequences(train_scaled_df, target_column="Close", sequence_length=sequence_length)
X_val, y_val = prepare_sequences(val_scaled_df, target_column="Close", sequence_length=sequence_length)

input_shape = (X_train.shape[1], X_train.shape[2])  # (sequence_length, number of features)

In [None]:
train_df, val_df, test_df = split_time_series(df)
plot_close(train_df, val_df, test_df)

In [None]:
# Example usage:
# Assuming you have already split your dataset into train_df, val_df, and test_df
get_statistics(train_df, val_df, test_df, target_column='Close')

In [None]:
hyperparams = {
    'hidden_dims': [
        [128, 64],            # A common starting point with two layers
        [256, 128],           # Slightly larger architecture for increased capacity
        [128],                # Simple architecture for baseline comparison
        [512, 256],           # Larger model for capturing complex patterns
        [256, 128, 64],       # Three-layer architecture for capturing interactions
        [256, 256],           # Two large layers for deeper learning
        [128, 64, 32, 16],    # More layers with decreasing size for capturing nuances
        [128, 128],           # Two equal-sized layers for balanced capacity
        [512, 256, 128],      # A deeper stack with a significant capacity
        [128, 64, 64],        # Wider layers for more feature learning
        [256, 128, 64, 32]    # Complex model with a diverse structure
    ],
    'dense_units': [
        [64, 32, 16], # Smaller layers for basic configurations
        [128, 64],            # Balanced dense layers
        [256, 128],           # Larger dense layers for improved learning
        [64, 32],             # Smaller layers to test overfitting
        [128, 128],           # Two equal-sized layers
        [64, 64, 32],         # Three-layer structure for capturing interactions
        [256, 128, 64],       # Larger dense layers to explore complex interactions
        [128, 32, 16],        # Simpler setup with a clear decline
        [128, 64, 32, 16],    # Deeper architecture with declining size
        [128, 64, 64],        # Slightly wider layers with an added layer            
        [128, 32]             # A basic two-layer configuration
    ],
    'dropout_rate': [0.1, 0.2, 0.25, 0.3, 0.5],  # Start with lower rates for complex models
    'lr': [1e-3, 1e-4, 1e-5],    # Moderate learning rates to start, with room for adjustment
    'batch_size': [4, 8, 16, 32, 64],  # Mid-sized batch to start, adjusting downwards and upwards
    'epochs': [100, 200]  # Aiming for a balance between training time and model convergence
}

# Directory to save models
model_dir = 'models'
os.makedirs(model_dir, exist_ok=True)

# Initialize DataFrame to store results
results_df = pd.DataFrame(columns=['Model Name', 'Test Loss'])

# results_df = pd.read_excel(r"results_summary.xlsx")

counter = 1
# Loop through all combinations of hyperparameters
for params in product(*hyperparams.values()):
    hidden_dims, dense_units, dropout_rate, lr, batch_size, epochs = params

        
    # Build the model
    model = build_and_compile_model(input_shape=(X_train.shape[1], X_train.shape[2]),
                                    hidden_dims=hidden_dims,
                                    dense_units=dense_units,
                                    dropout_rate=dropout_rate,
                                    lr=lr
                                   )
    
    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights = True)

    # Train the model
    history = model.fit(X_train, y_train, 
                        validation_data=(X_val, y_val), 
                        batch_size=batch_size, 
                        epochs=epochs, 
                        callbacks=[early_stopping],shuffle=False)
    
    # Save the model
    model_name = f'model_{counter}_hidden_dims_{str(hidden_dims).replace(", ", "_")}_dense_units_{str(dense_units).replace(", ", "_")}_dropout_{dropout_rate}_lr_{lr}_batch_{batch_size}_epochs_{epochs}'
    model.save(os.path.join(model_dir, f'{model_name}.h5'))
    counter+=1
    # Save results and plots
    save_results_and_plots(history, model_name, model_dir)

    # Predict on the validation set
    y_val_pred = model.predict(X_val)
    
    
    val_mse = mean_squared_error(target_scaler.inverse_transform(y_val.reshape(-1,1)),
                                 target_scaler.inverse_transform(y_val_pred.reshape(-1,1)))
    print("Validation MSE:", val_mse)

    # Save predictions plot for validation set
    future_days = 30
    dates = pd.date_range(start=val_df.index[0], periods=future_days)
    save_predictions_plot(target_scaler.inverse_transform(y_val[:future_days].reshape(-1,1)),
                          target_scaler.inverse_transform(y_val_pred[:future_days].reshape(-1,1)), dates, model_name, model_dir, 'Validation')

    # Calculate Mean Squared Error for validation set target_scaler.inverse_transform()
    mse_val = mean_squared_error(target_scaler.inverse_transform(y_val.reshape(-1,1)),
                                 target_scaler.inverse_transform(y_val_pred.reshape(-1,1)))
    
    # Prepare for test predictions
    prediction_days = 80
    start_index = len(val_df) - sequence_length
    preds_val = pd.DataFrame(target_scaler.inverse_transform(y_val_pred), columns=['Prediction_val'], index= val_df.iloc[-start_index:].index)

    validation_valuations = pd.DataFrame(index = preds_val.iloc[:prediction_days].index)
    
    validation_valuations['Prediction'] = preds_val['Prediction_val'][:prediction_days] 
    validation_valuations['Close'] = target_scaler.inverse_transform(y_val[:prediction_days].reshape(-1,1))

    # Save validation valuations MSE
    mse_val_week = mean_squared_error(validation_valuations['Close'],
                                     validation_valuations['Prediction'])
    
    # Save validation valuations MSE
    mae_val_week = mean_absolute_error(validation_valuations['Close'],
                                     validation_valuations['Prediction'])
    
    print(f'Mean Squared Error for Validation Predictions: {mse_val_week}')
    
    # Test plotting
    X_test = prepare_data_for_predictions(train_df, val_df,test_df,n_lags = sequence_length)  # Adjusted to return only X_test

    prediction_days = 80

    predictions_test = model.predict(X_test)
    preds = pd.DataFrame(target_scaler.inverse_transform(predictions_test),columns=['Prediction'], index = test_df.index)
    real_test_values = test_df['Close']

    test_valuations = pd.DataFrame(index = test_df.index[:prediction_days])
    test_valuations['Prediction'] = preds[:prediction_days]
    test_valuations['Close'] = real_test_values[:prediction_days]

    # Calculate Mean Squared Error
    mse_test = mean_squared_error(test_valuations['Close'], test_valuations['Prediction'])
    
    mae_test = mean_absolute_error(test_valuations['Close'], test_valuations['Prediction'])
    
    print(f'Mean Squared Error for Test Predictions: {mse_test}')
    
    save_predictions_plot(y_test[:future_days], target_scaler.inverse_transform(predictions_test[:future_days]), dates, model_name, model_dir, 'Test Set')

    # Save validation valuations MSE
    mse_test_month = mean_squared_error(test_valuations['Close'],
                                      test_valuations['Prediction'])

    # Append results to DataFrame
    results_df = results_df.append({
        'Model Name': model_name,
        'Validation MSE': mse_val,
        'Validation MAE Month': mae_val_week,
        'Validation MSE Month': mse_val_week,
        'Test MSE': mse_test,
        'Test MAE': mae_test,
        'Test MSE Month': mse_test_month
        
    }, ignore_index=True)
    
    # Save all results to Excel
    results_df.to_excel(os.path.join(model_dir, 'results_summary.xlsx'), index=False)

print('Results and plots saved!')

In [None]:
from tensorflow.keras.models import load_model

# Load the model
model = load_model(r".\models\models_saved\model_9_hidden_dims_[128_64]_dense_units_[64_32_16]_dropout_0.5_lr_0.0001_batch_4_epochs_200.h5")

In [None]:
# Predict on the validation set
y_val_pred = best_model.predict(X_val)

In [None]:
val_mse = mean_squared_error(target_scaler.inverse_transform(y_val.reshape(-1,1)),
                             target_scaler.inverse_transform(y_val_pred.reshape(-1,1)))
print("Validation MSE:", val_mse)

val_rmse = np.sqrt(val_mse)
print("Validation RMSE:", val_rmse)

val_mae = mean_absolute_error(target_scaler.inverse_transform(y_val.reshape(-1,1)),
                             target_scaler.inverse_transform(y_val_pred.reshape(-1,1)))
print("Validation MAE:", val_mae)


# Test plotting
X_test = prepare_data_for_predictions(train_df, val_df,test_df,n_lags=sequence_length)  # Adjusted to return only X_test

prediction_days = 80

predictions_test = model.predict(X_test)
preds = pd.DataFrame(target_scaler.inverse_transform(predictions_test),columns=['Prediction'], index = test_df.index)
real_test_values = test_df['Close']

test_valuations = pd.DataFrame(index = test_df.index[:prediction_days])
test_valuations['Prediction'] = preds[:prediction_days]
test_valuations['Close'] = real_test_values[:prediction_days]

# Calculate Mean Squared Error
mse_test = mean_squared_error(test_valuations['Close'], test_valuations['Prediction'])

mae_test = mean_absolute_error(test_valuations['Close'], test_valuations['Prediction'])



# Save validation valuations MSE
mse_test_month = mean_squared_error(test_valuations['Close'],
                                  test_valuations['Prediction'])
rmse_test = np.sqrt(mse_test_month)


print(f'Test MSE: {mse_test_month}')
print(f'Test RMSE: {rmse_test}')
print(f'Test MAE: {mae_test}')

# Save predictions plot for validation set
future_days = 30

prediction_type = 'Test'
model_name = 'best_model_hidden_dims_[128_64]_dense_units_[64_32_16]_dropout_0.5_lr_0.0001_batch_4_epochs_200'
periods= 30
dates = pd.date_range(start= test_df.index[0], periods=future_days)


plt.figure(figsize=(10, 5))
plt.plot(dates, y_test[:future_days], label='Actual Values', color='blue', marker='o')
plt.plot(dates, target_scaler.inverse_transform(predictions_test[:future_days]), label='Predicted Values', color='orange', marker='x')

# Formatting the x-axis
plt.xticks(rotation=45)
plt.gca().xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%Y-%m-%d'))
plt.gca().xaxis.set_major_locator(plt.matplotlib.dates.DayLocator(interval=1))

# Adding titles and labels
plt.title(f'{model_name} - {prediction_type} - Predictions vs Actual Values')
plt.xlabel('Date')
plt.ylabel('Values')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
    
# Calculate Mean Squared Error for validation set target_scaler.inverse_transform()
mse_val = mean_squared_error(target_scaler.inverse_transform(y_val.reshape(-1,1)),
                             target_scaler.inverse_transform(y_val_pred.reshape(-1,1)))

# Prepare for test predictions
prediction_days = 80
start_index = len(val_df) - sequence_length
preds_val = pd.DataFrame(target_scaler.inverse_transform(y_val_pred), columns=['Prediction_val'], index= val_df.iloc[-start_index:].index)

validation_valuations = pd.DataFrame(index = preds_val.iloc[:prediction_days].index)

validation_valuations['Prediction'] = preds_val['Prediction_val'][:prediction_days] 
validation_valuations['Close'] = target_scaler.inverse_transform(y_val[:prediction_days].reshape(-1,1))

# Save validation valuations MSE
mse_val_week = mean_squared_error(validation_valuations['Close'],
                                 validation_valuations['Prediction'])

# Save validation valuations MSE
mae_val_week = mean_absolute_error(validation_valuations['Close'],
                                 validation_valuations['Prediction'])

print(f'Mean Squared Error for Validation Predictions: {mse_val_week}')


# Test plotting
X_test = prepare_data_for_predictions(train_df, val_df,test_df,n_lags=sequence_length)  # Adjusted to return only X_test

prediction_days = 80

predictions_test = model.predict(X_test)
preds = pd.DataFrame(target_scaler.inverse_transform(predictions_test),columns=['Prediction'], index = test_df.index)
real_test_values = test_df['Close']

test_valuations = pd.DataFrame(index = test_df.index[:prediction_days])
test_valuations['Prediction'] = preds[:prediction_days]
test_valuations['Close'] = real_test_values[:prediction_days]

# Calculate Mean Squared Error
mse_test = mean_squared_error(test_valuations['Close'], test_valuations['Prediction'])

mae_test = mean_absolute_error(test_valuations['Close'], test_valuations['Prediction'])

print(f'Mean Squared Error for Test Predictions: {mse_test}')

save_predictions_plot(y_test[:future_days], target_scaler.inverse_transform(predictions_test[:future_days]), dates, model_name, model_dir, 'Test Set')

    # Save validation valuations MSE
mse_test_month = mean_squared_error(test_valuations['Close'],
                                  test_valuations['Prediction'])

In [None]:
plt.figure(figsize=(12, 6))

# Plot the validation set
days_val = 14
target_column= 'Close'
plt.plot(val_df.iloc[-days_val:].index, val_df.iloc[-days_val:][target_column], label='Validation', color='orange')

# Plot the test set
plt.plot(test_df.iloc[:days_val].index, test_df.iloc[:days_val][target_column], label='Test', color='green')

# Add labels and title
plt.title(f'{target_column} over Time for Train, Validation, and Test Sets')
plt.xlabel('Date')
plt.ylabel(target_column)

# Add a legend to differentiate the sets
plt.legend()

# Show the plot
plt.show()
    