In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


In [2]:
# Load the dataset
file_path = 'apple_stock_data.csv'
df = pd.read_csv(file_path)

In [None]:
#Display the first few rows of the dataframe
print(df.head())
print("-------------------------------------------------------------------------------------------------------------")
#Display information about the dataset
print(df.info())
print("-------------------------------------------------------------------------------------------------------------")
#Get summary statistics
print(df.describe())
print("-------------------------------------------------------------------------------------------------------------")

In [3]:
# Convert 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Sort by date
df = df.sort_values('Date')

# Set 'Date' as the index
df.set_index('Date', inplace=True)

# Select all the columns
data = df[['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits']]


In [None]:
# Check if any value is missing in the entire DataFrame
print(data.isnull().values.any())
print("-------------------------------------------------------------------------------------------------------------")
# Check for missing values in each column
print(data.isnull().sum())


# Calculate the correlation matrix
correlation_matrix = data.corr()

# Print the correlation matrix
print(correlation_matrix)

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Plot distributions
plt.figure(figsize=(15, 10))
for i, column in enumerate(['Open', 'High', 'Low', 'Close', 'Volume']):
    plt.subplot(3, 2, i + 1)
    sns.histplot(df[column], kde=True)
    plt.title(f'Distribution of {column}')
plt.tight_layout()
plt.show()

#remove Dividends column and Stock splits as the show very weak significance to the problem, their correlation to the target variable is very low.

In [4]:
# Select the relevant columns
data = df[['Open', 'High', 'Low', 'Close', 'Volume']]

# Normalize the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)


# Create a DataFrame with the scaled data
scaled_df = pd.DataFrame(scaled_data, columns=data.columns, index=data.index)
# Display the first few rows of the scaled dataframe
print(scaled_df.head())
# Display the statistics of the 'Close' column to verify normalization
close_stats = scaled_df['Close'].describe()
print(close_stats)
# Check if the minimum value is 0 and the maximum value is 1
print(f"Min value: {scaled_df['Close'].min()}")
print(f"Max value: {scaled_df['Close'].max()}")

# Split the data into training, validation, and test sets
train_size = int(len(scaled_df) * 0.7)
val_size = int(len(scaled_df) * 0.15)
train_data = scaled_df[:train_size]
val_data = scaled_df[train_size:train_size + val_size]
test_data = scaled_df[train_size + val_size:]

# Display the shapes of the datasets
print(f"Training data shape: {train_data.shape}")
print(f"Validation data shape: {val_data.shape}")
print(f"Test data shape: {test_data.shape}")

                               Open      High       Low     Close    Volume
Date                                                                       
1980-12-12 00:00:00-05:00  0.000308  0.000307  0.000311  0.000310  0.063198
1980-12-15 00:00:00-05:00  0.000284  0.000281  0.000285  0.000283  0.023699
1980-12-16 00:00:00-05:00  0.000249  0.000247  0.000250  0.000249  0.014246
1980-12-17 00:00:00-05:00  0.000257  0.000257  0.000261  0.000259  0.011647
1980-12-18 00:00:00-05:00  0.000270  0.000270  0.000274  0.000273  0.009897
count    10950.000000
mean         0.104518
std          0.222362
min          0.000000
25%          0.001031
50%          0.001969
75%          0.085925
max          1.000000
Name: Close, dtype: float64
Min value: 0.0
Max value: 1.0
Training data shape: (7664, 5)
Validation data shape: (1642, 5)
Test data shape: (1644, 5)


In [5]:
def create_sequences(data, seq_length):
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        x = data[i:i+seq_length]
        y = data[i+seq_length][3]  # Target is the 'Close' price
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

seq_length = 60  # You can adjust this value
X_train, y_train = create_sequences(train_data.values, seq_length)
X_val, y_val = create_sequences(val_data.values, seq_length)
X_test, y_test = create_sequences(test_data.values, seq_length)

print(f"Training data shape: {X_train.shape}, {y_train.shape}")
print(f"Validation data shape: {X_val.shape}, {y_val.shape}")
print(f"Test data shape: {X_test.shape}, {y_test.shape}")



Training data shape: (7604, 60, 5), (7604,)
Validation data shape: (1582, 60, 5), (1582,)
Test data shape: (1584, 60, 5), (1584,)


In [6]:
import keras

# Define the LSTM model
model = keras.Sequential()
model.add(keras.layers.LSTM(units=50, return_sequences=True, input_shape=(seq_length, 5)))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.LSTM(units=50, return_sequences=False))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(units=1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val))

# Save the model
model.save('apple_stock_lstm_model.h5')


ModuleNotFoundError: No module named 'keras'