# AAPL LSTM Training

### Imports

In [688]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
import numpy as np


### Preprocess Data

In [689]:
df = pd.read_csv('./processed_data/AAPL_daily_data_splits_processed.csv')

df['time_stamp'] = pd.to_datetime(df['time_stamp'])
df.sort_values('time_stamp', inplace=True)

df = df.dropna()
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 6024 entries, 6023 to 0
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   time_stamp          6024 non-null   datetime64[ns]
 1   open                6024 non-null   float64       
 2   high                6024 non-null   float64       
 3   low                 6024 non-null   float64       
 4   close               6024 non-null   float64       
 5   volume              6024 non-null   int64         
 6   SMA_10              6024 non-null   float64       
 7   EMA_10              6024 non-null   float64       
 8   SMA_20              6024 non-null   float64       
 9   EMA_20              6024 non-null   float64       
 10  SMA_50              6024 non-null   float64       
 11  EMA_50              6024 non-null   float64       
 12  SMA_100             6024 non-null   float64       
 13  EMA_100             6024 non-null   float64       
 1

In [690]:
y = df['close']

y.head()

6023    0.83
6022    0.87
6021    0.92
6020    0.89
6019    0.90
Name: close, dtype: float64

In [691]:
X = df.drop('close', axis=1)
X.head()

Unnamed: 0,time_stamp,open,high,low,volume,SMA_10,EMA_10,SMA_20,EMA_20,SMA_50,...,RSI,EMA_Fast,EMA_Slow,MACD,Signal,Upper_Band,Lower_Band,log_returns,rolling_volatility,momentum
6023,2000-08-15,0.84,0.86,0.83,2039300,0.846,0.852679,0.8785,0.874786,0.8974,...,36.745829,0.85781,0.882824,-0.025014,-0.019965,0.966346,0.790654,-0.011976,0.622251,-0.05
6022,2000-08-16,0.84,0.88,0.84,2565600,0.849,0.855828,0.875,0.87433,0.8982,...,52.218484,0.859685,0.881874,-0.022189,-0.020409,0.957971,0.792029,0.047068,0.633937,0.03
6021,2000-08-17,0.86,0.94,0.86,4837500,0.855,0.867496,0.872,0.878679,0.8994,...,64.679648,0.868964,0.884698,-0.015734,-0.019474,0.942367,0.801633,0.05588,0.654879,0.06
6020,2000-08-18,0.92,0.93,0.89,3396000,0.859,0.871588,0.8685,0.879757,0.9002,...,54.787717,0.872201,0.885091,-0.01289,-0.018157,0.926275,0.810725,-0.033152,0.648764,0.04
6019,2000-08-21,0.9,0.92,0.89,2401200,0.863,0.876754,0.87,0.881685,0.9012,...,57.299451,0.876477,0.886195,-0.009718,-0.01647,0.929471,0.810529,0.011173,0.630583,0.04


#### Train-Test Split

In [692]:
train_size = int(len(df) * 0.8)

X_train = X.iloc[:train_size].copy()
X_test = X.iloc[train_size:].copy()
y_train = y.iloc[:train_size].copy()
y_test = y.iloc[train_size:].copy()


In [693]:
# Convert 'volume' to float using .loc for both X_train and X_test
X_train['volume'] = X_train['volume'].astype(float)
X_test['volume'] = X_test['volume'].astype(float)



In [694]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4819 entries, 6023 to 1205
Data columns (total 25 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   time_stamp          4819 non-null   datetime64[ns]
 1   open                4819 non-null   float64       
 2   high                4819 non-null   float64       
 3   low                 4819 non-null   float64       
 4   volume              4819 non-null   float64       
 5   SMA_10              4819 non-null   float64       
 6   EMA_10              4819 non-null   float64       
 7   SMA_20              4819 non-null   float64       
 8   EMA_20              4819 non-null   float64       
 9   SMA_50              4819 non-null   float64       
 10  EMA_50              4819 non-null   float64       
 11  SMA_100             4819 non-null   float64       
 12  EMA_100             4819 non-null   float64       
 13  SMA_200             4819 non-null   float64       

In [695]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1205 entries, 1204 to 0
Data columns (total 25 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   time_stamp          1205 non-null   datetime64[ns]
 1   open                1205 non-null   float64       
 2   high                1205 non-null   float64       
 3   low                 1205 non-null   float64       
 4   volume              1205 non-null   float64       
 5   SMA_10              1205 non-null   float64       
 6   EMA_10              1205 non-null   float64       
 7   SMA_20              1205 non-null   float64       
 8   EMA_20              1205 non-null   float64       
 9   SMA_50              1205 non-null   float64       
 10  EMA_50              1205 non-null   float64       
 11  SMA_100             1205 non-null   float64       
 12  EMA_100             1205 non-null   float64       
 13  SMA_200             1205 non-null   float64       
 1

In [696]:
X_train.head()

Unnamed: 0,time_stamp,open,high,low,volume,SMA_10,EMA_10,SMA_20,EMA_20,SMA_50,...,RSI,EMA_Fast,EMA_Slow,MACD,Signal,Upper_Band,Lower_Band,log_returns,rolling_volatility,momentum
6023,2000-08-15,0.84,0.86,0.83,2039300.0,0.846,0.852679,0.8785,0.874786,0.8974,...,36.745829,0.85781,0.882824,-0.025014,-0.019965,0.966346,0.790654,-0.011976,0.622251,-0.05
6022,2000-08-16,0.84,0.88,0.84,2565600.0,0.849,0.855828,0.875,0.87433,0.8982,...,52.218484,0.859685,0.881874,-0.022189,-0.020409,0.957971,0.792029,0.047068,0.633937,0.03
6021,2000-08-17,0.86,0.94,0.86,4837500.0,0.855,0.867496,0.872,0.878679,0.8994,...,64.679648,0.868964,0.884698,-0.015734,-0.019474,0.942367,0.801633,0.05588,0.654879,0.06
6020,2000-08-18,0.92,0.93,0.89,3396000.0,0.859,0.871588,0.8685,0.879757,0.9002,...,54.787717,0.872201,0.885091,-0.01289,-0.018157,0.926275,0.810725,-0.033152,0.648764,0.04
6019,2000-08-21,0.9,0.92,0.89,2401200.0,0.863,0.876754,0.87,0.881685,0.9012,...,57.299451,0.876477,0.886195,-0.009718,-0.01647,0.929471,0.810529,0.011173,0.630583,0.04


In [697]:
X_test.head()

Unnamed: 0,time_stamp,open,high,low,volume,SMA_10,EMA_10,SMA_20,EMA_20,SMA_50,...,RSI,EMA_Fast,EMA_Slow,MACD,Signal,Upper_Band,Lower_Band,log_returns,rolling_volatility,momentum
1204,2019-10-11,58.24,59.41,58.08,41990210.0,56.502,56.734208,55.7285,55.840227,53.4834,...,76.653776,56.522124,55.41385,1.108274,0.903273,58.090714,53.366286,0.026252,0.224825,4.35
1203,2019-10-14,58.72,59.53,58.67,24413484.0,56.8,57.140715,55.928,56.138301,53.6428,...,75.469267,56.89872,55.677269,1.221451,0.966909,58.667816,53.188184,-0.001356,0.224828,2.98
1202,2019-10-15,59.1,59.41,58.72,23040483.0,57.068,57.447858,56.1105,56.394653,53.8526,...,73.185682,57.19584,55.910805,1.285036,1.030534,59.114087,53.106913,-0.002377,0.218657,2.68
1201,2019-10-16,58.34,58.81,58.3,19286694.0,57.453,57.65552,56.2555,56.603734,54.0394,...,69.052742,57.410326,56.109264,1.301063,1.08464,59.447691,53.063309,-0.004088,0.216721,3.85
1200,2019-10-17,58.77,59.04,58.38,17272897.0,57.815,57.867244,56.4345,56.814807,54.2206,...,70.87166,57.627199,56.310059,1.317141,1.13114,59.784519,53.084481,0.003918,0.211669,3.62


In [698]:
y_train.info()

<class 'pandas.core.series.Series'>
Index: 4819 entries, 6023 to 1205
Series name: close
Non-Null Count  Dtype  
--------------  -----  
4819 non-null   float64
dtypes: float64(1)
memory usage: 75.3 KB


In [699]:
y_train.head()

6023    0.83
6022    0.87
6021    0.92
6020    0.89
6019    0.90
Name: close, dtype: float64

In [700]:
y_test.info()

<class 'pandas.core.series.Series'>
Index: 1205 entries, 1204 to 0
Series name: close
Non-Null Count  Dtype  
--------------  -----  
1205 non-null   float64
dtypes: float64(1)
memory usage: 18.8 KB


In [701]:
y_train.info()

<class 'pandas.core.series.Series'>
Index: 4819 entries, 6023 to 1205
Series name: close
Non-Null Count  Dtype  
--------------  -----  
4819 non-null   float64
dtypes: float64(1)
memory usage: 75.3 KB


### Scaling

In [702]:
# Drop the 'time_stamp' column from both training and testing DataFrames
X_train = X_train.drop('time_stamp', axis=1)
X_test = X_test.drop('time_stamp', axis=1)


In [703]:
ss = StandardScaler()

ss_features = ['open', 'high', 'low', 'SMA_10', 'EMA_10', 'SMA_20', 'EMA_20', 'SMA_50', 'EMA_50', 
               'SMA_100', 'EMA_100', 'SMA_200', 'EMA_200', 'EMA_Fast', 'EMA_Slow']

# Using .loc to ensure that the operation modifies the DataFrame directly
X_train.loc[:, ss_features] = ss.fit_transform(X_train[ss_features])
X_test.loc[:, ss_features] = ss.transform(X_test[ss_features])


In [704]:
mm = MinMaxScaler()

mm_features = ['RSI', 'MACD', 'Signal', 'log_returns', 'rolling_volatility', 'momentum']

# Fit on training data and transform both training and testing data
X_train.loc[:, mm_features] = mm.fit_transform(X_train[mm_features])
X_test.loc[:, mm_features] = mm.transform(X_test[mm_features])


In [705]:
# Correcting the assignments to avoid SettingWithCopyWarning

# Ensure 'volume' is float before log transformation and apply log transformation
X_train.loc[:, 'volume'] = np.log1p(X_train['volume'].astype(float))
X_test.loc[:, 'volume'] = np.log1p(X_test['volume'].astype(float))

# Apply StandardScaler to 'volume'
volume_scaler = StandardScaler()
X_train.loc[:, 'volume'] = volume_scaler.fit_transform(X_train[['volume']])
X_test.loc[:, 'volume'] = volume_scaler.transform(X_test[['volume']])



In [706]:
print("Training Shape", X_train.shape, y_train.shape)
print("Testing Shape", X_test.shape, y_test.shape) 

Training Shape (4819, 24) (4819,)
Testing Shape (1205, 24) (1205,)


### Tensors

In [707]:
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

In [708]:
def create_sequences(input_data, seq_length):
    xs = []
    ys = []

    for i in range(len(input_data) - seq_length):
        x = input_data[i:(i+seq_length)]  # Sequence of data
        y = input_data[i+seq_length, 0]  # Assuming the target is the first feature of the next step
        xs.append(x)
        ys.append(y)

    return torch.stack(xs), torch.tensor(ys, dtype=torch.float32)  # Ensure ys is a tensor of scalars



In [709]:
seq_length = 10
X_train_sequences, y_train_sequences = create_sequences(X_train_tensor, seq_length)
X_test_sequences, y_test_sequences = create_sequences(X_test_tensor, seq_length)

print("X_train_sequences shape:", X_train_sequences.shape)
print("y_train_sequences shape:", y_train_sequences.shape)
print("X_test_sequences shape:", X_test_sequences.shape)
print("y_test_sequences shape:", y_test_sequences.shape)

X_train_sequences shape: torch.Size([4809, 10, 24])
y_train_sequences shape: torch.Size([4809])
X_test_sequences shape: torch.Size([1195, 10, 24])
y_test_sequences shape: torch.Size([1195])
