# LSTM REGRESSION WITH OHLC AND 30 DAYS FEATURES 

In [117]:
#import libraries
import pandas as pd
import numpy as np
import yfinance as yf
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import datetime
import warnings
from tensorflow.keras.preprocessing import timeseries_dataset_from_array

warnings.filterwarnings("ignore")


symbol_to_fetch = 'AAPL'
start_date = '2020-01-01'
end_date = '2024-05-01'
# Parameters
batch_size = 256
sequence_length = 30

def fetch_ticker_data(symbol, start_date, end_date):
    """Fetches stock data for a given symbol using yfinance."""
    ticker = yf.Ticker(symbol)
    data = ticker.history(start='1980-01-01', end=end_date)
    return data

def label_data(data):
    # Calculate the percentage change in price from one day to the next
    data['pr_change_on_last_day'] = data['Close'].pct_change()
    data['pr_change_on_current_day'] = data['pr_change_on_last_day'].shift(-1)
    data.iloc[0,-2] = 0
    # data['sentiment'] = pd.Series(np.where(data['pr_change_on_current_day'] > 0.025, 1, np.where(data['pr_change_on_current_day'] < -0.025, -1, 0)), index=data.index)
    # data['perc_change'] = data['Percentage Change']
    # # Drop any rows with missing values
    # data.dropna(inplace=True)
    # data.drop('pr_change_on_current_day',axis=1 , inplace=True)
    return data
stock = fetch_ticker_data(symbol_to_fetch, start_date, end_date)

# Calculate deltas, moving averages, and Bollinger Bands
for i in range(1, 30):
    stock[f"open_delta_{i}day"] = stock["Open"].diff(periods=i)
    stock[f"high_delta_{i}day"] = stock["High"].diff(periods=i)
    stock[f"low_delta_{i}day"] = stock["Low"].diff(periods=i)
    stock[f"close_delta_{i}day"] = stock["Close"].diff(periods=i)
    stock[f"rolling_mean_open_{i}day"] = stock["Open"].rolling(window=i).mean()
    stock[f"rolling_mean_high_{i}day"] = stock["High"].rolling(window=i).mean()
    stock[f"rolling_mean_low_{i}day"] = stock["Low"].rolling(window=i).mean()
    stock[f"rolling_mean_close_{i}day"] = stock["Close"].rolling(window=i).mean()
    stock[f"rolling_std_open_{i}day"] = stock["Open"].rolling(window=i).std()
    stock[f"rolling_std_high_{i}day"] = stock["High"].rolling(window=i).std()
    stock[f"rolling_std_low_{i}day"] = stock["Low"].rolling(window=i).std()
    stock[f"rolling_std_close_{i}day"] = stock["Close"].rolling(window=i).std()

stock['fast_ma'] = stock['Close'].rolling(window=20).mean()
stock['slow_ma'] = stock['Close'].rolling(window=50).mean()
stock['bollinger_high'] = stock['Close'].rolling(window=20).mean() + (2 * stock['Close'].rolling(window=20).std())
stock['bollinger_low'] = stock['Close'].rolling(window=20).mean() - (2 * stock['Close'].rolling(window=20).std())
stock['ema'] = stock['Close'].ewm(span=20, adjust=False).mean()
stock['envelope_high'] = stock['Close'].rolling(window=20).mean() * (1 + 0.05)
stock['envelope_low'] = stock['Close'].rolling(window=20).mean() * (1 - 0.05)
stock['macd_line'] = stock['Close'].ewm(span=12, adjust=False).mean() - stock['Close'].ewm(span=26, adjust=False).mean()
stock['macd_signal'] = stock['macd_line'].ewm(span=9, adjust=False).mean()

# RSI calculation
def calculate_rsi(data, rsi_period):
    delta = data['Close'].diff().dropna()
    gain = delta.where(delta > 0, 0).dropna()
    loss = -delta.where(delta < 0, 0).dropna()
    avg_gain = gain.rolling(window=rsi_period).mean()
    avg_loss = loss.rolling(window=rsi_period).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

stock['rsi'] = calculate_rsi(stock, 14)

# # Stochastic Oscillator calculation
# def calculate_stochastic(data, k_window, d_window):
#     high_low = data[['High', 'Low']]
#     c = data['Close']
#     highest = high_low.rolling(window=k_window).max()
#     lowest = high_low.rolling(window=k_window).min()
#     print(((c - lowest) / (highest - lowest)) * 100)
#     stochastic_k = ((c - lowest) / (highest - lowest)) * 100
#     stochastic_d = stochastic_k.rolling(window=d_window).mean()
#     return stochastic_k, stochastic_d
# stock['stochastic_k'], stock['stochastic_d'] = calculate_stochastic(stock, 14, 3)

# stock['stochastic_k']= calculate_stochastic(stock, 14, 3)[0]
# stock['stochastic_d']= calculate_stochastic(stock, 14, 3)[1]
stock['day'] = pd.to_datetime(stock.index).day
stock['month'] = pd.to_datetime(stock.index).month
stock['year'] = pd.to_datetime(stock.index).year
stock['weekday'] = pd.to_datetime(stock.index).weekday
stock['dayofyear'] = pd.to_datetime(stock.index).dayofyear
stock = stock.fillna(method="ffill", axis=0)
stock = stock.fillna(method="bfill", axis=0)
stock.index = stock.index.date
# Split the data into training and test sets

# df = stock.copy()

# # Calculate pairwise correlation
# corr_matrix = df.corr()

# # Identify highly correlated columns
# redundant_cols = set()
# for i in range(5,len(corr_matrix.columns)-1):
#     for j in range(i+1, len(corr_matrix.columns)):
#         if corr_matrix.iloc[i,j] > 0.8 and corr_matrix.columns[i] not in redundant_cols:
#             redundant_cols.add(corr_matrix.columns[j])

# # Remove one of the redundant columns
# for col in redundant_cols:
#     df = df.drop(col, axis=1)

# # Print the updated DataFrame
# print(df)

stock = df.copy()
train_data_index = np.searchsorted(stock.index.values, np.datetime64(start_date))
train_data = stock.iloc[:int(0.9*train_data_index)].copy()
val_data  = stock.iloc[int(0.9*train_data_index)-sequence_length:train_data_index].copy()
test_data = stock.iloc[train_data_index-sequence_length:].copy()
train_data = label_data(train_data)
val_data = label_data(val_data)
test_data = label_data(test_data)
train_data.fillna(0,axis = 0, inplace=True)
val_data.fillna(0,axis = 0, inplace=True)
test_data.fillna(0,axis = 0, inplace=True)

#trian & test data
X_train_data = train_data.iloc[:,:-1]
y_train_data = train_data.iloc[:,-1]
#trian & test data
X_val_data = val_data.iloc[:,:-1]
y_val_data = val_data.iloc[:,-1]
X_test_data = test_data.iloc[:,:-1]
y_test_data = test_data.iloc[:,-1]
print(len(X_test_data), len(X_test_data.columns))
# Normalize the data
normalizer = MinMaxScaler()
X_train_data_normalizer = normalizer.fit_transform(X_train_data)
X_val_data_normalizer = normalizer.fit_transform(X_val_data)
X_test_data_normalizer = normalizer.transform(X_test_data)

# # # Reshape X_train_data_normalizer
X_train_reshaped = X_train_data_normalizer.reshape(X_train_data_normalizer.shape[0], X_train_data_normalizer.shape[1], 1)
X_val_reshaped = X_val_data_normalizer.reshape(X_val_data_normalizer.shape[0], X_val_data_normalizer.shape[1], 1)
X_test_reshaped = X_test_data_normalizer.reshape(X_test_data_normalizer.shape[0], X_test_data_normalizer.shape[1], 1)



train_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_train_data_normalizer,
    y_train_data,
    length = 3,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
val_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_val_data_normalizer,
    y_val_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
test_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_test_data_normalizer,
    y_test_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
from keras.utils import to_categorical

# Convert targets to one-hot encoding
y_train_onehot = to_categorical(y_train_data, num_classes=3)
y_val_onehot = to_categorical(y_val_data, num_classes=3)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Flatten
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

lstm_model = Sequential([
    LSTM(256, input_shape=(sequence_length, X_train_data_normalizer.shape[1]), return_sequences=True),
    Dropout(0.3),
    LSTM(128, return_sequences=True),
    Dropout(0.3),
    LSTM(64, return_sequences=True),
    Dropout(0.3),
    LSTM(32, return_sequences=True),
    Dropout(0.3),
    LSTM(16, return_sequences=True),
    Dropout(0.3),
    LSTM(8, return_sequences=True),
    Dropout(0.3),
    LSTM(4, return_sequences=False),
    Dropout(0.3),
    # Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1)  # Adjust num_classes according to your problem
])

# Compile the model
lstm_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse'])

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(patience=5, monitor='val_loss', mode='min', restore_best_weights=True)

# Define model checkpoint to save the best model during training
model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss', mode='min')

# Train the model
history = lstm_model.fit(train_dataset, epochs=100, batch_size=64, validation_data=val_dataset, callbacks=[early_stopping, model_checkpoint])

# Load the best saved model
best_model = tf.keras.models.load_model('best_model.keras')

print(best_model.evaluate(test_dataset))
test_predictions = best_model.predict(test_dataset)
print(test_predictions.min())
test_predictions


1119 33
Epoch 1/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 174ms/step - loss: 7.7857e-04 - mse: 7.7857e-04 - val_loss: 4.7307e-04 - val_mse: 4.7307e-04
Epoch 2/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 136ms/step - loss: 9.5424e-04 - mse: 9.5424e-04 - val_loss: 4.5200e-04 - val_mse: 4.5200e-04
Epoch 3/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 119ms/step - loss: 8.2470e-04 - mse: 8.2470e-04 - val_loss: 4.9062e-04 - val_mse: 4.9062e-04
Epoch 4/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 129ms/step - loss: 8.8748e-04 - mse: 8.8748e-04 - val_loss: 5.8304e-04 - val_mse: 5.8304e-04
Epoch 5/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 140ms/step - loss: 9.0810e-04 - mse: 9.0810e-04 - val_loss: 5.8767e-04 - val_mse: 5.8767e-04
Epoch 6/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 148ms/step - loss: 0.0010 - mse: 0.0010 - val_loss: 7.7219e-04 - val_m

0.008436078


# LSTM CLASSIFICATION MODEL WITH NO FEATURE ENGINEERING 

In [130]:
#import libraries
import pandas as pd
import numpy as np
import yfinance as yf
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import datetime
import warnings
from tensorflow.keras.preprocessing import timeseries_dataset_from_array

warnings.filterwarnings("ignore")


symbol_to_fetch = 'AAPL'
start_date = '2020-01-01'
end_date = '2024-05-01'
# Parameters
batch_size = 256
sequence_length = 30

def fetch_ticker_data(symbol, start_date, end_date):
    """Fetches stock data for a given symbol using yfinance."""
    ticker = yf.Ticker(symbol)
    data = ticker.history(start='1980-01-01', end=end_date)
    return data

def label_data(data):
    # Calculate the percentage change in price from one day to the next
    data['pr_change_on_last_day'] = data['Close'].pct_change()
    data['pr_change_on_current_day'] = data['pr_change_on_last_day'].shift(-1)
    data.iloc[0,-2] = 0
    data['sentiment'] = pd.Series(np.where(data['pr_change_on_current_day'] > 0, 1, np.where(data['pr_change_on_current_day'] < 0, -1, 0)), index=data.index)
    # data['perc_change'] = data['Percentage Change']
    # # Drop any rows with missing values
    # data.dropna(inplace=True)
    data.drop('pr_change_on_current_day',axis=1 , inplace=True)
    return data
stock = fetch_ticker_data(symbol_to_fetch, start_date, end_date)

# # Calculate deltas, moving averages, and Bollinger Bands
# for i in range(1, 30):
#     stock[f"open_delta_{i}day"] = stock["Open"].diff(periods=i)
#     stock[f"high_delta_{i}day"] = stock["High"].diff(periods=i)
#     stock[f"low_delta_{i}day"] = stock["Low"].diff(periods=i)
#     stock[f"close_delta_{i}day"] = stock["Close"].diff(periods=i)
#     stock[f"rolling_mean_open_{i}day"] = stock["Open"].rolling(window=i).mean()
#     stock[f"rolling_mean_high_{i}day"] = stock["High"].rolling(window=i).mean()
#     stock[f"rolling_mean_low_{i}day"] = stock["Low"].rolling(window=i).mean()
#     stock[f"rolling_mean_close_{i}day"] = stock["Close"].rolling(window=i).mean()
#     stock[f"rolling_std_open_{i}day"] = stock["Open"].rolling(window=i).std()
#     stock[f"rolling_std_high_{i}day"] = stock["High"].rolling(window=i).std()
#     stock[f"rolling_std_low_{i}day"] = stock["Low"].rolling(window=i).std()
#     stock[f"rolling_std_close_{i}day"] = stock["Close"].rolling(window=i).std()

stock['fast_ma'] = stock['Close'].rolling(window=20).mean()
stock['slow_ma'] = stock['Close'].rolling(window=50).mean()
stock['bollinger_high'] = stock['Close'].rolling(window=20).mean() + (2 * stock['Close'].rolling(window=20).std())
stock['bollinger_low'] = stock['Close'].rolling(window=20).mean() - (2 * stock['Close'].rolling(window=20).std())
stock['ema'] = stock['Close'].ewm(span=20, adjust=False).mean()
stock['envelope_high'] = stock['Close'].rolling(window=20).mean() * (1 + 0.05)
stock['envelope_low'] = stock['Close'].rolling(window=20).mean() * (1 - 0.05)
stock['macd_line'] = stock['Close'].ewm(span=12, adjust=False).mean() - stock['Close'].ewm(span=26, adjust=False).mean()
stock['macd_signal'] = stock['macd_line'].ewm(span=9, adjust=False).mean()

# RSI calculation
def calculate_rsi(data, rsi_period):
    delta = data['Close'].diff().dropna()
    gain = delta.where(delta > 0, 0).dropna()
    loss = -delta.where(delta < 0, 0).dropna()
    avg_gain = gain.rolling(window=rsi_period).mean()
    avg_loss = loss.rolling(window=rsi_period).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

stock['rsi'] = calculate_rsi(stock, 14)

# # Stochastic Oscillator calculation
# def calculate_stochastic(data, k_window, d_window):
#     high_low = data[['High', 'Low']]
#     c = data['Close']
#     highest = high_low.rolling(window=k_window).max()
#     lowest = high_low.rolling(window=k_window).min()
#     print(((c - lowest) / (highest - lowest)) * 100)
#     stochastic_k = ((c - lowest) / (highest - lowest)) * 100
#     stochastic_d = stochastic_k.rolling(window=d_window).mean()
#     return stochastic_k, stochastic_d
# stock['stochastic_k'], stock['stochastic_d'] = calculate_stochastic(stock, 14, 3)

# stock['stochastic_k']= calculate_stochastic(stock, 14, 3)[0]
# stock['stochastic_d']= calculate_stochastic(stock, 14, 3)[1]
stock['day'] = pd.to_datetime(stock.index).day
stock['month'] = pd.to_datetime(stock.index).month
stock['year'] = pd.to_datetime(stock.index).year
stock['weekday'] = pd.to_datetime(stock.index).weekday
stock['dayofyear'] = pd.to_datetime(stock.index).dayofyear
stock = stock.fillna(method="ffill", axis=0)
stock = stock.fillna(method="bfill", axis=0)
stock.index = stock.index.date
# Split the data into training and test sets

# df = stock.copy()

# # Calculate pairwise correlation
# corr_matrix = df.corr()

# # Identify highly correlated columns
# redundant_cols = set()
# for i in range(5,len(corr_matrix.columns)-1):
#     for j in range(i+1, len(corr_matrix.columns)):
#         if corr_matrix.iloc[i,j] > 0.8 and corr_matrix.columns[i] not in redundant_cols:
#             redundant_cols.add(corr_matrix.columns[j])

# # Remove one of the redundant columns
# for col in redundant_cols:
#     df = df.drop(col, axis=1)

# # Print the updated DataFrame
# print(df)

# stock = df.copy()
train_data_index = np.searchsorted(stock.index.values, np.datetime64(start_date))
train_data = stock.iloc[:int(0.9*train_data_index)].copy()
val_data  = stock.iloc[int(0.9*train_data_index)-sequence_length:train_data_index].copy()
test_data = stock.iloc[train_data_index-sequence_length:].copy()
train_data = label_data(train_data)
val_data = label_data(val_data)
test_data = label_data(test_data)
train_data.fillna(0,axis = 0, inplace=True)
val_data.fillna(0,axis = 0, inplace=True)
test_data.fillna(0,axis = 0, inplace=True)

#trian & test data
X_train_data = train_data.iloc[:,:-1]
y_train_data = train_data.iloc[:,-1]
#trian & test data
X_val_data = val_data.iloc[:,:-1]
y_val_data = val_data.iloc[:,-1]
X_test_data = test_data.iloc[:,:-1]
y_test_data = test_data.iloc[:,-1]
print(len(X_test_data), len(X_test_data.columns))
# Normalize the data
normalizer = MinMaxScaler()
X_train_data_normalizer = normalizer.fit_transform(X_train_data)
X_val_data_normalizer = normalizer.fit_transform(X_val_data)
X_test_data_normalizer = normalizer.transform(X_test_data)

# # # Reshape X_train_data_normalizer
X_train_reshaped = X_train_data_normalizer.reshape(X_train_data_normalizer.shape[0], X_train_data_normalizer.shape[1], 1)
X_val_reshaped = X_val_data_normalizer.reshape(X_val_data_normalizer.shape[0], X_val_data_normalizer.shape[1], 1)
X_test_reshaped = X_test_data_normalizer.reshape(X_test_data_normalizer.shape[0], X_test_data_normalizer.shape[1], 1)



train_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_train_data_normalizer,
    y_train_data,
    length = 3,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
val_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_val_data_normalizer,
    y_val_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
test_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_test_data_normalizer,
    y_test_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
from keras.utils import to_categorical

# Convert targets to one-hot encoding
y_train_onehot = to_categorical(y_train_data, num_classes=3)
y_val_onehot = to_categorical(y_val_data, num_classes=3)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Flatten
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

lstm_model = Sequential([
    LSTM(256, input_shape=(sequence_length, X_train_data_normalizer.shape[1]), return_sequences=True),
    Dropout(0.3),
    LSTM(128, return_sequences=True),
    Dropout(0.3),
    LSTM(64, return_sequences=True),
    Dropout(0.3),
    LSTM(32, return_sequences=True),
    Dropout(0.3),
    LSTM(16, return_sequences=True),
    Dropout(0.3),
    LSTM(8, return_sequences=True),
    Dropout(0.3),
    LSTM(4, return_sequences=False),
    Dropout(0.3),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='softmax')  # 3 neurons for the 3 classes
])

# Compile the model
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(patience=10, monitor='val_loss', mode='max', restore_best_weights=True)

# Define model checkpoint to save the best model during training
model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss', mode='max')

# Train the model
history = lstm_model.fit(train_dataset, epochs=100, batch_size=64, validation_data=val_dataset, callbacks=[early_stopping, model_checkpoint])

# Load the best saved model
best_model = tf.keras.models.load_model('best_model.keras')

# Make predictions on the test set
test_predictions = best_model.predict(test_dataset)
from sklearn.metrics import accuracy_score,classification_report
# Calculate accuracy
accuracy = accuracy_score(y_test_data[sequence_length:], test_predictions)

print('Accuracy:', accuracy)
print(classification_report(y_test_data[sequence_length:], test_predictions))
test_predictions 


1119 23
Epoch 1/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 91ms/step - accuracy: 0.4847 - loss: 0.0000e+00 - val_accuracy: 0.5442 - val_loss: 0.0000e+00
Epoch 2/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 53ms/step - accuracy: 0.4937 - loss: 0.0000e+00 - val_accuracy: 0.5442 - val_loss: 0.0000e+00
Epoch 3/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 60ms/step - accuracy: 0.5000 - loss: 0.0000e+00 - val_accuracy: 0.5442 - val_loss: 0.0000e+00
Epoch 4/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 55ms/step - accuracy: 0.4807 - loss: 0.0000e+00 - val_accuracy: 0.5442 - val_loss: 0.0000e+00
Epoch 5/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step - accuracy: 0.4889 - loss: 0.0000e+00 - val_accuracy: 0.5442 - val_loss: 0.0000e+00
Epoch 6/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 47ms/step - accuracy: 0.4852 - loss: 0.0000e+00 - val_accuracy: 0

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]], dtype=float32)

# LSTM CLASSIFICATION WITH FEATURE ENGINEERING + INDICATORS

In [131]:
#import libraries
import pandas as pd
import numpy as np
import yfinance as yf
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import datetime
import warnings
from tensorflow.keras.preprocessing import timeseries_dataset_from_array

warnings.filterwarnings("ignore")


symbol_to_fetch = 'AAPL'
start_date = '2020-01-01'
end_date = '2024-05-01'
# Parameters
batch_size = 256
sequence_length = 30

def fetch_ticker_data(symbol, start_date, end_date):
    """Fetches stock data for a given symbol using yfinance."""
    ticker = yf.Ticker(symbol)
    data = ticker.history(start='1980-01-01', end=end_date)
    return data

def label_data(data):
    # Calculate the percentage change in price from one day to the next
    data['pr_change_on_last_day'] = data['Close'].pct_change()
    data['pr_change_on_current_day'] = data['pr_change_on_last_day'].shift(-1)
    data.iloc[0,-2] = 0
    data['sentiment'] = pd.Series(np.where(data['pr_change_on_current_day'] > 0, 1, np.where(data['pr_change_on_current_day'] < 0, -1, 0)), index=data.index)
    # data['perc_change'] = data['Percentage Change']
    # # Drop any rows with missing values
    # data.dropna(inplace=True)
    data.drop('pr_change_on_current_day',axis=1 , inplace=True)
    return data
stock = fetch_ticker_data(symbol_to_fetch, start_date, end_date)

# Calculate deltas, moving averages, and Bollinger Bands
for i in range(1, 90,5):
    stock[f"open_delta_{i}day"] = stock["Open"].diff(periods=i)
    stock[f"high_delta_{i}day"] = stock["High"].diff(periods=i)
    stock[f"low_delta_{i}day"] = stock["Low"].diff(periods=i)
    stock[f"close_delta_{i}day"] = stock["Close"].diff(periods=i)
    stock[f"rolling_mean_open_{i}day"] = stock["Open"].rolling(window=i).mean()
    stock[f"rolling_mean_high_{i}day"] = stock["High"].rolling(window=i).mean()
    stock[f"rolling_mean_low_{i}day"] = stock["Low"].rolling(window=i).mean()
    stock[f"rolling_mean_close_{i}day"] = stock["Close"].rolling(window=i).mean()
    stock[f"rolling_std_open_{i}day"] = stock["Open"].rolling(window=i).std()
    stock[f"rolling_std_high_{i}day"] = stock["High"].rolling(window=i).std()
    stock[f"rolling_std_low_{i}day"] = stock["Low"].rolling(window=i).std()
    stock[f"rolling_std_close_{i}day"] = stock["Close"].rolling(window=i).std()

stock['fast_ma'] = stock['Close'].rolling(window=20).mean()
stock['slow_ma'] = stock['Close'].rolling(window=50).mean()
stock['bollinger_high'] = stock['Close'].rolling(window=20).mean() + (2 * stock['Close'].rolling(window=20).std())
stock['bollinger_low'] = stock['Close'].rolling(window=20).mean() - (2 * stock['Close'].rolling(window=20).std())
stock['ema'] = stock['Close'].ewm(span=20, adjust=False).mean()
stock['envelope_high'] = stock['Close'].rolling(window=20).mean() * (1 + 0.05)
stock['envelope_low'] = stock['Close'].rolling(window=20).mean() * (1 - 0.05)
stock['macd_line'] = stock['Close'].ewm(span=12, adjust=False).mean() - stock['Close'].ewm(span=26, adjust=False).mean()
stock['macd_signal'] = stock['macd_line'].ewm(span=9, adjust=False).mean()

# RSI calculation
def calculate_rsi(data, rsi_period):
    delta = data['Close'].diff().dropna()
    gain = delta.where(delta > 0, 0).dropna()
    loss = -delta.where(delta < 0, 0).dropna()
    avg_gain = gain.rolling(window=rsi_period).mean()
    avg_loss = loss.rolling(window=rsi_period).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

stock['rsi'] = calculate_rsi(stock, 14)

# # Stochastic Oscillator calculation
# def calculate_stochastic(data, k_window, d_window):
#     high_low = data[['High', 'Low']]
#     c = data['Close']
#     highest = high_low.rolling(window=k_window).max()
#     lowest = high_low.rolling(window=k_window).min()
#     print(((c - lowest) / (highest - lowest)) * 100)
#     stochastic_k = ((c - lowest) / (highest - lowest)) * 100
#     stochastic_d = stochastic_k.rolling(window=d_window).mean()
#     return stochastic_k, stochastic_d
# stock['stochastic_k'], stock['stochastic_d'] = calculate_stochastic(stock, 14, 3)

# stock['stochastic_k']= calculate_stochastic(stock, 14, 3)[0]
# stock['stochastic_d']= calculate_stochastic(stock, 14, 3)[1]
stock['day'] = pd.to_datetime(stock.index).day
stock['month'] = pd.to_datetime(stock.index).month
stock['year'] = pd.to_datetime(stock.index).year
stock['weekday'] = pd.to_datetime(stock.index).weekday
stock['dayofyear'] = pd.to_datetime(stock.index).dayofyear
stock = stock.fillna(method="ffill", axis=0)
stock = stock.fillna(method="bfill", axis=0)
stock.index = stock.index.date
# Split the data into training and test sets

# df = stock.copy()

# # Calculate pairwise correlation
# corr_matrix = df.corr()

# # Identify highly correlated columns
# redundant_cols = set()
# for i in range(5,len(corr_matrix.columns)-1):
#     for j in range(i+1, len(corr_matrix.columns)):
#         if corr_matrix.iloc[i,j] > 0.8 and corr_matrix.columns[i] not in redundant_cols:
#             redundant_cols.add(corr_matrix.columns[j])

# # Remove one of the redundant columns
# for col in redundant_cols:
#     df = df.drop(col, axis=1)

# # Print the updated DataFrame
# print(df)

# stock = df.copy()
train_data_index = np.searchsorted(stock.index.values, np.datetime64(start_date))
train_data = stock.iloc[:int(0.9*train_data_index)].copy()
val_data  = stock.iloc[int(0.9*train_data_index)-sequence_length:train_data_index].copy()
test_data = stock.iloc[train_data_index-sequence_length:].copy()
train_data = label_data(train_data)
val_data = label_data(val_data)
test_data = label_data(test_data)
train_data.fillna(0,axis = 0, inplace=True)
val_data.fillna(0,axis = 0, inplace=True)
test_data.fillna(0,axis = 0, inplace=True)

#trian & test data
X_train_data = train_data.iloc[:,:-1]
y_train_data = train_data.iloc[:,-1]
#trian & test data
X_val_data = val_data.iloc[:,:-1]
y_val_data = val_data.iloc[:,-1]
X_test_data = test_data.iloc[:,:-1]
y_test_data = test_data.iloc[:,-1]
print(len(X_test_data), len(X_test_data.columns))
# Normalize the data
normalizer = MinMaxScaler()
X_train_data_normalizer = normalizer.fit_transform(X_train_data)
X_val_data_normalizer = normalizer.fit_transform(X_val_data)
X_test_data_normalizer = normalizer.transform(X_test_data)

# # # Reshape X_train_data_normalizer
X_train_reshaped = X_train_data_normalizer.reshape(X_train_data_normalizer.shape[0], X_train_data_normalizer.shape[1], 1)
X_val_reshaped = X_val_data_normalizer.reshape(X_val_data_normalizer.shape[0], X_val_data_normalizer.shape[1], 1)
X_test_reshaped = X_test_data_normalizer.reshape(X_test_data_normalizer.shape[0], X_test_data_normalizer.shape[1], 1)



train_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_train_data_normalizer,
    y_train_data,
    length = 3,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
val_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_val_data_normalizer,
    y_val_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
test_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_test_data_normalizer,
    y_test_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
from keras.utils import to_categorical

# Convert targets to one-hot encoding
y_train_onehot = to_categorical(y_train_data, num_classes=3)
y_val_onehot = to_categorical(y_val_data, num_classes=3)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Flatten
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

lstm_model = Sequential([
    LSTM(256, input_shape=(sequence_length, X_train_data_normalizer.shape[1]), return_sequences=True),
    Dropout(0.3),
    LSTM(128, return_sequences=True),
    Dropout(0.3),
    LSTM(64, return_sequences=True),
    Dropout(0.3),
    LSTM(32, return_sequences=True),
    Dropout(0.3),
    LSTM(16, return_sequences=True),
    Dropout(0.3),
    LSTM(8, return_sequences=True),
    Dropout(0.3),
    LSTM(4, return_sequences=False),
    Dropout(0.3),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='softmax')  # 3 neurons for the 3 classes
])

# Compile the model
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(patience=10, monitor='val_loss', mode='max', restore_best_weights=True)

# Define model checkpoint to save the best model during training
model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss', mode='max')

# Train the model
history = lstm_model.fit(train_dataset, epochs=100, batch_size=64, validation_data=val_dataset, callbacks=[early_stopping, model_checkpoint])

# Load the best saved model
best_model = tf.keras.models.load_model('best_model.keras')

# Make predictions on the test set
test_predictions = best_model.predict(test_dataset)

from sklearn.metrics import accuracy_score,classification_report
# Calculate accuracy
accuracy = accuracy_score(y_test_data[sequence_length:], test_predictions)

print('Accuracy:', accuracy)
print(classification_report(y_test_data[sequence_length:], test_predictions))
test_predictions 

1119 239
Epoch 1/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 92ms/step - accuracy: 0.4898 - loss: 0.0000e+00 - val_accuracy: 0.5442 - val_loss: 0.0000e+00
Epoch 2/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 61ms/step - accuracy: 0.5004 - loss: 0.0000e+00 - val_accuracy: 0.5442 - val_loss: 0.0000e+00
Epoch 3/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 62ms/step - accuracy: 0.4841 - loss: 0.0000e+00 - val_accuracy: 0.5442 - val_loss: 0.0000e+00
Epoch 4/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 65ms/step - accuracy: 0.4947 - loss: 0.0000e+00 - val_accuracy: 0.5442 - val_loss: 0.0000e+00
Epoch 5/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 65ms/step - accuracy: 0.4808 - loss: 0.0000e+00 - val_accuracy: 0.5442 - val_loss: 0.0000e+00
Epoch 6/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 58ms/step - accuracy: 0.4976 - loss: 0.0000e+00 - val_accuracy: 

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]], dtype=float32)

# LSTM CLASSIFICATION WITH ONLY TWO CLASSES WITH FEATURE ENGINEERING + INDICATORS 


In [140]:
#import libraries
import pandas as pd
import numpy as np
import yfinance as yf
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import datetime
import warnings
from tensorflow.keras.preprocessing import timeseries_dataset_from_array

warnings.filterwarnings("ignore")


symbol_to_fetch = 'AAPL'
start_date = '2020-01-01'
end_date = '2024-05-01'
# Parameters
batch_size = 256
sequence_length = 30

def fetch_ticker_data(symbol, start_date, end_date):
    """Fetches stock data for a given symbol using yfinance."""
    ticker = yf.Ticker(symbol)
    data = ticker.history(start='1980-01-01', end=end_date)
    return data

def label_data(data):
    # Calculate the percentage change in price from one day to the next
    data['pr_change_on_last_day'] = data['Close'].pct_change()
    data['pr_change_on_current_day'] = data['pr_change_on_last_day'].shift(-1)
    data.iloc[0,-2] = 0
    data['sentiment'] = pd.Series(np.where(data['pr_change_on_current_day'] > 0, 1, 0), index=data.index)
    # data['perc_change'] = data['Percentage Change']
    # # Drop any rows with missing values
    # data.dropna(inplace=True)
    data.drop('pr_change_on_current_day',axis=1 , inplace=True)
    return data
stock = fetch_ticker_data(symbol_to_fetch, start_date, end_date)

# Calculate deltas, moving averages, and Bollinger Bands
for i in range(1, 90,5):
    stock[f"open_delta_{i}day"] = stock["Open"].diff(periods=i)
    stock[f"high_delta_{i}day"] = stock["High"].diff(periods=i)
    stock[f"low_delta_{i}day"] = stock["Low"].diff(periods=i)
    stock[f"close_delta_{i}day"] = stock["Close"].diff(periods=i)
    stock[f"rolling_mean_open_{i}day"] = stock["Open"].rolling(window=i).mean()
    stock[f"rolling_mean_high_{i}day"] = stock["High"].rolling(window=i).mean()
    stock[f"rolling_mean_low_{i}day"] = stock["Low"].rolling(window=i).mean()
    stock[f"rolling_mean_close_{i}day"] = stock["Close"].rolling(window=i).mean()
    stock[f"rolling_std_open_{i}day"] = stock["Open"].rolling(window=i).std()
    stock[f"rolling_std_high_{i}day"] = stock["High"].rolling(window=i).std()
    stock[f"rolling_std_low_{i}day"] = stock["Low"].rolling(window=i).std()
    stock[f"rolling_std_close_{i}day"] = stock["Close"].rolling(window=i).std()

stock['fast_ma'] = stock['Close'].rolling(window=20).mean()
stock['slow_ma'] = stock['Close'].rolling(window=50).mean()
stock['bollinger_high'] = stock['Close'].rolling(window=20).mean() + (2 * stock['Close'].rolling(window=20).std())
stock['bollinger_low'] = stock['Close'].rolling(window=20).mean() - (2 * stock['Close'].rolling(window=20).std())
stock['ema'] = stock['Close'].ewm(span=20, adjust=False).mean()
stock['envelope_high'] = stock['Close'].rolling(window=20).mean() * (1 + 0.05)
stock['envelope_low'] = stock['Close'].rolling(window=20).mean() * (1 - 0.05)
stock['macd_line'] = stock['Close'].ewm(span=12, adjust=False).mean() - stock['Close'].ewm(span=26, adjust=False).mean()
stock['macd_signal'] = stock['macd_line'].ewm(span=9, adjust=False).mean()

# RSI calculation
def calculate_rsi(data, rsi_period):
    delta = data['Close'].diff().dropna()
    gain = delta.where(delta > 0, 0).dropna()
    loss = -delta.where(delta < 0, 0).dropna()
    avg_gain = gain.rolling(window=rsi_period).mean()
    avg_loss = loss.rolling(window=rsi_period).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

stock['rsi'] = calculate_rsi(stock, 14)

# # Stochastic Oscillator calculation
# def calculate_stochastic(data, k_window, d_window):
#     high_low = data[['High', 'Low']]
#     c = data['Close']
#     highest = high_low.rolling(window=k_window).max()
#     lowest = high_low.rolling(window=k_window).min()
#     print(((c - lowest) / (highest - lowest)) * 100)
#     stochastic_k = ((c - lowest) / (highest - lowest)) * 100
#     stochastic_d = stochastic_k.rolling(window=d_window).mean()
#     return stochastic_k, stochastic_d
# stock['stochastic_k'], stock['stochastic_d'] = calculate_stochastic(stock, 14, 3)

# stock['stochastic_k']= calculate_stochastic(stock, 14, 3)[0]
# stock['stochastic_d']= calculate_stochastic(stock, 14, 3)[1]
stock['day'] = pd.to_datetime(stock.index).day
stock['month'] = pd.to_datetime(stock.index).month
stock['year'] = pd.to_datetime(stock.index).year
stock['weekday'] = pd.to_datetime(stock.index).weekday
stock['dayofyear'] = pd.to_datetime(stock.index).dayofyear
stock = stock.fillna(method="ffill", axis=0)
stock = stock.fillna(method="bfill", axis=0)
stock.index = stock.index.date
# Split the data into training and test sets

# df = stock.copy()

# # Calculate pairwise correlation
# corr_matrix = df.corr()

# # Identify highly correlated columns
# redundant_cols = set()
# for i in range(5,len(corr_matrix.columns)-1):
#     for j in range(i+1, len(corr_matrix.columns)):
#         if corr_matrix.iloc[i,j] > 0.8 and corr_matrix.columns[i] not in redundant_cols:
#             redundant_cols.add(corr_matrix.columns[j])

# # Remove one of the redundant columns
# for col in redundant_cols:
#     df = df.drop(col, axis=1)

# # Print the updated DataFrame
# print(df)

# stock = df.copy()
train_data_index = np.searchsorted(stock.index.values, np.datetime64(start_date))
train_data = stock.iloc[:int(0.9*train_data_index)].copy()
val_data  = stock.iloc[int(0.9*train_data_index)-sequence_length:train_data_index].copy()
test_data = stock.iloc[train_data_index-sequence_length:].copy()
train_data = label_data(train_data)
val_data = label_data(val_data)
test_data = label_data(test_data)
train_data.fillna(0,axis = 0, inplace=True)
val_data.fillna(0,axis = 0, inplace=True)
test_data.fillna(0,axis = 0, inplace=True)
from keras.utils import to_categorical
#trian & test data
X_train_data = train_data.iloc[:,:-1]
y_train_data = train_data.iloc[:,-1]
#trian & test data
X_val_data = val_data.iloc[:,:-1]
y_val_data = val_data.iloc[:,-1]
X_test_data = test_data.iloc[:,:-1]
y_test_data = test_data.iloc[:,-1]

y_train_data = to_categorical(y_train_data)
y_test_data = to_categorical(y_test_data)
y_val_data = to_categorical(y_val_data)

print(len(X_test_data), len(X_test_data.columns))
# Normalize the data
normalizer = MinMaxScaler()
X_train_data_normalizer = normalizer.fit_transform(X_train_data)
X_val_data_normalizer = normalizer.fit_transform(X_val_data)
X_test_data_normalizer = normalizer.transform(X_test_data)

# # # Reshape X_train_data_normalizer
X_train_reshaped = X_train_data_normalizer.reshape(X_train_data_normalizer.shape[0], X_train_data_normalizer.shape[1], 1)
X_val_reshaped = X_val_data_normalizer.reshape(X_val_data_normalizer.shape[0], X_val_data_normalizer.shape[1], 1)
X_test_reshaped = X_test_data_normalizer.reshape(X_test_data_normalizer.shape[0], X_test_data_normalizer.shape[1], 1)



train_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_train_data_normalizer,
    y_train_data,
    length = 3,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
val_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_val_data_normalizer,
    y_val_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
test_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_test_data_normalizer,
    y_test_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
from keras.utils import to_categorical

# Convert targets to one-hot encoding
y_train_onehot = to_categorical(y_train_data, num_classes=3)
y_val_onehot = to_categorical(y_val_data, num_classes=3)



from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Flatten
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

lstm_model = Sequential([
    LSTM(256, input_shape=(sequence_length, X_train_data_normalizer.shape[1]), return_sequences=True),
    Dropout(0.3),
    LSTM(128, return_sequences=True),
    Dropout(0.3),
    LSTM(64, return_sequences=True),
    Dropout(0.3),
    LSTM(32, return_sequences=True),
    Dropout(0.3),
    LSTM(16, return_sequences=True),
    Dropout(0.3),
    LSTM(8, return_sequences=True),
    Dropout(0.3),
    LSTM(4, return_sequences=False),
    Dropout(0.3),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(2, activation='sigmoid')  # 3 neurons for the 3 classes
])

# Compile the model
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy', 'recall','f1_score'])


# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(patience=10, monitor='val_loss', mode='min', restore_best_weights=True)

# Define model checkpoint to save the best model during training
model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss', mode='min')

# Train the model
history = lstm_model.fit(train_dataset, epochs=100, batch_size=64, validation_data=val_dataset, callbacks=[early_stopping, model_checkpoint])

# Load the best saved model
best_model = tf.keras.models.load_model('best_model.keras')

# Make predictions on the test set
test_predictions = best_model.predict(test_dataset)
test_predictions_binary = np.argmax(test_predictions, axis=1)
y_test_data_binary = np.argmax(y_test_data, axis=1)

from sklearn.metrics import accuracy_score,classification_report
# Calculate accuracy
accuracy = accuracy_score(y_test_data_binary[sequence_length:], test_predictions_binary)

print('Accuracy:', accuracy)

print(test_predictions )
print(classification_report(y_test_data_binary[sequence_length:], test_predictions_binary))

1119 239
Epoch 1/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 88ms/step - binary_accuracy: 0.5116 - f1_score: 0.3851 - loss: 0.6930 - recall: 0.4778 - val_binary_accuracy: 0.4558 - val_f1_score: 0.3131 - val_loss: 0.6976 - val_recall: 0.4558
Epoch 2/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 56ms/step - binary_accuracy: 0.5059 - f1_score: 0.3359 - loss: 0.6931 - recall: 0.5059 - val_binary_accuracy: 0.4558 - val_f1_score: 0.3131 - val_loss: 0.6976 - val_recall: 0.4558
Epoch 3/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 58ms/step - binary_accuracy: 0.5017 - f1_score: 0.3340 - loss: 0.6934 - recall: 0.5017 - val_binary_accuracy: 0.4558 - val_f1_score: 0.3131 - val_loss: 0.6982 - val_recall: 0.4558
Epoch 4/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 62ms/step - binary_accuracy: 0.5145 - f1_score: 0.3397 - loss: 0.6927 - recall: 0.5145 - val_binary_accuracy: 0.4558 - val_f1_score: 0.3131 - val_

# gru MODEL with classification 

In [2]:
#import libraries
import pandas as pd
import numpy as np
import yfinance as yf
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import datetime
import warnings
from tensorflow.keras.preprocessing import timeseries_dataset_from_array

warnings.filterwarnings("ignore")


symbol_to_fetch = 'AAPL'
start_date = '2020-01-01'
end_date = '2024-05-01'
# Parameters
batch_size = 256
sequence_length = 30

def fetch_ticker_data(symbol, start_date, end_date):
    """Fetches stock data for a given symbol using yfinance."""
    ticker = yf.Ticker(symbol)
    data = ticker.history(start='1980-01-01', end=end_date)
    return data

def label_data(data):
    # Calculate the percentage change in price from one day to the next
    data['pr_change_on_last_day'] = data['Close'].pct_change()
    data['pr_change_on_current_day'] = data['pr_change_on_last_day'].shift(-1)
    data.iloc[0,-2] = 0
    data['sentiment'] = pd.Series(np.where(data['pr_change_on_current_day'] > 0, 1, 0), index=data.index)
    # data['perc_change'] = data['Percentage Change']
    # # Drop any rows with missing values
    # data.dropna(inplace=True)
    data.drop('pr_change_on_current_day',axis=1 , inplace=True)
    return data
stock = fetch_ticker_data(symbol_to_fetch, start_date, end_date)

# Calculate deltas, moving averages, and Bollinger Bands
for i in range(1, 90,5):
    stock[f"open_delta_{i}day"] = stock["Open"].diff(periods=i)
    stock[f"high_delta_{i}day"] = stock["High"].diff(periods=i)
    stock[f"low_delta_{i}day"] = stock["Low"].diff(periods=i)
    stock[f"close_delta_{i}day"] = stock["Close"].diff(periods=i)
    stock[f"rolling_mean_open_{i}day"] = stock["Open"].rolling(window=i).mean()
    stock[f"rolling_mean_high_{i}day"] = stock["High"].rolling(window=i).mean()
    stock[f"rolling_mean_low_{i}day"] = stock["Low"].rolling(window=i).mean()
    stock[f"rolling_mean_close_{i}day"] = stock["Close"].rolling(window=i).mean()
    stock[f"rolling_std_open_{i}day"] = stock["Open"].rolling(window=i).std()
    stock[f"rolling_std_high_{i}day"] = stock["High"].rolling(window=i).std()
    stock[f"rolling_std_low_{i}day"] = stock["Low"].rolling(window=i).std()
    stock[f"rolling_std_close_{i}day"] = stock["Close"].rolling(window=i).std()

stock['fast_ma'] = stock['Close'].rolling(window=20).mean()
stock['slow_ma'] = stock['Close'].rolling(window=50).mean()
stock['bollinger_high'] = stock['Close'].rolling(window=20).mean() + (2 * stock['Close'].rolling(window=20).std())
stock['bollinger_low'] = stock['Close'].rolling(window=20).mean() - (2 * stock['Close'].rolling(window=20).std())
stock['ema'] = stock['Close'].ewm(span=20, adjust=False).mean()
stock['envelope_high'] = stock['Close'].rolling(window=20).mean() * (1 + 0.05)
stock['envelope_low'] = stock['Close'].rolling(window=20).mean() * (1 - 0.05)
stock['macd_line'] = stock['Close'].ewm(span=12, adjust=False).mean() - stock['Close'].ewm(span=26, adjust=False).mean()
stock['macd_signal'] = stock['macd_line'].ewm(span=9, adjust=False).mean()

# RSI calculation
def calculate_rsi(data, rsi_period):
    delta = data['Close'].diff().dropna()
    gain = delta.where(delta > 0, 0).dropna()
    loss = -delta.where(delta < 0, 0).dropna()
    avg_gain = gain.rolling(window=rsi_period).mean()
    avg_loss = loss.rolling(window=rsi_period).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

stock['rsi'] = calculate_rsi(stock, 14)

# # Stochastic Oscillator calculation
# def calculate_stochastic(data, k_window, d_window):
#     high_low = data[['High', 'Low']]
#     c = data['Close']
#     highest = high_low.rolling(window=k_window).max()
#     lowest = high_low.rolling(window=k_window).min()
#     print(((c - lowest) / (highest - lowest)) * 100)
#     stochastic_k = ((c - lowest) / (highest - lowest)) * 100
#     stochastic_d = stochastic_k.rolling(window=d_window).mean()
#     return stochastic_k, stochastic_d
# stock['stochastic_k'], stock['stochastic_d'] = calculate_stochastic(stock, 14, 3)

# stock['stochastic_k']= calculate_stochastic(stock, 14, 3)[0]
# stock['stochastic_d']= calculate_stochastic(stock, 14, 3)[1]
stock['day'] = pd.to_datetime(stock.index).day
stock['month'] = pd.to_datetime(stock.index).month
stock['year'] = pd.to_datetime(stock.index).year
stock['weekday'] = pd.to_datetime(stock.index).weekday
stock['dayofyear'] = pd.to_datetime(stock.index).dayofyear
stock = stock.fillna(method="ffill", axis=0)
stock = stock.fillna(method="bfill", axis=0)
stock.index = stock.index.date
# Split the data into training and test sets

# df = stock.copy()

# # Calculate pairwise correlation
# corr_matrix = df.corr()

# # Identify highly correlated columns
# redundant_cols = set()
# for i in range(5,len(corr_matrix.columns)-1):
#     for j in range(i+1, len(corr_matrix.columns)):
#         if corr_matrix.iloc[i,j] > 0.8 and corr_matrix.columns[i] not in redundant_cols:
#             redundant_cols.add(corr_matrix.columns[j])

# # Remove one of the redundant columns
# for col in redundant_cols:
#     df = df.drop(col, axis=1)

# # Print the updated DataFrame
# print(df)

# stock = df.copy()
train_data_index = np.searchsorted(stock.index.values, np.datetime64(start_date))
train_data = stock.iloc[:int(0.9*train_data_index)].copy()
val_data  = stock.iloc[int(0.9*train_data_index)-sequence_length:train_data_index].copy()
test_data = stock.iloc[train_data_index-sequence_length:].copy()
train_data = label_data(train_data)
val_data = label_data(val_data)
test_data = label_data(test_data)
train_data.fillna(0,axis = 0, inplace=True)
val_data.fillna(0,axis = 0, inplace=True)
test_data.fillna(0,axis = 0, inplace=True)

#trian & test data
X_train_data = train_data.iloc[:,:-1]
y_train_data = train_data.iloc[:,-1]
#trian & test data
X_val_data = val_data.iloc[:,:-1]
y_val_data = val_data.iloc[:,-1]
X_test_data = test_data.iloc[:,:-1]
y_test_data = test_data.iloc[:,-1]
print(len(X_test_data), len(X_test_data.columns))
from keras.utils import to_categorical

# Convert targets to one-hot encoding
y_train_onehot = to_categorical(y_train_data, num_classes=3)
y_val_onehot = to_categorical(y_val_data, num_classes=3)

y_train_data = to_categorical(y_train_data)
y_test_data = to_categorical(y_test_data)
y_val_data = to_categorical(y_val_data)

# Normalize the data
normalizer = MinMaxScaler()
X_train_data_normalizer = normalizer.fit_transform(X_train_data)
X_val_data_normalizer = normalizer.fit_transform(X_val_data)
X_test_data_normalizer = normalizer.transform(X_test_data)

# # # Reshape X_train_data_normalizer
X_train_reshaped = X_train_data_normalizer.reshape(X_train_data_normalizer.shape[0], X_train_data_normalizer.shape[1], 1)
X_val_reshaped = X_val_data_normalizer.reshape(X_val_data_normalizer.shape[0], X_val_data_normalizer.shape[1], 1)
X_test_reshaped = X_test_data_normalizer.reshape(X_test_data_normalizer.shape[0], X_test_data_normalizer.shape[1], 1)



train_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_train_data_normalizer,
    y_train_data,
    length = 3,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
val_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_val_data_normalizer,
    y_val_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
test_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_test_data_normalizer,
    y_test_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Flatten
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Define the GRU model
gru_model = tf.keras.Sequential([
tf.keras.layers.GRU(256, input_shape=(sequence_length, X_train_data_normalizer.shape[1]), return_sequences=True),
tf.keras.layers.Dropout(0.3),
tf.keras.layers.GRU(128, return_sequences=True),
tf.keras.layers.Dropout(0.3),
tf.keras.layers.GRU(64, return_sequences=True),
tf.keras.layers.Dropout(0.3),
tf.keras.layers.GRU(32, return_sequences=True),
tf.keras.layers.Dropout(0.3),
tf.keras.layers.GRU(16, return_sequences=True),
tf.keras.layers.Dropout(0.3),
tf.keras.layers.GRU(8, return_sequences=True),
tf.keras.layers.Dropout(0.3),
tf.keras.layers.GRU(4, return_sequences=False),
tf.keras.layers.Dropout(0.3),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dropout(0.3),
tf.keras.layers.Dense(2, activation='sigmoid') # 3 neurons for the 3 classes
])
# Compile the model
gru_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'recall','f1_score','precision'],)

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(patience=10, monitor='val_loss', mode='max', restore_best_weights=True)

# Define model checkpoint to save the best model during training
model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss', mode='max')

# Train the model
history = gru_model.fit(train_dataset, epochs=100, batch_size=64, validation_data=val_dataset, callbacks=[early_stopping, model_checkpoint])

# Load the best saved model
best_model = tf.keras.models.load_model('best_model.keras')

# Make predictions on the test set
test_predictions = best_model.predict(test_dataset)
test_predictions_binary = np.argmax(test_predictions, axis=1)
y_test_data_binary = np.argmax(y_test_data, axis=1)

from sklearn.metrics import accuracy_score,classification_report
# Calculate accuracy
accuracy = accuracy_score(y_test_data_binary[sequence_length:], test_predictions_binary)

print('Accuracy:', accuracy)

print(test_predictions )
print(classification_report(y_test_data_binary[sequence_length:], test_predictions_binary))


1119 239
Epoch 1/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 87ms/step - accuracy: 0.5123 - f1_score: 0.4063 - loss: 0.6930 - precision: 0.5120 - recall: 0.6203 - val_accuracy: 0.5442 - val_f1_score: 0.3524 - val_loss: 0.6920 - val_precision: 0.5000 - val_recall: 1.0000
Epoch 2/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 46ms/step - accuracy: 0.5135 - f1_score: 0.4655 - loss: 0.6932 - precision: 0.5061 - recall: 0.5601 - val_accuracy: 0.5208 - val_f1_score: 0.4750 - val_loss: 0.6929 - val_precision: 0.5202 - val_recall: 0.5360
Epoch 3/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step - accuracy: 0.5090 - f1_score: 0.4625 - loss: 0.6935 - precision: 0.5090 - recall: 0.5111 - val_accuracy: 0.5401 - val_f1_score: 0.3872 - val_loss: 0.6919 - val_precision: 0.5028 - val_recall: 0.9025
Epoch 4/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step - accuracy: 0.5147 - f1_score: 0.4572 - loss: 

# Logistic Regression classifier + all features + technical indicators 

In [152]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#import libraries
import pandas as pd
import numpy as np
import yfinance as yf
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import datetime
import warnings
from tensorflow.keras.preprocessing import timeseries_dataset_from_array

warnings.filterwarnings("ignore")


symbol_to_fetch = 'AAPL'
start_date = '2020-01-01'
end_date = '2024-05-01'
# Parameters
batch_size = 256
sequence_length = 30
stride = 1

def fetch_ticker_data(symbol, start_date, end_date):
    """Fetches stock data for a given symbol using yfinance."""
    ticker = yf.Ticker(symbol)
    data = ticker.history(start='1980-01-01', end=end_date)
    return data

def label_data(data):
    # Calculate the percentage change in price from one day to the next
    data['pr_change_on_last_day'] = data['Close'].pct_change()
    data['pr_change_on_current_day'] = data['pr_change_on_last_day'].shift(-1)
    data.iloc[0,-2] = 0
    data['sentiment'] = pd.Series(np.where(data['pr_change_on_current_day'] > 0, 1, np.where(data['pr_change_on_current_day'] < 0, -1, 0)), index=data.index)
    # data['perc_change'] = data['Percentage Change']
    # # Drop any rows with missing values
    # data.dropna(inplace=True)
    data.drop('pr_change_on_current_day',axis=1 , inplace=True)
    return data
stock = fetch_ticker_data(symbol_to_fetch, start_date, end_date)

# Calculate deltas, moving averages, and Bollinger Bands
for i in range(1, 90,5):
    stock[f"open_delta_{i}day"] = stock["Open"].diff(periods=i)
    stock[f"high_delta_{i}day"] = stock["High"].diff(periods=i)
    stock[f"low_delta_{i}day"] = stock["Low"].diff(periods=i)
    stock[f"close_delta_{i}day"] = stock["Close"].diff(periods=i)
    stock[f"rolling_mean_open_{i}day"] = stock["Open"].rolling(window=i).mean()
    stock[f"rolling_mean_high_{i}day"] = stock["High"].rolling(window=i).mean()
    stock[f"rolling_mean_low_{i}day"] = stock["Low"].rolling(window=i).mean()
    stock[f"rolling_mean_close_{i}day"] = stock["Close"].rolling(window=i).mean()
    stock[f"rolling_std_open_{i}day"] = stock["Open"].rolling(window=i).std()
    stock[f"rolling_std_high_{i}day"] = stock["High"].rolling(window=i).std()
    stock[f"rolling_std_low_{i}day"] = stock["Low"].rolling(window=i).std()
    stock[f"rolling_std_close_{i}day"] = stock["Close"].rolling(window=i).std()

stock['fast_ma'] = stock['Close'].rolling(window=20).mean()
stock['slow_ma'] = stock['Close'].rolling(window=50).mean()
stock['bollinger_high'] = stock['Close'].rolling(window=20).mean() + (2 * stock['Close'].rolling(window=20).std())
stock['bollinger_low'] = stock['Close'].rolling(window=20).mean() - (2 * stock['Close'].rolling(window=20).std())
stock['ema'] = stock['Close'].ewm(span=20, adjust=False).mean()
stock['envelope_high'] = stock['Close'].rolling(window=20).mean() * (1 + 0.05)
stock['envelope_low'] = stock['Close'].rolling(window=20).mean() * (1 - 0.05)
stock['macd_line'] = stock['Close'].ewm(span=12, adjust=False).mean() - stock['Close'].ewm(span=26, adjust=False).mean()
stock['macd_signal'] = stock['macd_line'].ewm(span=9, adjust=False).mean()

# RSI calculation
def calculate_rsi(data, rsi_period):
    delta = data['Close'].diff().dropna()
    gain = delta.where(delta > 0, 0).dropna()
    loss = -delta.where(delta < 0, 0).dropna()
    avg_gain = gain.rolling(window=rsi_period).mean()
    avg_loss = loss.rolling(window=rsi_period).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

stock['rsi'] = calculate_rsi(stock, 14)

# # Stochastic Oscillator calculation
# def calculate_stochastic(data, k_window, d_window):
#     high_low = data[['High', 'Low']]
#     c = data['Close']
#     highest = high_low.rolling(window=k_window).max()
#     lowest = high_low.rolling(window=k_window).min()
#     print(((c - lowest) / (highest - lowest)) * 100)
#     stochastic_k = ((c - lowest) / (highest - lowest)) * 100
#     stochastic_d = stochastic_k.rolling(window=d_window).mean()
#     return stochastic_k, stochastic_d
# stock['stochastic_k'], stock['stochastic_d'] = calculate_stochastic(stock, 14, 3)

# stock['stochastic_k']= calculate_stochastic(stock, 14, 3)[0]
# stock['stochastic_d']= calculate_stochastic(stock, 14, 3)[1]
stock['day'] = pd.to_datetime(stock.index).day
stock['month'] = pd.to_datetime(stock.index).month
stock['year'] = pd.to_datetime(stock.index).year
stock['weekday'] = pd.to_datetime(stock.index).weekday
stock['dayofyear'] = pd.to_datetime(stock.index).dayofyear
stock = stock.fillna(method="ffill", axis=0)
stock = stock.fillna(method="bfill", axis=0)
stock.index = stock.index.date
# Split the data into training and test sets

# df = stock.copy()

# # Calculate pairwise correlation
# corr_matrix = df.corr()

# # Identify highly correlated columns
# redundant_cols = set()
# for i in range(5,len(corr_matrix.columns)-1):
#     for j in range(i+1, len(corr_matrix.columns)):
#         if corr_matrix.iloc[i,j] > 0.8 and corr_matrix.columns[i] not in redundant_cols:
#             redundant_cols.add(corr_matrix.columns[j])

# # Remove one of the redundant columns
# for col in redundant_cols:
#     df = df.drop(col, axis=1)

# # Print the updated DataFrame
# print(df)

# stock = df.copy()
train_data_index = np.searchsorted(stock.index.values, np.datetime64(start_date))
train_data = stock.iloc[:int(0.9*train_data_index)].copy()
val_data  = stock.iloc[int(0.9*train_data_index)-sequence_length:train_data_index].copy()
test_data = stock.iloc[train_data_index-sequence_length:].copy()
train_data = label_data(train_data)
val_data = label_data(val_data)
test_data = label_data(test_data)
train_data.fillna(0,axis = 0, inplace=True)
val_data.fillna(0,axis = 0, inplace=True)
test_data.fillna(0,axis = 0, inplace=True)

#trian & test data
X_train_data = train_data.iloc[:,:-1]
y_train_data = train_data.iloc[:,-1]
#trian & test data
X_val_data = val_data.iloc[:,:-1]
y_val_data = val_data.iloc[:,-1]
X_test_data = test_data.iloc[:,:-1]
y_test_data = test_data.iloc[:,-1]
print(len(X_test_data), len(X_test_data.columns))
from keras.utils import to_categorical

# Convert targets to one-hot encoding
y_train_onehot = to_categorical(y_train_data, num_classes=3)
y_val_onehot = to_categorical(y_val_data, num_classes=3)

y_train_data_onehot = to_categorical(y_train_data)
y_test_data_onehot = to_categorical(y_test_data)
y_val_data_onehot = to_categorical(y_val_data)

# Normalize the data
normalizer = MinMaxScaler()
X_train_data_normalizer = normalizer.fit_transform(X_train_data)
X_val_data_normalizer = normalizer.fit_transform(X_val_data)
X_test_data_normalizer = normalizer.transform(X_test_data)

# # # Reshape X_train_data_normalizer
X_train_reshaped = X_train_data_normalizer.reshape(X_train_data_normalizer.shape[0], X_train_data_normalizer.shape[1], 1)
X_val_reshaped = X_val_data_normalizer.reshape(X_val_data_normalizer.shape[0], X_val_data_normalizer.shape[1], 1)
X_test_reshaped = X_test_data_normalizer.reshape(X_test_data_normalizer.shape[0], X_test_data_normalizer.shape[1], 1)

def create_sequences(x,y,sequence_length,stride):
    sequence_length  = sequence_length
    X_test_data_normalizer_sequences = []
    y_test_data_sequences = []
    stride = stride
    no_of_rows = len(x)
    no_of_columns = len(x[0])
    for i in range(sequence_length, no_of_rows-1 , stride):
        X_test_data_normalizer_sequences.append(x[i-sequence_length: i])
        y_test_data_sequences.append(y[i-1])
    return np.array(X_test_data_normalizer_sequences),np.array(y_test_data_sequences)
        
X_train_data_normalizer_sequences,y_train_data_sequences = create_sequences(X_train_data_normalizer,y_train_data,sequence_length,stride)
X_test_data_normalizer_sequences,y_test_data_sequences = create_sequences(X_test_data_normalizer,y_test_data,sequence_length,stride)



train_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_train_data_normalizer,
    y_train_data,
    length = 3,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
val_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_val_data_normalizer,
    y_val_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
test_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_test_data_normalizer,
    y_test_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)

# Create a logistic regression model
logistic_model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model on the training data
logistic_model.fit(X_train_data_normalizer, y_train_data)

# Predict labels for the test set
y_pred = logistic_model.predict(X_test_data_normalizer)

# Calculate accuracy
accuracy = accuracy_score(y_test_data, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test_data, y_pred))

y_pred

1119 239
Accuracy: 0.483467381590706
              precision    recall  f1-score   support

          -1       0.48      0.79      0.59       535
           0       0.00      0.00      0.00         4
           1       0.51      0.20      0.29       580

    accuracy                           0.48      1119
   macro avg       0.33      0.33      0.29      1119
weighted avg       0.49      0.48      0.43      1119



array([-1,  1, -1, ..., -1, -1, -1])

# Random Forest Classification + Technical Indicators +  Features 


In [153]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#import libraries
import pandas as pd
import numpy as np
import yfinance as yf
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import datetime
import warnings
from tensorflow.keras.preprocessing import timeseries_dataset_from_array

warnings.filterwarnings("ignore")


symbol_to_fetch = 'AAPL'
start_date = '2020-01-01'
end_date = '2024-05-01'
# Parameters
batch_size = 256
sequence_length = 30
stride = 1

def fetch_ticker_data(symbol, start_date, end_date):
    """Fetches stock data for a given symbol using yfinance."""
    ticker = yf.Ticker(symbol)
    data = ticker.history(start='1980-01-01', end=end_date)
    return data

def label_data(data):
    # Calculate the percentage change in price from one day to the next
    data['pr_change_on_last_day'] = data['Close'].pct_change()
    data['pr_change_on_current_day'] = data['pr_change_on_last_day'].shift(-1)
    data.iloc[0,-2] = 0
    data['sentiment'] = pd.Series(np.where(data['pr_change_on_current_day'] > 0, 1, np.where(data['pr_change_on_current_day'] < 0, -1, 0)), index=data.index)
    # data['perc_change'] = data['Percentage Change']
    # # Drop any rows with missing values
    # data.dropna(inplace=True)
    data.drop('pr_change_on_current_day',axis=1 , inplace=True)
    return data
stock = fetch_ticker_data(symbol_to_fetch, start_date, end_date)

# Calculate deltas, moving averages, and Bollinger Bands
for i in range(1, 90,5):
    stock[f"open_delta_{i}day"] = stock["Open"].diff(periods=i)
    stock[f"high_delta_{i}day"] = stock["High"].diff(periods=i)
    stock[f"low_delta_{i}day"] = stock["Low"].diff(periods=i)
    stock[f"close_delta_{i}day"] = stock["Close"].diff(periods=i)
    stock[f"rolling_mean_open_{i}day"] = stock["Open"].rolling(window=i).mean()
    stock[f"rolling_mean_high_{i}day"] = stock["High"].rolling(window=i).mean()
    stock[f"rolling_mean_low_{i}day"] = stock["Low"].rolling(window=i).mean()
    stock[f"rolling_mean_close_{i}day"] = stock["Close"].rolling(window=i).mean()
    stock[f"rolling_std_open_{i}day"] = stock["Open"].rolling(window=i).std()
    stock[f"rolling_std_high_{i}day"] = stock["High"].rolling(window=i).std()
    stock[f"rolling_std_low_{i}day"] = stock["Low"].rolling(window=i).std()
    stock[f"rolling_std_close_{i}day"] = stock["Close"].rolling(window=i).std()

stock['fast_ma'] = stock['Close'].rolling(window=20).mean()
stock['slow_ma'] = stock['Close'].rolling(window=50).mean()
stock['bollinger_high'] = stock['Close'].rolling(window=20).mean() + (2 * stock['Close'].rolling(window=20).std())
stock['bollinger_low'] = stock['Close'].rolling(window=20).mean() - (2 * stock['Close'].rolling(window=20).std())
stock['ema'] = stock['Close'].ewm(span=20, adjust=False).mean()
stock['envelope_high'] = stock['Close'].rolling(window=20).mean() * (1 + 0.05)
stock['envelope_low'] = stock['Close'].rolling(window=20).mean() * (1 - 0.05)
stock['macd_line'] = stock['Close'].ewm(span=12, adjust=False).mean() - stock['Close'].ewm(span=26, adjust=False).mean()
stock['macd_signal'] = stock['macd_line'].ewm(span=9, adjust=False).mean()

# RSI calculation
def calculate_rsi(data, rsi_period):
    delta = data['Close'].diff().dropna()
    gain = delta.where(delta > 0, 0).dropna()
    loss = -delta.where(delta < 0, 0).dropna()
    avg_gain = gain.rolling(window=rsi_period).mean()
    avg_loss = loss.rolling(window=rsi_period).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

stock['rsi'] = calculate_rsi(stock, 14)

# # Stochastic Oscillator calculation
# def calculate_stochastic(data, k_window, d_window):
#     high_low = data[['High', 'Low']]
#     c = data['Close']
#     highest = high_low.rolling(window=k_window).max()
#     lowest = high_low.rolling(window=k_window).min()
#     print(((c - lowest) / (highest - lowest)) * 100)
#     stochastic_k = ((c - lowest) / (highest - lowest)) * 100
#     stochastic_d = stochastic_k.rolling(window=d_window).mean()
#     return stochastic_k, stochastic_d
# stock['stochastic_k'], stock['stochastic_d'] = calculate_stochastic(stock, 14, 3)

# stock['stochastic_k']= calculate_stochastic(stock, 14, 3)[0]
# stock['stochastic_d']= calculate_stochastic(stock, 14, 3)[1]
stock['day'] = pd.to_datetime(stock.index).day
stock['month'] = pd.to_datetime(stock.index).month
stock['year'] = pd.to_datetime(stock.index).year
stock['weekday'] = pd.to_datetime(stock.index).weekday
stock['dayofyear'] = pd.to_datetime(stock.index).dayofyear
stock = stock.fillna(method="ffill", axis=0)
stock = stock.fillna(method="bfill", axis=0)
stock.index = stock.index.date
# Split the data into training and test sets

# df = stock.copy()

# # Calculate pairwise correlation
# corr_matrix = df.corr()

# # Identify highly correlated columns
# redundant_cols = set()
# for i in range(5,len(corr_matrix.columns)-1):
#     for j in range(i+1, len(corr_matrix.columns)):
#         if corr_matrix.iloc[i,j] > 0.8 and corr_matrix.columns[i] not in redundant_cols:
#             redundant_cols.add(corr_matrix.columns[j])

# # Remove one of the redundant columns
# for col in redundant_cols:
#     df = df.drop(col, axis=1)

# # Print the updated DataFrame
# print(df)

# stock = df.copy()
train_data_index = np.searchsorted(stock.index.values, np.datetime64(start_date))
train_data = stock.iloc[:int(0.9*train_data_index)].copy()
val_data  = stock.iloc[int(0.9*train_data_index)-sequence_length:train_data_index].copy()
test_data = stock.iloc[train_data_index-sequence_length:].copy()
train_data = label_data(train_data)
val_data = label_data(val_data)
test_data = label_data(test_data)
train_data.fillna(0,axis = 0, inplace=True)
val_data.fillna(0,axis = 0, inplace=True)
test_data.fillna(0,axis = 0, inplace=True)

#trian & test data
X_train_data = train_data.iloc[:,:-1]
y_train_data = train_data.iloc[:,-1]
#trian & test data
X_val_data = val_data.iloc[:,:-1]
y_val_data = val_data.iloc[:,-1]
X_test_data = test_data.iloc[:,:-1]
y_test_data = test_data.iloc[:,-1]
print(len(X_test_data), len(X_test_data.columns))
from keras.utils import to_categorical

# Convert targets to one-hot encoding
y_train_onehot = to_categorical(y_train_data, num_classes=3)
y_val_onehot = to_categorical(y_val_data, num_classes=3)

y_train_data_onehot = to_categorical(y_train_data)
y_test_data_onehot = to_categorical(y_test_data)
y_val_data_onehot = to_categorical(y_val_data)

# Normalize the data
normalizer = MinMaxScaler()
X_train_data_normalizer = normalizer.fit_transform(X_train_data)
X_val_data_normalizer = normalizer.fit_transform(X_val_data)
X_test_data_normalizer = normalizer.transform(X_test_data)

# # # Reshape X_train_data_normalizer
X_train_reshaped = X_train_data_normalizer.reshape(X_train_data_normalizer.shape[0], X_train_data_normalizer.shape[1], 1)
X_val_reshaped = X_val_data_normalizer.reshape(X_val_data_normalizer.shape[0], X_val_data_normalizer.shape[1], 1)
X_test_reshaped = X_test_data_normalizer.reshape(X_test_data_normalizer.shape[0], X_test_data_normalizer.shape[1], 1)

def create_sequences(x,y,sequence_length,stride):
    sequence_length  = sequence_length
    X_test_data_normalizer_sequences = []
    y_test_data_sequences = []
    stride = stride
    no_of_rows = len(x)
    no_of_columns = len(x[0])
    for i in range(sequence_length, no_of_rows-1 , stride):
        X_test_data_normalizer_sequences.append(x[i-sequence_length: i])
        y_test_data_sequences.append(y[i-1])
    return np.array(X_test_data_normalizer_sequences),np.array(y_test_data_sequences)
        
X_train_data_normalizer_sequences,y_train_data_sequences = create_sequences(X_train_data_normalizer,y_train_data,sequence_length,stride)
X_test_data_normalizer_sequences,y_test_data_sequences = create_sequences(X_test_data_normalizer,y_test_data,sequence_length,stride)



train_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_train_data_normalizer,
    y_train_data,
    length = 3,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
val_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_val_data_normalizer,
    y_val_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
test_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_test_data_normalizer,
    y_test_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Create a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
rf_model.fit(X_train_data_normalizer, y_train_data)

# Predict labels for the test set
y_pred = rf_model.predict(X_test_data_normalizer)

# Calculate accuracy
accuracy = accuracy_score(y_test_data, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test_data, y_pred))

y_pred

1119 239
Accuracy: 0.4959785522788204
              precision    recall  f1-score   support

          -1       0.48      0.54      0.51       535
           0       0.00      0.00      0.00         4
           1       0.52      0.46      0.49       580

    accuracy                           0.50      1119
   macro avg       0.33      0.33      0.33      1119
weighted avg       0.50      0.50      0.49      1119



array([-1, -1, -1, ...,  1, -1,  1])

# SVM CLASSIFIER +  TECHNICAL INDICATORS + FEATURES ENGINEERING

In [157]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#import libraries
import pandas as pd
import numpy as np
import yfinance as yf
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import datetime
import warnings
from tensorflow.keras.preprocessing import timeseries_dataset_from_array

warnings.filterwarnings("ignore")


symbol_to_fetch = 'AAPL'
start_date = '2020-01-01'
end_date = '2024-05-01'
# Parameters
batch_size = 256
sequence_length = 30
stride = 1

def fetch_ticker_data(symbol, start_date, end_date):
    """Fetches stock data for a given symbol using yfinance."""
    ticker = yf.Ticker(symbol)
    data = ticker.history(start='1980-01-01', end=end_date)
    return data

def label_data(data):
    # Calculate the percentage change in price from one day to the next
    data['pr_change_on_last_day'] = data['Close'].pct_change()
    data['pr_change_on_current_day'] = data['pr_change_on_last_day'].shift(-1)
    data.iloc[0,-2] = 0
    data['sentiment'] = pd.Series(np.where(data['pr_change_on_current_day'] > 0, 1, np.where(data['pr_change_on_current_day'] < 0, -1, 0)), index=data.index)
    # data['perc_change'] = data['Percentage Change']
    # # Drop any rows with missing values
    # data.dropna(inplace=True)
    data.drop('pr_change_on_current_day',axis=1 , inplace=True)
    return data
stock = fetch_ticker_data(symbol_to_fetch, start_date, end_date)

# Calculate deltas, moving averages, and Bollinger Bands
for i in range(1, 90,5):
    stock[f"open_delta_{i}day"] = stock["Open"].diff(periods=i)
    stock[f"high_delta_{i}day"] = stock["High"].diff(periods=i)
    stock[f"low_delta_{i}day"] = stock["Low"].diff(periods=i)
    stock[f"close_delta_{i}day"] = stock["Close"].diff(periods=i)
    stock[f"rolling_mean_open_{i}day"] = stock["Open"].rolling(window=i).mean()
    stock[f"rolling_mean_high_{i}day"] = stock["High"].rolling(window=i).mean()
    stock[f"rolling_mean_low_{i}day"] = stock["Low"].rolling(window=i).mean()
    stock[f"rolling_mean_close_{i}day"] = stock["Close"].rolling(window=i).mean()
    stock[f"rolling_std_open_{i}day"] = stock["Open"].rolling(window=i).std()
    stock[f"rolling_std_high_{i}day"] = stock["High"].rolling(window=i).std()
    stock[f"rolling_std_low_{i}day"] = stock["Low"].rolling(window=i).std()
    stock[f"rolling_std_close_{i}day"] = stock["Close"].rolling(window=i).std()

stock['fast_ma'] = stock['Close'].rolling(window=20).mean()
stock['slow_ma'] = stock['Close'].rolling(window=50).mean()
stock['bollinger_high'] = stock['Close'].rolling(window=20).mean() + (2 * stock['Close'].rolling(window=20).std())
stock['bollinger_low'] = stock['Close'].rolling(window=20).mean() - (2 * stock['Close'].rolling(window=20).std())
stock['ema'] = stock['Close'].ewm(span=20, adjust=False).mean()
stock['envelope_high'] = stock['Close'].rolling(window=20).mean() * (1 + 0.05)
stock['envelope_low'] = stock['Close'].rolling(window=20).mean() * (1 - 0.05)
stock['macd_line'] = stock['Close'].ewm(span=12, adjust=False).mean() - stock['Close'].ewm(span=26, adjust=False).mean()
stock['macd_signal'] = stock['macd_line'].ewm(span=9, adjust=False).mean()

# RSI calculation
def calculate_rsi(data, rsi_period):
    delta = data['Close'].diff().dropna()
    gain = delta.where(delta > 0, 0).dropna()
    loss = -delta.where(delta < 0, 0).dropna()
    avg_gain = gain.rolling(window=rsi_period).mean()
    avg_loss = loss.rolling(window=rsi_period).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

stock['rsi'] = calculate_rsi(stock, 14)

# # Stochastic Oscillator calculation
# def calculate_stochastic(data, k_window, d_window):
#     high_low = data[['High', 'Low']]
#     c = data['Close']
#     highest = high_low.rolling(window=k_window).max()
#     lowest = high_low.rolling(window=k_window).min()
#     print(((c - lowest) / (highest - lowest)) * 100)
#     stochastic_k = ((c - lowest) / (highest - lowest)) * 100
#     stochastic_d = stochastic_k.rolling(window=d_window).mean()
#     return stochastic_k, stochastic_d
# stock['stochastic_k'], stock['stochastic_d'] = calculate_stochastic(stock, 14, 3)

# stock['stochastic_k']= calculate_stochastic(stock, 14, 3)[0]
# stock['stochastic_d']= calculate_stochastic(stock, 14, 3)[1]
stock['day'] = pd.to_datetime(stock.index).day
stock['month'] = pd.to_datetime(stock.index).month
stock['year'] = pd.to_datetime(stock.index).year
stock['weekday'] = pd.to_datetime(stock.index).weekday
stock['dayofyear'] = pd.to_datetime(stock.index).dayofyear
stock = stock.fillna(method="ffill", axis=0)
stock = stock.fillna(method="bfill", axis=0)
stock.index = stock.index.date
# Split the data into training and test sets

# df = stock.copy()

# # Calculate pairwise correlation
# corr_matrix = df.corr()

# # Identify highly correlated columns
# redundant_cols = set()
# for i in range(5,len(corr_matrix.columns)-1):
#     for j in range(i+1, len(corr_matrix.columns)):
#         if corr_matrix.iloc[i,j] > 0.8 and corr_matrix.columns[i] not in redundant_cols:
#             redundant_cols.add(corr_matrix.columns[j])

# # Remove one of the redundant columns
# for col in redundant_cols:
#     df = df.drop(col, axis=1)

# # Print the updated DataFrame
# print(df)

# stock = df.copy()
train_data_index = np.searchsorted(stock.index.values, np.datetime64(start_date))
train_data = stock.iloc[:int(0.9*train_data_index)].copy()
val_data  = stock.iloc[int(0.9*train_data_index)-sequence_length:train_data_index].copy()
test_data = stock.iloc[train_data_index-sequence_length:].copy()
train_data = label_data(train_data)
val_data = label_data(val_data)
test_data = label_data(test_data)
train_data.fillna(0,axis = 0, inplace=True)
val_data.fillna(0,axis = 0, inplace=True)
test_data.fillna(0,axis = 0, inplace=True)

#trian & test data
X_train_data = train_data.iloc[:,:-1]
y_train_data = train_data.iloc[:,-1]
#trian & test data
X_val_data = val_data.iloc[:,:-1]
y_val_data = val_data.iloc[:,-1]
X_test_data = test_data.iloc[:,:-1]
y_test_data = test_data.iloc[:,-1]
print(len(X_test_data), len(X_test_data.columns))
from keras.utils import to_categorical

# Convert targets to one-hot encoding
y_train_onehot = to_categorical(y_train_data, num_classes=3)
y_val_onehot = to_categorical(y_val_data, num_classes=3)

y_train_data_onehot = to_categorical(y_train_data)
y_test_data_onehot = to_categorical(y_test_data)
y_val_data_onehot = to_categorical(y_val_data)

# Normalize the data
normalizer = MinMaxScaler()
X_train_data_normalizer = normalizer.fit_transform(X_train_data)
X_val_data_normalizer = normalizer.fit_transform(X_val_data)
X_test_data_normalizer = normalizer.transform(X_test_data)

# # # Reshape X_train_data_normalizer
X_train_reshaped = X_train_data_normalizer.reshape(X_train_data_normalizer.shape[0], X_train_data_normalizer.shape[1], 1)
X_val_reshaped = X_val_data_normalizer.reshape(X_val_data_normalizer.shape[0], X_val_data_normalizer.shape[1], 1)
X_test_reshaped = X_test_data_normalizer.reshape(X_test_data_normalizer.shape[0], X_test_data_normalizer.shape[1], 1)

def create_sequences(x,y,sequence_length,stride):
    sequence_length  = sequence_length
    X_test_data_normalizer_sequences = []
    y_test_data_sequences = []
    stride = stride
    no_of_rows = len(x)
    no_of_columns = len(x[0])
    for i in range(sequence_length, no_of_rows-1 , stride):
        X_test_data_normalizer_sequences.append(x[i-sequence_length: i])
        y_test_data_sequences.append(y[i-1])
    return np.array(X_test_data_normalizer_sequences),np.array(y_test_data_sequences)
        
X_train_data_normalizer_sequences,y_train_data_sequences = create_sequences(X_train_data_normalizer,y_train_data,sequence_length,stride)
X_test_data_normalizer_sequences,y_test_data_sequences = create_sequences(X_test_data_normalizer,y_test_data,sequence_length,stride)



train_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_train_data_normalizer,
    y_train_data,
    length = 3,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
val_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_val_data_normalizer,
    y_val_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
test_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_test_data_normalizer,
    y_test_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Create an SVM classifier
svm_model = SVC(kernel='sigmoid', random_state=42)

# Train the model on the training data
svm_model.fit(X_train_data_normalizer, y_train_data)

# Predict labels for the test set
y_pred = svm_model.predict(X_test_data_normalizer)
# Calculate accuracy
accuracy = accuracy_score(y_test_data, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test_data, y_pred))

y_pred

1119 239
Accuracy: 0.48614834673815904
              precision    recall  f1-score   support

          -1       0.48      0.59      0.53       535
           0       0.00      0.00      0.00         4
           1       0.52      0.39      0.45       580

    accuracy                           0.49      1119
   macro avg       0.33      0.33      0.33      1119
weighted avg       0.50      0.49      0.49      1119



array([-1, -1, -1, ...,  1,  1,  1])

In [163]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#import libraries
import pandas as pd
import numpy as np
import yfinance as yf
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import datetime
import warnings
from tensorflow.keras.preprocessing import timeseries_dataset_from_array

warnings.filterwarnings("ignore")


symbol_to_fetch = 'AAPL'
start_date = '2020-01-01'
end_date = '2024-05-01'
# Parameters
batch_size = 256
sequence_length = 30
stride = 1

def fetch_ticker_data(symbol, start_date, end_date):
    """Fetches stock data for a given symbol using yfinance."""
    ticker = yf.Ticker(symbol)
    data = ticker.history(start='1980-01-01', end=end_date)
    return data

def label_data(data):
    # Calculate the percentage change in price from one day to the next
    data['pr_change_on_last_day'] = data['Close'].pct_change()
    data['pr_change_on_current_day'] = data['pr_change_on_last_day'].shift(-1)
    data.iloc[0,-2] = 0
    data['sentiment'] = pd.Series(np.where(data['pr_change_on_current_day'] > 0, 1, np.where(data['pr_change_on_current_day'] < 0, -1, 0)), index=data.index)
    # data['perc_change'] = data['Percentage Change']
    # # Drop any rows with missing values
    # data.dropna(inplace=True)
    data.drop('pr_change_on_current_day',axis=1 , inplace=True)
    return data
stock = fetch_ticker_data(symbol_to_fetch, start_date, end_date)

# Calculate deltas, moving averages, and Bollinger Bands
for i in range(1, 90,5):
    stock[f"open_delta_{i}day"] = stock["Open"].diff(periods=i)
    stock[f"high_delta_{i}day"] = stock["High"].diff(periods=i)
    stock[f"low_delta_{i}day"] = stock["Low"].diff(periods=i)
    stock[f"close_delta_{i}day"] = stock["Close"].diff(periods=i)
    stock[f"rolling_mean_open_{i}day"] = stock["Open"].rolling(window=i).mean()
    stock[f"rolling_mean_high_{i}day"] = stock["High"].rolling(window=i).mean()
    stock[f"rolling_mean_low_{i}day"] = stock["Low"].rolling(window=i).mean()
    stock[f"rolling_mean_close_{i}day"] = stock["Close"].rolling(window=i).mean()
    stock[f"rolling_std_open_{i}day"] = stock["Open"].rolling(window=i).std()
    stock[f"rolling_std_high_{i}day"] = stock["High"].rolling(window=i).std()
    stock[f"rolling_std_low_{i}day"] = stock["Low"].rolling(window=i).std()
    stock[f"rolling_std_close_{i}day"] = stock["Close"].rolling(window=i).std()

stock['fast_ma'] = stock['Close'].rolling(window=20).mean()
stock['slow_ma'] = stock['Close'].rolling(window=50).mean()
stock['bollinger_high'] = stock['Close'].rolling(window=20).mean() + (2 * stock['Close'].rolling(window=20).std())
stock['bollinger_low'] = stock['Close'].rolling(window=20).mean() - (2 * stock['Close'].rolling(window=20).std())
stock['ema'] = stock['Close'].ewm(span=20, adjust=False).mean()
stock['envelope_high'] = stock['Close'].rolling(window=20).mean() * (1 + 0.05)
stock['envelope_low'] = stock['Close'].rolling(window=20).mean() * (1 - 0.05)
stock['macd_line'] = stock['Close'].ewm(span=12, adjust=False).mean() - stock['Close'].ewm(span=26, adjust=False).mean()
stock['macd_signal'] = stock['macd_line'].ewm(span=9, adjust=False).mean()

# RSI calculation
def calculate_rsi(data, rsi_period):
    delta = data['Close'].diff().dropna()
    gain = delta.where(delta > 0, 0).dropna()
    loss = -delta.where(delta < 0, 0).dropna()
    avg_gain = gain.rolling(window=rsi_period).mean()
    avg_loss = loss.rolling(window=rsi_period).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

stock['rsi'] = calculate_rsi(stock, 14)

# # Stochastic Oscillator calculation
# def calculate_stochastic(data, k_window, d_window):
#     high_low = data[['High', 'Low']]
#     c = data['Close']
#     highest = high_low.rolling(window=k_window).max()
#     lowest = high_low.rolling(window=k_window).min()
#     print(((c - lowest) / (highest - lowest)) * 100)
#     stochastic_k = ((c - lowest) / (highest - lowest)) * 100
#     stochastic_d = stochastic_k.rolling(window=d_window).mean()
#     return stochastic_k, stochastic_d
# stock['stochastic_k'], stock['stochastic_d'] = calculate_stochastic(stock, 14, 3)

# stock['stochastic_k']= calculate_stochastic(stock, 14, 3)[0]
# stock['stochastic_d']= calculate_stochastic(stock, 14, 3)[1]
stock['day'] = pd.to_datetime(stock.index).day
stock['month'] = pd.to_datetime(stock.index).month
stock['year'] = pd.to_datetime(stock.index).year
stock['weekday'] = pd.to_datetime(stock.index).weekday
stock['dayofyear'] = pd.to_datetime(stock.index).dayofyear
stock = stock.fillna(method="ffill", axis=0)
stock = stock.fillna(method="bfill", axis=0)
stock.index = stock.index.date
# Split the data into training and test sets

# df = stock.copy()

# # Calculate pairwise correlation
# corr_matrix = df.corr()

# # Identify highly correlated columns
# redundant_cols = set()
# for i in range(5,len(corr_matrix.columns)-1):
#     for j in range(i+1, len(corr_matrix.columns)):
#         if corr_matrix.iloc[i,j] > 0.8 and corr_matrix.columns[i] not in redundant_cols:
#             redundant_cols.add(corr_matrix.columns[j])

# # Remove one of the redundant columns
# for col in redundant_cols:
#     df = df.drop(col, axis=1)

# # Print the updated DataFrame
# print(df)

# stock = df.copy()
train_data_index = np.searchsorted(stock.index.values, np.datetime64(start_date))
train_data = stock.iloc[:int(0.9*train_data_index)].copy()
val_data  = stock.iloc[int(0.9*train_data_index)-sequence_length:train_data_index].copy()
test_data = stock.iloc[train_data_index-sequence_length:].copy()
train_data = label_data(train_data)
val_data = label_data(val_data)
test_data = label_data(test_data)
train_data.fillna(0,axis = 0, inplace=True)
val_data.fillna(0,axis = 0, inplace=True)
test_data.fillna(0,axis = 0, inplace=True)

#trian & test data
X_train_data = train_data.iloc[:,:-1]
y_train_data = train_data.iloc[:,-1]
#trian & test data
X_val_data = val_data.iloc[:,:-1]
y_val_data = val_data.iloc[:,-1]
X_test_data = test_data.iloc[:,:-1]
y_test_data = test_data.iloc[:,-1]
print(len(X_test_data), len(X_test_data.columns))
from keras.utils import to_categorical

# Convert targets to one-hot encoding
y_train_onehot = to_categorical(y_train_data, num_classes=3)
y_val_onehot = to_categorical(y_val_data, num_classes=3)

y_train_data_onehot = to_categorical(y_train_data)
y_test_data_onehot = to_categorical(y_test_data)
y_val_data_onehot = to_categorical(y_val_data)

# Normalize the data
normalizer = MinMaxScaler()
X_train_data_normalizer = normalizer.fit_transform(X_train_data)
X_val_data_normalizer = normalizer.fit_transform(X_val_data)
X_test_data_normalizer = normalizer.transform(X_test_data)

# # # Reshape X_train_data_normalizer
X_train_reshaped = X_train_data_normalizer.reshape(X_train_data_normalizer.shape[0], X_train_data_normalizer.shape[1], 1)
X_val_reshaped = X_val_data_normalizer.reshape(X_val_data_normalizer.shape[0], X_val_data_normalizer.shape[1], 1)
X_test_reshaped = X_test_data_normalizer.reshape(X_test_data_normalizer.shape[0], X_test_data_normalizer.shape[1], 1)

def create_sequences(x,y,sequence_length,stride):
    sequence_length  = sequence_length
    X_test_data_normalizer_sequences = []
    y_test_data_sequences = []
    stride = stride
    no_of_rows = len(x)
    no_of_columns = len(x[0])
    for i in range(sequence_length, no_of_rows-1 , stride):
        X_test_data_normalizer_sequences.append(x[i-sequence_length: i])
        y_test_data_sequences.append(y[i-1])
    return np.array(X_test_data_normalizer_sequences),np.array(y_test_data_sequences)
        
X_train_data_normalizer_sequences,y_train_data_sequences = create_sequences(X_train_data_normalizer,y_train_data,sequence_length,stride)
X_test_data_normalizer_sequences,y_test_data_sequences = create_sequences(X_test_data_normalizer,y_test_data,sequence_length,stride)



train_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_train_data_normalizer,
    y_train_data,
    length = 3,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
val_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_val_data_normalizer,
    y_val_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
test_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_test_data_normalizer,
    y_test_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

# Create a LightGBM classifier
lgbm_model = LGBMClassifier(random_state=42)

# Train the model on the training data
lgbm_model.fit(X_train_data_normalizer, y_train_data)

# Predict labels for the test set
y_pred = lgbm_model.predict(X_test_data_normalizer)

# Calculate accuracy
accuracy = accuracy_score(y_test_data, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test_data, y_pred))

y_pred


1119 239
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020792 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 58479
[LightGBM] [Info] Number of data points in the train set: 8862, number of used features: 234
[LightGBM] [Info] Start training from score -0.756017
[LightGBM] [Info] Start training from score -3.186894
[LightGBM] [Info] Start training from score -0.715051
Accuracy: 0.5040214477211796
              precision    recall  f1-score   support

          -1       0.49      0.79      0.60       535
           0       0.00      0.00      0.00         4
           1       0.56      0.24      0.34       580

    accuracy                           0.50      1119
   macro avg       0.35      0.34      0.31      1119
weighted avg       0.52      0.50      0.46      1119



array([-1, -1,  1, ..., -1, -1,  1])

# Gradient Boosting + Features + TA 

In [164]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#import libraries
import pandas as pd
import numpy as np
import yfinance as yf
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import datetime
import warnings
from tensorflow.keras.preprocessing import timeseries_dataset_from_array

warnings.filterwarnings("ignore")


symbol_to_fetch = 'AAPL'
start_date = '2020-01-01'
end_date = '2024-05-01'
# Parameters
batch_size = 256
sequence_length = 30
stride = 1

def fetch_ticker_data(symbol, start_date, end_date):
    """Fetches stock data for a given symbol using yfinance."""
    ticker = yf.Ticker(symbol)
    data = ticker.history(start='1980-01-01', end=end_date)
    return data

def label_data(data):
    # Calculate the percentage change in price from one day to the next
    data['pr_change_on_last_day'] = data['Close'].pct_change()
    data['pr_change_on_current_day'] = data['pr_change_on_last_day'].shift(-1)
    data.iloc[0,-2] = 0
    data['sentiment'] = pd.Series(np.where(data['pr_change_on_current_day'] > 0, 1, np.where(data['pr_change_on_current_day'] < 0, -1, 0)), index=data.index)
    # data['perc_change'] = data['Percentage Change']
    # # Drop any rows with missing values
    # data.dropna(inplace=True)
    data.drop('pr_change_on_current_day',axis=1 , inplace=True)
    return data
stock = fetch_ticker_data(symbol_to_fetch, start_date, end_date)

# Calculate deltas, moving averages, and Bollinger Bands
for i in range(1, 90,5):
    stock[f"open_delta_{i}day"] = stock["Open"].diff(periods=i)
    stock[f"high_delta_{i}day"] = stock["High"].diff(periods=i)
    stock[f"low_delta_{i}day"] = stock["Low"].diff(periods=i)
    stock[f"close_delta_{i}day"] = stock["Close"].diff(periods=i)
    stock[f"rolling_mean_open_{i}day"] = stock["Open"].rolling(window=i).mean()
    stock[f"rolling_mean_high_{i}day"] = stock["High"].rolling(window=i).mean()
    stock[f"rolling_mean_low_{i}day"] = stock["Low"].rolling(window=i).mean()
    stock[f"rolling_mean_close_{i}day"] = stock["Close"].rolling(window=i).mean()
    stock[f"rolling_std_open_{i}day"] = stock["Open"].rolling(window=i).std()
    stock[f"rolling_std_high_{i}day"] = stock["High"].rolling(window=i).std()
    stock[f"rolling_std_low_{i}day"] = stock["Low"].rolling(window=i).std()
    stock[f"rolling_std_close_{i}day"] = stock["Close"].rolling(window=i).std()

stock['fast_ma'] = stock['Close'].rolling(window=20).mean()
stock['slow_ma'] = stock['Close'].rolling(window=50).mean()
stock['bollinger_high'] = stock['Close'].rolling(window=20).mean() + (2 * stock['Close'].rolling(window=20).std())
stock['bollinger_low'] = stock['Close'].rolling(window=20).mean() - (2 * stock['Close'].rolling(window=20).std())
stock['ema'] = stock['Close'].ewm(span=20, adjust=False).mean()
stock['envelope_high'] = stock['Close'].rolling(window=20).mean() * (1 + 0.05)
stock['envelope_low'] = stock['Close'].rolling(window=20).mean() * (1 - 0.05)
stock['macd_line'] = stock['Close'].ewm(span=12, adjust=False).mean() - stock['Close'].ewm(span=26, adjust=False).mean()
stock['macd_signal'] = stock['macd_line'].ewm(span=9, adjust=False).mean()

# RSI calculation
def calculate_rsi(data, rsi_period):
    delta = data['Close'].diff().dropna()
    gain = delta.where(delta > 0, 0).dropna()
    loss = -delta.where(delta < 0, 0).dropna()
    avg_gain = gain.rolling(window=rsi_period).mean()
    avg_loss = loss.rolling(window=rsi_period).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

stock['rsi'] = calculate_rsi(stock, 14)

# # Stochastic Oscillator calculation
# def calculate_stochastic(data, k_window, d_window):
#     high_low = data[['High', 'Low']]
#     c = data['Close']
#     highest = high_low.rolling(window=k_window).max()
#     lowest = high_low.rolling(window=k_window).min()
#     print(((c - lowest) / (highest - lowest)) * 100)
#     stochastic_k = ((c - lowest) / (highest - lowest)) * 100
#     stochastic_d = stochastic_k.rolling(window=d_window).mean()
#     return stochastic_k, stochastic_d
# stock['stochastic_k'], stock['stochastic_d'] = calculate_stochastic(stock, 14, 3)

# stock['stochastic_k']= calculate_stochastic(stock, 14, 3)[0]
# stock['stochastic_d']= calculate_stochastic(stock, 14, 3)[1]
stock['day'] = pd.to_datetime(stock.index).day
stock['month'] = pd.to_datetime(stock.index).month
stock['year'] = pd.to_datetime(stock.index).year
stock['weekday'] = pd.to_datetime(stock.index).weekday
stock['dayofyear'] = pd.to_datetime(stock.index).dayofyear
stock = stock.fillna(method="ffill", axis=0)
stock = stock.fillna(method="bfill", axis=0)
stock.index = stock.index.date
# Split the data into training and test sets

# df = stock.copy()

# # Calculate pairwise correlation
# corr_matrix = df.corr()

# # Identify highly correlated columns
# redundant_cols = set()
# for i in range(5,len(corr_matrix.columns)-1):
#     for j in range(i+1, len(corr_matrix.columns)):
#         if corr_matrix.iloc[i,j] > 0.8 and corr_matrix.columns[i] not in redundant_cols:
#             redundant_cols.add(corr_matrix.columns[j])

# # Remove one of the redundant columns
# for col in redundant_cols:
#     df = df.drop(col, axis=1)

# # Print the updated DataFrame
# print(df)

# stock = df.copy()
train_data_index = np.searchsorted(stock.index.values, np.datetime64(start_date))
train_data = stock.iloc[:int(0.9*train_data_index)].copy()
val_data  = stock.iloc[int(0.9*train_data_index)-sequence_length:train_data_index].copy()
test_data = stock.iloc[train_data_index-sequence_length:].copy()
train_data = label_data(train_data)
val_data = label_data(val_data)
test_data = label_data(test_data)
train_data.fillna(0,axis = 0, inplace=True)
val_data.fillna(0,axis = 0, inplace=True)
test_data.fillna(0,axis = 0, inplace=True)

#trian & test data
X_train_data = train_data.iloc[:,:-1]
y_train_data = train_data.iloc[:,-1]
#trian & test data
X_val_data = val_data.iloc[:,:-1]
y_val_data = val_data.iloc[:,-1]
X_test_data = test_data.iloc[:,:-1]
y_test_data = test_data.iloc[:,-1]
print(len(X_test_data), len(X_test_data.columns))
from keras.utils import to_categorical

# Convert targets to one-hot encoding
y_train_onehot = to_categorical(y_train_data, num_classes=3)
y_val_onehot = to_categorical(y_val_data, num_classes=3)

y_train_data_onehot = to_categorical(y_train_data)
y_test_data_onehot = to_categorical(y_test_data)
y_val_data_onehot = to_categorical(y_val_data)

# Normalize the data
normalizer = MinMaxScaler()
X_train_data_normalizer = normalizer.fit_transform(X_train_data)
X_val_data_normalizer = normalizer.fit_transform(X_val_data)
X_test_data_normalizer = normalizer.transform(X_test_data)

# # # Reshape X_train_data_normalizer
X_train_reshaped = X_train_data_normalizer.reshape(X_train_data_normalizer.shape[0], X_train_data_normalizer.shape[1], 1)
X_val_reshaped = X_val_data_normalizer.reshape(X_val_data_normalizer.shape[0], X_val_data_normalizer.shape[1], 1)
X_test_reshaped = X_test_data_normalizer.reshape(X_test_data_normalizer.shape[0], X_test_data_normalizer.shape[1], 1)

def create_sequences(x,y,sequence_length,stride):
    sequence_length  = sequence_length
    X_test_data_normalizer_sequences = []
    y_test_data_sequences = []
    stride = stride
    no_of_rows = len(x)
    no_of_columns = len(x[0])
    for i in range(sequence_length, no_of_rows-1 , stride):
        X_test_data_normalizer_sequences.append(x[i-sequence_length: i])
        y_test_data_sequences.append(y[i-1])
    return np.array(X_test_data_normalizer_sequences),np.array(y_test_data_sequences)
        
X_train_data_normalizer_sequences,y_train_data_sequences = create_sequences(X_train_data_normalizer,y_train_data,sequence_length,stride)
X_test_data_normalizer_sequences,y_test_data_sequences = create_sequences(X_test_data_normalizer,y_test_data,sequence_length,stride)



train_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_train_data_normalizer,
    y_train_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
val_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_val_data_normalizer,
    y_val_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
test_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_test_data_normalizer,
    y_test_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Create a Gradient Boosting classifier
gb_model = GradientBoostingClassifier(random_state=42)

# Train the model on the training data
gb_model.fit(X_train_data_normalizer, y_train_data)

# Predict labels for the test set
y_pred = gb_model.predict(X_test_data_normalizer)


# Calculate accuracy
accuracy = accuracy_score(y_test_data, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test_data, y_pred))

y_pred


1119 239


Accuracy: 0.47989276139410186
              precision    recall  f1-score   support

          -1       0.49      0.99      0.65       535
           0       0.04      0.25      0.07         4
           1       0.88      0.01      0.02       580

    accuracy                           0.48      1119
   macro avg       0.47      0.42      0.25      1119
weighted avg       0.69      0.48      0.32      1119



array([-1, -1, -1, ..., -1, -1, -1])

# XGBOOST + TECHNICAL INDICATORS + FEATURES

In [167]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#import libraries
import pandas as pd
import numpy as np
import yfinance as yf
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import datetime
import warnings
from tensorflow.keras.preprocessing import timeseries_dataset_from_array

warnings.filterwarnings("ignore")


symbol_to_fetch = 'AAPL'
start_date = '2020-01-01'
end_date = '2024-05-01'
# Parameters
batch_size = 256
sequence_length = 30
stride = 1

def fetch_ticker_data(symbol, start_date, end_date):
    """Fetches stock data for a given symbol using yfinance."""
    ticker = yf.Ticker(symbol)
    data = ticker.history(start='1980-01-01', end=end_date)
    return data

def label_data(data):
    # Calculate the percentage change in price from one day to the next
    data['pr_change_on_last_day'] = data['Close'].pct_change()
    data['pr_change_on_current_day'] = data['pr_change_on_last_day'].shift(-1)
    data.iloc[0,-2] = 0
    data['sentiment'] = pd.Series(np.where(data['pr_change_on_current_day'] > 0, 1, 0), index=data.index)
    # data['perc_change'] = data['Percentage Change']
    # # Drop any rows with missing values
    # data.dropna(inplace=True)
    data.drop('pr_change_on_current_day',axis=1 , inplace=True)
    return data
stock = fetch_ticker_data(symbol_to_fetch, start_date, end_date)

# Calculate deltas, moving averages, and Bollinger Bands
for i in range(1, 90,5):
    stock[f"open_delta_{i}day"] = stock["Open"].diff(periods=i)
    stock[f"high_delta_{i}day"] = stock["High"].diff(periods=i)
    stock[f"low_delta_{i}day"] = stock["Low"].diff(periods=i)
    stock[f"close_delta_{i}day"] = stock["Close"].diff(periods=i)
    stock[f"rolling_mean_open_{i}day"] = stock["Open"].rolling(window=i).mean()
    stock[f"rolling_mean_high_{i}day"] = stock["High"].rolling(window=i).mean()
    stock[f"rolling_mean_low_{i}day"] = stock["Low"].rolling(window=i).mean()
    stock[f"rolling_mean_close_{i}day"] = stock["Close"].rolling(window=i).mean()
    stock[f"rolling_std_open_{i}day"] = stock["Open"].rolling(window=i).std()
    stock[f"rolling_std_high_{i}day"] = stock["High"].rolling(window=i).std()
    stock[f"rolling_std_low_{i}day"] = stock["Low"].rolling(window=i).std()
    stock[f"rolling_std_close_{i}day"] = stock["Close"].rolling(window=i).std()

stock['fast_ma'] = stock['Close'].rolling(window=20).mean()
stock['slow_ma'] = stock['Close'].rolling(window=50).mean()
stock['bollinger_high'] = stock['Close'].rolling(window=20).mean() + (2 * stock['Close'].rolling(window=20).std())
stock['bollinger_low'] = stock['Close'].rolling(window=20).mean() - (2 * stock['Close'].rolling(window=20).std())
stock['ema'] = stock['Close'].ewm(span=20, adjust=False).mean()
stock['envelope_high'] = stock['Close'].rolling(window=20).mean() * (1 + 0.05)
stock['envelope_low'] = stock['Close'].rolling(window=20).mean() * (1 - 0.05)
stock['macd_line'] = stock['Close'].ewm(span=12, adjust=False).mean() - stock['Close'].ewm(span=26, adjust=False).mean()
stock['macd_signal'] = stock['macd_line'].ewm(span=9, adjust=False).mean()

# RSI calculation
def calculate_rsi(data, rsi_period):
    delta = data['Close'].diff().dropna()
    gain = delta.where(delta > 0, 0).dropna()
    loss = -delta.where(delta < 0, 0).dropna()
    avg_gain = gain.rolling(window=rsi_period).mean()
    avg_loss = loss.rolling(window=rsi_period).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

stock['rsi'] = calculate_rsi(stock, 14)

# # Stochastic Oscillator calculation
# def calculate_stochastic(data, k_window, d_window):
#     high_low = data[['High', 'Low']]
#     c = data['Close']
#     highest = high_low.rolling(window=k_window).max()
#     lowest = high_low.rolling(window=k_window).min()
#     print(((c - lowest) / (highest - lowest)) * 100)
#     stochastic_k = ((c - lowest) / (highest - lowest)) * 100
#     stochastic_d = stochastic_k.rolling(window=d_window).mean()
#     return stochastic_k, stochastic_d
# stock['stochastic_k'], stock['stochastic_d'] = calculate_stochastic(stock, 14, 3)

# stock['stochastic_k']= calculate_stochastic(stock, 14, 3)[0]
# stock['stochastic_d']= calculate_stochastic(stock, 14, 3)[1]
stock['day'] = pd.to_datetime(stock.index).day
stock['month'] = pd.to_datetime(stock.index).month
stock['year'] = pd.to_datetime(stock.index).year
stock['weekday'] = pd.to_datetime(stock.index).weekday
stock['dayofyear'] = pd.to_datetime(stock.index).dayofyear
stock = stock.fillna(method="ffill", axis=0)
stock = stock.fillna(method="bfill", axis=0)
stock.index = stock.index.date
# Split the data into training and test sets

# df = stock.copy()

# # Calculate pairwise correlation
# corr_matrix = df.corr()

# # Identify highly correlated columns
# redundant_cols = set()
# for i in range(5,len(corr_matrix.columns)-1):
#     for j in range(i+1, len(corr_matrix.columns)):
#         if corr_matrix.iloc[i,j] > 0.8 and corr_matrix.columns[i] not in redundant_cols:
#             redundant_cols.add(corr_matrix.columns[j])

# # Remove one of the redundant columns
# for col in redundant_cols:
#     df = df.drop(col, axis=1)

# # Print the updated DataFrame
# print(df)

# stock = df.copy()
train_data_index = np.searchsorted(stock.index.values, np.datetime64(start_date))
train_data = stock.iloc[:int(0.9*train_data_index)].copy()
val_data  = stock.iloc[int(0.9*train_data_index)-sequence_length:train_data_index].copy()
test_data = stock.iloc[train_data_index-sequence_length:].copy()
train_data = label_data(train_data)
val_data = label_data(val_data)
test_data = label_data(test_data)
train_data.fillna(0,axis = 0, inplace=True)
val_data.fillna(0,axis = 0, inplace=True)
test_data.fillna(0,axis = 0, inplace=True)

#trian & test data
X_train_data = train_data.iloc[:,:-1]
y_train_data = train_data.iloc[:,-1]
#trian & test data
X_val_data = val_data.iloc[:,:-1]
y_val_data = val_data.iloc[:,-1]
X_test_data = test_data.iloc[:,:-1]
y_test_data = test_data.iloc[:,-1]
print(len(X_test_data), len(X_test_data.columns))
from keras.utils import to_categorical

# Convert targets to one-hot encoding
y_train_onehot = to_categorical(y_train_data, num_classes=3)
y_val_onehot = to_categorical(y_val_data, num_classes=3)

y_train_data_onehot = to_categorical(y_train_data)
y_test_data_onehot = to_categorical(y_test_data)
y_val_data_onehot = to_categorical(y_val_data)

# Normalize the data
normalizer = MinMaxScaler()
X_train_data_normalizer = normalizer.fit_transform(X_train_data)
X_val_data_normalizer = normalizer.fit_transform(X_val_data)
X_test_data_normalizer = normalizer.transform(X_test_data)

# # # Reshape X_train_data_normalizer
X_train_reshaped = X_train_data_normalizer.reshape(X_train_data_normalizer.shape[0], X_train_data_normalizer.shape[1], 1)
X_val_reshaped = X_val_data_normalizer.reshape(X_val_data_normalizer.shape[0], X_val_data_normalizer.shape[1], 1)
X_test_reshaped = X_test_data_normalizer.reshape(X_test_data_normalizer.shape[0], X_test_data_normalizer.shape[1], 1)

def create_sequences(x,y,sequence_length,stride):
    sequence_length  = sequence_length
    X_test_data_normalizer_sequences = []
    y_test_data_sequences = []
    stride = stride
    no_of_rows = len(x)
    no_of_columns = len(x[0])
    for i in range(sequence_length, no_of_rows-1 , stride):
        X_test_data_normalizer_sequences.append(x[i-sequence_length: i])
        y_test_data_sequences.append(y[i-1])
    return np.array(X_test_data_normalizer_sequences),np.array(y_test_data_sequences)
        
X_train_data_normalizer_sequences,y_train_data_sequences = create_sequences(X_train_data_normalizer,y_train_data,sequence_length,stride)
X_test_data_normalizer_sequences,y_test_data_sequences = create_sequences(X_test_data_normalizer,y_test_data,sequence_length,stride)



train_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_train_data_normalizer,
    y_train_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
val_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_val_data_normalizer,
    y_val_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
test_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_test_data_normalizer,
    y_test_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Create an XGBoost classifier
xgb_model = XGBClassifier(random_state=42)

# Train the model on the training data
xgb_model.fit(X_train_data_normalizer, y_train_data)

# Predict labels for the test set
y_pred = xgb_model.predict(X_test_data_normalizer)


# Calculate accuracy
accuracy = accuracy_score(y_test_data, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test_data, y_pred))

y_pred

1119 239


Accuracy: 0.49776586237712245
              precision    recall  f1-score   support

           0       0.49      0.71      0.58       539
           1       0.53      0.30      0.39       580

    accuracy                           0.50      1119
   macro avg       0.51      0.51      0.48      1119
weighted avg       0.51      0.50      0.48      1119



array([0, 0, 0, ..., 0, 1, 1])

In [177]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#import libraries
import pandas as pd
import numpy as np
import yfinance as yf
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import datetime
import warnings
from tensorflow.keras.preprocessing import timeseries_dataset_from_array

warnings.filterwarnings("ignore")


symbol_to_fetch = 'AAPL'
start_date = '2020-01-01'
end_date = '2024-05-01'
# Parameters
batch_size = 256
sequence_length = 30
stride = 1

def fetch_ticker_data(symbol, start_date, end_date):
    """Fetches stock data for a given symbol using yfinance."""
    ticker = yf.Ticker(symbol)
    data = ticker.history(start='1980-01-01', end=end_date)
    return data

def label_data(data):
    # Calculate the percentage change in price from one day to the next
    data['pr_change_on_last_day'] = data['Close'].pct_change()
    data['pr_change_on_current_day'] = data['pr_change_on_last_day'].shift(-1)
    data.iloc[0,-2] = 0
    data['sentiment'] = pd.Series(np.where(data['pr_change_on_current_day'] > 0, 1, 0), index=data.index)
    # data['perc_change'] = data['Percentage Change']
    # # Drop any rows with missing values
    # data.dropna(inplace=True)
    data.drop('pr_change_on_current_day',axis=1 , inplace=True)
    return data
stock = fetch_ticker_data(symbol_to_fetch, start_date, end_date)

# Calculate deltas, moving averages, and Bollinger Bands
for i in range(1, 90,5):
    stock[f"open_delta_{i}day"] = stock["Open"].diff(periods=i)
    stock[f"high_delta_{i}day"] = stock["High"].diff(periods=i)
    stock[f"low_delta_{i}day"] = stock["Low"].diff(periods=i)
    stock[f"close_delta_{i}day"] = stock["Close"].diff(periods=i)
    stock[f"rolling_mean_open_{i}day"] = stock["Open"].rolling(window=i).mean()
    stock[f"rolling_mean_high_{i}day"] = stock["High"].rolling(window=i).mean()
    stock[f"rolling_mean_low_{i}day"] = stock["Low"].rolling(window=i).mean()
    stock[f"rolling_mean_close_{i}day"] = stock["Close"].rolling(window=i).mean()
    stock[f"rolling_std_open_{i}day"] = stock["Open"].rolling(window=i).std()
    stock[f"rolling_std_high_{i}day"] = stock["High"].rolling(window=i).std()
    stock[f"rolling_std_low_{i}day"] = stock["Low"].rolling(window=i).std()
    stock[f"rolling_std_close_{i}day"] = stock["Close"].rolling(window=i).std()

stock['fast_ma'] = stock['Close'].rolling(window=20).mean()
stock['slow_ma'] = stock['Close'].rolling(window=50).mean()
stock['bollinger_high'] = stock['Close'].rolling(window=20).mean() + (2 * stock['Close'].rolling(window=20).std())
stock['bollinger_low'] = stock['Close'].rolling(window=20).mean() - (2 * stock['Close'].rolling(window=20).std())
stock['ema'] = stock['Close'].ewm(span=20, adjust=False).mean()
stock['envelope_high'] = stock['Close'].rolling(window=20).mean() * (1 + 0.05)
stock['envelope_low'] = stock['Close'].rolling(window=20).mean() * (1 - 0.05)
stock['macd_line'] = stock['Close'].ewm(span=12, adjust=False).mean() - stock['Close'].ewm(span=26, adjust=False).mean()
stock['macd_signal'] = stock['macd_line'].ewm(span=9, adjust=False).mean()

# RSI calculation
def calculate_rsi(data, rsi_period):
    delta = data['Close'].diff().dropna()
    gain = delta.where(delta > 0, 0).dropna()
    loss = -delta.where(delta < 0, 0).dropna()
    avg_gain = gain.rolling(window=rsi_period).mean()
    avg_loss = loss.rolling(window=rsi_period).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

stock['rsi'] = calculate_rsi(stock, 14)

# # Stochastic Oscillator calculation
# def calculate_stochastic(data, k_window, d_window):
#     high_low = data[['High', 'Low']]
#     c = data['Close']
#     highest = high_low.rolling(window=k_window).max()
#     lowest = high_low.rolling(window=k_window).min()
#     print(((c - lowest) / (highest - lowest)) * 100)
#     stochastic_k = ((c - lowest) / (highest - lowest)) * 100
#     stochastic_d = stochastic_k.rolling(window=d_window).mean()
#     return stochastic_k, stochastic_d
# stock['stochastic_k'], stock['stochastic_d'] = calculate_stochastic(stock, 14, 3)

# stock['stochastic_k']= calculate_stochastic(stock, 14, 3)[0]
# stock['stochastic_d']= calculate_stochastic(stock, 14, 3)[1]
stock['day'] = pd.to_datetime(stock.index).day
stock['month'] = pd.to_datetime(stock.index).month
stock['year'] = pd.to_datetime(stock.index).year
stock['weekday'] = pd.to_datetime(stock.index).weekday
stock['dayofyear'] = pd.to_datetime(stock.index).dayofyear
stock = stock.fillna(method="ffill", axis=0)
stock = stock.fillna(method="bfill", axis=0)
stock.index = stock.index.date
# Split the data into training and test sets

# df = stock.copy()

# # Calculate pairwise correlation
# corr_matrix = df.corr()

# # Identify highly correlated columns
# redundant_cols = set()
# for i in range(5,len(corr_matrix.columns)-1):
#     for j in range(i+1, len(corr_matrix.columns)):
#         if corr_matrix.iloc[i,j] > 0.8 and corr_matrix.columns[i] not in redundant_cols:
#             redundant_cols.add(corr_matrix.columns[j])

# # Remove one of the redundant columns
# for col in redundant_cols:
#     df = df.drop(col, axis=1)

# # Print the updated DataFrame
# print(df)

# stock = df.copy()
train_data_index = np.searchsorted(stock.index.values, np.datetime64(start_date))
train_data = stock.iloc[:int(0.9*train_data_index)].copy()
val_data  = stock.iloc[int(0.9*train_data_index)-sequence_length:train_data_index].copy()
test_data = stock.iloc[train_data_index-sequence_length:].copy()
train_data = label_data(train_data)
val_data = label_data(val_data)
test_data = label_data(test_data)
train_data.fillna(0,axis = 0, inplace=True)
val_data.fillna(0,axis = 0, inplace=True)
test_data.fillna(0,axis = 0, inplace=True)

#trian & test data
X_train_data = train_data.iloc[:,:-1]
y_train_data = train_data.iloc[:,-1]
#trian & test data
X_val_data = val_data.iloc[:,:-1]
y_val_data = val_data.iloc[:,-1]
X_test_data = test_data.iloc[:,:-1]
y_test_data = test_data.iloc[:,-1]
print(len(X_test_data), len(X_test_data.columns))
from keras.utils import to_categorical

# Convert targets to one-hot encoding
y_train_onehot = to_categorical(y_train_data, num_classes=3)
y_val_onehot = to_categorical(y_val_data, num_classes=3)

y_train_data_onehot = to_categorical(y_train_data)
y_test_data_onehot = to_categorical(y_test_data)
y_val_data_onehot = to_categorical(y_val_data)

# Normalize the data
normalizer = MinMaxScaler()
X_train_data_normalizer = normalizer.fit_transform(X_train_data)
X_val_data_normalizer = normalizer.fit_transform(X_val_data)
X_test_data_normalizer = normalizer.transform(X_test_data)

# # # Reshape X_train_data_normalizer
X_train_reshaped = X_train_data_normalizer.reshape(X_train_data_normalizer.shape[0], X_train_data_normalizer.shape[1], 1)
X_val_reshaped = X_val_data_normalizer.reshape(X_val_data_normalizer.shape[0], X_val_data_normalizer.shape[1], 1)
X_test_reshaped = X_test_data_normalizer.reshape(X_test_data_normalizer.shape[0], X_test_data_normalizer.shape[1], 1)

def create_sequences(x,y,sequence_length,stride):
    sequence_length  = sequence_length
    X_test_data_normalizer_sequences = []
    y_test_data_sequences = []
    stride = stride
    no_of_rows = len(x)
    no_of_columns = len(x[0])
    for i in range(sequence_length, no_of_rows-1 , stride):
        X_test_data_normalizer_sequences.append(x[i-sequence_length: i])
        y_test_data_sequences.append(y[i-1])
    return np.array(X_test_data_normalizer_sequences),np.array(y_test_data_sequences)
        
X_train_data_normalizer_sequences,y_train_data_sequences = create_sequences(X_train_data_normalizer,y_train_data,sequence_length,stride)
X_test_data_normalizer_sequences,y_test_data_sequences = create_sequences(X_test_data_normalizer,y_test_data,sequence_length,stride)



train_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_train_data_normalizer,
    y_train_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
val_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_val_data_normalizer,
    y_val_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)
test_dataset = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_test_data_normalizer,
    y_test_data,
    length = sequence_length,
    sampling_rate=1,
    stride=1,
    start_index=0,
    end_index=None,
    shuffle=False,
    reverse=False,
    batch_size=batch_size
)

1119 239
