In [1]:
import os
current_directory = os.getcwd()
print(current_directory)
dir_base = os.path.join(current_directory,'Dataset')
print(dir_base)

/home/ladans/DNN/Project
/home/ladans/DNN/Project/Dataset


In [2]:
import pandas as pd
import glob
import os
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import Embedding, LSTM, Dense, Input, Concatenate
from keras.layers import Flatten, RepeatVector

# Directory containing CSV files
data_dir = dir_base

# Initialize an empty DataFrame
all_data = pd.DataFrame()

# Iterate over each file in the directory
for file_path in glob.glob(os.path.join(data_dir, "*.csv")):
    # Extract stock symbol from the filename
    stock_symbol = os.path.basename(file_path).split('.')[0]
    
    # Load the CSV
    df = pd.read_csv(file_path)
    
    # Add a column for the stock symbol
    df['Stock_Symbol'] = stock_symbol
    
    # Optional: Select only necessary columns
    df = df[['Date', 'Open', 'Close', 'Stock_Symbol']]
    
    # Append to the main DataFrame
    all_data = pd.concat([all_data, df], ignore_index=True)

# Sort by date and stock symbol if needed
all_data = all_data.sort_values(['Stock_Symbol', 'Date']).reset_index(drop=True)

# Encode stock symbols
encoder = LabelEncoder()
all_data['Stock_Index'] = encoder.fit_transform(all_data['Stock_Symbol'])
all_data.head()

2024-11-11 14:44:48.420822: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731336288.447426   71655 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731336288.453810   71655 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-11 14:44:48.484319: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Unnamed: 0,Date,Open,Close,Stock_Symbol,Stock_Index
0,2012-09-04,95.108574,96.424286,AAPL,0
1,2012-09-05,96.510002,95.747147,AAPL,0
2,2012-09-06,96.167145,96.610001,AAPL,0
3,2012-09-07,96.864288,97.205711,AAPL,0
4,2012-09-10,97.207146,94.677139,AAPL,0


In [3]:
from sklearn.metrics import mean_squared_error
from pyswarm import pso

test_ratio=0.2
def train_and_evaluate(sequence_length):
    print(f"Evaluating sequence length: {sequence_length}")
    sequence_length = int(sequence_length[0])  # Extract the integer value

    # Re-initialize lists for inputs and outputs
    
    X, y, stock_indices = [], [], []
    
    for stock in all_data['Stock_Symbol'].unique():
        stock_data = all_data[all_data['Stock_Symbol'] == stock]
        prices = stock_data['Close'].values
        normalized_prices = prices / prices[0] - 1  # Normalize
        
        for i in range(len(normalized_prices) - sequence_length):
            X.append(normalized_prices[i: i + sequence_length])
            y.append(normalized_prices[i + sequence_length])
            stock_indices.append(stock_data['Stock_Index'].iloc[0])

    # Convert to numpy arrays and split into train-test sets
    X = np.array(X)
    y = np.array(y)
    stock_indices = np.array(stock_indices)
    train_size = int(len(X) * (1 - test_ratio))
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]
    stock_indices_train, stock_indices_test = stock_indices[:train_size], stock_indices[train_size:]

    #Model:
    # Number of unique stocks and embedding dimensions
    num_stocks = len(all_data['Stock_Symbol'].unique())
    embedding_size = 8
    

    # Define and compile model
    price_input = Input(shape=(sequence_length, 1), name='price_input')
    stock_input = Input(shape=(1,), name='stock_input')
    stock_embedding = Embedding(input_dim=num_stocks, output_dim=embedding_size, input_length=1)(stock_input)
    stock_embedding = Flatten()(stock_embedding)
    stock_embedding = RepeatVector(sequence_length)(stock_embedding)
    merged_input = Concatenate(axis=2)([price_input, stock_embedding])
    lstm_out = LSTM(units=128)(merged_input)
    output = Dense(units=1)(lstm_out)
    model = Model(inputs=[price_input, stock_input], outputs=output)
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    # Train model
    model.fit([X_train, stock_indices_train], y_train, epochs=5, batch_size=1, verbose=0)  # Adjust epochs for quick evaluation
    
    # Predict and calculate RMSE
    predicted_prices = model.predict([X_test, stock_indices_test])
    rmse = mean_squared_error(y_test, predicted_prices, squared=False)
    print(f"Sequence length {sequence_length} gives RMSE: {rmse}")
    return rmse
print('done')

done


In [None]:
# Define bounds for sequence_length as lists
lb, ub = [10], [60]

# Run PSO to find the optimal sequence_length
optimal_sequence_length, optimal_rmse = pso(train_and_evaluate, lb, ub, maxiter=5)

print("Optimal sequence length:", optimal_sequence_length)
print("RMSE for optimal sequence length:", optimal_rmse)
