#### Importing libraries

In [None]:
import pandas as pd # for data manipulation
import glob # for reading files in a random order
import matplotlib.pyplot as plt #for plotting
import matplotlib.dates as mdates
import talipp as ta
from talipp.indicators import ADX
import talib as tb
import math as mth
import numpy as np
# importing the random module
import random
import lightgbm as lgb
import sklearn.metrics
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.model_selection import train_test_split
import optuna
import pickle
import os
from sklearn.metrics import confusion_matrix, classification_report
import gc




##### Accessing the databases

In [None]:
#Getting a CSV files list from a folder
file_path1 = './SingleExampleTrain/AAPL.csv'

#Reading all CSV files in a list
appleDb = pd.read_csv(file_path1)
#print(appleDb.to_string()) - to view the original dataframe

# Convert Date column to datetime object
appleDb['Date'] = pd.to_datetime(appleDb['Date'], utc= True) #set the utc parameter to True to ensure that the timezone information is preserved

# Filter rows based on condition - starting from the year 2008
appleDb = appleDb[pd.to_datetime(appleDb['Date']).dt.year >= 2008]

print(appleDb.to_string())



The dataset contains the following stock prices for 
- AAPL which stands for the Apple Inc. company. 
- AAL which stands for the American Airlines Group Inc. for Major Airlines
- APA which stands for the Apache Corporation for independent oil and gas

This dataset contains the following information:

Date: This is the date that the stock market was open (i.e. the day that the stock was traded). It's usually listed in a format like "MM/DD/YYYY" (month/day/year).

Open: This is the price at which the stock opened for trading on that particular day. It's the price that the first trade was made at.

High: This is the highest price that the stock reached during trading on that day.

Low: This is the lowest price that the stock reached during trading on that day.

Close: This is the price at which the stock closed for trading on that particular day. It's the price that the last trade was made at.

Volume: This is the total number of shares that were traded on that particular day. It's a measure of how active the market was for that stock on that day.

Dividends: These are payments that companies sometimes make to their shareholders as a way of distributing profits. They're usually paid out on a regular basis (e.g. quarterly) and are typically a percentage of the company's earnings.

Stock Splits: This is when a company decides to divide its existing shares into multiple shares. For example, if a company had 1 million shares outstanding and decided to do a 2-for-1 stock split, there would then be 2 million shares outstanding, but each share would be worth half as much as before. Companies often do stock splits to make their shares more affordable to individual investors.

#### Converting the prices into logs representation

In [None]:
#checking how all the columns are listed in the dataframe
appleDb.columns


In [None]:
#checking how all the rows are listed in the dataframe
appleDb.index



In [None]:
print(appleDb.to_string())


def changeValuesIntoLogs(df):
    df['Open'] = df['Open'].apply(lambda x: mth.log(x))
    df['High'] = df['High'].apply(lambda x: mth.log(x))
    df['Low'] = df['Low'].apply(lambda x: mth.log(x))
    df['Close'] = df['Close'].apply(lambda x: mth.log(x))
    return df

appleDb = changeValuesIntoLogs(appleDb)

In [None]:
print("After converting with logs")


In [None]:
print(appleDb.to_string())

#### Graphs Representations

Open Graphs - The opening price of the stock on that day. This helps to identify trends in the stock's price and evaluate whether the stock is undervalued or overvalued.

In [None]:
# #plotting the closing price of the stocks

# # create a figure and axis objects with 3 rows and 1 column
# fig, axs = plt.subplots(3, 1, figsize=(8, 10))

# # plot the open prices for each company on each axis object
# axs[0].plot(appleDb['Date'], appleDb['Open'])
# axs[0].set_title('Apple Db')
# axs[1].plot(americanAirlineDb['Date'], americanAirlineDb['Open'])
# axs[1].set_title('American Airline Db')
# axs[2].plot(apacheCorpDb['Date'], apacheCorpDb['Open'])
# axs[2].set_title('Apache Corp Db')

# # adjust the space between subplots
# plt.subplots_adjust(hspace=0.5)

# # add a common x-label and y-label to the collage
# fig.text(0.5, 0.04, 'Date', ha='center')
# fig.text(0.04, 0.5, 'Open', va='center', rotation='vertical')

# # display the plot
# plt.show()

High Graphs - The highest price the stock reached during the day. This helps to identify how volatile the stock is and how much potential upside there is.

In [None]:
# #plotting the closing price of the stocks

# # create a figure and axis objects with 3 rows and 1 column
# fig, axs = plt.subplots(3, 1, figsize=(8, 10))

# # plot the high prices for each company on each axis object
# axs[0].plot(appleDb['Date'], appleDb['High'])
# axs[0].set_title('Apple Db')
# axs[1].plot(americanAirlineDb['Date'], americanAirlineDb['High'])
# axs[1].set_title('American Airline Db')
# axs[2].plot(apacheCorpDb['Date'], apacheCorpDb['High'])
# axs[2].set_title('Apache Corp Db')

# # adjust the space between subplots
# plt.subplots_adjust(hspace=0.5)

# # add a common x-label and y-label to the collage
# fig.text(0.5, 0.04, 'Date', ha='center')
# fig.text(0.04, 0.5, 'High', va='center', rotation='vertical')

# # display the plot
# plt.show()

Low Graphs -  The lowest price the stock reached during the day. This helps you evaluate how risky the stock is and how much potential downside there is.

In [None]:
# #plotting the closing price of the stocks

# # create a figure and axis objects with 3 rows and 1 column
# fig, axs = plt.subplots(3, 1, figsize=(8, 10))

# # plot the low prices for each company on each axis object
# axs[0].plot(appleDb['Date'], appleDb['Low'])
# axs[0].set_title('Apple Db')
# axs[1].plot(americanAirlineDb['Date'], americanAirlineDb['Low'])
# axs[1].set_title('American Airline Db')
# axs[2].plot(apacheCorpDb['Date'], apacheCorpDb['Low'])
# axs[2].set_title('Apache Corp Db')

# # adjust the space between subplots
# plt.subplots_adjust(hspace=0.5)

# # add a common x-label and y-label to the collage
# fig.text(0.5, 0.04, 'Date', ha='center')
# fig.text(0.04, 0.5, 'Low', va='center', rotation='vertical')

# # display the plot
# plt.show()

Closing Graphs - The closing price of the stock on that day. This can helps to evaluate how the stock performed over the course of the day and whether it ended up in positive or negative territory.

In [None]:
# #plotting the closing price of the stocks

# # create a figure and axis objects with 3 rows and 1 column
# fig, axs = plt.subplots(3, 1, figsize=(8, 10))

# # plot the close prices for each company on each axis object
# axs[0].plot(appleDb['Date'], appleDb['Close'])
# axs[0].set_title('Apple Db')
# axs[1].plot(americanAirlineDb['Date'], americanAirlineDb['Close'])
# axs[1].set_title('American Airline Db')
# axs[2].plot(apacheCorpDb['Date'], apacheCorpDb['Close'])
# axs[2].set_title('Apache Corp Db')

# # adjust the space between subplots
# plt.subplots_adjust(hspace=0.5)

# # add a common x-label and y-label to the collage
# fig.text(0.5, 0.04, 'Date', ha='center')
# fig.text(0.04, 0.5, 'Closing', va='center', rotation='vertical')

# # display the plot
# plt.show()

Volume Graphs - The total number of shares of the stock that were traded on that day. This helps to evaluate how active the market is for the stock and how much interest there is from other investors.

In [None]:
# #plotting the closing price of the stocks

# # create a figure and axis objects with 3 rows and 1 column
# fig, axs = plt.subplots(3, 1, figsize=(8, 10))

# # plot the close prices for each company on each axis object
# axs[0].plot(appleDb['Date'], appleDb['Volume'])
# axs[0].set_title('Apple Db')
# axs[1].plot(americanAirlineDb['Date'], americanAirlineDb['Volume'])
# axs[1].set_title('American Airline Db')
# axs[2].plot(apacheCorpDb['Date'], apacheCorpDb['Volume'])
# axs[2].set_title('Apache Corp Db')

# # adjust the space between subplots
# plt.subplots_adjust(hspace=0.5)

# # add a common x-label and y-label to the collage
# fig.text(0.5, 0.04, 'Date', ha='center')
# fig.text(0.04, 0.5, 'Volume', va='center', rotation='vertical')

# # display the plot
# plt.show()

#### Adding extra features into the dataframe to help the AI stock advisor to make better predictions. 

In [None]:
def priceChange(df):
    df['Price Change'] = df['Close'] - df['Close'].shift(1) #taking the closing price of the current day and subtracting it from the closing price of the previous day
    return df


By adding the Price Change in the data frame it will illustrate the percentage change between the previous closing price (of the previous day) and closing price of that day for each day. This can provide insight into the daily market sentiment of the stock. The reason for this is that the difference between the previous closing price and the closing price for a given day provides a good indication of the direction and magnitude of price movements for that day.

In [None]:
def priceVolatility(df):
    df['Price Volatility'] = df['High'] - df['Low'] #taking the high price of the day and subtracting it from the low price of the day
    return df



By adding the Price Volatility to the dataframe it refers to how much the price of a stock moves up and down over a certain period. The more a stock's price fluctuates (changes frequently), the higher its volatility. 

Note that the Volatility means the amount of uncertainty or risk related to the size of changes in a security's value

Volatility can be caused by various factors, such as changes in market conditions, news events, or investor sentiment. High volatility can indicate that the stock is more risky or uncertain, while low volatility can suggest that the stock is more stable and predictable. 

To work out the volatility of a stock taking the difference between the high and low prices of a stock. As stated before, the higher the difference, the more volatility. 

In [None]:
#calculating the SMA for the closing price of the stocks

def SMA(df,windowValue):
    df['SMA'] = tb.SMA(df['Close'], windowValue)
    return df



In [None]:
def TEMA(df,windowValue):
    df['EMA'] = tb.EMA(df['Close'], windowValue)
    return df


In [None]:
# # Plot SMA vs Closing Price
# fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10,8))
# ax1.plot(appleDb['Date'],appleDb['Close'], label='Closing Price')
# ax1.plot(appleDb['Date'],appleDb['SMA'], label='SMA')
# ax1.set_title('SMA vs Closing Price')
# ax1.legend()

# # Plot TEMA vs Closing Price
# ax2.plot(appleDb['Date'],appleDb['Close'], label='Closing Price')
# ax2.plot(appleDb['Date'],appleDb['TEMA'], label='TEMA')
# ax2.set_title('TEMA vs Closing Price')
# ax2.legend()

# plt.tight_layout()
# plt.show()

Moving Averages: 


Moving averages are a trend-following indicator that smooths out price fluctuations over a given period of time. They can be used to identify trends and potential reversal points in the stock's price.

The Simple Moving Average (SMA) is useful as it is a way to calculate the average price of a stock over a specific period of time. For example, if you want to calculate the SMA over the last 10 days, you would add up the prices of the stock from each of the last 10 days and divide that sum by 10.

The SMA is called "moving" because as each new day's price is added to the calculation, the oldest price is dropped, and the SMA "moves" to reflect the new set of prices.

The SMA can help to smooth out the daily fluctuations in the stock price and give you a better idea of the overall trend. If the SMA is moving up, it means that the stock price is generally increasing, and if the SMA is moving down, it means that the stock price is generally decreasing.

the simple moving average (SMA) is a calculation that helps to identify trends in the price of a stock over a specific time period. It is calculated by adding up the prices over a certain number of days, and then dividing by the number of days.

For example, if you want to calculate the 20-day SMA, you would add up the closing prices for the past 20 days and then divide by 20.

 The closing price is used because it is the final price of a trading day and is considered to be the most important price point for that day. Using closing prices over a period of time helps to reduce the impact of short-term price fluctuations and provides a more accurate picture of the trend over that time period.

The TEMA moving average is a technical analysis indicator that uses a triple exponential moving average to reduce the lag of the standard exponential moving average (EMA).

To calculate the TEMA, you first calculate a single EMA for a given period, then you calculate a second EMA of that first EMA, and then a third EMA of that second EMA. The formula for the TEMA is:

TEMA = 3 * (EMA1 - EMA2) + EMA3

where EMA1 is the first EMA, EMA2 is the second EMA of the first EMA, and EMA3 is the third EMA of the second EMA.

In [None]:
def ADX(df):
    df['ADX5'] = tb.ADX(df['High'],df['Low'], df['Close'], timeperiod=5)
    df['ADX10'] = tb.ADX(df['High'],df['Low'], df['Close'], timeperiod=10)
    df['ADX20'] = tb.ADX(df['High'],df['Low'], df['Close'], timeperiod=20)
    return df


# Plot Various ADX values
# fig, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(10,8))
# ax1.plot(appleDb['Date'],appleDb['High'], label='High Price')
# ax1.plot(appleDb['Date'],appleDb['Low'], label='Low Price')
# ax1.plot(appleDb['Date'],appleDb['Close'], label='Closing Price')
# ax1.plot(appleDb['Date'],appleDb['ADX20'], label='ADX20')
# ax1.set_title('ADX20')
# ax1.legend()

# ax2.plot(appleDb['Date'],appleDb['High'], label='High Price')
# ax2.plot(appleDb['Date'],appleDb['Low'], label='Low Price')
# ax2.plot(appleDb['Date'],appleDb['Close'], label='Closing Price')
# ax2.plot(appleDb['Date'],appleDb['ADX10'], label='ADX10')
# ax2.set_title('ADX10')
# ax2.legend()

# ax3.plot(appleDb['Date'],appleDb['High'], label='High Price')
# ax3.plot(appleDb['Date'],appleDb['Low'], label='Low Price')
# ax3.plot(appleDb['Date'],appleDb['Close'], label='Closing Price')
# ax3.plot(appleDb['Date'],appleDb['ADX5'], label='ADX5')
# ax3.set_title('ADX5')
# ax3.legend()

# plt.tight_layout()
# plt.show()

Average Directional Movement Index(Momentum Indicator)

ADX can be used to help measure the overall strength of a trend. The ADX indicator is an average of expanding price range values. The Average Directional Movement Index (ADX) is a technical indicator that is based on the high, low, and close prices of a stock. The ADX helps to measure the strength of a trend, whether it's up or down, and its overall momentum.

The ADX is calculated using a formula that takes into account the price movements over a specific period of time. The indicator is based on the difference between the high and low prices and the current closing price. The ADX is usually plotted on a chart as a line that ranges between 0 and 100, with a higher value indicating a stronger trend.

To calculate the ADX for a stock, you would first need to determine the time period for which you want to calculate the indicator. This can vary depending on your strategy and the volatility of the stock you are analyzing. Once you have determined the time period, you can then use the high, low, and close prices for that period to calculate the ADX.

In [None]:
def ATR(df):
    df['ATR5'] = tb.ATR(df['High'], df['Low'], df['Close'], timeperiod=5)
    df['ATR10'] = tb.ATR(df['High'], df['Low'], df['Close'], timeperiod=10)
    return df

# ax1.plot(appleDb['Date'],appleDb['Close'], label='Closing Price')
# ax1.plot(appleDb['Date'],appleDb['High'], label='High Price')
# ax1.plot(appleDb['Date'],appleDb['Low'], label='Low Price')
# ax1.plot(appleDb['Date'],appleDb['ATR'], label='ATR',color='red')
# ax1.set_title('ATR vs Closing Price vs High Price vs Low Price')
# ax1.legend()


Average True Range (ATR): ATR is a measure of volatility that takes into account the daily price range of the stock. It can help predict potential price movement and can be useful for setting stop-loss orders.

The Average True Range (ATR) is another technical indicator that is commonly used by traders and investors to measure the volatility of a stock. While the ADX measures the strength of a trend, the ATR measures the volatility of the stock price.

The ATR is calculated using the difference between the high and low prices of a stock over a specific period of time. It takes into account any gaps or limit moves that may have occurred during that time period. The ATR is usually expressed in points or as a percentage of the stock price.

One key difference between the ADX and ATR is that the ADX is used to measure the strength of a trend, while the ATR is used to measure the volatility of the stock price. Another difference is that the ADX is calculated using the high, low, and close prices, while the ATR is calculated using only the high and low prices.

Both the ADX and ATR are useful indicators that can help traders and investors make informed decisions about buying and selling stocks. The ADX can help to identify strong trends, while the ATR can help to identify potential changes in volatility.

In [None]:
def RSI(df):
    df['RSI'] = tb.RSI(df['Close'], timeperiod=10)
    return df


# appleDb[['RSI']].plot(figsize=(12,10),marker='o')
# plt.axhline(y=30, color='green', linestyle='-')
# plt.axhline(y=70, color='red', linestyle='-')
# x = np.arange(0, len(appleDb['RSI']),0.1)
# appleDb['RSI'].tail(10)
# plt.title("Apple RSI")

Relative Strength Index (RSI): an indicator used in finance to help investors and traders understand whether a stock is overbought or oversold. It is a measure of the stock's recent price changes, and is expressed as a number between 0 and 100.

The RSI works by comparing the average price gains of a stock to its average price losses over a specific time period (usually 14 days). If the average gains are higher than the average losses, the RSI will be higher, indicating that the stock is in an uptrend. If the average losses are higher than the average gains, the RSI will be lower, indicating that the stock is in a downtrend.

Traders use the RSI to identify potential buy and sell signals. When the RSI is above 70, it is considered overbought, which means that the stock may be due for a price correction. When the RSI is below 30, it is considered oversold, which means that the stock may be undervalued and due for a price increase.

In [None]:
# appleDb['Upper_BBand'], appleDb['Mid_BBand'], appleDb['Lower_BBand'] = tb.BBANDS(appleDb['Close'], timeperiod =20)
# appleDb[['Close','Mid_BBand','Upper_BBand','Lower_BBand']].plot(figsize= (12,10))
# plt.title("Apple Bollinger Bands")
# plt.show()

def bbands(df):
    df['Upper_BBand5'], df['Mid_BBand5'], df['Lower_BBand5'] = tb.BBANDS(df['Close'], timeperiod =5)
    df['Upper_BBand10'], df['Mid_BBand10'], df['Lower_BBand10'] = tb.BBANDS(df['Close'], timeperiod =10)
    return df

Bollinger Bands is a tool that helps traders understand how much a stock's price tends to change over time. It consists of three lines on a chart: a middle line that represents the average price of the stock over a certain period (usually 20 days), and two additional lines that are drawn above and below the middle line.

These upper and lower lines are drawn at a distance that represents how much the stock's price has varied in the past. The distance is based on the stock's standard deviation, which is a measure of how much its price has fluctuated in the past.

When the stock's price is moving within the Bollinger Bands, it is considered to be trading within a normal range. However, when the stock's price moves above the upper line, it is considered to be overbought, meaning it may be due for a price correction. Conversely, when the stock's price moves below the lower line, it is considered to be oversold, meaning it may be due for a price increase.

Bollinger Bands can be a useful tool for traders to help identify potential buy and sell signals. 

In [None]:
# Calculate the MACD and Signal Line indicators
# appleDb["MACD"], appleDb["macd_signal"], appleDb["macd_hist"] = tb.MACD(appleDb["Close"], fastperiod=12, slowperiod=26, signalperiod=9)
# appleDb[['MACD','macd_signal','macd_hist']].plot(figsize= (12,10))
# plt.title("Apple MACD")
# plt.show()

def MACD(df):
    df["MACD"], df["macd_signal"], df["macd_hist"] = tb.MACD(df["Close"], fastperiod=5, slowperiod=20, signalperiod=9)
    return df

Moving Average Convergence Divergence (MACD): The MACD is a trend-following indicator that calculates the difference between two moving averages of the stock's price. It can help identify changes in the stock's trend and can be used to generate buy or sell signals.

The MACD is a trend-following indicator that measures the difference between a short-term moving average and a long-term moving average. It is typically used with a 12-day and 26-day moving average, and a 9-day signal line.


In [None]:
# # Calculate the On Balance Volume indicator
# appleDb['OBV'] = tb.OBV(appleDb['Close'], appleDb['Volume'])
# appleDb[['OBV','Close']].plot(figsize= (12,10))
# plt.title("Apple OBV")
# plt.show()
# print(appleDb.to_string())

def OBV(df):
    df['OBV'] = tb.OBV(df['Close'], df['Volume'])
    return df


On Balance Volume (OBV)

In [None]:
# # Calculate the Chaikin Money Flow indicator
# appleDb['MFI'] = tb.MFI(appleDb['High'], appleDb['Low'], appleDb['Close'], appleDb['Volume'])
# appleDb[['MFI','Close']].plot(figsize= (12,10))
# plt.title("Apple MFI")
# plt.show()
# print(appleDb.to_string())

def MFI(df):
    df['MFI'] = tb.MFI(df['High'], df['Low'], df['Close'], df['Volume'])
    return df


Money Flow Index (MFI)

#### Prediction of Close Price after a certain amount of days



In [None]:
def closePriceAfter(df):
    df['ClosePriceAfter10Days'] = df['Close'].shift(-10)
    df['ClosePriceAfter30Days'] = df['Close'].shift(-30)
    df['ClosePriceAfter60Days'] =df['Close'].shift(-60)
    df['ClosePriceAfter120Days'] = df['Close'].shift(-120)
    df['ClosePriceAfter365Days'] =df['Close'].shift(-365)
    return df

def differencePriceAfter(df):
    df['DifferenceAfter10Days'] = df['ClosePriceAfter10Days']-  df['Close']
    df['DifferenceAfter30Days'] = df['ClosePriceAfter30Days']-  df['Close']
    df['DifferenceAfter60Days'] = df['ClosePriceAfter60Days'] -  df['Close']
    df['DifferenceAfter120Days'] = df['ClosePriceAfter120Days']- df['Close']
    df['DifferenceAfter365Days'] = df['ClosePriceAfter365Days'] -  df['Close']
    return df



def obtainingExtraFeatures(df,windowValue):
    df = priceChange(df) 
    df = priceVolatility(df)
    df = SMA(df,windowValue)
    df = TEMA(df,windowValue)
    df = ADX(df)
    df = ATR(df)
    df = RSI(df)
    df = bbands(df)
    df = MACD(df)
    df = OBV(df)
    df = MFI(df)
    df = closePriceAfter(df)
    df = differencePriceAfter(df)
    return df


appleDb = obtainingExtraFeatures(appleDb,10)

print(appleDb.to_string())

#### Splitting the dataset into training and testing datasets

In [None]:
def splittingTheDataset(database,nameOfPredictedColumn,predictionDays,testSize,randomState):
    database = database.iloc[:-predictionDays]
    print(database)
    inputFeatureDatabase = database[['Open','High','Low','Close','Volume','Dividends','Stock Splits','Price Change','Price Volatility','SMA','EMA','ADX5','ADX10','ADX20',
                                     'ATR5','ATR10','RSI', 'Upper_BBand5','Mid_BBand5','Lower_BBand5','Upper_BBand10','Mid_BBand10','Lower_BBand10','MACD','macd_signal','macd_hist',
                                     'OBV','MFI']]

    targetOutputVariable = database[[nameOfPredictedColumn]]
    
    #splitting the dataset into train and test using the train_test_split function -This splits the data into 80% training data and 20% testing data
    inputFeatureDatabase_Train, inputFeatureDatabase_Test, targetOutputVariable_Train, targetOutputVariable_Test = train_test_split(inputFeatureDatabase, targetOutputVariable, test_size=testSize, random_state=randomState)
    
    return inputFeatureDatabase_Train, inputFeatureDatabase_Test, targetOutputVariable_Train, targetOutputVariable_Test 

def obtainRecentRecords(database):
    # Convert Date column to datetime object
    database['Date'] = pd.to_datetime(database['Date'], utc= True) #set the utc parameter to True to ensure that the timezone information is preserved

    # Filter rows based on condition - starting from the year 2010
    database = database[pd.to_datetime(database['Date']).dt.year >= 2010]
    

    return database

appleDb = obtainRecentRecords(appleDb)
print(appleDb.to_string())

The train_test_split function splits the input data into two subsets: one for training the model and the other for testing the model. The function returns four outputs, which are:

inputFeatureDatabase_Train: the feature data for training the model.
inputFeatureDatabase_Test: the feature data for testing the model.
targetOutputVariable_Train: the target data (labels) for training the model.
targetOutputVariable_Test: the target data (labels) for testing the model.
So, inputFeatureDatabase_Train and targetOutputVariable_Train are used to train the model, while inputFeatureDatabase_Test and targetOutputVariable_Test are used to evaluate the performance of the model on unseen data.

The random_state=42 parameter in the train_test_split() function sets the random seed for the data splitting process. This ensures that the data is split in the same way every time the code is run, which is useful for reproducibility and debugging purposes. The value 42 is an arbitrary number and can be any non-negative integer.

#### Defining the parameters for the AI Model 

The parameters provided above are used to configure a machine learning model that uses the LightGBM library for gradient boosting. Here's an explanation of each parameter:

task: The task parameter specifies whether the model should be used for training ('train'), prediction ('predict'), or other tasks.

boosting: The boosting parameter specifies the type of boosting algorithm to use. In this case, 'gbdt' stands for Gradient Boosting Decision Tree, which is a popular algorithm for supervised learning tasks like regression and classification.

objective: The objective parameter specifies the loss function to be optimized during training. In this case, 'regression' indicates that the model is being trained to perform a regression task, where the goal is to predict a continuous numerical value.

num_leaves: The num_leaves parameter controls the number of leaves (or nodes) in each decision tree of the gradient boosting algorithm. Increasing this value can lead to more complex models with better performance, but also increases the risk of overfitting.

learning_rate: The learning_rate parameter controls the step size at each iteration of the gradient boosting algorithm. A smaller learning rate can lead to more accurate models, but also requires more iterations to converge.

metric: The metric parameter specifies the evaluation metric(s) to be used during training. In this case, {'l2', 'l1'} indicates that the mean squared error (l2) and mean absolute error (l1) will be used to evaluate the model's performance during training.

verbose: The verbose parameter controls the amount of output printed during training. A value of -1 means that no output will be printed.

Overall, these parameters are used to configure a gradient boosting model that is optimized for a regression task, using a decision tree as the base estimator. The specific values for each parameter may need to be tuned for optimal performance on a given dataset.

In [None]:
# laoding data into lightgbm -  taking the training and testing data and converting them into formats that can be used by the LightGBM model for training and evaluation.
def changingIntoLightGBMformat(inputFeatureDatasetTrain, targetOutputVariable_Train, inputFeatureDatasetTest, targetOutputVariable_Test):
    lgb_ToTrainModel = lgb.Dataset(inputFeatureDatasetTrain, label = targetOutputVariable_Train)
    lgb_evalualtePerformanceDuringTraining = lgb.Dataset(inputFeatureDatasetTest, label = targetOutputVariable_Test, reference=lgb_ToTrainModel)
    return lgb_ToTrainModel, lgb_evalualtePerformanceDuringTraining

# fitting the model

def buildModel(params, lgb_ToTrainModel, lgb_evalualtePerformanceDuringTraining, early_stopping_rounds):
    model = lgb.train(params,
                 train_set=lgb_ToTrainModel,
                 valid_sets=lgb_evalualtePerformanceDuringTraining,
                 early_stopping_rounds=early_stopping_rounds)
    return model

#predicting the output values
def prediction(model, inputFeatureDatabase_Test):
    predictedOutputValues = model.predict(inputFeatureDatabase_Test)
    return predictedOutputValues


#accuracy check - using the mean squared error and root mean squared error
def calculateMSERmse(targetOutputVariable_Test, predictedOutputValues):
    mse = mean_squared_error(targetOutputVariable_Test, predictedOutputValues)
    rmse = mse**(0.5)

    return mse, rmse





lgb.Dataset(inputFeatureDatabase_Train, targetOutputVariable_Train) is creating a LightGBM dataset object called lgb_ToTrainModel using the training data, inputFeatureDatabase_Train and targetOutputVariable_Train. This dataset will be used to train the LightGBM model.
lgb.Dataset(inputFeatureDatabase_Test, targetOutputVariable_Test, reference=lgb_ToTrainModel) is creating another LightGBM dataset object called lgb_evalualtePerformanceDuringTraining using the testing data, inputFeatureDatabase_Test and targetOutputVariable_Test. This dataset will be used to evaluate the performance of the LightGBM model during training.

Why minimising the loss was chosen rather than maximising the prediction:

Maximizing the prediction means trying to make the model's predictions as accurate as possible, regardless of the cost of making a mistake. In other words, the focus is on optimizing the model's ability to make correct predictions, without considering the cost of getting things wrong. This approach is often used when the consequences of making a mistake are not severe, or when the cost of getting things right is much higher than the cost of getting things wrong.

On the other hand, minimizing the loss means trying to make the model's predictions as accurate as possible, while also minimizing the cost of making a mistake. In this approach, the focus is on optimizing the model's ability to balance the trade-off between making correct predictions and avoiding mistakes that have high costs. This approach is often used when the cost of making a mistake is high, such as in medical diagnosis or financial forecasting.

#### Training the model 

#### Building the Model for 30 Day Prediction

In [None]:
#Model for 30 Day Prediction
def objective(trial, lgb_ToTrainModel1_30Days, lgb_evalualtePerformanceDuringTrainingModel1_30Days, inputFeatureDatabase_TestModel1_30Days, targetOutputVariable_TestModel1_30Days):
    params = {
        'task': 'train', 
        'boosting': 'gbdt',
        'objective': 'regression',
        'num_leaves': trial.suggest_int('num_leaves', 5, 50),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'metric': {'l2', 'l1'},
        'verbose': -1
    }

    # Train the model and make predictions
    model1_30Days = buildModel(params, lgb_ToTrainModel1_30Days, lgb_evalualtePerformanceDuringTrainingModel1_30Days, 30)

    predictedOutputValuesModel1_30Days = prediction(model1_30Days, inputFeatureDatabase_TestModel1_30Days)

    # Calculate the error
    mse_combined, rmse = calculateMSERmse(targetOutputVariable_TestModel1_30Days, predictedOutputValuesModel1_30Days)

    # Return the evaluation metric (MSE) as the objective value to minimize
    return mse_combined

# Prepare the training and testing data for model 1 - 30 days
inputFeatureDatabase_TrainModel1_30Days, inputFeatureDatabase_TestModel1_30Days, targetOutputVariable_TrainModel1_30Days, targetOutputVariable_TestModel1_30Days  = splittingTheDataset(appleDb, 'DifferenceAfter30Days', 30, 0.2, 42)

lgb_ToTrainModel1_30Days, lgb_evalualtePerformanceDuringTrainingModel1_30Days = changingIntoLightGBMformat(inputFeatureDatabase_TrainModel1_30Days, targetOutputVariable_TrainModel1_30Days, inputFeatureDatabase_TestModel1_30Days, targetOutputVariable_TestModel1_30Days)

study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, lgb_ToTrainModel1_30Days, lgb_evalualtePerformanceDuringTrainingModel1_30Days, inputFeatureDatabase_TestModel1_30Days, targetOutputVariable_TestModel1_30Days), n_trials=100, show_progress_bar=False)

best_paramsForModel1 = study.best_params
print("Best Parameters:", best_paramsForModel1)

# Re-run the model with the best parameters
#Number of training epochs: In machine learning, an epoch is one complete pass through the entire training dataset. If 30 represents the number of epochs, then each model will be trained for 30 iterations over the training data. The number of epochs can affect the accuracy of the model. Too few epochs can result in underfitting of the model, whereas too many epochs can lead to overfitting.
best_modelForModel1 = buildModel(best_paramsForModel1, lgb_ToTrainModel1_30Days, lgb_evalualtePerformanceDuringTrainingModel1_30Days, 30) 
predicted_outputForModel1 = prediction(best_modelForModel1, inputFeatureDatabase_TestModel1_30Days)

# Calculate the error with the best model
best_mseForModel1, best_rmseForModel1 = calculateMSERmse(targetOutputVariable_TestModel1_30Days, predicted_outputForModel1)

print("Best MSE:", best_mseForModel1)
print("Best RMSE:", best_rmseForModel1)


#save the model 
filename = "model1.txt"
pickle.dump(best_modelForModel1, open(filename, 'wb'))


    

In [None]:
def objective(trial, lgb_ToTrainModel2_60Days, lgb_evalualtePerformanceDuringTrainingModel2_60Days, inputFeatureDatabase_TestModel2_60Days, targetOutputVariable_TestModel2_60Days):
    params = {
        'task': 'train', 
        'boosting': 'gbdt',
        'objective': 'regression',
        'num_leaves': trial.suggest_int('num_leaves', 5, 50),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'metric': {'l2', 'l1'},
        'verbose': -1
    }

    # Train the model and make predictions
    model2_60Days = buildModel(params, lgb_ToTrainModel2_60Days, lgb_evalualtePerformanceDuringTrainingModel2_60Days, 30)
    predictedOutputValuesModel2_60Days = prediction(model2_60Days, inputFeatureDatabase_TestModel2_60Days)

    # Calculate the error
    mse_combined, rmse = calculateMSERmse(targetOutputVariable_TestModel2_60Days, predictedOutputValuesModel2_60Days)

    # Return the evaluation metric (MSE) as the objective value to minimize
    return mse_combined

# Prepare the training and testing data for model 1 - 30 days
inputFeatureDatabase_TrainModel2_60Days, inputFeatureDatabase_TestModel2_60Days, targetOutputVariable_TrainModel2_60Days, targetOutputVariable_TestModel2_60Days  = splittingTheDataset(appleDb, 'DifferenceAfter60Days', 60, 0.2, 42)

lgb_ToTrainModel2_60Days, lgb_evalualtePerformanceDuringTrainingModel2_60Days = changingIntoLightGBMformat(inputFeatureDatabase_TrainModel2_60Days, targetOutputVariable_TrainModel2_60Days, inputFeatureDatabase_TestModel2_60Days, targetOutputVariable_TestModel2_60Days)

study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, lgb_ToTrainModel2_60Days, lgb_evalualtePerformanceDuringTrainingModel2_60Days, inputFeatureDatabase_TestModel2_60Days, targetOutputVariable_TestModel2_60Days), n_trials=100)

best_paramsForModel2 = study.best_params
print("Best Parameters:", best_paramsForModel2)

# Re-run the model with the best parameters
best_modelForModel2 = buildModel(best_paramsForModel2, lgb_ToTrainModel2_60Days, lgb_evalualtePerformanceDuringTrainingModel2_60Days, 30)
predicted_outputForModel2 = prediction(best_modelForModel2, inputFeatureDatabase_TestModel2_60Days)

# Calculate the error with the best model
best_mseForModel2, best_rmseForModel2 = calculateMSERmse(targetOutputVariable_TestModel2_60Days, predicted_outputForModel2)

# Print intermediate results
print("Best MSE:", best_mseForModel2)
print("Best RMSE:", best_rmseForModel2)

#save the model 
filename = "model2.txt"
pickle.dump(best_modelForModel2, open(filename, 'wb'))

#### Model for 120 days prediction

In [None]:
def objective(trial, lgb_ToTrainModel3_120Days, lgb_evalualtePerformanceDuringTrainingModel3_120Days, inputFeatureDatabase_TestModel3_120Days, targetOutputVariable_TestModel3_120Days):
    params = {
        'task': 'train', 
        'boosting': 'gbdt',
        'objective': 'regression',
        'num_leaves': trial.suggest_int('num_leaves', 5, 50),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'metric': {'l2', 'l1'},
        'verbose': -1
    }

    # Train the model and make predictions
    model3_120Days = buildModel(params, lgb_ToTrainModel3_120Days, lgb_evalualtePerformanceDuringTrainingModel3_120Days, 30)
    predictedOutputValuesModel3_120Days = prediction(model3_120Days, inputFeatureDatabase_TestModel3_120Days)

    # Calculate the error
    mse_combined, rmse = calculateMSERmse(targetOutputVariable_TestModel3_120Days, predictedOutputValuesModel3_120Days)

    # Return the evaluation metric (MSE) as the objective value to minimize
    return mse_combined

# Prepare the training and testing data for model 3 - 120 days
inputFeatureDatabase_TrainModel3_120Days, inputFeatureDatabase_TestModel3_120Days, targetOutputVariable_TrainModel3_120Days, targetOutputVariable_TestModel3_120Days  = splittingTheDataset(appleDb, 'DifferenceAfter120Days', 120, 0.2, 42)

lgb_ToTrainModel3_120Days, lgb_evalualtePerformanceDuringTrainingModel3_120Days = changingIntoLightGBMformat(inputFeatureDatabase_TrainModel3_120Days, targetOutputVariable_TrainModel3_120Days, inputFeatureDatabase_TestModel3_120Days, targetOutputVariable_TestModel3_120Days)

study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, lgb_ToTrainModel3_120Days, lgb_evalualtePerformanceDuringTrainingModel3_120Days, inputFeatureDatabase_TestModel3_120Days, targetOutputVariable_TestModel3_120Days), n_trials=100)

best_paramsForModel3 = study.best_params
print("Best Parameters:", best_paramsForModel3)

# Re-run the model with the best parameters
best_modelForModel3 = buildModel(best_paramsForModel3, lgb_ToTrainModel3_120Days, lgb_evalualtePerformanceDuringTrainingModel3_120Days, 30)
predicted_outputForModel3 = prediction(best_modelForModel3, inputFeatureDatabase_TestModel3_120Days)

# Calculate the error with the best model
best_mseForModel3, best_rmseForModel3 = calculateMSERmse(targetOutputVariable_TestModel3_120Days, predicted_outputForModel3)

# Print intermediate results
print("Best MSE:", best_mseForModel3)
print("Best RMSE:", best_rmseForModel3)

#save the model 
filename = "model3.txt"
pickle.dump(best_modelForModel3, open(filename, 'wb'))

#### Building the model for Prediction 365 days

In [None]:
def objective(trial, lgb_ToTrainModel4_365Days, lgb_evalualtePerformanceDuringTrainingModel4_365Days, inputFeatureDatabase_TestModel4_365Days, targetOutputVariable_TestModel4_365Days):
    params = {
        'task': 'train', 
        'boosting': 'gbdt',
        'objective': 'regression',
        'num_leaves': trial.suggest_int('num_leaves', 5, 50),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'metric': {'l2', 'l1'},
        'verbose': -1
    }

    # Train the model and make predictions
    model4_365Days = buildModel(params, lgb_ToTrainModel4_365Days, lgb_evalualtePerformanceDuringTrainingModel4_365Days, 30)
    predictedOutputValuesModel4_365Days = prediction(model4_365Days, inputFeatureDatabase_TestModel4_365Days)

    # Calculate the error
    mse_combined, rmse = calculateMSERmse(targetOutputVariable_TestModel4_365Days, predictedOutputValuesModel4_365Days)

    # Return the evaluation metric (MSE) as the objective value to minimize
    return mse_combined

# Prepare the training and testing data for model 4 - 365 days
inputFeatureDatabase_TrainModel4_365Days, inputFeatureDatabase_TestModel4_365Days, targetOutputVariable_TrainModel4_365Days, targetOutputVariable_TestModel4_365Days  = splittingTheDataset(appleDb, 'DifferenceAfter365Days', 365, 0.2, 42)

lgb_ToTrainModel4_365Days, lgb_evalualtePerformanceDuringTrainingModel4_365Days = changingIntoLightGBMformat(inputFeatureDatabase_TrainModel4_365Days, targetOutputVariable_TrainModel4_365Days, inputFeatureDatabase_TestModel4_365Days, targetOutputVariable_TestModel4_365Days)

study = optuna.create_study(direction='minimize')
#The n_trials argument specifies the number of iterations for the hyperparameter optimization process. 
#If you increase n_trials, Optuna will search through a larger portion of the hyperparameter space and potentially find a better set of hyperparameters, but it will take longer because more models need to be trained and evaluated.
#If you decrease n_trials, the process will be faster, but Optuna might not find the optimal set of hyperparameters because it has fewer opportunities to explore the hyperparameter space.
study.optimize(lambda trial: objective(trial, lgb_ToTrainModel4_365Days, lgb_evalualtePerformanceDuringTrainingModel4_365Days, inputFeatureDatabase_TestModel4_365Days, targetOutputVariable_TestModel4_365Days), n_trials=100)

best_paramsForModel4 = study.best_params
print("Best Parameters:", best_paramsForModel4)

# Re-run the model with the best parameters
best_modelForModel4 = buildModel(best_paramsForModel4, lgb_ToTrainModel4_365Days, lgb_evalualtePerformanceDuringTrainingModel4_365Days, 30)
predicted_outputForModel4 = prediction(best_modelForModel4, inputFeatureDatabase_TestModel4_365Days)

# Calculate the error with the best model
best_mseForModel4, best_rmseForModel4 = calculateMSERmse(targetOutputVariable_TestModel4_365Days, predicted_outputForModel4)

# Print intermediate results
print("Best MSE:", best_mseForModel4)
print("Best RMSE:", best_rmseForModel4)

#save the model 
filename = "model4.txt"
pickle.dump(best_modelForModel3, open(filename, 'wb'))

It makes a prediction on the test set using a trained model. The predicted values are saved to y_pred.
It calculates the mean squared error (MSE) and root mean squared error (RMSE) between the true target values (y_test) and the predicted values (y_pred).
It prints the values of MSE and RMSE to the console.
In simple terms, the code is checking how well the trained model is performing on the test set. The MSE and RMSE are measures of how close the predicted values are to the true values. A lower value of MSE or RMSE indicates a better performance of the model.

In [None]:
# visualizing in a plot
def plottingTheDifferenceBetweenPredictionAndActualValues(targetOutputVariable_Test, predictedOutputValues):
    x_ax = range(len(targetOutputVariable_Test))
    plt.figure(figsize=(9, 4))
    plt.plot(x_ax, targetOutputVariable_Test, label="original")
    plt.plot(x_ax, predictedOutputValues, label="predicted")
    plt.title("Prediction of Closing Price After 30 Days")
    
    plt.ylabel('Price')
    plt.legend(loc='best',fancybox=True, shadow=True)
    plt.grid(True)
    plt.show()

plottingTheDifferenceBetweenPredictionAndActualValues(targetOutputVariable_TestModel1_30Days, predicted_outputForModel1)

In [None]:
# plotting feature importance
def featureImportancePlot(model,title):
    fig, ax = plt.subplots(figsize=(8,6))
    lgb.plot_importance(model, ax=ax, height=.70)
    ax.set_title(title)
    # adjust the spacing between the y-axis and the left edge of the plot
    plt.subplots_adjust(left=0.3)
    plt.show()

featureImportancePlot(best_modelForModel1,'Feature Importance for 30 Days Prediction')

#### Loading the model and testing on recent data

In [None]:
def splittingTheDatasetOfTheUpdatedInfo(database,nameOfPredictedColumn,predictionDays):
    database = database.iloc[:-predictionDays]
    print(database)
    inputFeatureDatabase = database[['Open','High','Low','Close','Volume','Dividends','Stock Splits','Price Change','Price Volatility','SMA','EMA','ADX5','ADX10','ADX20',
                                     'ATR5','ATR10','RSI', 'Upper_BBand5','Mid_BBand5','Lower_BBand5','Upper_BBand10','Mid_BBand10','Lower_BBand10','MACD','macd_signal','macd_hist',
                                     'OBV','MFI']]

    targetOutputVariable = database[[nameOfPredictedColumn]]

    return inputFeatureDatabase, targetOutputVariable

def gettingRecentPredictions(filename,file_path1,predictionDays,predictDays,windowValue):

    #to load the model
    loadedModel1 = pickle.load(open(filename, 'rb'))

    #to predict the output

    #Reading all CSV files in a list
    df = pd.read_csv(file_path1)
    #print(appleDb.to_string()) - to view the original dataframe

    # Convert Date column to datetime object
    df['Date'] = pd.to_datetime(df['Date'], utc= True) #set the utc parameter to True to ensure that the timezone information is preserved

    # Filter rows based on condition - starting from the year 2008
    df = df[pd.to_datetime(df['Date']).dt.year >= 2008]
    df = changeValuesIntoLogs(df)
    df = obtainingExtraFeatures(df,windowValue)
    df = obtainRecentRecords(df)
    lastestInput, targetOutput = splittingTheDatasetOfTheUpdatedInfo(df,predictionDays,predictDays)

    # Create the LGBM Dataset
    lgb_dataset = lgb.Dataset(lastestInput, label=targetOutput)
    predictions = loadedModel1.predict(lastestInput)
    # Set NumPy print options to display the entire array
    np.set_printoptions(threshold=np.inf)

    # Print the predictions
    print(predictions)
    return predictions,df

def determineWorthOfInvestment(df,predictions,actualClosePrice,differencePrice,whenToPredict):
    print(df)
    # Get the current close price of the day you want to predict
    current_close_price = df['Close'].iloc[-whenToPredict]
    current_After_closePrice = df[actualClosePrice].iloc[-whenToPredict]
    differencePriceAmount = df[differencePrice].iloc[-whenToPredict]
    print("Date:",df['Date'].iloc[-whenToPredict] )
    print("Price:", current_close_price)
    print("After Price:", current_After_closePrice)
    print("Actual Diff:", differencePriceAmount)
    # Set the investment threshold (you can adjust this based on your strategy)

    print("Threshold:", 0.02)
    print("Predicted Diff :", predictions[-whenToPredict])

    return predictions[-whenToPredict]

    
def getPredictionValue(predictionValue):
    if predictionValue > 0.02:
        return "BUY"
    elif predictionValue <-0.02:
        return "SELL"
    else:
        return "HOLD"


predictions30Days, latestDataset = gettingRecentPredictions("./model1.txt",'./SingleExampleUpdated/AAPL.csv','DifferenceAfter30Days',30,10)
predictions60Days, latestDataset = gettingRecentPredictions("./model2.txt",'./SingleExampleUpdated/AAPL.csv','DifferenceAfter60Days',60,10)
predictions120Days, latestDataset = gettingRecentPredictions("./model3.txt",'./SingleExampleUpdated/AAPL.csv','DifferenceAfter120Days',120,10)
predictions365Days, latestDataset = gettingRecentPredictions("./model4.txt",'./SingleExampleUpdated/AAPL.csv','DifferenceAfter365Days',365,10)
print(latestDataset)

#### Stating whether to invest or not 

In [None]:
worth30DayModel = determineWorthOfInvestment(latestDataset,predictions30Days,'ClosePriceAfter30Days','DifferenceAfter30Days',1)
worth60DayModel =determineWorthOfInvestment(latestDataset,predictions60Days,'ClosePriceAfter60Days','DifferenceAfter60Days',1)
worth120DayModel =determineWorthOfInvestment(latestDataset,predictions120Days,'ClosePriceAfter120Days','DifferenceAfter120Days',1)
worth365DayModel =determineWorthOfInvestment(latestDataset,predictions365Days,'ClosePriceAfter365Days','DifferenceAfter365Days',1)


predictedValue = getPredictionValue(worth30DayModel)
print("Investing as predicted value is: ", predictedValue)


predictedValue = getPredictionValue(worth60DayModel)
print("Investing as predicted value is: ", predictedValue)
predictedValue = getPredictionValue(worth120DayModel)
print("Investing as predicted value is: ", predictedValue)
predictedValue = getPredictionValue(worth365DayModel)
print("Investing as predicted value is: ", predictedValue)


    

#### Accessing multiple dataframes


In [None]:
# Specify the directory path where your files are located
directory = './TrainingDatasets'

# Iterate over the file list and perform operations on each file
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)  # Create the full file path
    
    # Check if the file is a CSV file (you can modify the condition as per your file format)
    if filename.endswith('.csv'):
        df_name = os.path.splitext(filename)[0]  # Get the name of the dataframe
        
        # Load the dataframe from the CSV file
        df = pd.read_csv(file_path)
        # Convert Date column to datetime object
        df['Date'] = pd.to_datetime(df['Date'], utc=True)

        # Filter rows based on condition - starting from the year 2008
        df = df[pd.to_datetime(df['Date']).dt.year >= 2008]
        
        df = changeValuesIntoLogs(df)
        df = obtainingExtraFeatures(df, 10)
        df = obtainRecentRecords(appleDb)
        
        # -------------------------------------------------------------------Model for 30 Day Prediction ------------------------------------------------------------------------

        # Prepare the training and testing data for model 1 - 30 days
        inputFeatureDatabase_TrainModel1_30Days, inputFeatureDatabase_TestModel1_30Days, targetOutputVariable_TrainModel1_30Days, targetOutputVariable_TestModel1_30Days  = splittingTheDataset(appleDb, 'DifferenceAfter30Days', 30, 0.2, 42)

        lgb_ToTrainModel1_30Days, lgb_evalualtePerformanceDuringTrainingModel1_30Days = changingIntoLightGBMformat(inputFeatureDatabase_TrainModel1_30Days, targetOutputVariable_TrainModel1_30Days, inputFeatureDatabase_TestModel1_30Days, targetOutputVariable_TestModel1_30Days)

        study = optuna.create_study(direction='minimize')
        study.optimize(lambda trial: objective(trial, lgb_ToTrainModel1_30Days, lgb_evalualtePerformanceDuringTrainingModel1_30Days, inputFeatureDatabase_TestModel1_30Days, targetOutputVariable_TestModel1_30Days), n_trials=30, show_progress_bar=False)

        best_paramsForModel1 = study.best_params

        # Re-run the model with the best parameters
        best_modelForModel1 = buildModel(best_paramsForModel1, lgb_ToTrainModel1_30Days, lgb_evalualtePerformanceDuringTrainingModel1_30Days, 30) 
        predicted_outputForModel1 = prediction(best_modelForModel1, inputFeatureDatabase_TestModel1_30Days)

        # Calculate the error with the best model
        best_mseForModel1, best_rmseForModel1 = calculateMSERmse(targetOutputVariable_TestModel1_30Days, predicted_outputForModel1)

        # save the model 
        filename = df_name + "model1.txt"
        pickle.dump(best_modelForModel1, open(filename, 'wb'))

        # At the end of each loop iteration, before starting the next file:
        del df, lgb_ToTrainModel1_30Days, lgb_evalualtePerformanceDuringTrainingModel1_30Days
        gc.collect()


        

In [None]:
# Specify the directory path where your files are located
directory = './TrainingDatasets'

# Iterate over the file list and perform operations on each file
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)  # Create the full file path
    
    # Check if the file is a CSV file (you can modify the condition as per your file format)
    if filename.endswith('.csv'):
        df_name = os.path.splitext(filename)[0]  # Get the name of the dataframe
        
        # Load the dataframe from the CSV file
        df = pd.read_csv(file_path)
        # Convert Date column to datetime object
        df['Date'] = pd.to_datetime(df['Date'], utc=True)

        # Filter rows based on condition - starting from the year 2008
        df = df[pd.to_datetime(df['Date']).dt.year >= 2008]
        
        df = changeValuesIntoLogs(df)
        df = obtainingExtraFeatures(df, 10)
        df = obtainRecentRecords(appleDb)
        

        # ----------------------------------------------------------------------------60 day model--------------------------------------------------------------------------------- 

        # Prepare the training and testing data for model 2 - 60 days
        inputFeatureDatabase_TrainModel2_60Days, inputFeatureDatabase_TestModel2_60Days, targetOutputVariable_TrainModel2_60Days, targetOutputVariable_TestModel2_60Days  = splittingTheDataset(appleDb, 'DifferenceAfter60Days', 60, 0.2, 42)

        lgb_ToTrainModel2_60Days, lgb_evalualtePerformanceDuringTrainingModel2_60Days = changingIntoLightGBMformat(inputFeatureDatabase_TrainModel2_60Days, targetOutputVariable_TrainModel2_60Days, inputFeatureDatabase_TestModel2_60Days, targetOutputVariable_TestModel2_60Days)

        study = optuna.create_study(direction='minimize')
        study.optimize(lambda trial: objective(trial, lgb_ToTrainModel2_60Days, lgb_evalualtePerformanceDuringTrainingModel2_60Days, inputFeatureDatabase_TestModel2_60Days, targetOutputVariable_TestModel2_60Days), n_trials=30, show_progress_bar=False)

        best_paramsForModel2 = study.best_params

        # Re-run the model with the best parameters
        best_modelForModel2 = buildModel(best_paramsForModel2, lgb_ToTrainModel2_60Days, lgb_evalualtePerformanceDuringTrainingModel2_60Days, 30)
        predicted_outputForModel2 = prediction(best_modelForModel2, inputFeatureDatabase_TestModel2_60Days)

        # Calculate the error with the best model
        best_mseForModel2, best_rmseForModel2 = calculateMSERmse(targetOutputVariable_TestModel2_60Days, predicted_outputForModel2)

        # save the model 
        filename = df_name + "model2.txt"
        pickle.dump(best_modelForModel2, open(filename, 'wb'))

        
        # At the end of each loop iteration, before starting the next file:
        del df, lgb_ToTrainModel2_60Days, lgb_evalualtePerformanceDuringTrainingModel2_60Days
        gc.collect()


In [None]:
# Specify the directory path where your files are located
directory = './TrainingDatasets'

# Iterate over the file list and perform operations on each file
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)  # Create the full file path
    
    # Check if the file is a CSV file (you can modify the condition as per your file format)
    if filename.endswith('.csv'):
        df_name = os.path.splitext(filename)[0]  # Get the name of the dataframe
        
        # Load the dataframe from the CSV file
        df = pd.read_csv(file_path)
        # Convert Date column to datetime object
        df['Date'] = pd.to_datetime(df['Date'], utc=True)

        # Filter rows based on condition - starting from the year 2008
        df = df[pd.to_datetime(df['Date']).dt.year >= 2008]
        
        df = changeValuesIntoLogs(df)
        df = obtainingExtraFeatures(df, 10)
        df = obtainRecentRecords(appleDb)
        

        # -------------------------------------------------------------------------120 day model---------------------------------------------------------------------------------------

        # Prepare the training and testing data for model 3 - 120 days
        inputFeatureDatabase_TrainModel3_120Days, inputFeatureDatabase_TestModel3_120Days, targetOutputVariable_TrainModel3_120Days, targetOutputVariable_TestModel3_120Days  = splittingTheDataset(appleDb, 'DifferenceAfter120Days', 120, 0.2, 42)

        lgb_ToTrainModel3_120Days, lgb_evalualtePerformanceDuringTrainingModel3_120Days = changingIntoLightGBMformat(inputFeatureDatabase_TrainModel3_120Days, targetOutputVariable_TrainModel3_120Days, inputFeatureDatabase_TestModel3_120Days, targetOutputVariable_TestModel3_120Days)

        study = optuna.create_study(direction='minimize')
        study.optimize(lambda trial: objective(trial, lgb_ToTrainModel3_120Days, lgb_evalualtePerformanceDuringTrainingModel3_120Days, inputFeatureDatabase_TestModel3_120Days, targetOutputVariable_TestModel3_120Days), n_trials=30, show_progress_bar=False)

        best_paramsForModel3 = study.best_params

        # Re-run the model with the best parameters
        best_modelForModel3 = buildModel(best_paramsForModel3, lgb_ToTrainModel3_120Days, lgb_evalualtePerformanceDuringTrainingModel3_120Days, 30)
        predicted_outputForModel3 = prediction(best_modelForModel3, inputFeatureDatabase_TestModel3_120Days)

        # Calculate the error with the best model
        best_mseForModel3, best_rmseForModel3 = calculateMSERmse(targetOutputVariable_TestModel3_120Days, predicted_outputForModel3)

        # save the model 
        filename = df_name + "model3.txt"
        pickle.dump(best_modelForModel3, open(filename, 'wb'))

        
        # At the end of each loop iteration, before starting the next file:
        del df, lgb_ToTrainModel3_120Days, lgb_evalualtePerformanceDuringTrainingModel3_120Days
        gc.collect()


In [None]:
# Specify the directory path where your files are located
directory = './TrainingDatasets'

# Iterate over the file list and perform operations on each file
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)  # Create the full file path
    
    # Check if the file is a CSV file (you can modify the condition as per your file format)
    if filename.endswith('.csv'):
        df_name = os.path.splitext(filename)[0]  # Get the name of the dataframe
        
        # Load the dataframe from the CSV file
        df = pd.read_csv(file_path)
        # Convert Date column to datetime object
        df['Date'] = pd.to_datetime(df['Date'], utc=True)

        # Filter rows based on condition - starting from the year 2008
        df = df[pd.to_datetime(df['Date']).dt.year >= 2008]
        
        df = changeValuesIntoLogs(df)
        df = obtainingExtraFeatures(df, 10)
        df = obtainRecentRecords(appleDb)
        

        # -------------------------------------------------------------------365 day model----------------------------------------------------------------------------------------

        # Prepare the training and testing data for model 4 - 365 days
        inputFeatureDatabase_TrainModel4_365Days, inputFeatureDatabase_TestModel4_365Days, targetOutputVariable_TrainModel4_365Days, targetOutputVariable_TestModel4_365Days  = splittingTheDataset(appleDb, 'DifferenceAfter365Days', 365, 0.2, 42)

        lgb_ToTrainModel4_365Days, lgb_evalualtePerformanceDuringTrainingModel4_365Days = changingIntoLightGBMformat(inputFeatureDatabase_TrainModel4_365Days, targetOutputVariable_TrainModel4_365Days, inputFeatureDatabase_TestModel4_365Days, targetOutputVariable_TestModel4_365Days)

        study = optuna.create_study(direction='minimize')
        study.optimize(lambda trial: objective(trial, lgb_ToTrainModel4_365Days, lgb_evalualtePerformanceDuringTrainingModel4_365Days, inputFeatureDatabase_TestModel4_365Days, targetOutputVariable_TestModel4_365Days), n_trials=30, show_progress_bar=False)

        best_paramsForModel4 = study.best_params

        # Re-run the model with the best parameters
        best_modelForModel4 = buildModel(best_paramsForModel4, lgb_ToTrainModel4_365Days, lgb_evalualtePerformanceDuringTrainingModel4_365Days, 30)
        predicted_outputForModel4 = prediction(best_modelForModel4, inputFeatureDatabase_TestModel4_365Days)

        # Calculate the error with the best model
        best_mseForModel4, best_rmseForModel4 = calculateMSERmse(targetOutputVariable_TestModel4_365Days, predicted_outputForModel4)

        # save the model 
        filename = df_name + "model4.txt"
        pickle.dump(best_mseForModel4, open(filename, 'wb'))

        
        # At the end of each loop iteration, before starting the next file:
        del df, lgb_ToTrainModel4_365Days, lgb_evalualtePerformanceDuringTrainingModel4_365Days
        gc.collect()

#### Testing on new data

In [None]:
# Specify the directory path where your files are located
directory = './UpdatedDatasets'
#to keep track of which is the best stock for each model
worth30DayModel_dict = {}
worth60DayModel_dict = {}
worth120DayModel_dict = {}
worth365DayModel_dict = {}

# Iterate over the file list and perform operations on each file
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)  # Create the full file path
    
    # Check if the file is a CSV file (you can modify the condition as per your file format)
    if filename.endswith('.csv'):
        df_name = os.path.splitext(filename)[0]  # Get the name of the dataframe
        
        # Load the dataframe from the CSV file
        df = pd.read_csv(file_path)

        #modelName Depending on the dataframe being read
        model1 = "./" + df_name + "model1.pkl"
        model2 = "./" +  df_name + "model2.pkl"
        model3 = "./" + df_name + "model3.pkl"
        model4 = "./" + df_name + "model4.pkl"

        #get the latest data and compute the extra features -T.I
        predictions30Days, latestDataset = gettingRecentPredictions(model1,file_path,'DifferenceAfter30Days',30,10)
        predictions60Days, latestDataset = gettingRecentPredictions(model2,file_path,'DifferenceAfter60Days',60,10)
        predictions120Days, latestDataset = gettingRecentPredictions(model3,file_path,'DifferenceAfter120Days',120,10)
        predictions365Days, latestDataset = gettingRecentPredictions(model4,file_path,'DifferenceAfter365Days',365,10)

        worth30DayModel_dict[df_name] = determineWorthOfInvestment(latestDataset,predictions30Days,'DifferenceAfter30Days',1)
        worth60DayModel_dict[df_name] = determineWorthOfInvestment(latestDataset,predictions60Days,'DifferenceAfter60Days',1)
        worth120DayModel_dict[df_name] = determineWorthOfInvestment(latestDataset,predictions120Days,'DifferenceAfter120Days',1)
        worth365DayModel_dict[df_name] = determineWorthOfInvestment(latestDataset,predictions365Days,'DifferenceAfter365Days',1)


        print(df.head())

best_df_30DayModel = max(worth30DayModel_dict, key=worth30DayModel_dict.get)
best_df_60DayModel = max(worth60DayModel_dict, key=worth60DayModel_dict.get)
best_df_120DayModel = max(worth120DayModel_dict, key=worth120DayModel_dict.get)
best_df_365DayModel = max(worth365DayModel_dict, key=worth365DayModel_dict.get)

print("30 Day Model: Best DF is ", best_df_30DayModel, " with value ", worth30DayModel_dict[best_df_30DayModel])
print("60 Day Model: Best DF is ", best_df_60DayModel, " with value ", worth60DayModel_dict[best_df_60DayModel])
print("120 Day Model: Best DF is ", best_df_120DayModel, " with value ", worth120DayModel_dict[best_df_120DayModel])
print("365 Day Model: Best DF is ", best_df_365DayModel, " with value ", worth365DayModel_dict[best_df_365DayModel])