<a href="https://colab.research.google.com/github/itsmuditt/Stock_Price_Prediction/blob/main/DL_Stock_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##*`Imports and Data Call through 'yfinance' API`*

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pip
import time
import datetime

In [None]:
company = input("Enter the name of company you want Prediction of: ")

In [None]:
# These dates are not accurate
end_date = datetime.datetime(2023, 10, 21)
start_date = datetime.datetime(2010, 1, 1)

In [None]:
import yfinance as yf
data = yf.download(f'{company}', start_date, end_date)

In [None]:
data.to_csv(f'{company}.csv')



---



##*`Data Preprocessing`*

In [None]:
prices_df = pd.read_csv(f'{company}.csv', parse_dates=True)

In [None]:
close_df = prices_df[['Date', 'Adj Close', 'Volume']]
close_df['Next Day'] = close_df['Adj Close'].shift(-1)
close_df.drop(len(close_df)-1, inplace=True)

In [None]:
close_df['Price_Diff'] = close_df['Next Day']-close_df['Adj Close']
close_df.to_csv('prices_infosys_v2.csv')

```
Dataset for prices has been created and saved.
```

##Computing News Sentiment scores according to Stock Market Dates



In [None]:
import pandas as pd
import numpy as np

In [None]:
financial_df = pd.read_csv('financial_scores.csv', index_col=False)
prices_df = pd.read_csv('prices_infosys_v2.csv', index_col=False)

In [None]:
financial_df.columns

In [None]:
financial_df = financial_df.drop('Unnamed: 0', axis=1)

In [None]:
# Reformatting the date in dataset1
financial_df['Date'] = pd.to_datetime(financial_df['Date'], format='%d-%m-%Y').dt.strftime('%d-%m-%Y')

# Reformatting the date in dataset2
prices_df['Date'] = pd.to_datetime(prices_df['Date']).dt.strftime('%d-%m-%Y')


In [None]:
def rearrange_dates(df_a, df_b):

    # df_a = sentiment_df and df_b = prices_df
    iter_a = df_a.iterrows()
    iter_b = df_b.iterrows()

    scores = []
    dates = []

    row_a = next(iter_a, None)
    row_b = next(iter_b, None)
    flag = False
    # Iterate over the rows of df_a and df_b separately
    for i in range(len(df_a)):
        if flag:
            row_a = next_a
            row_b = next_b

        next_a = next(iter_a, None)
        next_b = next(iter_b, None)

        if (next_a == None) or (next_b == None):
            break
        if next_a[1]['Date'] == next_b[1]['Date']:
            scores.append(row_a[1]['sentiment_score'])
            dates.append(row_a[1]['Date'])
        else:
            date = row_a[1]['Date']
            sum = (row_a[1]['sentiment_score'] * row_a[1]['count'])
            cnt = row_a[1]['count']
            while (next_a != None) and (next_a[1]['Date'] != next_b[1]['Date']):
                sum = sum + (next_a[1]['sentiment_score'] * next_a[1]['count'])
                cnt = cnt + next_a[1]['count']
                next_a = next(iter_a, None)
            scores.append(sum/cnt)
            dates.append(date)
        flag = True


    rearranged_sentiment_df = pd.DataFrame({'sentiment_score': scores, 'Date': dates})
    return rearranged_sentiment_df

In [None]:
rearranged_df = rearrange_dates(financial_df, prices_df)
rearranged_df.to_csv('rearranged.csv')



---
###Normalize sentiment scores and Adj Close to compute Impact
---



In [None]:
import pandas as pd

In [None]:
prices_df = pd.read_csv('prices_infosys_v2.csv', index_col=False)
financial_df = pd.read_csv('rearranged.csv', index_col=False)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler_fin = MinMaxScaler(feature_range=(-1, 1))

In [None]:
financial_df['scaled_financial_scores'] = scaler_fin.fit_transform(financial_df[['financial_score']])

In [None]:
prices_df['scaled_diff'] = prices_df['Price_Diff']/prices_df['Price_Diff'].abs().max()

In [None]:
prices_df['Impact'] = prices_df['scaled_diff'] * financial_df['scaled_financial_scores']
prices_df['Impact'] = prices_df['Impact'].abs()
prices_df.loc[prices_df['scaled_diff'] < 0, 'Impact'] *= -1

In [None]:
prices_df = prices_df.assign(financial_scores = financial_df['sentiment_score'])

In [None]:
prices_df = prices_df.assign(scaled_financial_scores = financial_df['scaled_financial_scores'])

In [None]:
prices_df.to_csv('final_v2.csv')



---



## LSTM Model

In [None]:
!pip install tensorflow --quiet

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras import regularizers
from tensorflow.keras.layers import LeakyReLU
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('final_v2.csv')
df.drop('Unnamed: 0.1', axis = 1, inplace=True)
df.drop('Unnamed: 0', axis = 1, inplace=True)

In [None]:
df.head(5)

Unnamed: 0,Date,Adj Close,Volume,Next Day,Price_Diff,scaled_diff,Impact,financial_scores,scaled_financial_scores
0,2010-01-04,239.741455,4069264,240.544266,0.802811,0.006381,0.000729,0.138043,0.114222
1,2010-01-05,240.544266,6895528,237.034332,-3.509933,-0.027896,-0.01363,0.207873,0.488592
2,2010-01-06,237.034332,6817288,231.707504,-5.326828,-0.042337,-0.015643,0.185656,0.369487
3,2010-01-07,231.707504,10892600,226.123688,-5.583817,-0.044379,-0.00567,0.092908,-0.127754
4,2010-01-08,226.123688,12649312,228.459061,2.335373,0.018561,0.000367,0.120427,0.019778


In [None]:
data = df[['Adj Close', 'financial_scores']].values
target = df['Next Day'].values

# Normalize the input data
scalerX = MinMaxScaler(feature_range=(0, 1))
X_scaled = scalerX.fit_transform(data)

#Empty lists will be used for formatted training data
dataX = []
targetY = []

# How far to look in future
predict_days = 1
# number of past days
timestep = 7

for i in range(timestep, len(data) - predict_days +1):
    dataX.append(X_scaled[i - timestep:i, 0:data.shape[1]])
    targetY.append(target[i - 1:i + predict_days-1])

scalerY = MinMaxScaler(feature_range=(0, 1))
Y_scaled = scalerY.fit_transform(targetY)

# Split the dataset into training and testing sets
train_size = int(len(df) * 0.9)
trainX = dataX[:train_size]
testX = dataX[train_size:]

trainY = Y_scaled[:train_size]
testY = Y_scaled[train_size:]

trainX, trainY = np.array(trainX), np.array(trainY)
testX, testY = np.array(testX), np.array(testY)


# dates = df['Date']
# test_dates = dates[train_size:]

In [None]:
# Define the LSTM Model
model = Sequential()
model.add(LSTM(units=7, activation=LeakyReLU(alpha=0.3), input_shape=(trainX.shape[1], trainX.shape[2]), return_sequences=True))
model.add(LSTM(units=4, activation=LeakyReLU(alpha=0.2), use_bias=True, return_sequences=True))
model.add(LSTM(units=2, activation=LeakyReLU(alpha=0.3), use_bias=True, return_sequences=False))
model.add(Dense(units=1))

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint("model_test4.h5",
                             monitor='val_loss',
                             save_best_only=True,
                             mode='min',
                             verbose=1)

In [None]:
import tensorflow.keras.backend as K

def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))


In [None]:
from tensorflow.keras.optimizers import Adam

# Compile the Model with a custom learning rate
learning_rate = 0.001
optimizer = Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss=root_mean_squared_error)

In [None]:
# Train the Model
history1 = model.fit(trainX, trainY, epochs=320, batch_size=4, shuffle=False, verbose=1, validation_split=0.1, callbacks=[checkpoint])

Epoch 1/320
Epoch 1: val_loss improved from inf to 0.23408, saving model to model_test4.h5
Epoch 2/320
  1/690 [..............................] - ETA: 33s - loss: 0.4446

  saving_api.save_model(


Epoch 2: val_loss improved from 0.23408 to 0.16229, saving model to model_test4.h5
Epoch 3/320
Epoch 3: val_loss did not improve from 0.16229
Epoch 4/320
Epoch 4: val_loss improved from 0.16229 to 0.15347, saving model to model_test4.h5
Epoch 5/320
Epoch 5: val_loss improved from 0.15347 to 0.11811, saving model to model_test4.h5
Epoch 6/320
Epoch 6: val_loss improved from 0.11811 to 0.09848, saving model to model_test4.h5
Epoch 7/320
Epoch 7: val_loss did not improve from 0.09848
Epoch 8/320
Epoch 8: val_loss did not improve from 0.09848
Epoch 9/320
Epoch 9: val_loss did not improve from 0.09848
Epoch 10/320
Epoch 10: val_loss did not improve from 0.09848
Epoch 11/320
Epoch 11: val_loss did not improve from 0.09848
Epoch 12/320
Epoch 12: val_loss did not improve from 0.09848
Epoch 13/320
Epoch 13: val_loss did not improve from 0.09848
Epoch 14/320
Epoch 14: val_loss did not improve from 0.09848
Epoch 15/320
Epoch 15: val_loss did not improve from 0.09848
Epoch 16/320
Epoch 16: val_los

In [None]:
model.save('model_test320_4_1.h5')

In [None]:
# Access the training history
# training_loss += history3.history['loss']
training_loss = history1.history['loss'] + history2.history['loss'] + history3.history['loss']
# validation_loss += history3.history['val_loss']
validation_loss = history1.history['val_loss'] + history2.history['val_loss'] + history3.history['val_loss']

# Create an array of epoch numbers for the x-axis
epochs = range(1, len(training_loss) + 1)

# Plot training and validation loss
plt.figure(figsize=(16, 8))
plt.plot(epochs, training_loss, 'b', label='Training Loss')
plt.plot(epochs, validation_loss, 'r', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
predictions_scaled = model.predict(testX)
predictions = scalerY.inverse_transform(predictions_scaled)

In [None]:
y_inverse = scalerY.inverse_transform(testY)

In [None]:
# Plot training and validation loss
plt.figure(figsize=(10, 6))
plt.xticks(range(0, len(y_inverse), 16))
plt.plot(predictions, 'r', label='Predicted Prices')
plt.plot(y_inverse, 'b', label='Actual Prices')
plt.title('Predicted and Actual Prices')
plt.xlabel('Dates')
plt.ylabel('Prices')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Calculate Mean Absolute Error
mae = mean_absolute_error(y_inverse, predictions)
print(f"Mean Absolute Error: {mae}")

# Calculate Mean Squared Error
mse = mean_squared_error(y_inverse, predictions)
print(f"Mean Squared Error: {mse}")

# Calculate Root Mean Squared Error
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")

# Calculate Mean Absolute Percentage Error
mape = np.mean(np.abs((y_inverse - predictions) / y_inverse)) * 100
print(f"Mean Absolute Percentage Error: {mape} %")

In [None]:
# Calculate Mean Absolute Error
mae = mean_absolute_error(testY, predictions_scaled)
print(f"Mean Absolute Error: {mae}")

# Calculate Mean Squared Error
mse = mean_squared_error(testY, predictions_scaled)
print(f"Mean Squared Error: {mse}")

# Calculate Root Mean Squared Error
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")

# Calculate Mean Absolute Percentage Error
mape = np.mean(np.abs((testY - predictions_scaled) / testY)) * 100
print(f"Mean Absolute Percentage Error: {mape} %")

##Testing the Model

In [None]:
!pip install tensorflow --quiet

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras import regularizers
from tensorflow.keras.layers import LeakyReLU
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Dataset/Stock Price Prediction/Prices/final_v2.csv')
df.drop('Unnamed: 0.1', axis = 1, inplace=True)
df.drop('Unnamed: 0', axis = 1, inplace=True)

In [None]:
data = df[['Adj Close', 'Volume']].values
target = df['Next Day'].values

# Normalize the input data
scalerX = MinMaxScaler(feature_range=(0, 1))
X_scaled = scalerX.fit_transform(data)

#Empty lists will be used for formatted training data
dataX = []
targetY = []

# How far to look in future
predict_days = 1
# number of past days
timestep = 14

for i in range(timestep, len(data) - predict_days +1):
    dataX.append(X_scaled[i - timestep:i, 0:data.shape[1]])
    targetY.append(target[i - 1:i + predict_days-1])

scalerY = MinMaxScaler(feature_range=(0, 1))
Y_scaled = scalerY.fit_transform(targetY)

# Split the dataset into training and testing sets
train_size = int(len(df) * 0.9)
trainX = dataX[:train_size]
testX = dataX[train_size:]

trainY = Y_scaled[:train_size]
testY = Y_scaled[train_size:]

trainX, trainY = np.array(trainX), np.array(trainY)
testX, testY = np.array(testX), np.array(testY)

# dates = df['Date']
# test_dates = dates[train_size:]

In [None]:
import tensorflow.keras.backend as K

def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [None]:
import time
time.sleep(8)
import tensorflow as tf
model = tf.keras.models.load_model('model_test4.h5')
# , custom_objects={'root_mean_squared_error': root_mean_squared_error}

In [None]:
predictions_scaled1 = model.predict(testX)
predictions1 = scalerY.inverse_transform(predictions_scaled1)

In [None]:
y_inverse = scalerY.inverse_transform(testY)

In [None]:
# Plot training and validation loss
plt.figure(figsize=(10, 6))
plt.xticks(range(0, len(y_inverse), 16))
plt.plot(predictions1, 'r', label='Predicted Prices')
plt.plot(y_inverse, 'b', label='Actual Prices')
plt.title('Predicted and Actual Prices')
plt.xlabel('Dates')
plt.ylabel('Prices')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

In [None]:
# Calculate Mean Absolute Error
mae = mean_absolute_error(y_inverse, predictions1)
print(f"Mean Absolute Error: {mae}")

# Calculate Mean Squared Error
mse = mean_squared_error(y_inverse, predictions1)
print(f"Mean Squared Error: {mse}")

# Calculate Root Mean Squared Error
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")

# Calculate Mean Absolute Percentage Error
mape = np.mean(np.abs((y_inverse - predictions1) / y_inverse)) * 100
print(f"Mean Absolute Percentage Error: {mape} %")

In [None]:
# Calculate Mean Absolute Error
mae = mean_absolute_error(testY, predictions_scaled1)
print(f"Mean Absolute Error: {mae}")

# Calculate Mean Squared Error
mse = mean_squared_error(testY, predictions_scaled1)
print(f"Mean Squared Error: {mse}")

# Calculate Root Mean Squared Error
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")

# Calculate Mean Absolute Percentage Error
mape = np.mean(np.abs((testY - predictions_scaled1) / testY)) * 100
print(f"Mean Absolute Percentage Error: {mape} %")