In [None]:
# Import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
# Import data into pandas dataframe
df_pnj=pd.read_csv('/content/PNJ.csv')
df_pnj=pd.DataFrame(df_pnj)

In [None]:
# Get information from dataset
df_pnj.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125309 entries, 0 to 125308
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Ticker         125309 non-null  object 
 1   Date/Time      125309 non-null  object 
 2   Open           125309 non-null  float64
 3   High           125309 non-null  float64
 4   Low            125309 non-null  float64
 5   Close          125309 non-null  float64
 6   Volume         125309 non-null  int64  
 7   Open Interest  125309 non-null  int64  
dtypes: float64(4), int64(2), object(2)
memory usage: 7.6+ MB


In [None]:
df_pnj.describe()

Unnamed: 0,Open,High,Low,Close,Volume,Open Interest
count,125309.0,125309.0,125309.0,125309.0,125309.0,125309.0
mean,72.182052,72.37259,72.042287,72.179371,3031.830451,0.0
std,9.900263,9.995786,9.830847,9.899401,7248.952968,0.0
min,44.43,44.53,44.04,44.14,1.0,0.0
25%,64.6,64.7,64.57,64.6,220.0,0.0
50%,72.8,73.01,72.64,72.83,1000.0,0.0
75%,80.16,80.35,80.06,80.16,3130.0,0.0
max,100.16,100.16,100.16,100.16,580730.0,0.0


In [None]:
# Change type of Date/Time column, keep the last price of day and rename
def changeDateTime(df):
    df['Date/Time'] = pd.to_datetime(df['Date/Time'])
    # Change the line to drop the rows and update dataset
    df.drop(df[df['Date/Time'].dt.time != pd.to_datetime('14:46:00').time()].index, inplace=True)
    df.reset_index(drop=True, inplace=True)
    # Convert to date only
    df['Date/Time'] = df['Date/Time'].dt.date
    # Rename the new date column
    df.rename(columns={'Date/Time': 'Date'}, inplace=True)

changeDateTime(df_pnj)

In [None]:
#Get necessary data
def prepareData(df):
  data = df[['Date', 'Close']]
  dataset=df['Close'].values.reshape(-1,1)
  # Set the training dataset with 85% of given data
  training_data_len=int(np.ceil(len(dataset)*0.85))
  training_data_len
  return data, dataset, training_data_len

In [None]:
PNJ_data, PNJ_dataset,PNJ_train_len=prepareData(df_pnj)

In [None]:
# Scale the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(PNJ_dataset)

In [None]:
# Create the training data set
def createTrainSet(scaled_data, training_len):
  # Create the scaled training data set
  train_data = scaled_data[0:int(training_len), :]
  # Split the data into x_train and y_train data sets
  x_train = []
  y_train = []

  for i in range(100, len(train_data)):
      x_train.append(train_data[i-100:i, 0])
      y_train.append(train_data[i, 0])

  # Convert the x_train and y_train to numpy arrays
  x_train, y_train = np.array(x_train), np.array(y_train)

  # Reshape the data
  x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
  return x_train, y_train

In [None]:
x_train,y_train=createTrainSet(scaled_data,PNJ_train_len)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM

# Build the LSTM model
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape= (x_train.shape[1], 1)))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(25))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(x_train, y_train, batch_size=6, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x7d3ac1b2e710>

In [None]:
# Create the testing data set
def createTestSet(dataset,scaled_data,training_len):
  # Create a new array containing scaled values
  test_data = scaled_data[training_len - 100: , :]
  # Create the data sets x_test and y_test
  x_test = []
  y_test = dataset[training_len:, :]
  for i in range(100, len(test_data)):
      x_test.append(test_data[i-100:i, 0])

  # Convert the data to a numpy array
  x_test = np.array(x_test)

  # Reshape the data
  x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1 ))
  return x_test, y_test

def predictValue(x_test,model,scaler):
  # Get the models predicted price values
  predictions = model.predict(x_test)
  predictions = scaler.inverse_transform(predictions)
  return predictions


In [None]:
x_test,y_test=createTestSet(PNJ_dataset,scaled_data,PNJ_train_len)
prediction=predictValue(x_test,model,scaler)



In [None]:
#Get the root mean squared error (RMSE)
rmse = np.sqrt(np.mean(((prediction - y_test) ** 2)))
rmse

1.4753164303821127

In [None]:
# Make validation data
valid = PNJ_data[PNJ_train_len:]
valid['Prediction'] = prediction
# Fill NaN values with 0
valid['Prediction'].fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid['Prediction'] = prediction
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid['Prediction'].fillna(0, inplace=True)


In [None]:
import plotly.express as px
import plotly.graph_objects as go

# Create a line plot for 'Close' values
fig = px.line(valid, x="Date", y="Close", title='Close vs Prediction')

# Add 'Prediction' line to the existing figure
fig.add_trace(go.Scatter(x=valid["Date"], y=valid["Prediction"], mode='lines', name='Prediction'))

# Show the plot
fig.show()


In [None]:
valid.head(10)

Unnamed: 0,Date,Close,Prediction
599,2020-07-27,52.1,58.128113
600,2020-07-28,54.0,55.432644
601,2020-07-29,51.5,54.396862
602,2020-07-30,51.5,53.14761
603,2020-07-31,51.6,52.405125
604,2020-08-03,52.8,52.154961
605,2020-08-04,53.5,52.63829
606,2020-08-05,54.48,53.440617
607,2020-08-06,55.18,54.438843
608,2020-08-07,54.78,55.39349


In [None]:
# Calculate the different with the first date
real_diff=valid['Close']-valid['Close'].iloc[0]
pred_diff=valid['Prediction']-valid['Prediction'].iloc[0]

In [None]:
# Plot the Real Movement vs Prediction Movement
from plotly.subplots import make_subplots
fig=make_subplots(rows=2, cols=1)
fig.add_trace(go.Bar(x=valid['Date'],y=real_diff,name='Real'),row=1,col=1)
fig.add_trace(go.Bar(x=valid['Date'],y=pred_diff,name='Prediction'),row=2,col=1)

fig.update_layout(
    autosize=False,
    width=800,
    height=800,
    title='Real Movement vs Prediction Movement'
)
fig.show()