# Data Preprocessing for CNN-LSTM

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd "/content/drive/My Drive/P2_TimeForecast/P2_CodeBase"

# install import-ipynb for importing helper modules
!pip install import-ipynb
import import_ipynb

# importing helper functions for data preprocessing and model visualization
import plotlib_helper as gplot
import datapreprocess_helper as dataprep

/content/drive/My Drive/P2_TimeForecast/P2_CodeBase
Collecting import-ipynb
  Downloading https://files.pythonhosted.org/packages/63/35/495e0021bfdcc924c7cdec4e9fbb87c88dd03b9b9b22419444dc370c8a45/import-ipynb-0.1.3.tar.gz
Building wheels for collected packages: import-ipynb
  Building wheel for import-ipynb (setup.py) ... [?25l[?25hdone
  Created wheel for import-ipynb: filename=import_ipynb-0.1.3-cp36-none-any.whl size=2976 sha256=d3e148ba1a8bd5cf27a747c6a920674749f2c90e19f441c451bc5ade45a19ebe
  Stored in directory: /root/.cache/pip/wheels/b4/7b/e9/a3a6e496115dffdb4e3085d0ae39ffe8a814eacc44bbf494b5
Successfully built import-ipynb
Installing collected packages: import-ipynb
Successfully installed import-ipynb-0.1.3
importing Jupyter notebook from plotlib_helper.ipynb
importing Jupyter notebook from datapreprocess_helper.ipynb


## Data Preprocessing

In [3]:
import pandas as pd
import io
import os
import requests
import numpy as np
from sklearn import metrics
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

path = "/content/drive/My Drive/P2_TimeForecast/P2_CodeBase/"

filepath = os.path.join(path, "data/CSC215_P2_Stock_Price.csv")
df = pd.read_csv(filepath, na_values=['NA','?','-'])

print(df.dtypes , df.shape)
print()
print("----------------------- DataFrame before Preprocessing ------------------------")
print()
print(df.head())
print()

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Adj_Close    float64
Volume         int64
dtype: object (4392, 7)

----------------------- DataFrame before Preprocessing ------------------------

        Date      Open      High       Low     Close  Adj_Close   Volume
0  2000/3/27  3.812500  4.156250  3.812500  4.125000   4.125000  3675600
1  2000/3/28  4.125000  4.125000  4.000000  4.015625   4.015625  1077600
2  2000/3/29  4.000000  4.031250  3.953125  4.000000   4.000000   437200
3  2000/3/30  4.000000  4.000000  3.843750  3.843750   3.843750  1883600
4  2000/3/31  3.734375  3.734375  3.390625  3.390625   3.390625  7931600



### Removing Date and Adj_Close columns

In [4]:
# removing any null values and dropping Date and Adj_close columns

df.dropna(axis=0, inplace=True)
df.drop(columns=['Date', 'Adj_Close'], axis= 1, inplace=True)
print(df[0:5])
print()
print(f"Shape of dataframe {df.shape}")

       Open      High       Low     Close   Volume
0  3.812500  4.156250  3.812500  4.125000  3675600
1  4.125000  4.125000  4.000000  4.015625  1077600
2  4.000000  4.031250  3.953125  4.000000   437200
3  4.000000  4.000000  3.843750  3.843750  1883600
4  3.734375  3.734375  3.390625  3.390625  7931600

Shape of dataframe (4392, 5)


### Checking for null values

In [5]:
#checking for null values in any columns
df.isnull().sum()

Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64

### Normalization of numeric columns 
All numeric columns normalized except for output (y)

In [6]:
df['Output']=df['Close']

# normalizing all the columns except for Output
for col in df.columns:
  if col not in 'Output':
    dataprep.encode_numeric_zscore(df,col)

print(f"Shape of Preprocessed DataFrame: {df.shape}")

print("----------------------- DataFrame after Preprocessing ------------------------")
print()
print(df.head())
print()

Shape of Preprocessed DataFrame: (4392, 6)
----------------------- DataFrame after Preprocessing ------------------------

       Open      High       Low     Close    Volume    Output
0 -0.894209 -0.885022 -0.892306 -0.884364  1.104812  4.125000
1 -0.883762 -0.886056 -0.885975 -0.888021 -0.497301  4.015625
2 -0.887941 -0.889159 -0.887558 -0.888543 -0.892217  4.000000
3 -0.887941 -0.890194 -0.891251 -0.893768 -0.000263  3.843750
4 -0.896820 -0.898986 -0.906551 -0.908920  3.729366  3.390625



In [7]:
# saving preprocessed data into csv for future use across notebooks

data_write = os.path.join(path,"data/P2_Preprocessed_CNN_LSTM.csv")
df.to_csv(data_write,index=False)

print("Successfully saved CNN,LSTM preprocessed dataset")

Successfully saved CNN,LSTM preprocessed dataset
