# Preprocessing data for LSTM

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import math
from keras.src.preprocessing.sequence import TimeseriesGenerator


### Steps
1. Load data <br>
2. Drop time <br>
3. Normalize data <br>
4. Split into samples <br>
5. Reshape data 

## 1. Load data, remove Date column and convert to numpy format

In [None]:
df = pd.read_csv()
df.drop(["Date"], inplace=True, axis=1)
data = df.to_numpy()

In [12]:
data = np.array([
 [0.1, 1.0, 112],
 [0.2, 0.9, 321],
 [0.3, 0.8, 453],
 [0.42, 0.9, 512]
])

target = np.array([1,0,1,1])

### Split data into training and testing sets

In [23]:
# Input parameters: 
#       data - input data that has to be split into train and test sets
#       test_size - percentage of the input data that will be used for testing
def train_test_split(data, test_size):
    data_len = data.shape[0]
    split_index = math.ceil(data_len*(1-test_size))

    if data.shape[1] > 1:
        return data[:split_index, ], data[split_index:, ]
    else:
        return data[:split_index], data[split_index:]


In [24]:
test_size = 0.3

X_train, X_test = train_test_split(data, test_size)
y_train, y_test = train_test_split(target, test_size)


[[1.00e-01 1.00e+00 1.12e+02]
 [2.00e-01 9.00e-01 3.21e+02]
 [3.00e-01 8.00e-01 4.53e+02]]


## 2. Normalize the data

Min-max scaler is used for normalizing the data. The scaler is fit on the training data, and the fitted scaler is applied to test data. Target variable does not need to be scaled.

In [None]:
min_max_scaler = MinMaxScaler()
X_train_scaled = min_max_scaler.fit_transform(X_train)
X_test_scaled = min_max_scaler.transform(X_test)

In [None]:
# Reshape the data

# n = number of samples: LSTMs dont like sequences of more than 200-400 time steps, split data into samples
n = 1
# t = number of timesteps / number of days in the training set
t = 365
# f = number of input features
f = 3

data.reshape(n, t, f)

## Create samples and reshape data

In [27]:

data_gen = TimeseriesGenerator(data, target,
                               length=10, sampling_rate=2,
                               batch_size=2)
data_gen

NameError: name 'TimeseriesGenerator' is not defined