In [None]:
"""
What? Prepare date for LSTM

https://machinelearningmastery.com
"""

In [21]:
# Import python modules
from pandas import Series
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from math import sqrt
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from keras.preprocessing.sequence import pad_sequences
from pandas import DataFrame

### Prepare Numeric Data: Normalize Series Data

In [None]:
"""
Normalization is a rescaling of the data from the original range so that all values are within the range of 0 and 1.
"""

In [5]:
# define contrived series
data = [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0]
series = Series(data)
print(series)

# prepare data for normalization
values = series.values
values = values.reshape((len(values), 1))

# train the normalization
scaler = MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit(values)
print("Min: %f, Max: %f  %", (scaler.data_min_, scaler.data_max_))

# normalize the dataset and print
normalized = scaler.transform(values)
print(normalized)

# Inverse transform: prints the values back in their original scale 
inversed = scaler.inverse_transform(normalized)
print(inversed)

0     10.0
1     20.0
2     30.0
3     40.0
4     50.0
5     60.0
6     70.0
7     80.0
8     90.0
9    100.0
dtype: float64
Min: %f, Max: %f  % (array([10.]), array([100.]))
[[0.        ]
 [0.11111111]
 [0.22222222]
 [0.33333333]
 [0.44444444]
 [0.55555556]
 [0.66666667]
 [0.77777778]
 [0.88888889]
 [1.        ]]
[[ 10.]
 [ 20.]
 [ 30.]
 [ 40.]
 [ 50.]
 [ 60.]
 [ 70.]
 [ 80.]
 [ 90.]
 [100.]]


### Prepare Numeric Data: Standardize Series Data

In [None]:
"""
Standardizing a dataset involves rescaling the distribution of values so that the mean of observed values is 0 
and the standard deviation is 1
"""

In [8]:
# define contrived series
data = [1.0, 5.5, 9.0, 2.6, 8.8, 3.0, 4.1, 7.9, 6.3]
series = Series(data)
print(series)

# prepare data for normalization
values = series.values
values = values.reshape((len(values), 1))

# train the normalization
scaler = StandardScaler()
scaler = scaler.fit(values)
print("Mean: %f, StandardDeviation: %f", (scaler.mean_, sqrt(scaler.var_)))

# normalize the dataset and print
standardized = scaler.transform(values)
print(standardized)

# inverse transform and print
inversed = scaler.inverse_transform(standardized)
print(inversed)

0    1.0
1    5.5
2    9.0
3    2.6
4    8.8
5    3.0
6    4.1
7    7.9
8    6.3
dtype: float64
Mean: %f, StandardDeviation: %f (array([5.35555556]), 2.7125679146074897)
[[-1.60569456]
 [ 0.05325007]
 [ 1.34354035]
 [-1.01584758]
 [ 1.26980948]
 [-0.86838584]
 [-0.46286604]
 [ 0.93802055]
 [ 0.34817357]]
[[1. ]
 [5.5]
 [9. ]
 [2.6]
 [8.8]
 [3. ]
 [4.1]
 [7.9]
 [6.3]]


### Prepare Categorical Data: One Hot Encode

In [14]:
# define example
data=["cold", "cold", "warm", "cold", "hot", "hot", "warm", "cold", "warm", "hot"] 
values = array(data)
print(values)

# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)

# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded) 
print(onehot_encoded)

# invert first example
inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])]) 
print(inverted)

['cold' 'cold' 'warm' 'cold' 'hot' 'hot' 'warm' 'cold' 'warm' 'hot']
[0 0 2 0 1 1 2 0 2 1]
[[1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]
['cold']


### Pre & post-sequence Padding

In [None]:
"""
The pad sequences() function in the Keras deep learning library can be used to pad variable length sequences.
"""

In [16]:
# define sequences
sequences = [ [1, 2, 3, 4], [1, 2, 3], [1] ]
# pad sequence
padded = pad_sequences(sequences)
print(padded)

[[1 2 3 4]
 [0 1 2 3]
 [0 0 0 1]]


In [18]:
# pad sequence
padded = pad_sequences(sequences, padding= "post")
print(padded)

[[1 2 3 4]
 [1 2 3 0]
 [1 0 0 0]]


### Pre & post-Sequence Truncation

In [19]:
# truncate sequence
truncated= pad_sequences(sequences, maxlen=2)
print(truncated)

[[3 4]
 [2 3]
 [0 1]]


In [20]:
# truncate sequence
truncated= pad_sequences(sequences, maxlen=2, truncating= "post")
print(truncated)

[[1 2]
 [1 2]
 [0 1]]


### Pandas shift() Function

In [None]:
"""
A key function to help transform time series data into a supervised learning problem is the Pandas shift() function.
"""

In [23]:
# define the sequence
df = DataFrame()
df["t"] = [x for x in range(10)]
print(df)

   t
0  0
1  1
2  2
3  3
4  4
5  5
6  6
7  7
8  8
9  9


In [25]:
# define the sequence
df = DataFrame()
df["t"] = [x for x in range(10)]
# shift forward
df["t-1"] = df["t"].shift(1)
print(df)

   t  t-1
0  0  NaN
1  1  0.0
2  2  1.0
3  3  2.0
4  4  3.0
5  5  4.0
6  6  5.0
7  7  6.0
8  8  7.0
9  9  8.0


In [26]:
# define the sequence
df = DataFrame()
df["t"] = [x for x in range(10)]
# shift backward
df["t+1"] = df["t"].shift(-1)
print(df)

   t  t+1
0  0  1.0
1  1  2.0
2  2  3.0
3  3  4.0
4  4  5.0
5  5  6.0
6  6  7.0
7  7  8.0
8  8  9.0
9  9  NaN
