#실습 예제
Sunspots 데이터셋을 활용하여 window_dataset 만들기

In [1]:
import csv
import tensorflow as tf
import numpy as np
import urllib

url = 'https://storage.googleapis.com/download.tensorflow.org/data/Sunspots.csv'
urllib.request.urlretrieve(url, 'sunspots.csv')

('sunspots.csv', <http.client.HTTPMessage at 0x7f44a387f310>)

In [2]:
with open('sunspots.csv') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    i = 0
    for row in reader:
        print(row)
        i+=1
        if i > 5:
            break

['0', '1749-01-31', '96.7']
['1', '1749-02-28', '104.3']
['2', '1749-03-31', '116.7']
['3', '1749-04-30', '92.8']
['4', '1749-05-31', '141.7']
['5', '1749-06-30', '139.2']


In [3]:
train_data = []
with open('sunspots.csv') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    # 첫 줄은 header이므로 skip 합니다.
    next(reader)
    for row in reader:
        train_data.append(float(row[2]))

In [4]:
train_data[:5]

[96.7, 104.3, 116.7, 92.8, 141.7]

In [5]:
train_data = np.asarray(train_data)
print(f"Before  reshape: {train_data.shape}")
train_data = np.expand_dims(train_data, axis=1)
print(f"After   reshape: {train_data.shape}")

Before  reshape: (3235,)
After   reshape: (3235, 1)


In [6]:
dataset = tf.data.Dataset.from_tensor_slices(train_data)

In [7]:
i = 0
for data in dataset:
    print(data)
    i += 1
    if i > 5:
        break

tf.Tensor([96.7], shape=(1,), dtype=float64)
tf.Tensor([104.3], shape=(1,), dtype=float64)
tf.Tensor([116.7], shape=(1,), dtype=float64)
tf.Tensor([92.8], shape=(1,), dtype=float64)
tf.Tensor([141.7], shape=(1,), dtype=float64)
tf.Tensor([139.2], shape=(1,), dtype=float64)


In [8]:
window_size = 20+1
dataset = dataset.window(window_size, shift=1, drop_remainder=True)

In [9]:
dataset = dataset.flat_map(lambda w: w.batch(window_size + 1))

In [10]:
dataset = dataset.map(lambda x:(x[:-1],x[-1:]))

In [11]:
for train, label in dataset.take(2):
    print(f'train: {train.shape}')
    print(f'label: {label.shape}')

train: (20, 1)
label: (1, 1)
train: (20, 1)
label: (1, 1)


# tf.data: Tensorflow 입력 파이프라인 빌드
https://www.tensorflow.org/guide/data#time_series_windowing

In [12]:
# list(tf.data.Dataset.range(1000).as_numpy_iterator())
range_ds = tf.data.Dataset.range(1, 1000)

In [13]:
batches = range_ds.batch(10, drop_remainder=False)

In [14]:
for batch in batches.take(3):
    print(batch.numpy())

[ 1  2  3  4  5  6  7  8  9 10]
[11 12 13 14 15 16 17 18 19 20]
[21 22 23 24 25 26 27 28 29 30]


In [15]:
def dense_1_step(batch):
    # Shift features and labels one step relative to each other.
    return batch[:-1], batch[1:]

In [16]:
predict_dense_1_step = batches.map(dense_1_step)

In [17]:
for feature, label in predict_dense_1_step.take(3):
    print(feature.numpy(), '=>', label.numpy())

[1 2 3 4 5 6 7 8 9] => [ 2  3  4  5  6  7  8  9 10]
[11 12 13 14 15 16 17 18 19] => [12 13 14 15 16 17 18 19 20]
[21 22 23 24 25 26 27 28 29] => [22 23 24 25 26 27 28 29 30]


In [18]:
batches = range_ds.batch(15, drop_remainder=True)

In [19]:
def label_next_5_steps(batch):
    # Shift features and labels one step relative to each other.
    return batch[:-5], batch[-5:]

In [20]:
predict_5_steps = batches.map(label_next_5_steps)

In [21]:
for features, label in predict_5_steps.take(3):
  print(features.numpy(), " => ", label.numpy())

[ 1  2  3  4  5  6  7  8  9 10]  =>  [11 12 13 14 15]
[16 17 18 19 20 21 22 23 24 25]  =>  [26 27 28 29 30]
[31 32 33 34 35 36 37 38 39 40]  =>  [41 42 43 44 45]


In [22]:
feature_length = 10
label_length = 3

features = range_ds.batch(feature_length, drop_remainder=True)
labels = range_ds.batch(feature_length).skip(1).map(
    lambda labels: labels[:label_length])

predicted_steps = tf.data.Dataset.zip((features, labels))

for features, label in predicted_steps.take(5):
    print(features.numpy(), '=>', label.numpy())

[ 1  2  3  4  5  6  7  8  9 10] => [11 12 13]
[11 12 13 14 15 16 17 18 19 20] => [21 22 23]
[21 22 23 24 25 26 27 28 29 30] => [31 32 33]
[31 32 33 34 35 36 37 38 39 40] => [41 42 43]
[41 42 43 44 45 46 47 48 49 50] => [51 52 53]


In [23]:
windows = range_ds.window(5, shift=1)
for sub_ds in windows.take(5):
    print(sub_ds)

<_VariantDataset shapes: (), types: tf.int64>
<_VariantDataset shapes: (), types: tf.int64>
<_VariantDataset shapes: (), types: tf.int64>
<_VariantDataset shapes: (), types: tf.int64>
<_VariantDataset shapes: (), types: tf.int64>


In [24]:
for x in windows.flat_map(lambda x: x.batch(window_size)).take(5):
    print(x.numpy())

[1 2 3 4 5]
[2 3 4 5 6]
[3 4 5 6 7]
[4 5 6 7 8]
[5 6 7 8 9]


In [25]:
def make_window_dataset(ds, window_size=5, shift=1, stride=1):
  windows = ds.window(window_size, shift=shift, stride=stride)

  def sub_to_batch(sub):
    return sub.batch(window_size, drop_remainder=True)

  windows = windows.flat_map(sub_to_batch)
  return windows

In [26]:
ds = make_window_dataset(range_ds, 10, 1, 1)
for example in ds.take(10):
    print(example.numpy())

[ 1  2  3  4  5  6  7  8  9 10]
[ 2  3  4  5  6  7  8  9 10 11]
[ 3  4  5  6  7  8  9 10 11 12]
[ 4  5  6  7  8  9 10 11 12 13]
[ 5  6  7  8  9 10 11 12 13 14]
[ 6  7  8  9 10 11 12 13 14 15]
[ 7  8  9 10 11 12 13 14 15 16]
[ 8  9 10 11 12 13 14 15 16 17]
[ 9 10 11 12 13 14 15 16 17 18]
[10 11 12 13 14 15 16 17 18 19]


# My data

In [27]:
import pandas as pd

In [28]:
data_path = '/content/drive/MyDrive/Colab_Architecture/preprocessed/ASOS_108_20090101_20161231.csv'
df = pd.read_csv(data_path)
df.drop(df.columns[0], axis=1, inplace=True)

In [29]:
_X = np.empty((100, 13))

In [30]:
_y = df['ta'].to_numpy()
_X = df.drop('ta', axis=1).to_numpy()
_X.shape, _y.shape

((70127, 13), (70127,))

In [40]:
feature_length = 30
target_length = 1

X = tf.data.Dataset.from_tensor_slices(_X)
y = tf.data.Dataset.from_tensor_slices(_y)
y = y.skip(feature_length)

## Multi-Variate Time Series

In [68]:
dX = make_window_dataset(X, feature_length).batch(32)
dy = make_window_dataset(y, target_length, shift=feature_length).batch(32)
ds = tf.data.Dataset.zip((dX, dy))

In [69]:
for window in dy:
    items = [item.numpy() for item in window]
    print(np.array(items).shape)
    break

(32, 1)


In [70]:
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(20, input_shape=[feature_length, 13], 
                         return_sequences=True),
    tf.keras.layers.LSTM(1, input_shape=[feature_length, 13])
])

In [71]:
model.compile(optimizer = 'adam',
              loss = 'mse',
              metrics = ['accuracy'],)

In [72]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 30, 20)            2720      
_________________________________________________________________
lstm_5 (LSTM)                (None, 1)                 88        
Total params: 2,808
Trainable params: 2,808
Non-trainable params: 0
_________________________________________________________________


In [73]:
model.fit(ds, epochs=2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f4450bedad0>

In [77]:
_X[np.newaxis, :feature_length].shape

(1, 30, 13)

In [75]:
model.predict(_X[np.newaxis, :feature_length])

array([[0.4272872]], dtype=float32)

## Single Variate Time Series

In [82]:
y = tf.data.Dataset.from_tensor_slices(_y)

In [84]:
batches = y.batch(10, drop_remainder=True)

In [88]:
def label_next_1_steps(batch):
  return (batch[:-1],   # Take the first 5 steps
          batch[-1:])   # take the remainder

In [91]:
predict_1_steps = batches.map(label_next_1_steps)

In [92]:
predict_1_steps

<MapDataset shapes: ((9,), (1,)), types: (tf.float64, tf.float64)>

In [95]:
for features, label in predict_1_steps.take(5):
    print(features.numpy(), '=>', label.numpy())

[-7.6 -7.8 -8.1 -8.5 -8.8 -9.  -9.  -9.4 -9.3] => [-8.6]
[-7.8 -6.7 -5.3 -4.6 -3.8 -3.1 -2.9 -3.1 -3.4] => [-3.9]
[-3.8 -4.  -4.3 -4.4 -4.8 -4.9 -5.  -5.3 -5.3] => [-5.4]
[-5.8 -6.4 -6.6 -6.2 -4.2 -2.8 -0.6 -0.2  0.4] => [0.6]
[ 0.4  0.1 -0.6 -1.2 -1.9 -2.1 -2.2 -2.5 -2.7] => [-3.3]


In [None]:
# 여기서부터 하기
model = tf.keras.Sequential()