# 미세먼지 농도 예측

* AirkKorea 미세먼지 데이터 활용  
 - 춘천시 석사동 지역의 두달간 미세먼지농도 데이터를 수집
 - 24시간 미세먼지 측정값을 이용하여 5시간 동안 미세먼지농도를 예측   

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import tensorflow as tf

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False

## 미세먼지 dataset

지역 : 석사동
기간 : 20200616 ~ 20200717
관측값 : 미세먼지(PM10), 초미세먼지(PM2.5), 오존(O3), 이산화질소(NO2), 일산화탄소(CO), 아황산가스(SO2)

예측 : 24시간 이전 값으로 5시간 예측
윈도우 : 입력 : 24시간 데이터, 출력 :  5시간  데이터


In [None]:
csv_path = '/content/drive/My Drive/DeepLearning/data/pmdata_20200617_0717.csv'
#csv_path = './data/pmdata_20200617_0717.csv'
df = pd.read_csv(csv_path)

In [None]:
# 상위 데이터 목록 확인
df.head()


In [None]:
#데이터프레임 정보확인
df.info()

In [None]:
#데이터 통계값 확인
df.describe()

In [None]:
#datetime index 설정
df.index = df['DateTime']

print(df.head())
print(df.shape)

In [None]:
#결측치 확인 및 삭제
na = df.isnull().sum()
print("NA = ", na)
df.dropna(inplace=True)
print(df.shape)


In [None]:
#날짜순으로 정렬
df = df.sort_index(ascending=True)
print(df)

데이터의 처음 500 개는 학습 데이터 세트고 나머지는 유효성 검사 데이터 세트로 구성

In [None]:
#학습 데이터 분리를 위한 위치값
TRAIN_SPLIT = 500

재현성을 보장하기 위해 시드 설정.

In [None]:
tf.random.set_seed(13)

### 3개 특징으로 예측

In [None]:
pd.plotting.scatter_matrix(df, s=60, diagonal='kde')

### (1) 3개의 특징으로 구성된 데이터 생성

In [None]:
features_considered = ['PM10', 'O3', 'NO2']

In [None]:
features = df[features_considered]
features.index = df['DateTime']
features.head()

In [None]:
def create_time_steps(length):
  return list(range(-length, 0))

In [None]:
def show_plot(plot_data, delta, title):
  labels = ['History', 'True Future', 'Model Prediction']
  marker = ['.-', 'rx', 'go']
  time_steps = create_time_steps(plot_data[0].shape[0])
  if delta:
    future = delta
  else:
    future = 0

  plt.title(title)
  for i, x in enumerate(plot_data):
    if i:
      plt.plot(future, plot_data[i], marker[i], markersize=10,
               label=labels[i])
    else:
      plt.plot(time_steps, plot_data[i].flatten(), marker[i], label=labels[i])
  plt.legend()
  plt.xlim([time_steps[0], (future+5)*2])
  plt.xlabel('Time-Step')
  return plt

In [None]:
features.plot(subplots=True)

### (2) 표준화 

In [None]:
dataset = features.values
data_mean = dataset[:TRAIN_SPLIT].mean(axis=0)
data_std = dataset[:TRAIN_SPLIT].std(axis=0)

In [None]:
dataset = (dataset-data_mean)/data_std

###  3개 특징으로 다단계(Multi-Step model) 예측

24시간 이전 데이터로 5시간 예측

In [None]:
def multivariate_data(dataset, target, start_index, end_index, history_size,
                      target_size, step, single_step=False):
  data = []
  labels = []

  start_index = start_index + history_size
  if end_index is None:
    end_index = len(dataset) - target_size

  for i in range(start_index, end_index):
    indices = range(i-history_size, i, step)
    data.append(dataset[indices])

    if single_step:
      labels.append(target[i+target_size])
    else:
      labels.append(target[i:i+target_size])

  return np.array(data), np.array(labels)

In [None]:
past_history = 24
future_target = 5
STEP = 1

x_train, y_train = multivariate_data(dataset, dataset[:, 0], 0,
                                                   TRAIN_SPLIT, past_history,
                                                   future_target, STEP,
                                                   single_step=False)
x_val, y_val = multivariate_data(dataset, dataset[:, 0],
                                               TRAIN_SPLIT, None, past_history,
                                               future_target, STEP,
                                               single_step=False)

In [None]:
print ('window of past history : {}'.format(x_train[0].shape))
print ('\n Target PM10 predict : {}'.format(y_train[0].shape))

In [None]:
BATCH_SIZE = 50
BUFFER_SIZE = 500

train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_data = train_data.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat()

val_data = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_data = val_data.batch(BATCH_SIZE).repeat()

In [None]:
def multi_step_plot(history, true_future, prediction):
  plt.figure(figsize=(12, 6))
  num_in = create_time_steps(len(history))
  num_out = len(true_future)

  plt.plot(num_in, np.array(history[:, 0]), label='History')
  plt.plot(np.arange(num_out)/STEP, np.array(true_future), 'bo',
           label='True Future')
  if prediction.any():
    plt.plot(np.arange(num_out)/STEP, np.array(prediction), 'ro',
             label='Predicted Future')
  plt.legend(loc='upper left')
  plt.show()

In [None]:
for x, y in train_data.take(1):
  multi_step_plot(x[0], y[0], np.array([0]))

In [None]:
for x, y in val_data.take(1):
  multi_step_plot(x[0], y[0], np.array([0]))

In [None]:
multi_step_model = tf.keras.models.Sequential()
multi_step_model.add(tf.keras.layers.LSTM(32,
                                          return_sequences=True,
                                          input_shape=x_train.shape[-2:]))
multi_step_model.add(tf.keras.layers.Dropout(0.3))
multi_step_model.add(tf.keras.layers.LSTM(16, activation='relu'))
multi_step_model.add(tf.keras.layers.Dense(future_target))   #5시간 구간 출력

multi_step_model.compile(optimizer=tf.keras.optimizers.RMSprop(clipvalue=1.0), loss='mae')

In [None]:
for x, y in val_data.take(1):
  pred = multi_step_model.predict(x)
  print(pred.shape)

In [None]:
EVALUATION_INTERVAL = 100
EPOCHS = 10

multi_step_history = multi_step_model.fit(train_data, epochs=EPOCHS,
                                            steps_per_epoch=EVALUATION_INTERVAL,
                                            validation_data=val_data,
                                            validation_steps=50)

In [None]:
def plot_train_history(history, title):
  loss = history.history['loss']
  val_loss = history.history['val_loss']

  epochs = range(len(loss))

  plt.figure()

  plt.plot(epochs, loss, 'b', label='Training loss')
  plt.plot(epochs, val_loss, 'r', label='Validation loss')
  plt.title(title)
  plt.legend()

  plt.show()

In [None]:
plot_train_history(multi_step_history,
                   'Multi Step Training and validation loss')

In [None]:
for x, y in val_data.take(3):
  multi_step_plot(x[0], y[0], multi_step_model.predict(x)[0])