# 日毎のUFO出現数（発見数）を予測

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import LSTM, Dropout, Dense
from tensorflow.python.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error


In [None]:
df = pd.read_csv("../datasets/scrubbed_only_us_formatted.csv", index_col=0)
df.head()

In [None]:
df.dtypes

In [None]:
df["date"] = pd.to_datetime(df["date"])

In [None]:
dayly_count = df.groupby(pd.Grouper(key="date", freq="D")).size()
dayly_count.head()

In [None]:
dayly_count.to_csv('../datasets/scrubbed_only_us_dayly_count.csv')

In [None]:
dayly_count.plot(figsize=(50, 5))

## データセット生成

In [None]:
dataset = dayly_count.values
dataset = dataset.astype('float32')
dataset = np.reshape(dataset, (-1, 1))
dataset.shape

In [None]:
def gen_dataset(dataset, lag_max):
  X, y = [], []
  for i in range(len(dataset) - lag_max):
    a = i + lag_max
    X.append(dataset[i:a, 0]) #ラグ変数
    y.append(dataset[a, 0])   #目的変数
  return np.array(X), np.array(y)

In [None]:
lag_max = 7
X, y = gen_dataset(dataset, lag_max)

In [None]:
test_length = 365 #テストデータの期間

X_train_0 = X[:-test_length,:] #学習データ
X_test_0 = X[-test_length:,:]  #テストデータ

y_train_0 = y[:-test_length] #学習データ
y_test_0 = y[-test_length:]  #テストデータ

y_train = y_train_0.reshape(-1,1)
y_test = y_test_0.reshape(-1,1)


In [None]:
scaler_y = MinMaxScaler(feature_range=(0, 1))
y_train = scaler_y.fit_transform(y_train)
y_test = scaler_y.fit_transform(y_test)

from pickle import dump
dump(scaler_y, open('../scalers/scaler_y.pkl', 'wb'))

In [None]:
scaler_X = MinMaxScaler(feature_range=(0, 1))
X_train_0 = scaler_X.fit_transform(X_train_0)
X_test_0 = scaler_X.transform(X_test_0)

from pickle import dump
dump(scaler_X, open('../scalers/scaler_X.pkl', 'wb'))

In [None]:
X_train = np.reshape(X_train_0, (X_train_0.shape[0],1,X_train_0.shape[1]))
X_test = np.reshape(X_test_0, (X_test_0.shape[0],1,X_test_0.shape[1]))
print('X_train:',X_train.shape) #確認
print('X_test:',X_test.shape) #確認

In [None]:
# モデル定義
model = Sequential()
model.add(LSTM(300,input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(1, activation='linear'))
# コンパイル
model.compile(loss='mean_squared_error', optimizer='adam')
# モデルの視覚化
plot_model(model,show_shapes=True)

In [None]:
# EaelyStoppingの設定
early_stopping =  EarlyStopping(monitor='val_loss',
                                min_delta=0.0,
                                patience=2)
# 学習の実行
history = model.fit(X_train, y_train,
                    epochs=1000,
                    batch_size=128,
                    validation_split=0.2,
                    callbacks=[early_stopping] ,
                    verbose=1, 
                    shuffle=False)

In [None]:
# 学習結果の出力
model.summary()
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='valid Loss')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epochs')
plt.legend(loc='upper right')
plt.show()

In [None]:
# テストデータの目的変数を予測
y_test_pred_scaled = model.predict(X_test)
y_test_pred = scaler_y.inverse_transform(y_test_pred_scaled)
y_test = scaler_y.inverse_transform(y_test)
# テストデータの目的変数と予測結果を結合
df_test = pd.DataFrame(np.hstack((y_test,y_test_pred)),
                       columns=['y','predict'])
# 指標出力
print('RMSE:')
print(np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('MAE:')
print(mean_absolute_error(y_test, y_test_pred)) 
print('MAPE:')
print(mean_absolute_percentage_error(y_test, y_test_pred)) 
# グラフ化
df_test.plot(kind='line')

In [None]:
y_test_pred_scaled

In [None]:
df_test

In [None]:
model.save('../models/only_us_daily_count.h5')