In [97]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import os 
from itertools import cycle
color_cycle = cycle(plt.rcParams['axes.prop_cycle'].by_key()['color'])

from sklearn import preprocessing, metrics
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM,Dropout
from keras.layers import RepeatVector,TimeDistributed, BatchNormalization
from numpy import array
from keras.models import Sequential, load_model
#import utils_paths
import re
from tqdm import tqdm
import os

In [128]:
INPUT_DIR = ".\\datas\\"

def read_data():
    cal = pd.read_csv(f"{INPUT_DIR}calendar.csv")
    stv = pd.read_csv(f"{INPUT_DIR}sales_train_validation.csv")
    ste = pd.read_csv(f"{INPUT_DIR}sales_train_evaluation.csv")
    ss = pd.read_csv(f"{INPUT_DIR}sample_submission.csv")
    sellp = pd.read_csv(f"{INPUT_DIR}sell_prices.csv")
    
    return cal, stv, ste, ss, sellp

In [129]:
def reduce_mem_usage(df, verbose=True):
    """
    目的：メモリサイズの削減
    df: メモリを削減したい DataFrame (pandas.DataFrame)
    verbose: 実行時に、メモリ削減の情報を出力するかどうかを指定(bool)

    ■ 基本思想
    【前提知識】
    pandas で作成したデータフレームのうち数値データは、特に dtype を指定しない場合
    int64 または float64 でデータを作成するので、
    実際のデータよりもこの型が大きいと余計なメモリサイズを確保してしまう。

    【処理内容】
    (1) 入力された DataFrame の column の型を全てチェック(for loop)
    (2) その型が大きい数値データ(int16~int64, float16~float64)ならば、
        そのデータフレームの最大値・最小値をチェック。
        現在処理中のカラムを、上記の最大値・最小値を表せる必要最低限の型に変換する。
        int と floatに分けて処理。

    ────────────────────────────────────────────────────────────────────────
    【変更履歴】
    2020/06/06:
    ■ 35行目
    ifのネストが深かったので、リファクタ。
    Early Continueを入れたので可読性が向上(したはず)。

    ■ 46行目・71行目(置き換え・追加)
    説明変数(関数?)で置き換え。
    columnのtypeがintであるか否かを判定する関数を噛ませている。
    (返り値はbool値)
    """

    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    # main loop    
    for col in df.columns:
        col_type = df[col].dtypes

        if col_type not in numerics: 
            continue # Early continue if column type is not numeric
        
        c_min = df[col].min()
        c_max = df[col].max()

        if IsInt(col_type):
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)  
        else:
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))

    return df


def IsInt(col_type):
    return str(col_type)[:3] == 'int'

In [101]:
NUM_ITEMS = 30490
OUTPUT_PATH = ".\\datas\\training_datas\\"

In [102]:
def train_data_from_csv_generator(num=NUM_ITEMS):
    for i in range(num):
        df = pd.read_csv(OUTPUT_PATH + "train_data" + str(i) +".csv")
        df = reduce_mem_usage(df, verbose=False)
        array = df.values
        array = array / (np.max(array) - np.min(array)) #正規化して出力(そのままだと一瞬でLossがNaNになる(勾配爆発？))
        yield array

In [103]:
train_generator = train_data_from_csv_generator(num=10) 

In [104]:
x_shape = next(train_generator).shape

In [179]:
"""
Create Training Datas & Labels
"""

train_generator = train_data_from_csv_generator(num=1) 
x_shape = next(train_generator).shape
timesteps = 28
#timesteps=10
delay = 1
#num_samples=10
num_samples = 1000 # ローカルのマシンだとこのサイズですらメモリが限界になる。
#num_samples = NUM_ITEMS

train_generator = train_data_from_csv_generator(num=num_samples) 

len_sequence, num_features = x_shape
sample_batchsize = len_sequence-timesteps+1 - delay

X_train = np.zeros((sample_batchsize*num_samples, timesteps, num_features))
Y_train = np.zeros((sample_batchsize*num_samples, timesteps, 1))

for i, array in enumerate(train_generator):
    for j in range(sample_batchsize - timesteps + 1 -delay):
        X_train[i*sample_batchsize+j, 0: timesteps] = array[j:j+timesteps]
        Y_train[i*sample_batchsize+j, 0: timesteps] = array[j+timesteps:j+2*timesteps , num_features-1].reshape(timesteps, 1)

In [180]:
X_train.shape

(819000, 28, 8)

In [181]:
Y_train.shape

(819000, 28, 1)

In [174]:
# やっぱり突っ込む前に正規化しておく必要がある模様(これをしないと一瞬でlossがnanになる)
X_train = X_train / (np.max(X_train) - np.min(X_train))

In [182]:
n_out_seq_length =28
num_y = 1
batch_size = 900

model = Sequential()

model.add(LSTM(128, activation='relu', batch_input_shape=(batch_size, timesteps, num_features), return_sequences=False, stateful=True))
model.add(BatchNormalization())
model.add(RepeatVector(timesteps))
model.add(LSTM(32, activation='relu', return_sequences=True, stateful=True))
model.add(BatchNormalization())
#model.add(Dropout(0.1))  
model.add(TimeDistributed(Dense(delay)))   # num_y means the shape of y,in some problem(like translate), it can be many.
                                            #In that case, you should set the  activation= 'softmax'
model.compile(optimizer='adam', loss='mse')

In [183]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_42 (LSTM)               (900, 128)                70144     
_________________________________________________________________
batch_normalization_21 (Batc (900, 128)                512       
_________________________________________________________________
repeat_vector_22 (RepeatVect (900, 28, 128)            0         
_________________________________________________________________
lstm_43 (LSTM)               (900, 28, 32)             20608     
_________________________________________________________________
batch_normalization_22 (Batc (900, 28, 32)             128       
_________________________________________________________________
time_distributed_20 (TimeDis (900, 28, 1)              33        
Total params: 91,425
Trainable params: 91,105
Non-trainable params: 320
_________________________________________________________________


In [184]:
history = model.fit(X_train, Y_train, epochs=1, batch_size=batch_size, validation_split=0.1)

Train on 737100 samples, validate on 81900 samples
Epoch 1/1


In [185]:
history.history.items()

dict_items([('val_loss', [0.05453742369190677]), ('loss', [0.005783255722267685])])

### Evaluation は 6/10 以降にやる。
→ sklearn validation の KFold とか使えばよいのでは？<br />
あとはGPU対応させてGoogleColaboratoryとかも活用する

In [111]:
from sklearn.model_selection import train_test_split

In [115]:
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.1)

In [116]:
x_train.shape

(7533, 10, 8)

In [117]:
y_test.shape

(837, 10, 1)

In [118]:
history = model.fit(x_train, y_train, epochs=1, batch_size=)

Epoch 1/1


InvalidArgumentError: Incompatible shapes: [27] vs. [10]
	 [[{{node training_2/Adam/gradients/loss_7/time_distributed_8_loss/mul_grad/Mul}}]]