In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import os 
from itertools import cycle
color_cycle = cycle(plt.rcParams['axes.prop_cycle'].by_key()['color'])

In [2]:
INPUT_DIR = ".\\datas\\"

def read_data():
    cal = pd.read_csv(f"{INPUT_DIR}calendar.csv")
    stv = pd.read_csv(f"{INPUT_DIR}sales_train_validation.csv")
    ste = pd.read_csv(f"{INPUT_DIR}sales_train_evaluation.csv")
    ss = pd.read_csv(f"{INPUT_DIR}sample_submission.csv")
    sellp = pd.read_csv(f"{INPUT_DIR}sell_prices.csv")
    
    return cal, stv, ste, ss, sellp

In [3]:
def reduce_mem_usage(df, verbose=True):
    """
    目的：メモリサイズの削減
    df: メモリを削減したい DataFrame (pandas.DataFrame)
    verbose: 実行時に、メモリ削減の情報を出力するかどうかを指定(bool)

    ■ 基本思想
    【前提知識】
    pandas で作成したデータフレームのうち数値データは、特に dtype を指定しない場合
    int64 または float64 でデータを作成するので、
    実際のデータよりもこの型が大きいと余計なメモリサイズを確保してしまう。

    【処理内容】
    (1) 入力された DataFrame の column の型を全てチェック(for loop)
    (2) その型が大きい数値データ(int16~int64, float16~float64)ならば、
        そのデータフレームの最大値・最小値をチェック。
        現在処理中のカラムを、上記の最大値・最小値を表せる必要最低限の型に変換する。
        int と floatに分けて処理。

    ────────────────────────────────────────────────────────────────────────
    【変更履歴】
    2020/06/06:
    ■ 35行目
    ifのネストが深かったので、リファクタ。
    Early Continueを入れたので可読性が向上(したはず)。

    ■ 46行目・71行目(置き換え・追加)
    説明変数(関数?)で置き換え。
    columnのtypeがintであるか否かを判定する関数を噛ませている。
    (返り値はbool値)
    """

    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    # main loop    
    for col in df.columns:
        col_type = df[col].dtypes

        if col_type not in numerics: 
            continue # Early continue if column type is not numeric
        
        c_min = df[col].min()
        c_max = df[col].max()

        if IsInt(col_type):
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)  
        else:
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))

    return df


def IsInt(col_type):
    return str(col_type)[:3] == 'int'

In [4]:
cal, stv, ste, ss, sellp = read_data()

In [5]:
dfs = [cal, stv, ste, ss, sellp]
for df in dfs:
    df = reduce_mem_usage(df)

Mem. usage decreased to  0.12 Mb (41.9% reduction)
Mem. usage decreased to 95.00 Mb (78.7% reduction)
Mem. usage decreased to 96.13 Mb (78.8% reduction)
Mem. usage decreased to  2.09 Mb (84.5% reduction)
Mem. usage decreased to 130.48 Mb (37.5% reduction)


In [6]:
train_sales = ste
calendar = cal
sell_prices = pd.read_csv(f"{INPUT_DIR}sell_prices.csv")
# pd.pivot() を使うときに、メモリサイズを削減したものだとエラーになる模様。なので再度読み直し。対応策はないのか？
submission_file = ss

In [7]:
from sklearn.preprocessing import LabelEncoder

def transform(data):
    
    nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in nan_features:
        data[feature].fillna('unknown', inplace = True)
        
    cat = ['event_name_1','event_type_1','event_name_2','event_type_2','snap_CA','snap_TX','snap_WI']
    for feature in cat:
        encoder = LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature])
    
    return data
#calendarの何もない日をunknownで埋めて、label encoderによって処理しやすい形に変える
# one-hot encodingのほうが良いのでは？

In [8]:
days = range(1, 1970)
time_series_columns = [f'd_{i}' for i in days]

event_snap_columns = ['event_name_1','event_type_1','event_name_2','event_type_2','snap_CA','snap_TX','snap_WI']

transfer_cal = pd.DataFrame(calendar[event_snap_columns].values.T,
                            index=event_snap_columns,
                            columns=time_series_columns)
transfer_cal = transfer_cal.fillna(0)

# event_name_1_se = transfer_cal.loc['event_name_1'].apply(lambda x: x if re.search("^\d+$", str(x)) else np.nan).fillna(10)
# event_name_2_se = transfer_cal.loc['event_name_2'].apply(lambda x: x if re.search("^\d+$", str(x)) else np.nan).fillna(10)
# #↑event nameが入っているものをnanにした後、10にしている。
# # ここはプロットに使っていただけなので削除

In [9]:
transfer_cal

Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1960,d_1961,d_1962,d_1963,d_1964,d_1965,d_1966,d_1967,d_1968,d_1969
event_name_1,0,0,0,0,0,0,0,0,SuperBowl,0,...,0,0,0,0,0,0,0,0,0,NBAFinalsEnd
event_type_1,0,0,0,0,0,0,0,0,Sporting,0,...,0,0,0,0,0,0,0,0,0,Sporting
event_name_2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Father's day
event_type_2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Cultural
snap_CA,0,0,0,1,1,1,1,1,1,1,...,1,0,0,0,0,0,0,0,0,0
snap_TX,0,0,0,1,0,1,0,1,1,1,...,0,1,1,1,0,1,0,0,0,0
snap_WI,0,0,0,0,1,1,0,1,1,0,...,0,1,1,0,1,1,0,0,0,0


In [10]:
"""
※注意※

ここで、使用メモリを減らすためにcalenderの範囲が減らされている。
増やすと単純に精度向上が可能？
-> もしフルに使うと、ローカルのメモリが死ぬ

"""

calendar['date'] = pd.to_datetime(calendar['date'])
#calendar = calendar[calendar['date']>= '2014-3-15']  #reduce memory
#calendar = calendar[calendar["date"] <= "2016-05-22"] #eliminate evaluate date
#使うデータを少なくします -> TrainingDataのステップ数が800になるよう設定
calendar= transform(calendar)
# Attempts to convert events into time series data.
transfer_cal = pd.DataFrame(calendar[event_snap_columns + ["date", "d"]].values.T,
                            index=event_snap_columns + ["date", "d"])
transfer_cal

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968
event_name_1,30,30,30,30,30,30,30,30,26,30,...,30,30,30,30,30,30,30,30,30,16
event_type_1,4,4,4,4,4,4,4,4,3,4,...,4,4,4,4,4,4,4,4,4,3
event_name_2,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,2
event_type_2,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,0
snap_CA,0,0,0,1,1,1,1,1,1,1,...,1,0,0,0,0,0,0,0,0,0
snap_TX,0,0,0,1,0,1,0,1,1,1,...,0,1,1,1,0,1,0,0,0,0
snap_WI,0,0,0,0,1,1,0,1,1,0,...,0,1,1,0,1,1,0,0,0,0
date,2011-01-29 00:00:00,2011-01-30 00:00:00,2011-01-31 00:00:00,2011-02-01 00:00:00,2011-02-02 00:00:00,2011-02-03 00:00:00,2011-02-04 00:00:00,2011-02-05 00:00:00,2011-02-06 00:00:00,2011-02-07 00:00:00,...,2016-06-10 00:00:00,2016-06-11 00:00:00,2016-06-12 00:00:00,2016-06-13 00:00:00,2016-06-14 00:00:00,2016-06-15 00:00:00,2016-06-16 00:00:00,2016-06-17 00:00:00,2016-06-18 00:00:00,2016-06-19 00:00:00
d,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1960,d_1961,d_1962,d_1963,d_1964,d_1965,d_1966,d_1967,d_1968,d_1969


In [11]:
price_fea = calendar[['wm_yr_wk','date']].merge(sell_prices, on = ['wm_yr_wk'], how = 'left')
#販売価格に対して、calendarを結合した。
price_fea['id'] = price_fea['item_id']+'_'+price_fea['store_id']+'_evaluation'
df = price_fea.pivot('id','date','sell_price')
#各商品を一行にして、商品の値段の移り変わりを表した。

In [12]:
df.columns = ["d_" + str(d) for d in range(1, 1970)]

In [13]:
df = df.reset_index()

In [14]:
df

Unnamed: 0,id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,...,d_1960,d_1961,d_1962,d_1963,d_1964,d_1965,d_1966,d_1967,d_1968,d_1969
0,FOODS_1_001_CA_1_evaluation,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,...,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24
1,FOODS_1_001_CA_2_evaluation,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,...,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24
2,FOODS_1_001_CA_3_evaluation,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,...,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24
3,FOODS_1_001_CA_4_evaluation,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,...,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24
4,FOODS_1_001_TX_1_evaluation,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,...,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,HOUSEHOLD_2_516_TX_2_evaluation,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,...,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94
30486,HOUSEHOLD_2_516_TX_3_evaluation,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,...,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94
30487,HOUSEHOLD_2_516_WI_1_evaluation,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,...,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94
30488,HOUSEHOLD_2_516_WI_2_evaluation,,,,,,,,,,...,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94


In [15]:
price_df = df.fillna(0)

In [16]:
price_df

Unnamed: 0,id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,...,d_1960,d_1961,d_1962,d_1963,d_1964,d_1965,d_1966,d_1967,d_1968,d_1969
0,FOODS_1_001_CA_1_evaluation,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,...,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24
1,FOODS_1_001_CA_2_evaluation,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,...,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24
2,FOODS_1_001_CA_3_evaluation,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,...,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24
3,FOODS_1_001_CA_4_evaluation,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,...,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24
4,FOODS_1_001_TX_1_evaluation,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,...,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,HOUSEHOLD_2_516_TX_2_evaluation,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,...,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94
30486,HOUSEHOLD_2_516_TX_3_evaluation,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,...,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94
30487,HOUSEHOLD_2_516_WI_1_evaluation,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,...,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94
30488,HOUSEHOLD_2_516_WI_2_evaluation,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94,5.94


In [17]:
ste

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,...,1,0,3,0,1,1,0,0,1,1
30486,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
30487,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,...,0,0,1,2,0,1,0,1,0,2
30488,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,1,1,1,4,6,0,1,1,1,0


In [18]:
ste = ste.rename(columns={"id": "total_id"})

In [19]:
mod_ste = ste.drop(columns=["item_id", "dept_id", "cat_id", "store_id", "state_id"])
mod_ste

Unnamed: 0,total_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,0,0,0,0,0,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,0,0,0,0,0,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,0,0,0,0,0,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,0,0,0,0,0,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_evaluation,0,0,2,2,0,3,1,4,1,...,1,0,3,0,1,1,0,0,1,1
30486,FOODS_3_824_WI_3_evaluation,0,0,0,0,0,5,0,1,1,...,0,0,0,0,0,0,1,0,1,0
30487,FOODS_3_825_WI_3_evaluation,0,6,0,2,2,4,1,8,5,...,0,0,1,2,0,1,0,1,0,2
30488,FOODS_3_826_WI_3_evaluation,0,0,0,0,0,0,0,0,0,...,1,1,1,4,6,0,1,1,1,0


In [20]:
encoder = LabelEncoder()
item_attribute_df = pd.DataFrame(ste["total_id"])
for col in ["item_id", "dept_id", "cat_id", "store_id", "state_id"]:
    item_attribute_df[col] = pd.Series(encoder.fit_transform(ste[col]), name=col)

item_attribute_df

Unnamed: 0,total_id,item_id,dept_id,cat_id,store_id,state_id
0,HOBBIES_1_001_CA_1_evaluation,1437,3,1,0,0
1,HOBBIES_1_002_CA_1_evaluation,1438,3,1,0,0
2,HOBBIES_1_003_CA_1_evaluation,1439,3,1,0,0
3,HOBBIES_1_004_CA_1_evaluation,1440,3,1,0,0
4,HOBBIES_1_005_CA_1_evaluation,1441,3,1,0,0
...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_evaluation,1432,2,0,9,2
30486,FOODS_3_824_WI_3_evaluation,1433,2,0,9,2
30487,FOODS_3_825_WI_3_evaluation,1434,2,0,9,2
30488,FOODS_3_826_WI_3_evaluation,1435,2,0,9,2


In [21]:
type(price_df[price_df["id"] == "HOBBIES_1_001_CA_1_evaluation"])

pandas.core.frame.DataFrame

In [22]:
transfer_cal.transpose()

Unnamed: 0,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,date,d
0,30,4,4,2,0,0,0,2011-01-29,d_1
1,30,4,4,2,0,0,0,2011-01-30,d_2
2,30,4,4,2,0,0,0,2011-01-31,d_3
3,30,4,4,2,1,1,0,2011-02-01,d_4
4,30,4,4,2,1,0,1,2011-02-02,d_5
...,...,...,...,...,...,...,...,...,...
1964,30,4,4,2,0,1,1,2016-06-15,d_1965
1965,30,4,4,2,0,0,0,2016-06-16,d_1966
1966,30,4,4,2,0,0,0,2016-06-17,d_1967
1967,30,4,4,2,0,0,0,2016-06-18,d_1968


In [23]:
def train_data_generator(feature, cal):
    """
    6/7 未完成
    for文の中にもう一つfor文を作成し、
    所望の長さ(シーケンス)のデータをジェネレートするようにしたい
    
    -> 別のセルで、keras用のgeneratorクラスを作成することにした。
    これ自体は、トレーニングデータ作成に使えるので残す。

    feature: データ点列 × データ数のデータフレーム
    cal: カレンダーのデータフレーム

    引数は特徴量を追加した段階で足すかもしれない。

    """

    length = feature.shape[1]
    columns = feature.columns
    for i in range(0, length):
        feature_df =  pd.concat([cal.reset_index(drop=True), feature[columns[i]]], axis=1)
        yield feature_df

In [24]:
price_df
mod_ste
item_attribute_df
transfer_cal

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968
event_name_1,30,30,30,30,30,30,30,30,26,30,...,30,30,30,30,30,30,30,30,30,16
event_type_1,4,4,4,4,4,4,4,4,3,4,...,4,4,4,4,4,4,4,4,4,3
event_name_2,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,2
event_type_2,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,0
snap_CA,0,0,0,1,1,1,1,1,1,1,...,1,0,0,0,0,0,0,0,0,0
snap_TX,0,0,0,1,0,1,0,1,1,1,...,0,1,1,1,0,1,0,0,0,0
snap_WI,0,0,0,0,1,1,0,1,1,0,...,0,1,1,0,1,1,0,0,0,0
date,2011-01-29 00:00:00,2011-01-30 00:00:00,2011-01-31 00:00:00,2011-02-01 00:00:00,2011-02-02 00:00:00,2011-02-03 00:00:00,2011-02-04 00:00:00,2011-02-05 00:00:00,2011-02-06 00:00:00,2011-02-07 00:00:00,...,2016-06-10 00:00:00,2016-06-11 00:00:00,2016-06-12 00:00:00,2016-06-13 00:00:00,2016-06-14 00:00:00,2016-06-15 00:00:00,2016-06-16 00:00:00,2016-06-17 00:00:00,2016-06-18 00:00:00,2016-06-19 00:00:00
d,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1960,d_1961,d_1962,d_1963,d_1964,d_1965,d_1966,d_1967,d_1968,d_1969


In [35]:
col = "HOBBIES_1_001_CA_1_evaluation"
days = transfer_cal.transpose().shape[0]
cal_category = ['event_name_1','event_type_1','event_name_2','event_type_2','snap_CA','snap_TX','snap_WI']
output_category = ["price", "sale", "item_id", "dept_id", "cat_id", "store_id", "state_id", 'event_name_1','event_type_1','event_name_2','event_type_2','snap_CA','snap_TX','snap_WI']


price_abstract = price_df[price_df["id"] == col].drop(columns=["id"]).transpose().reset_index(drop=True)
mod_ste_abstract = mod_ste[mod_ste["total_id"] == col].drop(columns=["total_id"]).transpose().reset_index(drop=True)

id_info_abstract = pd.DataFrame(item_attribute_df[item_attribute_df["total_id"] == col].drop(columns=["total_id"]).values * np.ones((days,1)))

cal_abstract = transfer_cal.transpose()[cal_category]

tmp_df = pd.concat([price_abstract, mod_ste_abstract, id_info_abstract, cal_abstract], axis=1)
tmp_df.columns = output_category

tmp_df = tmp_df.fillna(0)


In [39]:
def train_df_generator(price_df, mod_ste, item_attribute_df, transfer_cal):
    
    days = transfer_cal.transpose().shape[0]
    cal_category = ['event_name_1','event_type_1','event_name_2','event_type_2','snap_CA','snap_TX','snap_WI']
    output_category = ["price", "sale", "item_id", "dept_id", "cat_id", "store_id", "state_id", 'event_name_1','event_type_1','event_name_2','event_type_2','snap_CA','snap_TX','snap_WI']

    id_list = item_attribute_df["total_id"]

    for col in id_list: 
        price_abstract = price_df[price_df["id"] == col].drop(columns=["id"]).transpose().reset_index(drop=True)
        mod_ste_abstract = mod_ste[mod_ste["total_id"] == col].drop(columns=["total_id"]).transpose().reset_index(drop=True)

        id_info_abstract = pd.DataFrame(item_attribute_df[item_attribute_df["total_id"] == col].drop(columns=["total_id"]).values * np.ones((days, 1)))

        cal_abstract = transfer_cal.transpose()[cal_category]

        tmp_df = pd.concat([price_abstract, mod_ste_abstract, id_info_abstract, cal_abstract], axis=1)
        tmp_df.columns = output_category

        tmp_df = tmp_df.fillna(0)

        yield tmp_df


In [40]:
tdg = train_df_generator(price_df, mod_ste, item_attribute_df, transfer_cal)

In [43]:
next(tdg)

Unnamed: 0,price,sale,item_id,dept_id,cat_id,store_id,state_id,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,0.00,0.0,1439.0,3.0,1.0,0.0,0.0,30,4,4,2,0,0,0
1,0.00,0.0,1439.0,3.0,1.0,0.0,0.0,30,4,4,2,0,0,0
2,0.00,0.0,1439.0,3.0,1.0,0.0,0.0,30,4,4,2,0,0,0
3,0.00,0.0,1439.0,3.0,1.0,0.0,0.0,30,4,4,2,1,1,0
4,0.00,0.0,1439.0,3.0,1.0,0.0,0.0,30,4,4,2,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1964,2.97,0.0,1439.0,3.0,1.0,0.0,0.0,30,4,4,2,0,1,1
1965,2.97,0.0,1439.0,3.0,1.0,0.0,0.0,30,4,4,2,0,0,0
1966,2.97,0.0,1439.0,3.0,1.0,0.0,0.0,30,4,4,2,0,0,0
1967,2.97,0.0,1439.0,3.0,1.0,0.0,0.0,30,4,4,2,0,0,0


In [44]:
OUTPUT_PATH = ".\\datas\\training_datas\\"
tdg = train_df_generator(price_df, mod_ste, item_attribute_df, transfer_cal)

# Create CSV Files Correspoding to Each id
for i, df in enumerate(tdg):
    df.to_csv(OUTPUT_PATH + "train_data" + str(i) + ".csv", index=False)

### Create Training Data & Prediction Input Data

In [4]:
NUM_ITEMS = 30490
OUTPUT_PATH = ".\\datas\\training_datas\\"

In [7]:
def train_data_from_csv_generator(num=NUM_ITEMS):
    for i in range(num):
        df = pd.read_csv(OUTPUT_PATH + "train_data" + str(i) +".csv")
        df = reduce_mem_usage(df, verbose=False)
        df = df.fillna(0)
        array = df.values
        #array = array / (np.max(array) - np.min(array)) #正規化して出力(そのままだと一瞬でLossがNaNになる(勾配爆発？))
        yield array

In [30]:
train_generator = train_data_from_csv_generator(num=1) 
next(train_generator).shape

(1969, 14)

In [6]:
"""
Create Training Datas & Labels
"""

train_generator = train_data_from_csv_generator(num=1) 
x_shape = next(train_generator).shape
timesteps = 28
#timesteps=10
delay = 1
num_samples=10
#num_samples = 1000 # ローカルのマシンだとこのサイズですらメモリが限界になる。
#num_samples = NUM_ITEMS

train_generator = train_data_from_csv_generator(num=num_samples) 

len_sequence, num_features = x_shape
sample_batchsize = len_sequence-timesteps+1 - delay

X_train = np.zeros((sample_batchsize*num_samples, timesteps, num_features))
Y_train = np.zeros((sample_batchsize*num_samples, timesteps, 1))

for i, array in enumerate(train_generator):
    for j in range(sample_batchsize - timesteps + 1 -delay):
        X_train[i*sample_batchsize+j, 0: timesteps] = array[j:j+timesteps]
        Y_train[i*sample_batchsize+j, 0: timesteps] = array[j+timesteps:j+2*timesteps , num_features-1].reshape(timesteps, 1)

In [8]:
def CreateTrainingData(timesteps=28, delay=1, num_samples=30490):
    x_shape = next(train_data_from_csv_generator(num=1)).shape

    train_generator = train_data_from_csv_generator(num=num_samples)

    len_sequence, num_features = x_shape
    sample_batchsize = len_sequence-timesteps+1 - delay

    X_train = np.zeros((sample_batchsize*num_samples, timesteps, num_features))
    Y_train = np.zeros((sample_batchsize*num_samples, timesteps, 1))

    for i, array in enumerate(train_generator):
        for j in range(sample_batchsize - timesteps + 1 -delay):
            X_train[i*sample_batchsize+j, 0: timesteps] = array[j:j+timesteps]
            Y_train[i*sample_batchsize+j, 0: timesteps] = array[j+timesteps:j+2*timesteps , num_features-1].reshape(timesteps, 1)

    return X_train, Y_train    

In [31]:
X_train, Y_train = CreateTrainingData(num_samples=10)

In [32]:
print(X_train.shape)
print(Y_train.shape)

(19410, 28, 14)
(19410, 28, 1)


In [11]:
"""
Evaluation用のデータ生成関数
"""

def GenerateInputForPrediction(num_samples=30490):
    TIMESTEPS = 28

    train_generator = train_data_from_csv_generator(num=1) 
    x_shape = next(train_generator).shape
    num_features = x_shape[1]

    #X_test = np.zeros((num_samples, TIMESTEPS, num_features))

    train_generator = train_data_from_csv_generator(num=num_samples)

    for i, array in enumerate(train_generator):
        #X_test[i] = array[-TIMESTEPS:]
        yield array[-TIMESTEPS:]

#    return X_test

In [13]:
X_test = GenerateInputForPrediction(10)
next(X_test).shape

(28, 14)

In [14]:
def MinMaxScaling(array):
    max_array = np.max(array)
    min_array = np.min(array)
    normalized_array = array - min_array / (max_array - min_array)
    return normalized_array, max_array, min_array

def Inverse_MinMaxScaling(normalized_array, max_array, min_array):
    array = normalized_array*(max_array - min_array) + min_array
    return array

In [15]:
def normalize(X):
    mean = np.mean(X, axis=(1,2), keepdims=True)
    sd = np.std(X, axis=(1,2), keepdims=True)
    return (X-mean)/(sd+1e-7)

In [20]:
from sklearn import preprocessing, metrics
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM,Dropout
from keras.layers import RepeatVector,TimeDistributed, BatchNormalization
from numpy import array
from keras.models import Sequential, load_model
from keras.optimizers import Adam, RMSprop
#import utils_paths
import re
from tqdm import tqdm
import os

def build_model():
    timesteps = 28
    #timesteps=10
    delay = 1
    #num_samples=10

    n_out_seq_length = 28
    num_y = 1

    train_generator = train_data_from_csv_generator(num=1) 
    x_shape = next(train_generator).shape

    len_sequence, num_features = x_shape

    model = Sequential()

    model.add(LSTM(128, activation='relu', batch_input_shape=(None, timesteps, num_features), return_sequences=False))
    model.add(BatchNormalization())
    model.add(RepeatVector(timesteps))
    model.add(LSTM(32, activation='relu', return_sequences=True))
    model.add(BatchNormalization())
    #model.add(Dropout(0.1))  
    model.add(TimeDistributed(Dense(delay)))   # num_y means the shape of y,in some problem(like translate), it can be many.
                                                #In that case, you should set the  activation= 'softmax'
    
    RMSpropOptimizer = RMSprop(lr=0.001, clipvalue=0.5)
    model.compile(optimizer=RMSpropOptimizer, loss='mean_squared_error', metrics=["accuracy"])

    return model

In [21]:
model = build_model()







In [22]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 128)               73216     
_________________________________________________________________
batch_normalization_1 (Batch (None, 128)               512       
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 28, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 28, 32)            20608     
_________________________________________________________________
batch_normalization_2 (Batch (None, 28, 32)            128       
_________________________________________________________________
time_distributed_1 (TimeDist (None, 28, 1)             33        
Total params: 94,497
Trainable params: 94,177
Non-trainable params: 320
_________________________________________________________________


In [23]:
X_train, X_max, X_min = MinMaxScaling(X_train)
#Y_train, Y_max, Y_min = MinMaxScaling(Y_train)

In [25]:
batch_size = 1000
history = model.fit(X_train, Y_train, epochs=10, batch_size=batch_size, validation_split=0.1)

Train on 873450 samples, validate on 97050 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
open("model.json" ,"w").write(model.to_json())
model.save_weights("weight.hdf5")

In [None]:
model_from_json(open("model.json", "w").read(model.to_json()))
model.load_weights("weight.hdf5")

In [225]:
X_test, X_test_max, X_test_min = MinMaxScaling(X_test)

In [226]:
prediction = model.predict(X_test)

In [228]:
prediction.shape

(10, 28, 1)