<a href="https://colab.research.google.com/github/kaggler-KyotoUni/kaggle-m5forecasting/blob/master/FeatureCreation_For_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import os 
from itertools import cycle
color_cycle = cycle(plt.rcParams['axes.prop_cycle'].by_key()['color'])

In [0]:
INPUT_DIR = "./drive/My Drive/kaggle/m5-forecasting/datas"

def read_data():
    cal = pd.read_csv(f"{INPUT_DIR}/calendar.csv")
    stv = pd.read_csv(f"{INPUT_DIR}/sales_train_validation.csv")
    ste = pd.read_csv(f"{INPUT_DIR}/sales_train_evaluation.csv")
    ss = pd.read_csv(f"{INPUT_DIR}/sample_submission.csv")
    sellp = pd.read_csv(f"{INPUT_DIR}/sell_prices.csv")
    
    return cal, stv, ste, ss, sellp

In [0]:
def reduce_mem_usage(df, verbose=True):
    """
    目的：メモリサイズの削減
    df: メモリを削減したい DataFrame (pandas.DataFrame)
    verbose: 実行時に、メモリ削減の情報を出力するかどうかを指定(bool)

    ■ 基本思想
    【前提知識】
    pandas で作成したデータフレームのうち数値データは、特に dtype を指定しない場合
    int64 または float64 でデータを作成するので、
    実際のデータよりもこの型が大きいと余計なメモリサイズを確保してしまう。

    【処理内容】
    (1) 入力された DataFrame の column の型を全てチェック(for loop)
    (2) その型が大きい数値データ(int16~int64, float16~float64)ならば、
        そのデータフレームの最大値・最小値をチェック。
        現在処理中のカラムを、上記の最大値・最小値を表せる必要最低限の型に変換する。
        int と floatに分けて処理。

    ────────────────────────────────────────────────────────────────────────
    【変更履歴】
    2020/06/06:
    ■ 35行目
    ifのネストが深かったので、リファクタ。
    Early Continueを入れたので可読性が向上(したはず)。

    ■ 46行目・71行目(置き換え・追加)
    説明変数(関数?)で置き換え。
    columnのtypeがintであるか否かを判定する関数を噛ませている。
    (返り値はbool値)
    """

    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    # main loop    
    for col in df.columns:
        col_type = df[col].dtypes

        if col_type not in numerics: 
            continue # Early continue if column type is not numeric
        
        c_min = df[col].min()
        c_max = df[col].max()

        if IsInt(col_type):
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)  
        else:
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))

    return df


def IsInt(col_type):
    return str(col_type)[:3] == 'int'

In [0]:
cal, stv, ste, ss, sellp = read_data()

In [5]:
dfs = [cal, stv, ste, ss, sellp]
for df in dfs:
    df = reduce_mem_usage(df)

Mem. usage decreased to  0.12 Mb (41.9% reduction)
Mem. usage decreased to 95.00 Mb (78.7% reduction)
Mem. usage decreased to 96.13 Mb (78.8% reduction)
Mem. usage decreased to  2.09 Mb (84.5% reduction)
Mem. usage decreased to 130.48 Mb (37.5% reduction)


In [0]:

train_sales = ste
calendar = cal
sell_prices = pd.read_csv(f"{INPUT_DIR}/sell_prices.csv")
# pd.pivot() を使うときに、メモリサイズを削減したものだとエラーになる模様。なので再度読み直し。対応策はないのか？
submission_file = ss

In [0]:
def transform(data):
    
    nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in nan_features:
        data[feature].fillna('unknown', inplace = True)
        
    cat = ['event_name_1','event_type_1','event_name_2','event_type_2','snap_CA','snap_TX','snap_WI']
    for feature in cat:
        data[feature] = pd.get_dummies(data[feature])
    
    return data


In [0]:
days = range(1, 1970)
time_series_columns = [f'd_{i}' for i in days]

event_snap_columns = ['event_name_1','event_type_1','event_name_2','event_type_2','snap_CA','snap_TX','snap_WI']

transfer_cal = pd.DataFrame(calendar[event_snap_columns].values.T,
                            index=event_snap_columns,
                            columns=time_series_columns)
transfer_cal = transfer_cal.fillna(0)


In [9]:
"""
※注意※

ここで、使用メモリを減らすためにcalenderの範囲が減らされている。
増やすと単純に精度向上が可能？
-> もしフルに使うと、ローカルのメモリが死ぬ

"""

calendar['date'] = pd.to_datetime(calendar['date'])
#calendar = calendar[calendar['date']>= '2014-3-15']  #reduce memory
#calendar = calendar[calendar["date"] <= "2016-05-22"] #eliminate evaluate date
#使うデータを少なくします -> TrainingDataのステップ数が800になるよう設定
calendar= transform(calendar)
# Attempts to convert events into time series data.
transfer_cal = pd.DataFrame(calendar[event_snap_columns + ["date", "d"]].values.T,
                            index=event_snap_columns + ["date", "d"])
transfer_cal

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,1929,1930,1931,1932,1933,1934,1935,1936,1937,1938,1939,1940,1941,1942,1943,1944,1945,1946,1947,1948,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968
event_name_1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
event_type_1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
event_name_2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
event_type_2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
snap_CA,1,1,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1
snap_TX,1,1,1,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,0,0,0,1,0,...,0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,1,1,1,1
snap_WI,1,1,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,1,0,0,...,0,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,1,1,1
date,2011-01-29 00:00:00,2011-01-30 00:00:00,2011-01-31 00:00:00,2011-02-01 00:00:00,2011-02-02 00:00:00,2011-02-03 00:00:00,2011-02-04 00:00:00,2011-02-05 00:00:00,2011-02-06 00:00:00,2011-02-07 00:00:00,2011-02-08 00:00:00,2011-02-09 00:00:00,2011-02-10 00:00:00,2011-02-11 00:00:00,2011-02-12 00:00:00,2011-02-13 00:00:00,2011-02-14 00:00:00,2011-02-15 00:00:00,2011-02-16 00:00:00,2011-02-17 00:00:00,2011-02-18 00:00:00,2011-02-19 00:00:00,2011-02-20 00:00:00,2011-02-21 00:00:00,2011-02-22 00:00:00,2011-02-23 00:00:00,2011-02-24 00:00:00,2011-02-25 00:00:00,2011-02-26 00:00:00,2011-02-27 00:00:00,2011-02-28 00:00:00,2011-03-01 00:00:00,2011-03-02 00:00:00,2011-03-03 00:00:00,2011-03-04 00:00:00,2011-03-05 00:00:00,2011-03-06 00:00:00,2011-03-07 00:00:00,2011-03-08 00:00:00,2011-03-09 00:00:00,...,2016-05-11 00:00:00,2016-05-12 00:00:00,2016-05-13 00:00:00,2016-05-14 00:00:00,2016-05-15 00:00:00,2016-05-16 00:00:00,2016-05-17 00:00:00,2016-05-18 00:00:00,2016-05-19 00:00:00,2016-05-20 00:00:00,2016-05-21 00:00:00,2016-05-22 00:00:00,2016-05-23 00:00:00,2016-05-24 00:00:00,2016-05-25 00:00:00,2016-05-26 00:00:00,2016-05-27 00:00:00,2016-05-28 00:00:00,2016-05-29 00:00:00,2016-05-30 00:00:00,2016-05-31 00:00:00,2016-06-01 00:00:00,2016-06-02 00:00:00,2016-06-03 00:00:00,2016-06-04 00:00:00,2016-06-05 00:00:00,2016-06-06 00:00:00,2016-06-07 00:00:00,2016-06-08 00:00:00,2016-06-09 00:00:00,2016-06-10 00:00:00,2016-06-11 00:00:00,2016-06-12 00:00:00,2016-06-13 00:00:00,2016-06-14 00:00:00,2016-06-15 00:00:00,2016-06-16 00:00:00,2016-06-17 00:00:00,2016-06-18 00:00:00,2016-06-19 00:00:00
d,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,d_20,d_21,d_22,d_23,d_24,d_25,d_26,d_27,d_28,d_29,d_30,d_31,d_32,d_33,d_34,d_35,d_36,d_37,d_38,d_39,d_40,...,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941,d_1942,d_1943,d_1944,d_1945,d_1946,d_1947,d_1948,d_1949,d_1950,d_1951,d_1952,d_1953,d_1954,d_1955,d_1956,d_1957,d_1958,d_1959,d_1960,d_1961,d_1962,d_1963,d_1964,d_1965,d_1966,d_1967,d_1968,d_1969


In [10]:
calendar

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,0,0,0,0,1,1,1
1,2011-01-30,11101,Sunday,2,1,2011,d_2,0,0,0,0,1,1,1
2,2011-01-31,11101,Monday,3,1,2011,d_3,0,0,0,0,1,1,1
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,0,0,0,0,0,0,1
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1964,2016-06-15,11620,Wednesday,5,6,2016,d_1965,0,0,0,0,1,0,0
1965,2016-06-16,11620,Thursday,6,6,2016,d_1966,0,0,0,0,1,1,1
1966,2016-06-17,11620,Friday,7,6,2016,d_1967,0,0,0,0,1,1,1
1967,2016-06-18,11621,Saturday,1,6,2016,d_1968,0,0,0,0,1,1,1


In [11]:
pd.get_dummies(stv["cat_id"])

Unnamed: 0,FOODS,HOBBIES,HOUSEHOLD
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
...,...,...,...
30485,1,0,0
30486,1,0,0
30487,1,0,0
30488,1,0,0


In [12]:
pd.get_dummies(stv["store_id"])

Unnamed: 0,CA_1,CA_2,CA_3,CA_4,TX_1,TX_2,TX_3,WI_1,WI_2,WI_3
0,1,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
30485,0,0,0,0,0,0,0,0,0,1
30486,0,0,0,0,0,0,0,0,0,1
30487,0,0,0,0,0,0,0,0,0,1
30488,0,0,0,0,0,0,0,0,0,1


In [13]:
stv.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,d_20,d_21,d_22,d_23,d_24,d_25,d_26,d_27,d_28,d_29,d_30,d_31,d_32,d_33,d_34,...,d_1874,d_1875,d_1876,d_1877,d_1878,d_1879,d_1880,d_1881,d_1882,d_1883,d_1884,d_1885,d_1886,d_1887,d_1888,d_1889,d_1890,d_1891,d_1892,d_1893,d_1894,d_1895,d_1896,d_1897,d_1898,d_1899,d_1900,d_1901,d_1902,d_1903,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,1,3,1,3,1,2,2,0,1,1,1,1,0,0,0,0,0,1,0,4,2,3,0,1,2,0,0,0,1,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,2,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,3,4,2,1,4,1,3,5,0,6,6,0,0,0,0,3,1,2,1,3,1,0,2,5,4,2,0,3,0,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,3,2,2,2,3,1,0,0,0,0,1,0,4,4,0,1,4,0,1,0,1,0,1,1,2,0,1,1,2,1,1,0,1,1,2,2,2,4


In [14]:
pd.get_dummies(stv["dept_id"])

Unnamed: 0,FOODS_1,FOODS_2,FOODS_3,HOBBIES_1,HOBBIES_2,HOUSEHOLD_1,HOUSEHOLD_2
0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0
4,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...
30485,0,0,1,0,0,0,0
30486,0,0,1,0,0,0,0
30487,0,0,1,0,0,0,0
30488,0,0,1,0,0,0,0


In [15]:
pd.get_dummies(stv["item_id"])

Unnamed: 0,FOODS_1_001,FOODS_1_002,FOODS_1_003,FOODS_1_004,FOODS_1_005,FOODS_1_006,FOODS_1_008,FOODS_1_009,FOODS_1_010,FOODS_1_011,FOODS_1_012,FOODS_1_013,FOODS_1_014,FOODS_1_015,FOODS_1_016,FOODS_1_017,FOODS_1_018,FOODS_1_019,FOODS_1_020,FOODS_1_021,FOODS_1_022,FOODS_1_023,FOODS_1_024,FOODS_1_025,FOODS_1_026,FOODS_1_027,FOODS_1_028,FOODS_1_029,FOODS_1_030,FOODS_1_031,FOODS_1_032,FOODS_1_033,FOODS_1_034,FOODS_1_035,FOODS_1_036,FOODS_1_037,FOODS_1_038,FOODS_1_039,FOODS_1_040,FOODS_1_041,...,HOUSEHOLD_2_477,HOUSEHOLD_2_478,HOUSEHOLD_2_479,HOUSEHOLD_2_480,HOUSEHOLD_2_481,HOUSEHOLD_2_482,HOUSEHOLD_2_483,HOUSEHOLD_2_484,HOUSEHOLD_2_485,HOUSEHOLD_2_486,HOUSEHOLD_2_487,HOUSEHOLD_2_488,HOUSEHOLD_2_489,HOUSEHOLD_2_490,HOUSEHOLD_2_491,HOUSEHOLD_2_492,HOUSEHOLD_2_493,HOUSEHOLD_2_494,HOUSEHOLD_2_495,HOUSEHOLD_2_496,HOUSEHOLD_2_497,HOUSEHOLD_2_498,HOUSEHOLD_2_499,HOUSEHOLD_2_500,HOUSEHOLD_2_501,HOUSEHOLD_2_502,HOUSEHOLD_2_503,HOUSEHOLD_2_504,HOUSEHOLD_2_505,HOUSEHOLD_2_506,HOUSEHOLD_2_507,HOUSEHOLD_2_508,HOUSEHOLD_2_509,HOUSEHOLD_2_510,HOUSEHOLD_2_511,HOUSEHOLD_2_512,HOUSEHOLD_2_513,HOUSEHOLD_2_514,HOUSEHOLD_2_515,HOUSEHOLD_2_516
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
30486,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
30487,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
30488,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
pd.get_dummies(stv["state_id"])

Unnamed: 0,CA,TX,WI
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
...,...,...,...
30485,0,0,1
30486,0,0,1
30487,0,0,1
30488,0,0,1


# エンコーディングの方針
item_id はラベルエンコーディングしてEmbedding layerにぶち込む -> 余裕があれば最後にやる。最初はこの特徴量をそもそも入れない。<br />
ほかのやつはone-hot encodingにする。<br />
<br />
ジェネレータも、圧縮ファイルから随時読み込む形に書き換える。


In [0]:
calendar = cal

In [18]:
required_columns = ["d", "event_name_1", "event_name_2", "event_type_1", "event_type_2", "snap_CA", "snap_TX", "snap_WI"]
calendar[required_columns].max()

d               d_999
event_name_1        1
event_name_2        1
event_type_1        1
event_type_2        1
snap_CA             1
snap_TX             1
snap_WI             1
dtype: object

In [19]:
cal["event_name_1"].unique()

array([0, 1], dtype=uint8)

In [0]:
# one-hot encodingしたものがあった
cal = pd.read_csv(f"{INPUT_DIR}/cal_dummies.csv")

In [21]:
cal.head()

Unnamed: 0.1,Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,snap_CA,snap_TX,snap_WI,event_name_1_Chanukah End,event_name_1_Christmas,event_name_1_Cinco De Mayo,event_name_1_ColumbusDay,event_name_1_Easter,event_name_1_Eid al-Fitr,event_name_1_EidAlAdha,event_name_1_Father's day,event_name_1_Halloween,event_name_1_IndependenceDay,event_name_1_LaborDay,event_name_1_LentStart,event_name_1_LentWeek2,event_name_1_MartinLutherKingDay,event_name_1_MemorialDay,event_name_1_Mother's day,event_name_1_NBAFinalsEnd,event_name_1_NBAFinalsStart,event_name_1_NewYear,event_name_1_OrthodoxChristmas,event_name_1_OrthodoxEaster,event_name_1_Pesach End,event_name_1_PresidentsDay,event_name_1_Purim End,event_name_1_Ramadan starts,event_name_1_StPatricksDay,event_name_1_SuperBowl,event_name_1_Thanksgiving,event_name_1_ValentinesDay,event_name_1_VeteransDay,event_type_1_Cultural,event_type_1_National,event_type_1_Religious,event_type_1_Sporting,event_name_2_Cinco De Mayo,event_name_2_Easter,event_name_2_Father's day,event_name_2_OrthodoxEaster,event_type_2_Cultural,event_type_2_Religious
0,0,2011-01-29,11101,Saturday,1,1,2011,d_1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,2011-01-30,11101,Sunday,2,1,2011,d_2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,2011-01-31,11101,Monday,3,1,2011,d_3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,2011-02-01,11101,Tuesday,4,2,2011,d_4,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,2011-02-02,11101,Wednesday,5,2,2011,d_5,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
cal = cal.drop(columns=["Unnamed: 0", "date", "wm_yr_wk", "weekday", "wday", "month", "year"])

In [0]:
"""
zipからデータ読み出し。
展開しないのでディスク容量も圧迫せず済む
"""
import zipfile
import shutil
import os

NUM_ITEMS = 30490
DATA_PATH = "./drive/My Drive/kaggle/m5-forecasting/datas/training_datas.zip"

def df_csv_generator(num=NUM_ITEMS, datapath=DATA_PATH):
    with zipfile.ZipFile(datapath) as myzip:
        filelist = myzip.namelist()

        for i, f_name in enumerate(filelist):

            if i == 0:
                continue

            if i > num:
                break

            df = pd.read_csv(myzip.extract(f_name, "./extract_dir"))
            df = reduce_mem_usage(df, verbose=False)
            df = df.fillna(0)
            shutil.rmtree("./extract_dir")
            yield df

In [0]:
dfg = df_csv_generator()

In [110]:
next(dfg)

Unnamed: 0,price,sale,item_id,dept_id,cat_id,store_id,state_id,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,6.441406,0.0,1537.0,3.0,1.0,0.0,0.0,30,4,4,2,0,0,0
1,6.441406,0.0,1537.0,3.0,1.0,0.0,0.0,30,4,4,2,0,0,0
2,6.441406,1.0,1537.0,3.0,1.0,0.0,0.0,30,4,4,2,0,0,0
3,6.441406,3.0,1537.0,3.0,1.0,0.0,0.0,30,4,4,2,1,1,0
4,6.441406,0.0,1537.0,3.0,1.0,0.0,0.0,30,4,4,2,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1964,6.679688,0.0,1537.0,3.0,1.0,0.0,0.0,30,4,4,2,0,1,1
1965,6.679688,0.0,1537.0,3.0,1.0,0.0,0.0,30,4,4,2,0,0,0
1966,6.679688,0.0,1537.0,3.0,1.0,0.0,0.0,30,4,4,2,0,0,0
1967,6.679688,0.0,1537.0,3.0,1.0,0.0,0.0,30,4,4,2,0,0,0


In [26]:
drop_columns = ["item_id", "dept_id", "cat_id", "store_id", "state_id", "event_name_1", "event_name_2", "event_type_1", "event_type_2", "snap_CA", "snap_TX", "snap_WI"]
test_df.drop(columns=drop_columns)

Unnamed: 0,price,sale
0,0.000000,0.0
1,0.000000,0.0
2,0.000000,0.0
3,0.000000,0.0
4,0.000000,0.0
...,...,...
1964,8.382812,0.0
1965,8.382812,0.0
1966,8.382812,0.0
1967,8.382812,0.0


In [27]:
pd.concat([cal.drop(columns=["d"]), test_df.drop(columns=drop_columns)], axis=1)

Unnamed: 0,snap_CA,snap_TX,snap_WI,event_name_1_Chanukah End,event_name_1_Christmas,event_name_1_Cinco De Mayo,event_name_1_ColumbusDay,event_name_1_Easter,event_name_1_Eid al-Fitr,event_name_1_EidAlAdha,event_name_1_Father's day,event_name_1_Halloween,event_name_1_IndependenceDay,event_name_1_LaborDay,event_name_1_LentStart,event_name_1_LentWeek2,event_name_1_MartinLutherKingDay,event_name_1_MemorialDay,event_name_1_Mother's day,event_name_1_NBAFinalsEnd,event_name_1_NBAFinalsStart,event_name_1_NewYear,event_name_1_OrthodoxChristmas,event_name_1_OrthodoxEaster,event_name_1_Pesach End,event_name_1_PresidentsDay,event_name_1_Purim End,event_name_1_Ramadan starts,event_name_1_StPatricksDay,event_name_1_SuperBowl,event_name_1_Thanksgiving,event_name_1_ValentinesDay,event_name_1_VeteransDay,event_type_1_Cultural,event_type_1_National,event_type_1_Religious,event_type_1_Sporting,event_name_2_Cinco De Mayo,event_name_2_Easter,event_name_2_Father's day,event_name_2_OrthodoxEaster,event_type_2_Cultural,event_type_2_Religious,price,sale
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.0
3,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.0
4,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1964,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8.382812,0.0
1965,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8.382812,0.0
1966,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8.382812,0.0
1967,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8.382812,0.0


In [28]:
ste = ste.rename(columns={"id": "total_id"})
mod_ste = ste.drop(columns=["item_id", "dept_id", "cat_id", "store_id", "state_id"])
mod_ste

Unnamed: 0,total_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,d_20,d_21,d_22,d_23,d_24,d_25,d_26,d_27,d_28,d_29,d_30,d_31,d_32,d_33,d_34,d_35,d_36,d_37,d_38,d_39,...,d_1902,d_1903,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,3,0,1,1,1,3,0,1,1,0,0,0,2,0,3,5,0,0,1,1,0,2,1,2,2,1,0,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,2,2,1,2,1,1,1,0,1,1,1,0,0,1,1,0,2,1,0,0,0,0,2,1,3,0,0,1,0,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,...,3,0,1,0,5,4,1,0,1,3,7,2,0,0,1,2,4,1,6,4,0,0,0,2,2,4,2,1,1,1,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,1,2,1,1,0,1,1,2,2,2,4,1,0,2,3,1,0,3,2,3,1,1,3,2,3,2,2,2,2,0,0,0,2,1,0,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_evaluation,0,0,2,2,0,3,1,4,1,0,0,3,4,4,0,0,1,0,1,1,7,7,3,6,3,3,7,12,4,2,7,5,12,5,3,3,7,6,4,...,0,0,2,0,0,0,0,0,1,0,0,1,0,0,0,2,2,0,0,0,2,0,0,1,1,1,0,0,1,0,1,0,3,0,1,1,0,0,1,1
30486,FOODS_3_824_WI_3_evaluation,0,0,0,0,0,5,0,1,1,3,1,1,0,4,2,0,1,2,1,1,0,0,0,0,3,1,1,1,2,0,1,0,1,1,1,3,3,4,4,...,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
30487,FOODS_3_825_WI_3_evaluation,0,6,0,2,2,4,1,8,5,2,7,5,3,5,20,8,10,3,3,4,7,2,3,5,6,3,4,1,2,5,1,2,2,2,1,9,8,1,6,...,0,0,2,1,0,2,0,1,0,0,1,0,0,0,1,1,0,2,1,1,0,0,1,0,0,1,0,3,3,1,0,0,1,2,0,1,0,1,0,2
30488,FOODS_3_826_WI_3_evaluation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,3,0,0,1,0,0,1,0,3,1,3,1,3,0,1,2,1,0,2,1,1,2,0,2,1,1,0,2,1,1,1,1,4,6,0,1,1,1,0


In [0]:
# 各商品の属性(食品、地域など)を記載したデータフレーム作成(item_idだけ除外)
item_attribute_df = pd.DataFrame(ste["total_id"])

dummy_list = ["dept_id", "cat_id", "store_id", "state_id"]
for col in dummy_list:
    item_attribute_df = pd.concat([item_attribute_df, pd.get_dummies(ste[col])], axis=1)

item_cat_df = item_attribute_df.drop(columns=["total_id"])

In [30]:
item_cat_df

Unnamed: 0,FOODS_1,FOODS_2,FOODS_3,HOBBIES_1,HOBBIES_2,HOUSEHOLD_1,HOUSEHOLD_2,FOODS,HOBBIES,HOUSEHOLD,CA_1,CA_2,CA_3,CA_4,TX_1,TX_2,TX_3,WI_1,WI_2,WI_3,CA,TX,WI
0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
4,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
30486,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
30487,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
30488,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1


In [0]:
def attribute_df_generator(item_cat_df):
    for i in range(item_cat_df.shape[0]):
        yield item_cat_df.loc[i]

In [32]:
atg = attribute_df_generator(item_cat_df)
(next(atg).values * np.ones((1969, 1))).shape

(1969, 23)

In [0]:
drop_columns = ["item_id", "dept_id", "cat_id", "store_id", "state_id", "event_name_1", "event_name_2", "event_type_1", "event_type_2", "snap_CA", "snap_TX", "snap_WI"]

def train_df_generator(item_cat_df, cat, num=30490):
    dfg = df_csv_generator()
    adg = attribute_df_generator(item_cat_df)
    for i, df in enumerate(dfg):
        if i >= num:
            break
        tmp_df = df.drop(columns=drop_columns)
        tmp_cat_df = pd.DataFrame((next(adg).values * np.ones((1969, 1))).astype(int))
        ret_df = pd.concat([cal.drop(columns=["d"]), tmp_cat_df, tmp_df], axis=1)
        yield ret_df

In [0]:
tdg = train_df_generator(item_cat_df, cal, 10)

In [35]:
next(tdg).shape

(1969, 68)

In [0]:
import os

In [37]:
os.getcwd()

'/content'

In [38]:
os.path.isdir("./training_datas")

True

In [0]:
os.mkdir("./training_datas")

In [134]:
OUTPUT_PATH = "./training_datas/"
tdg = train_df_generator(item_cat_df, cal)
shape_list = []
for i, df in enumerate(tdg):
     if i / 1000 == i // 1000:
         print(i)
     #df.to_csv(OUTPUT_PATH + "train_data" + str(i) + ".csv", index=False, compression="zip")
     df.to_pickle(OUTPUT_PATH + "train_data" + str(i) + ".zip")
     shape_list.append(df.shape)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000


In [118]:
import pickle

with zipfile.ZipFile("./training_datas/train_data0.zip") as zip:
    for info in zip.infolist():
        if info.is_dir():
            continue
        data = pickle.loads(zip.read(info.filename))
        print("\n", data)


       snap_CA  snap_TX  snap_WI  ...  22     price  sale
0           0        0        0  ...   0  0.000000   0.0
1           0        0        0  ...   0  0.000000   0.0
2           0        0        0  ...   0  0.000000   0.0
3           1        1        0  ...   0  0.000000   0.0
4           1        0        1  ...   0  0.000000   0.0
...       ...      ...      ...  ...  ..       ...   ...
1964        0        1        1  ...   0  8.382812   0.0
1965        0        0        0  ...   0  8.382812   0.0
1966        0        0        0  ...   0  8.382812   0.0
1967        0        0        0  ...   0  8.382812   0.0
1968        0        0        0  ...   0  8.382812   0.0

[1969 rows x 68 columns]


In [0]:
listdir = os.listdir("./training_datas")

In [129]:
# 以下二つは、作成したpickleファイルからジェネレートする際に必要

listdir.sort()
listdir

['train_data0.zip',
 'train_data1.zip',
 'train_data2.zip',
 'train_data3.zip',
 'train_data4.zip',
 'train_data5.zip',
 'train_data6.zip',
 'train_data7.zip',
 'train_data8.zip',
 'train_data9.zip']

In [130]:
for i in range(len(listdir)):
    with zipfile.ZipFile("./training_datas/" + listdir[i]) as zip:
        for info in zip.infolist():
            if info.is_dir():
                continue
            data = pickle.loads(zip.read(info.filename))
            print("\n", data)


       snap_CA  snap_TX  snap_WI  ...  22     price  sale
0           0        0        0  ...   0  0.000000   0.0
1           0        0        0  ...   0  0.000000   0.0
2           0        0        0  ...   0  0.000000   0.0
3           1        1        0  ...   0  0.000000   0.0
4           1        0        1  ...   0  0.000000   0.0
...       ...      ...      ...  ...  ..       ...   ...
1964        0        1        1  ...   0  8.382812   0.0
1965        0        0        0  ...   0  8.382812   0.0
1966        0        0        0  ...   0  8.382812   0.0
1967        0        0        0  ...   0  8.382812   0.0
1968        0        0        0  ...   0  8.382812   0.0

[1969 rows x 68 columns]

       snap_CA  snap_TX  snap_WI  ...  22     price  sale
0           0        0        0  ...   0  0.000000   0.0
1           0        0        0  ...   0  0.000000   0.0
2           0        0        0  ...   0  0.000000   0.0
3           1        1        0  ...   0  0.000000   0.0
4

In [139]:
!zip -r training_datas.zip ./training_datas

[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
  adding: training_datas/train_data26021.zip (stored 0%)
  adding: training_datas/train_data11559.zip (stored 0%)
  adding: training_datas/train_data491.zip (stored 0%)
  adding: training_datas/train_data11065.zip (stored 0%)
  adding: training_datas/train_data27532.zip (stored 0%)
  adding: training_datas/train_data24038.zip (stored 0%)
  adding: training_datas/train_data16760.zip (stored 0%)
  adding: training_datas/train_data20995.zip (stored 0%)
  adding: training_datas/train_data17258.zip (stored 0%)
  adding: training_datas/train_data8467.zip (stored 0%)
  adding: training_datas/train_data3207.zip (stored 0%)
  adding: training_datas/train_data27206.zip (stored 0%)
  adding: training_datas/train_data18108.zip (stored 0%)
  adding: training_datas/train_data19354.zip (stored 0%)
  adding: training_datas/train_data28384.zip (stored 0%)
  adding: training_datas/train_data12495.zip (stored 0%)
  adding: training_datas/train_data28232.zip (

In [0]:
%cd ..

/content


In [0]:
!ls

drive  sample_data  training_datas


In [0]:
!rm -r training_datas/

In [61]:
with zipfile.ZipFile("./training_datas.zip") as myzip:
    filelist = myzip.namelist()
    df = pd.read_csv(myzip.extract(filelist[10]))

print(df.shape)
print(len(filelist))

UnicodeDecodeError: ignored

In [197]:
for i in range(len(listdir[10])):
    with zipfile.ZipFile("./training_datas/" + listdir[i]) as zip:
        filelist = zip.namelist()
        print(filelist)
        data = pickle.loads(zip.read(filelist[0]))
        print(data.shape)
        print(data)
        # for info in zip.infolist():
        #     if info.is_dir():
        #         continue
        #     data = pickle.loads(zip.read(info.filename))
        #     print("\n", data)

['./training_datas/train_data0.zip']
(1969, 68)
      snap_CA  snap_TX  snap_WI  ...  22     price  sale
0           0        0        0  ...   0  0.000000   0.0
1           0        0        0  ...   0  0.000000   0.0
2           0        0        0  ...   0  0.000000   0.0
3           1        1        0  ...   0  0.000000   0.0
4           1        0        1  ...   0  0.000000   0.0
...       ...      ...      ...  ...  ..       ...   ...
1964        0        1        1  ...   0  8.382812   0.0
1965        0        0        0  ...   0  8.382812   0.0
1966        0        0        0  ...   0  8.382812   0.0
1967        0        0        0  ...   0  8.382812   0.0
1968        0        0        0  ...   0  8.382812   0.0

[1969 rows x 68 columns]
['./training_datas/train_data1.zip']
(1969, 68)
      snap_CA  snap_TX  snap_WI  ...  22     price  sale
0           0        0        0  ...   0  0.000000   0.0
1           0        0        0  ...   0  0.000000   0.0
2           0        0 

In [191]:
    listdir = os.listdir("./training_datas")
    listdir.sort()
    listdir[:10]

['train_data0.zip',
 'train_data1.zip',
 'train_data10.zip',
 'train_data100.zip',
 'train_data1000.zip',
 'train_data10000.zip',
 'train_data10001.zip',
 'train_data10002.zip',
 'train_data10003.zip',
 'train_data10004.zip']

In [0]:
listdir = os.listdir(DATA_PATH)
listdir[:10]

In [11]:
os.path.isdir("./drive/My Drive/kaggle/m5-forecasting/datas/training_datas_onehot/training_datas")

True

In [12]:
os.path.isdir(DATA_PATH)

True

# データ生成後は以下を実行するだけでOK
→ .py にして一部はライブラリ化する？

In [0]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import os 
from itertools import cycle
color_cycle = cycle(plt.rcParams['axes.prop_cycle'].by_key()['color'])

def reduce_mem_usage(df, verbose=True):
    """
    目的：メモリサイズの削減
    df: メモリを削減したい DataFrame (pandas.DataFrame)
    verbose: 実行時に、メモリ削減の情報を出力するかどうかを指定(bool)

    ■ 基本思想
    【前提知識】
    pandas で作成したデータフレームのうち数値データは、特に dtype を指定しない場合
    int64 または float64 でデータを作成するので、
    実際のデータよりもこの型が大きいと余計なメモリサイズを確保してしまう。

    【処理内容】
    (1) 入力された DataFrame の column の型を全てチェック(for loop)
    (2) その型が大きい数値データ(int16~int64, float16~float64)ならば、
        そのデータフレームの最大値・最小値をチェック。
        現在処理中のカラムを、上記の最大値・最小値を表せる必要最低限の型に変換する。
        int と floatに分けて処理。

    ────────────────────────────────────────────────────────────────────────
    【変更履歴】
    2020/06/06:
    ■ 35行目
    ifのネストが深かったので、リファクタ。
    Early Continueを入れたので可読性が向上(したはず)。

    ■ 46行目・71行目(置き換え・追加)
    説明変数(関数?)で置き換え。
    columnのtypeがintであるか否かを判定する関数を噛ませている。
    (返り値はbool値)
    """

    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    # main loop    
    for col in df.columns:
        col_type = df[col].dtypes

        if col_type not in numerics: 
            continue # Early continue if column type is not numeric
        
        c_min = df[col].min()
        c_max = df[col].max()

        if IsInt(col_type):
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)  
        else:
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))

    return df


def IsInt(col_type):
    return str(col_type)[:3] == 'int'

In [0]:
"""
zipからデータ読み出し。
展開しないのでディスク容量も圧迫せず済む
6/17: 追記

myzip.extract(f_name) が、カレントディレクトリに圧縮ファイルを展開してしまう。
→ バグにつながっていた。

【対処法】
ファイル展開用のフォルダを作成し、そこへ展開した後、生成されたcsvは削除するコードを追加。

※ このコードの考え方を、ReccurentTrainGeneratorへ応用

"""
import pickle
import zipfile
NUM_ITEMS = 30490
DATA_PATH = "./drive/My Drive/kaggle/m5-forecasting/datas/training_datas_onehot/training_datas"

def train_data_from_pickle_generator(num=NUM_ITEMS, datapath=DATA_PATH):
    listdir = os.listdir(datapath)
    #listdir.sort()

    for i in range(len(listdir)):

        if i > num:
            break

        with zipfile.ZipFile(datapath + "/" + listdir[i]) as tmpzip:
            filelist = tmpzip.namelist()

            df = pickle.loads(tmpzip.read(filelist[0]))
            df = reduce_mem_usage(df, verbose=False)
            df = df.fillna(0)
            array = df.values
        
            yield array

In [0]:
tpg = train_data_from_pickle_generator()

In [13]:
next(tpg)

array([[0.  , 0.  , 0.  , ..., 1.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 1.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 1.  , 0.  , 0.  ],
       ...,
       [0.  , 0.  , 0.  , ..., 1.  , 2.96, 0.  ],
       [0.  , 0.  , 0.  , ..., 1.  , 2.96, 0.  ],
       [0.  , 0.  , 0.  , ..., 1.  , 2.96, 0.  ]], dtype=float16)

In [0]:
from keras.utils import Sequence
from keras.models import Sequential

"""
model.fit_generatorを使うためのユーザ定義関数
※ generator を使わないとメモリが死ぬ
"""
class ReccurentTrainGenerator(Sequence):
    def _resetindices(self):
        """
        バッチ生成用のインデックスをランダムに出力
        """
        self.num_called = 0

        all_idx = np.random.permutation(np.arange(self.num_batches))
        remain_idx = np.random.choice(np.arange(self.num_batches),
                                      size=(self.steps_per_epoch*self.batch_size-len(all_idx)),
                                      replace=False)
        
        self.indices = np.hstack([all_idx, remain_idx]).reshape(self.steps_per_epoch, self.batch_size)

    def __init__(self, DataPath, batch_size, InputSteps=28, OutputSteps=28, delay=1, normalize_factor=None, sample_indices=np.arange(0, 30490)):
        """
        【入力】
        InputTensor: 入力データ(説明変数) データ数("HOBBIES_1_..."などに対応) × データ点数(時系列方向のデータ数) × 特徴量数 のndarray
                     ※ 正解ラベルも、この時系列データからとるのでこれだけ入力すればOK
                     ※ 今回は、引数 DataPathの先に圧縮してあるファイル(pickle)を都度解凍して利用する 
        batch_size: バッチサイズ(例えば、timestepが5として、時刻0~4までのデータ、1~5までのデータ、...、10~14までのデータ、
                                をひとまとめにして1データとみなすとする。RNNの場合はこのサイズがバッチサイズに対応する。)
        InputSteps: リカレント層に食わせるデータを、何ステップ前までのデータにするか
        OutputSteps: リカレント層からの出力(予測ステップ数)の設定値
        delay: 目的変数をどの程度遅らせるか？(予測ステップのスタート位置をどの程度後ろにずらすか)
        normalize_factor: 正規化する際のスケーリングをどの程度にするか

        6/16: 正解ラベル作成について、ラベルが間違っている可能性あり。
        6/17: 要素数を選択できるようにする (引数 num_samplesでも作る？)
        【構成案】
        sklearn.model_selection.KFold などで得たインデックスを流用できる形にする。
        ⇒ インデックスの配列をself.sample_indicesに突っ込み、その組を並び替える形で使用
        → _resetindicesなども修正対象
        """
        # データファイル名リストの取得
        self.datapath = DataPath
        self.listdir = os.listdir(DataPath)
        self.sample_indices = sample_indices

        with zipfile.ZipFile(self.datapath + "/" + self.listdir[sample_indices[0]]) as tmpzip:
            filelist = tmpzip.namelist()

            df = pickle.loads(tmpzip.read(filelist[0]))
            df = reduce_mem_usage(df, verbose=False)
            df = df.fillna(0)

        # 現在のエポックでバッチ生成の対象となっているデータ系列
        self.now_data = df.values

        # 各種パラメータ
        self.num_datas = len(self.sample_indices)
        self.len_sequence = df.shape[0]
        self.num_features = df.shape[1]
        self.batch_size = batch_size
        self.input_steps = InputSteps
        self.output_steps = OutputSteps
        self.delay = delay 

        # 各データ系列に対し、バッチサイズいくつ作れるか計算するのに必要な値
        self.len_requied_per_batch = InputSteps + (batch_size-1) + (delay-1) + OutputSteps # 訓練データと正解データを作るために必要なサイズ 
        self.num_batches = self.len_sequence - self.len_requied_per_batch + 1              # 作れるバッチの数

        # 1エポック当たりのステップ数
        self.steps_per_epoch = int(np.ceil(self.len_sequence / float(batch_size)))
        
        # バッチ生成用の乱数初期化
        self._resetindices()

        # データ取得用インデックス生成
        self.data_idx = self._reset_dataset_indices(self.num_datas)
        self.num_epoch = 0

        self.normalize_factor = normalize_factor


    def __len__(self):
        """
        1エポックあたりのステップ数をリターン
        """
        return self.steps_per_epoch

    def __getitem__(self, idx):
        """
        データをバッチにまとめて出力
        """
        indices_temp = self.indices[idx] # indices は (steps_per_epoch, batchsize)の array

        batch_x = np.array([self.now_data[i:i+self.input_steps] for i in indices_temp])
        batch_y = np.array([self.now_data[i+self.input_steps+(self.delay-1):i+self.input_steps+(self.delay-1)+self.output_steps, -1] for i in indices_temp]).reshape(self.batch_size, self.output_steps, 1)

        if self.num_called == (self.steps_per_epoch-1):
            self._resetindices()
        else:
            self.num_called += 1

        if self.normalize_factor:
            batch_x = batch_x / self.normalize_factor
            batch_y = batch_y / self.normalize_factor

        return batch_x, batch_y

    def on_epoch_end(self):
        """
        Epoch 終了ごとにデータセットを入れ替える
        (データセット："HOBBIES_1_..."などに対応)
        """
        if self.num_epoch == self.num_datas:
            self.num_epoch = 0
            self.data_idx = self._reset_dataset_indices(self.num_datas)
        else:
            self.num_epoch += 1
        
        next_data_idx = self.data_idx[self.num_epoch]

        with zipfile.ZipFile(self.datapath + "/" + self.listdir[self.sample_indices[next_data_idx]]) as tmpzip:
            filelist = tmpzip.namelist()

            tmp_df = pickle.loads(tmpzip.read(filelist[0]))
            tmp_df = reduce_mem_usage(tmp_df, verbose=False)
            tmp_df = tmp_df.fillna(0)
            
            self.nowdata = tmp_df.values

    def _reset_dataset_indices(self, num_datas):
        """
        Epoch毎に入れ替えるデータのインデックスをランダムにするためのメソッド
        """
        return np.random.permutation(np.arange(num_datas))

In [0]:
# 動作未チェック

from sklearn import preprocessing, metrics
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM,Dropout
from keras.layers import RepeatVector,TimeDistributed, BatchNormalization
from numpy import array
from keras.models import Sequential, load_model
from keras.optimizers import Adam, RMSprop
import re
from tqdm import tqdm
import os

"""
仮のモデル
ハイパーパラメータを引数にとれるよう改造すべき？
※ チューニングができるように
"""
def build_model():
    timesteps = 28
    delay = 1

    n_out_seq_length = 28
    num_y = 1

    train_generator = train_data_from_pickle_generator(num=1)
#    train_generator = train_data_from_pickle_generator(num=1, datapath="./training_datas") 
    x_shape = next(train_generator).shape
    print(x_shape)

    len_sequence, num_features = x_shape

    model = Sequential()

    model.add(LSTM(128, activation='relu', batch_input_shape=(None, timesteps, num_features), return_sequences=False))
    model.add(BatchNormalization())
    model.add(RepeatVector(28))
    model.add(LSTM(32, activation='relu', return_sequences=True))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))  
    model.add(TimeDistributed(Dense(delay, activation="relu")))   # num_y means the shape of y,in some problem(like translate), it can be many.
                                                #In that case, you should set the  activation= 'softmax'
    
    #RMSpropOptimizer = RMSprop(lr=0.001, clipvalue=0.5)
    #model.compile(optimizer=RMSpropOptimizer, loss='mean_squared_error', metrics=["accuracy"])
    model.compile(optimizer="adam", loss='mean_squared_error', metrics=["accuracy"])


    return model

In [19]:
model = build_model()
model.summary()

(1969, 68)
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 128)               100864    
_________________________________________________________________
batch_normalization_1 (Batch (None, 128)               512       
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 28, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 28, 32)            20608     
_________________________________________________________________
batch_normalization_2 (Batch (None, 28, 32)            128       
_________________________________________________________________
dropout_1 (Dropout)          (None, 28, 32)            0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 28, 1) 

In [0]:
DATA_PATH = "./drive/My Drive/kaggle/m5-forecasting/datas/training_datas_onehot/training_datas"
#DATA_PATH = "./training_datas.zip"
#DATA_PATH = "./training_datas"
RTG = ReccurentTrainGenerator(DataPath=DATA_PATH, batch_size=128, InputSteps=28, sample_indices=np.arange(0,1000))
Validation_RTG = ReccurentTrainGenerator(DataPath=DATA_PATH, batch_size=128, InputSteps=28, sample_indices=np.arange(9000,10000))

In [32]:
from keras.callbacks import EarlyStopping 
 
# Early-stopping: patienceはもう少し大きくとる？
early_stopping = EarlyStopping(patience=5, verbose=1) 

history = model.fit_generator(RTG, epochs=500, verbose=1, validation_data=Validation_RTG,callbacks=[early_stopping])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 00019: early stopping


In [57]:
RTG.__getitem__(1)[0].shape

(128, 28, 14)

In [59]:
RTG.now_data.shape

(1969, 14)

# クロスバリデーションのテスト

In [0]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5)
CV_gen = kfold.split(np.arange(0,1000))

In [35]:
from keras.callbacks import EarlyStopping 

History = []

# 1000サンプルでクロスバリデーションテスト(kfold.splitの引数に、0～999が順に入った配列を代入)
for train_cv_idx, valid_cv_idx in kfold.split(np.arange(0,1000)):
    X_CV_train_gen = ReccurentTrainGenerator(DataPath=DATA_PATH, batch_size=128, InputSteps=28, sample_indices=train_cv_idx)
    X_CV_valid_gen = ReccurentTrainGenerator(DataPath=DATA_PATH, batch_size=128, InputSteps=28, sample_indices=valid_cv_idx)

    model = build_model() #カテゴリごとのモデルを作る時も、同様にfor文内で再度モデルをビルドすればよいかもしれない。
 
    # Early-stopping: patienceはもう少し大きくとる？
    early_stopping = EarlyStopping(patience=5, verbose=1) 

    history = model.fit_generator(X_CV_train_gen, epochs=500, verbose=1, validation_data=X_CV_valid_gen, callbacks=[early_stopping])
    History.append(history)

(1969, 68)
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 00024: early stopping
(1969, 68)
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 00007: early stopping
(1969, 68)
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 00013: early stopping
(1969, 68)
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 00015: early stopping
(1969, 68)
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 00006: early stopping


# ハイパーパラメータチューニング用の関数

In [0]:
from sklearn.model_selection import KFold

DATA_PATH = "./drive/My Drive/kaggle/m5-forecasting/datas/training_datas_onehot/training_datas"

def objective(hyperparameters, datapath=DATA_PATH, num_samples=30490):
    """
    hyperparameters:
    LSTM units
    LSTM activation

    # ハイパーパラメータは、build_modelの引数に渡す。
    後ほど実装。(今は引数なし)

    """
    batch_size = 128
    epochs = 500

    
    kfold = KFold(n_splits=5)
    History = []

    for train_cv_idx, valid_cv_idx in kfold.split(np.arange(0, num_samples)):

        X_CV_train_gen = ReccurentTrainGenerator(DataPath=datapath, batch_size=128, InputSteps=28, sample_indices=train_cv_idx)
        X_CV_valid_gen = ReccurentTrainGenerator(DataPath=datapath, batch_size=128, InputSteps=28, sample_indices=valid_cv_idx)

        model = build_model() # 引数にハイパーパラメータを入れられるようにする

        early_stopping = EarlyStopping(patience=5, verbose=1) 

        history = model.fit_generator(X_CV_train_gen, epochs=epochs, verbose=1, validation_data=X_CV_valid_gen, callbacks=[early_stopping])
        History.append(history)

    scores = [History[i].history["val_loss"][-1] for i in range(len(History))]
    mean_score = np.mean(scores)

    return mean_score


In [42]:
mean_score = objective("hyperparameters", num_samples=1000)

(1969, 68)
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 00010: early stopping
(1969, 68)
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 00014: early stopping
(1969, 68)
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 00010: early stopping
(1969, 68)
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 00008: early stopping
(1969, 68)
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 00006: early stopping
