In [31]:
import sys
import os
import pathlib
import gc
from datetime import datetime
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import numpy as np
import math
import random
import pickle
import time
import psutil
import warnings

# function fixing random seeds

In [16]:
def seed_everything(seed=0):
    """Sets seed to make all processes deterministic     # type: int
    
    """
    random.seed(seed)
    np.random.seed(seed)

SEED = 42
seed_everything(SEED)    

# constant variables for helper functions

In [17]:
N_CORES = psutil.cpu_count()     # Available CPU cores
print(f"N_CORES: {N_CORES}")

N_CORES: 16


#  constant variables for data import

In [18]:
# change this var according to the dataset you refer to 
# path to the source's pickle files
# _DATA_DIR = os.path.sep.join(["data", "M5_Three_shades_of_Dark_Darker_magic", "sample"])
_DATA_DIR = os.path.sep.join(["data", "M5_Three_shades_of_Dark_Darker_magic"])
_OUTPUT_DIR = os.path.sep.join(["data", "M5_Three_shades_of_Dark_Darker_magic"])
print(f"_DATA_DIR: {_DATA_DIR}")
_CALENDAR_CSV_FILE = "calendar.csv"
_SAMPLE_SUBMISSION_CSV_FILE = "sample_submission.csv"
_SALES_TRAIN_VALIDATION_CSV_FILE = "sales_train_validation.csv"
_SELL_PRICES_CSV_FILE = "sell_prices.csv"

#PATHS for Features
BASE = "clearned_base_grid_for_darker_magic.pkl"
PRICE = "base_grid_with_sales_price_features_for_darker_magic.pkl"
CALENDAR = "base_grid_with_calendar_features_for_darker_magic.pkl"

LAGS = "base_grid_with_lag_features_for_28_days.pkl"
MEAN_ENC = "base_grid_with_mean_encoded_ids_means_stds_for_darker_magic.pkl"


#PATHS for Features made by the reference source
# BASE = 'grid_part_1.pkl'
# PRICE = 'grid_part_2.pkl'
# CALENDAR = 'grid_part_3.pkl'
# LAGS = 'lags_df_28.pkl'
# MEAN_ENC = 'mean_encoding_df.pkl'

_DATA_DIR: data/M5_Three_shades_of_Dark_Darker_magic


# model hyperparameters and constant variables for training and test

In [19]:
VER = 1                          # Our model version
SEED = 42                        # We want all things to be as deterministic as possible
seed_everything(SEED)

#LIMITS and const
TARGET      = 'sales'            # Our target column name
START_DAY_TRAIN = 0                  # We can skip some rows (Nans/faster training)
END_DAY_TRAIN   = 1913               # End day of our train set
PREDICTION_HORIZON_DAYS = 28                 # Prediction horizon
# Use or not pretrained models: make this true after completing model training.
# USE_AUX = True  
USE_AUX = False

# FEATURES to remove.
# These features lead to overfit or values not present in test set
REMOVE_FEATURES = ['id','state_id','store_id', 'date','wm_yr_wk','d',TARGET]
MEAN_STD_FEATURES   = ['enc_cat_id_mean','enc_cat_id_std',
                   'enc_dept_id_mean','enc_dept_id_std',
                   'enc_item_id_mean','enc_item_id_std'] 

# AUX(pretrained) Models paths
PRETRAINED_MODEL_DIR = 'trained_model'

#SPLITS for lags creation
SHIFT_DAYS  = 28
N_LAGS     = 15
LAGS_SPLIT = [col for col in range(SHIFT_DAYS, SHIFT_DAYS + N_LAGS)]
ROLLING_SPLIT = []

# function nicely displaying a head of Pandas DataFrame

In [20]:
import IPython

def display(*dfs, head=True):
    for df in dfs:
        IPython.display.display(df.head() if head else df)

# function processing df in multiprocess

In [21]:
def run_df_in_multiprocess(func, t_split):
    """Process ds in Multiprocess
    
    """
    num_cores = np.min([N_CORES,len(t_split)])
    print(f"num_cores: {num_cores}")
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df

# other helper functions

In [22]:
def get_memory_usage():
    """メモリ使用量を確認するためのシンプルな「メモリプロファイラ」
    
    """
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)


def merge_by_concat(df1, df2, merge_on):
    """
    dtypesを失わないための連結による結合
    
    """
    
    merged_gf = df1[merge_on]
    merged_gf = merged_gf.merge(df2, on=merge_on, how='left')
    new_columns = [col for col in list(merged_gf) if col not in merge_on]
    df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)
    return df1


def get_base_test():
    """Recombines Test set after training
    
    """
    base_test = pd.DataFrame()

    for store_id in STORE_IDS:
        test_pkl_path = os.path.sep.join([PRETRAINED_MODEL_DIR, 'test_dataset_'+store_id+'.pkl'])
        temp_df = pd.read_pickle(test_pkl_path)
        temp_df['store_id'] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test



##### Helper to make dynamic rolling lags #####
def make_lag(lag_day):
    """
    
    """
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'sales_lag_'+str(lag_day)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(lag_day)).astype(np.float16)
    return lag_df[[col_name]]


def make_lag_roll(lag_day):
    """
    
    """
    shift_day = lag_day[0]
    roll_wind = lag_day[1]
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'rolling_mean_tmp_'+str(shift_day)+'_'+str(roll_wind)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(shift_day).rolling(roll_wind).mean())
    return lag_df[[col_name]]
##### Helper to make dynamic rolling lags #####

# function importing data

In [23]:
def reduce_mem_usage(df, verbose=True):
    """
    reduce the memory usage of the given dataframe.
    https://qiita.com/hiroyuki_kageyama/items/02865616811022f79754
    
    Args:
        df: Dataframe
        verbose: 
        
    Returns:
        df, whose memory usage is reduced.

    Raises:
        None
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns: #columns毎に処理
        col_type = df[col].dtypes
        if col_type in numerics: #numericsのデータ型の範囲内のときに処理を実行. データの最大最小値を元にデータ型を効率的なものに変更
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def read_csv_data(directory, file_name):
    print('Reading files...')
    df = pd.read_csv(os.path.sep.join([str(directory), _DATA_DIR, file_name]))
    df = reduce_mem_usage(df)
    print('{} has {} rows and {} columns'.format(file_name, df.shape[0], df.shape[1]))
    
    return df


def read_data_by_store(store):

#     # Read and contact basic feature
#     df = pd.concat([pd.read_pickle(BASE),
#                     pd.read_pickle(PRICE).iloc[:,2:],
#                     pd.read_pickle(CALENDAR).iloc[:,2:]],
#                     axis=1)

    # Read and contact basic feature
    parent_dir = pathlib.Path(os.path.abspath(os.curdir)).parent.parent
    df = pd.concat([pd.read_pickle(os.path.sep.join([str(parent_dir), _DATA_DIR, BASE])),
                    pd.read_pickle(os.path.sep.join([str(parent_dir), _DATA_DIR, PRICE])).iloc[:,2:],
                    pd.read_pickle(os.path.sep.join([str(parent_dir), _DATA_DIR, CALENDAR])).iloc[:,2:]],
                    axis=1)
    
    # Leave only relevant store
    df = df[df['store_id']==store]

    # With memory limits we have to read lags and mean encoding features separately and drop items that we don't need.
    # As our Features Grids are aligned 
    # we can use index to keep only necessary rows
    # Alignment is good for us as concat uses less memory than merge.
    df2 = pd.read_pickle(os.path.sep.join([str(parent_dir), _DATA_DIR, MEAN_ENC]))[MEAN_STD_FEATURES]
    df2 = df2[df2.index.isin(df.index)]
    
    df3 = pd.read_pickle(os.path.sep.join([str(parent_dir), _DATA_DIR, LAGS])).iloc[:,3:]
    df3 = df3[df3.index.isin(df.index)]
    
    df = pd.concat([df, df2], axis=1)
    del df2 # to not reach memory limit 
    
    df = pd.concat([df, df3], axis=1)
    del df3 # to not reach memory limit 
    
    # Create features list
    features = [col for col in list(df) if col not in REMOVE_FEATURES]
    df = df[['id','d',TARGET]+features]
    
    # Skipping first n rows
    df = df[df['d']>=START_DAY_TRAIN].reset_index(drop=True)
    
    return df, features

# import pickle

In [24]:
STORE_IDS = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']

all_stores_df = pd.DataFrame()
all_stores_wo_store_id_df = pd.DataFrame()

for store_id in STORE_IDS:
    # Get grid for current store
    grid_df, features_columns = read_data_by_store(store_id)
# todo: when using aws forecast per store_id SEPARATELY, add store_id to the dataframe
#     grid_df["store_id"] = store_id

    print(f"features_columns: {features_columns}")
    
    all_stores_df = pd.concat([all_stores_df, grid_df], axis=0)


features_columns: ['item_id', 'dept_id', 'cat_id', 'release', 'sell_price', 'price_max', 'price_min', 'price_std', 'price_mean', 'price_norm', 'price_nunique', 'item_nunique', 'price_momentum', 'price_momentum_m', 'price_momentum_y', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'tm_d', 'tm_w', 'tm_m', 'tm_y', 'tm_wm', 'tm_dw', 'tm_w_end', 'enc_cat_id_mean', 'enc_cat_id_std', 'enc_dept_id_mean', 'enc_dept_id_std', 'enc_item_id_mean', 'enc_item_id_std', 'sales_lag_28', 'sales_lag_29', 'sales_lag_30', 'sales_lag_31', 'sales_lag_32', 'sales_lag_33', 'sales_lag_34', 'sales_lag_35', 'sales_lag_36', 'sales_lag_37', 'sales_lag_38', 'sales_lag_39', 'sales_lag_40', 'sales_lag_41', 'sales_lag_42', 'rolling_mean_7', 'rolling_std_7', 'rolling_mean_14', 'rolling_std_14', 'rolling_mean_30', 'rolling_std_30', 'rolling_mean_60', 'rolling_std_60', 'rolling_mean_180', 'rolling_std_180', 'rolling_mean_tmp_1_7', 'rolling_mean_tmp_1_14', 'rolling_mean_tmp_

In [32]:
print(f"all_stores_df.dtypes: {all_stores_df.dtypes}")
print(f"all_stores_df: {all_stores_df}")

all_stores_df.dtypes: id                        category
d                            int16
sales                      float64
item_id                   category
dept_id                   category
cat_id                    category
release                      int16
sell_price                 float16
price_max                  float16
price_min                  float16
price_std                  float16
price_mean                 float16
price_norm                 float16
price_nunique              float16
item_nunique                 int16
price_momentum             float16
price_momentum_m           float16
price_momentum_y           float16
event_name_1              category
event_type_1              category
event_name_2              category
event_type_2              category
snap_CA                   category
snap_TX                   category
snap_WI                   category
tm_d                          int8
tm_w                          int8
tm_m                          int

In [26]:
# todo: add timestamp for aws forecast

# export result to the local

In [33]:
parent_dir = pathlib.Path(os.path.abspath(os.curdir)).parent.parent

# # Reading competition sample submission and merging our predictions
# # As we have predictions only for "_validation" data we need to do fillna() for "_evaluation" items
# submission_df = read_csv_data(parent_dir, _SAMPLE_SUBMISSION_CSV_FILE)
# submission_ids_df = submission_df[["id"]]
# display(submission_ids_df)
# my_submission_df = submission_ids_df.merge(all_preds, on=['id'], how='left').fillna(0)

_EXPORT_FILE_NAME = 'aws_forecast_data_v'+str(VER)+'.csv'
print(f"csv data export start: {datetime.now()}")
all_stores_df.to_csv(os.path.sep.join([str(parent_dir), _OUTPUT_DIR, _EXPORT_FILE_NAME]), index=False)
print('csv data export finished. Size:', all_stores_df.shape)
print(f"csv data export end: {datetime.now()}")




csv data export start: 2020-06-27 03:37:21.299022
csv data export finished. Size: (46881677, 75)
csv data export end: 2020-06-27 04:05:36.341850


# export result to S3

In [34]:
import sagemaker
# import boto3

parent_dir = pathlib.Path(os.path.abspath(os.curdir)).parent.parent
local_path = os.path.sep.join([str(parent_dir), _DATA_DIR, _EXPORT_FILE_NAME])
print(f"local_path: {local_path}")

# role = sagemaker.get_execution_role()
bucket='sagemaker-m5-forecasting-okada'
data_directory = 'accuracy/aws_forecast/'
data_path = data_directory
data_location = 's3://{}/{}'.format(bucket, data_path)
print(f"data_location: {data_location}")

# https://stackoverflow.com/questions/56799763/uploading-a-dataframe-to-aws-s3-bucket-from-sagemaker
print("file export start")
sagemaker.s3.S3Uploader.upload(
    local_path=local_path,
    desired_s3_uri=data_location
)
sagemaker.s3.S3Downloader.list(data_location)
print("file export finished")

# csv_s3_uri = sagemaker.s3.S3Uploader.upload_string_as_file_body(
#     body=all_stores_df.to_csv(index=False),
#     desired_s3_uri=data_location
# )
# print("file export start")
# sagemaker.s3.S3Downloader.list(csv_s3_uri)
# print("file export finished")

local_path: /home/ec2-user/SageMaker/data/M5_Three_shades_of_Dark_Darker_magic/aws_forecast_data_v1.csv
data_location: s3://sagemaker-m5-forecasting-okada/accuracy/aws_forecast/
file export start
file export finished
