In [1]:
import s3fs
import numpy as np
import pandas as pd
import sys
import os
import pathlib
import gc

# function nicely diplaying a head of Pandas DataFrame

In [2]:
import IPython

def display(*dfs, head=True):
    for df in dfs:
        IPython.display.display(df.head() if head else df)

# parameters

In [3]:
bucket_name = 'sagemaker-m5-forecasting-okada'
path = 'accuracy/aws_forecast/result'
prefix = "m5_accuracy_base_renamed_custom_domain_df_result_2020-08-01T06-33-26Z_part"
_EXPORT_FILE_NAME = 'submission_aws_forecast_custom_domain_validation.csv'

# prefix = "m5_accuracy_base_renamed_custom_domain_df_result_2020-08-01T06-33-26Z_part"
# _EXPORT_FILE_NAME = 'submission_aws_forecast_custom_domain_validation.csv'
prefix = "m5_accuracy_base_renamed_retail_domain_df_2020-08-02T06-45-17Z_part"
_EXPORT_FILE_NAME = 'submission_aws_forecast_retail_domain_validation.csv'

# function importing data

In [4]:
def reduce_mem_usage(df, verbose=True):
    """
    reduce the memory usage of the given dataframe.
    https://qiita.com/hiroyuki_kageyama/items/02865616811022f79754
    
    Args:
        df: Dataframe
        verbose: 
        
    Returns:
        df, whose memory usage is reduced.

    Raises:
        None
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns: #columns毎に処理
        col_type = df[col].dtypes
        if col_type in numerics: #numericsのデータ型の範囲内のときに処理を実行. データの最大最小値を元にデータ型を効率的なものに変更
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def read_csv_data(directory, file_name):
    print('Reading files...')
    df = pd.read_csv(os.path.sep.join([str(directory), _DATA_DIR, file_name]))
    df = reduce_mem_usage(df)
    print('{} has {} rows and {} columns'.format(file_name, df.shape[0], df.shape[1]))
    
    return df


# other helper functions

In [5]:
def merge_by_concat(df1, df2, merge_on):
    """
    dtypesを失わないための連結による結合
    
    """
    merged_gf = df1[merge_on]
    merged_gf = merged_gf.merge(df2, on=merge_on, how='left')
    new_columns = [col for col in list(merged_gf) if col not in merge_on]
    df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)
    return df1

# import forecast result from s3

In [6]:
# https://stackoverflow.com/questions/37703634/how-to-import-a-text-file-on-aws-s3-into-pandas-without-writing-to-disk

num_result_files = 28
result_df = pd.DataFrame()
for i in range (0, num_result_files): 
    file_name = prefix + str(i) + ".csv"
    url = 's3://{}/{}/{}'.format(bucket_name, path, file_name)
    print(f"url: {url}")
    df = pd.read_csv(url)
    df = df[["item_id", "p50", "date"]]
    result_df = pd.concat([result_df, df], axis=0)
    
result_df = result_df.rename(columns={'item_id': 'id'})
    
display(result_df)
print(len(result_df))


url: s3://sagemaker-m5-forecasting-okada/accuracy/aws_forecast/result/m5_accuracy_base_renamed_retail_domain_df_2020-08-02T06-45-17Z_part0.csv
url: s3://sagemaker-m5-forecasting-okada/accuracy/aws_forecast/result/m5_accuracy_base_renamed_retail_domain_df_2020-08-02T06-45-17Z_part1.csv
url: s3://sagemaker-m5-forecasting-okada/accuracy/aws_forecast/result/m5_accuracy_base_renamed_retail_domain_df_2020-08-02T06-45-17Z_part2.csv
url: s3://sagemaker-m5-forecasting-okada/accuracy/aws_forecast/result/m5_accuracy_base_renamed_retail_domain_df_2020-08-02T06-45-17Z_part3.csv
url: s3://sagemaker-m5-forecasting-okada/accuracy/aws_forecast/result/m5_accuracy_base_renamed_retail_domain_df_2020-08-02T06-45-17Z_part4.csv
url: s3://sagemaker-m5-forecasting-okada/accuracy/aws_forecast/result/m5_accuracy_base_renamed_retail_domain_df_2020-08-02T06-45-17Z_part5.csv
url: s3://sagemaker-m5-forecasting-okada/accuracy/aws_forecast/result/m5_accuracy_base_renamed_retail_domain_df_2020-08-02T06-45-17Z_part6.csv

Unnamed: 0,id,p50,date
0,foods_3_721_tx_2_validation,0.0,2016-05-23T00:00:00Z
1,foods_3_721_tx_2_validation,0.0,2016-05-24T00:00:00Z
2,foods_3_721_tx_2_validation,0.0,2016-05-25T00:00:00Z
3,foods_3_721_tx_2_validation,0.0,2016-05-26T00:00:00Z
4,foods_3_721_tx_2_validation,0.0,2016-05-27T00:00:00Z


853720


In [7]:
print(result_df[result_df["id"] == "foods_3_721_tx_2_validation"])

                             id  p50                  date
0   foods_3_721_tx_2_validation  0.0  2016-05-23T00:00:00Z
1   foods_3_721_tx_2_validation  0.0  2016-05-24T00:00:00Z
2   foods_3_721_tx_2_validation  0.0  2016-05-25T00:00:00Z
3   foods_3_721_tx_2_validation  0.0  2016-05-26T00:00:00Z
4   foods_3_721_tx_2_validation  0.0  2016-05-27T00:00:00Z
5   foods_3_721_tx_2_validation  0.0  2016-05-28T00:00:00Z
6   foods_3_721_tx_2_validation  0.0  2016-05-29T00:00:00Z
7   foods_3_721_tx_2_validation  0.0  2016-05-30T00:00:00Z
8   foods_3_721_tx_2_validation  0.0  2016-05-31T00:00:00Z
9   foods_3_721_tx_2_validation  0.0  2016-06-01T00:00:00Z
10  foods_3_721_tx_2_validation  0.0  2016-06-02T00:00:00Z
11  foods_3_721_tx_2_validation  0.0  2016-06-03T00:00:00Z
12  foods_3_721_tx_2_validation  0.0  2016-06-04T00:00:00Z
13  foods_3_721_tx_2_validation  0.0  2016-06-05T00:00:00Z
14  foods_3_721_tx_2_validation  0.0  2016-06-06T00:00:00Z
15  foods_3_721_tx_2_validation  0.0  2016-06-07T00:00:0

In [8]:
id_list = result_df['id'].unique()
print(id_list)
print(len(id_list))

num_prediction_days = 28
column_list = ["F" + str(i) for i in range (1, num_prediction_days + 1)]
print(column_list)

date_list = [
"2016-05-23T00:00:00Z",
"2016-05-24T00:00:00Z",
"2016-05-25T00:00:00Z",
"2016-05-26T00:00:00Z",
"2016-05-27T00:00:00Z",
"2016-05-28T00:00:00Z",
"2016-05-29T00:00:00Z",
"2016-05-30T00:00:00Z",
"2016-05-31T00:00:00Z",
"2016-06-01T00:00:00Z",
"2016-06-02T00:00:00Z",
"2016-06-03T00:00:00Z",
"2016-06-04T00:00:00Z",
"2016-06-05T00:00:00Z",
"2016-06-06T00:00:00Z",
"2016-06-07T00:00:00Z",
"2016-06-08T00:00:00Z",
"2016-06-09T00:00:00Z",
"2016-06-10T00:00:00Z",
"2016-06-11T00:00:00Z",
"2016-06-12T00:00:00Z",
"2016-06-13T00:00:00Z",
"2016-06-14T00:00:00Z",
"2016-06-15T00:00:00Z",
"2016-06-16T00:00:00Z",
"2016-06-17T00:00:00Z",
"2016-06-18T00:00:00Z",
"2016-06-19T00:00:00Z",
]

output_df = pd.DataFrame(id_list ,columns = ["id"])

for i, day in enumerate(column_list):    
    one_day_preduction = result_df[result_df["date"] == date_list[i]]
    one_day_preduction = one_day_preduction.rename(columns={'p50': day})
    output_df = pd.merge(output_df, one_day_preduction[["id", day]], on="id", how="inner")

print(output_df)
print(f"output_df.shape: {output_df.shape}")

['foods_3_721_tx_2_validation' 'household_2_225_ca_1_validation'
 'foods_1_169_ca_4_validation' ... 'hobbies_2_079_ca_3_validation'
 'hobbies_2_140_wi_3_validation' 'foods_3_537_wi_2_validation']
30490
['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10', 'F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F17', 'F18', 'F19', 'F20', 'F21', 'F22', 'F23', 'F24', 'F25', 'F26', 'F27', 'F28']
                                    id   F1   F2   F3   F4   F5   F6   F7  \
0          foods_3_721_tx_2_validation  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
1      household_2_225_ca_1_validation  2.0  2.0  1.0  1.0  2.0  2.0  2.0   
2          foods_1_169_ca_4_validation  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
3        hobbies_2_066_wi_2_validation  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
4      household_1_137_wi_2_validation  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
...                                ...  ...  ...  ...  ...  ...  ...  ...   
30485      foods_3_756_tx_3_validation  7.0  6.0  6.0  7.0  7.0  9.0  8

In [9]:
output_df["id"] = output_df["id"].str.upper()
output_df["id"] = output_df["id"].str.replace("VALIDATION", "validation")
print(output_df)

                                    id   F1   F2   F3   F4   F5   F6   F7  \
0          FOODS_3_721_TX_2_validation  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
1      HOUSEHOLD_2_225_CA_1_validation  2.0  2.0  1.0  1.0  2.0  2.0  2.0   
2          FOODS_1_169_CA_4_validation  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
3        HOBBIES_2_066_WI_2_validation  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
4      HOUSEHOLD_1_137_WI_2_validation  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
...                                ...  ...  ...  ...  ...  ...  ...  ...   
30485      FOODS_3_756_TX_3_validation  7.0  6.0  6.0  7.0  7.0  9.0  8.0   
30486  HOUSEHOLD_2_069_WI_1_validation  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
30487    HOBBIES_2_079_CA_3_validation  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
30488    HOBBIES_2_140_WI_3_validation  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
30489      FOODS_3_537_WI_2_validation  1.0  1.0  1.0  0.0  1.0  1.0  1.0   

        F8   F9  ...  F19  F20  F21  F22  F23  F24  F25  F26  F27  F28  
0 

# file export 

In [10]:
parent_dir = pathlib.Path(os.path.abspath(os.curdir)).parent.parent
print(f"parent_dir: {parent_dir}")
_SAMPLE_SUBMISSION_CSV_FILE = "sample_submission.csv"
_OUTPUT_DIR = os.path.sep.join(["data", "aws_forecast"])
_DATA_DIR = os.path.sep.join(["data", "M5_Three_shades_of_Dark_Darker_magic"])

# Reading competition sample submission and merging our predictions
submission_df = read_csv_data(parent_dir, _SAMPLE_SUBMISSION_CSV_FILE)
submission_ids_df = submission_df[["id"]]
my_submission_df = submission_ids_df.merge(output_df, on=['id'], how='left').fillna(0)
print(f"my_submission_df: {my_submission_df}")

print("csv data export start")
my_submission_df.to_csv(os.path.sep.join([str(parent_dir), _OUTPUT_DIR, _EXPORT_FILE_NAME]), index=False)
print('csv data export finished. Size:', my_submission_df.shape)

parent_dir: /home/ec2-user/SageMaker
Reading files...
Mem. usage decreased to  2.09 Mb (84.5% reduction)
sample_submission.csv has 60980 rows and 29 columns
my_submission_df:                                   id   F1   F2   F3   F4   F5   F6   F7   F8  \
0      HOBBIES_1_001_CA_1_validation  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0   
1      HOBBIES_1_002_CA_1_validation  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
2      HOBBIES_1_003_CA_1_validation  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
3      HOBBIES_1_004_CA_1_validation  1.0  1.0  1.0  1.0  1.0  2.0  3.0  1.0   
4      HOBBIES_1_005_CA_1_validation  0.0  1.0  1.0  1.0  1.0  1.0  2.0  0.0   
...                              ...  ...  ...  ...  ...  ...  ...  ...  ...   
60975    FOODS_3_823_WI_3_evaluation  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
60976    FOODS_3_824_WI_3_evaluation  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
60977    FOODS_3_825_WI_3_evaluation  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
60978    FOODS_3_826_WI_3