In [1]:
import numpy as np
import pandas as pd
import datetime
import io
import boto3
import pyarrow.parquet as pq
import s3fs
import math
import yaml

In [2]:
# realized sales
def to_uri(bucket, key):
    """
    List all files under a S3 bucket
    :param bucket: (string) name of the S3 bucket
    :param key: (string) S3 key
    :return: (string) URI format
    """
    return f's3://{bucket}/{key}'

def read_multipart_parquet_s3(bucket, dir_path, prefix_filename='part-'):
    """
    Read a multipart parquet file (splitted) hosted on a S3 bucket, load and return as pandas dataframe. Note that only
    files with name starting with <prefix_filename> value are taken into account.
    :param bucket: (string) S3 source bucket
    :param dir_path: (string) full path to the folder that contains all parts within this S3 bucket
    :param prefix_filename: (string) Optional. Default is 'part-' which is what Spark generates. Only files with name
    starting with this value will be loaded
    :return: (pandas DataFrame) data loaded
    """
    fs = s3fs.S3FileSystem()
    s3_uri = to_uri(bucket, dir_path)
    data = pq.ParquetDataset(s3_uri, filesystem=fs).read().to_pandas(date_as_object=False)

    return data

def keys(bucket_name, prefix='/', delimiter='/'):
    prefix = prefix[1:] if prefix.startswith(delimiter) else prefix
    bucket = boto3.resource('s3').Bucket(bucket_name)
    return (_.key for _ in bucket.objects.filter(Prefix=prefix))

In [3]:
def date_to_week_id(date):
    """
    Turn a date to Decathlon week id
    :param date: (str, pd.Timestamp or pd.Series) the date or pandas column of dates
    :return: (int or pd.Series) the week id or pandas column of week ids
    """
    assert isinstance(date, (str, pd.Timestamp, pd.Series, datetime.date))
    if isinstance(date, (str, pd.Timestamp, datetime.date)):
        date = pd.Timestamp(date)
        if date.dayofweek == 6:  # If sunday, replace by next monday to get the correct iso week
            date = date + pd.Timedelta(1, unit='D')
        week_id = int(str(date.isocalendar()[0]) + str(date.isocalendar()[1]).zfill(2))
        return week_id
    else:
        df = pd.DataFrame({'date': pd.to_datetime(date)})
        df['dow'] = df['date'].dt.dayofweek
        df.loc[df['dow'] == 6, 'date'] = df.loc[df['dow'] == 6, 'date'] + pd.Timedelta(1, unit='D')
        df['week_id'] = df['date'].apply(lambda x: int(str(x.isocalendar()[0]) + str(x.isocalendar()[1]).zfill(2)))
        return df['week_id']

In [4]:
def read_yml(file_path):
    """
    Read a local yaml file and return a python dictionary
    :param file_path: (string) full path to the yaml file
    :return: (dict) data loaded
    """

    if file_path[:2] == "s3":
        fs = s3fs.S3FileSystem()
        with fs.open(file_path, 'r') as f:
            yaml_dict = yaml.safe_load(f)
    else:
        with open(file_path) as f:
            yaml_dict = yaml.safe_load(f)

    return yaml_dict

Create holiday calendar dataframe

In [5]:
# generate a dataframe contains all dates from year2016 to year2022
ts = pd.DataFrame({'date':pd.date_range('2016-01-01','2022-12-31')})
#ts.date = ts.date.astype(str)
ts['week_id'] = ts['date'].apply(lambda x:date_to_week_id(x))

In [6]:
cn_holidays = read_yml('cn_holidays.yaml')

In [7]:
holidays = []
tradeoff = []
for year in [2016,2017,2018,2019,2020,2021]:
    holidays += cn_holidays[year]['holiday']
    tradeoff += cn_holidays[year]['tradeoff']
    
def if_holiday(date):
    if date.weekday() >= 5 and date not in tradeoff:
        return 'num_holiday_weekend'
    elif date in holidays:
        return 'num_holiday_weekday'
    else:
        return np.nan
    
ts['if_holiday'] = ts['date'].apply(lambda x:if_holiday(x))

#### Option 1: calculate holidays per week weekday and weekend seperately

In [8]:
calendar_gap = pd.pivot_table(data=ts,index='week_id',columns='if_holiday',values='date',aggfunc='count').reset_index()\
[['week_id','num_holiday_weekend','num_holiday_weekday']].fillna(0)

In [9]:
calendar_gap.head(2)

if_holiday,week_id,num_holiday_weekend,num_holiday_weekday
0,201553,1.0,1.0
1,201601,2.0,0.0


In [10]:
# save the result
calendar_gap.to_parquet('additional_datasource/calendar_gap.parquet')

#### Option 2: calculate holidays per week together

In [89]:
calendar_gap = ts.groupby(by='week_id',as_index=False).agg({'if_holiday':'count'}).rename(columns={'if_holiday':'num_holidays'}).fillna(0)

In [90]:
# save the result
calendar_gap.to_parquet('additional_datasource/calendar_gap.parquet')

In [91]:
calendar_gap.head(2)

Unnamed: 0,week_id,num_holidays
0,201553,2
1,201601,2
