In [None]:
%load_ext autoreload
%autoreload 2

import json
import s3fs
import pandas as pd
import os
import sys

sys.path.insert(0,'..')

import src.config as cf
import src.utils as ut

os.environ['HTTP_PROXY'] = "http://proxy-internet-aws-eu.subsidia.org:3128"
os.environ['HTTPS_PROXY'] = "http://proxy-internet-aws-eu.subsidia.org:3128"
os.environ['no_proxy'] = "169.254.169.254,127.0.0.1"

# Params

In [None]:
RUN_ENV = "dev"
freq = 'W'
prediction_length = 16
cutoff_week_id = 201922

In [None]:
config_file = "../conf/prod.yml" if RUN_ENV == "prod" else "../conf/dev.yml"
config = cf.ProgramConfiguration(config_file, "../conf/functional.yml")

role = config.get_global_role_arn()

bucket = config.get_train_bucket_input()
prefix = "test_deepAR" # Your S3 test dir

s3_data_path = "{}/{}/data".format(bucket, prefix)
s3_output_path = "{}/{}/output".format(bucket, prefix)

# Load Refined Data

In [None]:
#prefix = 'specific/full_scope' # To del after data modification

train_data_cutoff = ut.read_parquet_S3(
    bucket, "{}/full_scope/train_data_cutoff/train_data_cutoff_{}".format(prefix, cutoff_week_id)
)

# Format data for deepAR

In [None]:
l_model = train_data_cutoff['model'].sort_values().unique()
l_model_test = train_data_cutoff \
    .loc[train_data_cutoff['week_id'] == ut.get_last_week_id(cutoff_week_id),'model'] \
    .sort_values() \
    .unique()

nb_ts = l_model.shape[0]
cpt = 0

train_ts = [] # All time series without the last 16 values
val_ts = []   # All time series
test_ts = []  # Time series to forecast for futur (=active past week)

for m in l_model: # TO OPTIMIZE (parallel? Iterate over groupBy Object?)
    ts = train_data_cutoff[train_data_cutoff['model'] == m].set_index('date').asfreq(freq)

    train_ts.append(ts[:-prediction_length])
    val_ts.append(ts)
    if m in l_model_test:
        test_ts.append(ts)
        
    cpt += 1
    if cpt % 500 == 0:
        print(str(cpt), '/', str(nb_ts))

# Export to S3 in JSONL format

In [None]:
def series_to_obj(ts, cat=None):
    obj = {"model": int(ts['model'].values[0]), 
           "start": str(ts.index[0]), 
           "target": list(ts['y'])}
    if cat is not None:
        obj["cat"] = cat
    return obj

def series_to_jsonline(ts, cat=None):
    return json.dumps(series_to_obj(ts, cat))

In [None]:
fs = s3fs.S3FileSystem()

# Train
with fs.open("{}/cutoff_{}/train.json".format(s3_data_path, cutoff_week_id), 'wb') as fp:
    for ts in train_ts:
        fp.write(series_to_jsonline(ts).encode("utf-8"))
        fp.write('\n'.encode("utf-8"))

# Val
with fs.open("{}/cutoff_{}/val.json".format(s3_data_path, cutoff_week_id), 'wb') as fp:
    for ts in val_ts:
        fp.write(series_to_jsonline(ts).encode("utf-8"))
        fp.write('\n'.encode("utf-8"))
        
# Test
with fs.open("{}/cutoff_{}/test.json".format(s3_data_path, cutoff_week_id), 'wb') as fp:
    for ts in test_ts:
        fp.write(series_to_jsonline(ts).encode("utf-8"))
        fp.write('\n'.encode("utf-8"))