# Data Pre-Processing and Feature Engineering 
### For Time Series model
This Notebook will process the created data and create features for the regression model. The Following data sources are used: <br>
dim_product: product fact table <br>
streaming: aggregated daily streaming data (2018/6/1/-2020/7/2) <br>
Note: New features can be added as a series or catagorical variable. Ideally, the prepcrocessing and feature engineering are preferred to be completed in a ETL tool before data arrives in Sagemaker. 

#### import packages and set envionment parameters

In [9]:
!pip install tqdm

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/mxnet_p36/bin/python -m pip install --upgrade pip' command.[0m


In [28]:
%matplotlib inline

import sys
from urllib.request import urlretrieve
import zipfile
from dateutil.parser import parse
import json
from random import shuffle
import random
import datetime
import os
from tqdm import tqdm
import pickle

import boto3
import s3fs
import sagemaker
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [4]:
#set a random seed to have a definite set of train/validation set
np.random.seed(42)
random.seed(42)

In [6]:
#set parameters
sagemaker_session = sagemaker.Session()
s3_bucket = 'wmg-streaming-prediction-dev/streaming_data_processed' 
s3_prefix = 'ts_data'   

role = sagemaker.get_execution_role()  
region = sagemaker_session.boto_region_name

s3_data_path = "s3://{}/{}/data".format(s3_bucket, s3_prefix)
s3_output_path = "s3://{}/{}/output".format(s3_bucket, s3_prefix)
image_name = sagemaker.amazon.amazon_estimator.get_image_uri(region, "forecasting-deepar", "latest")

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


#### Read in streaming data and reference data for pre-processing

In [10]:
# we want to apply the same filter and keep same amount of songs for training as our regression model, so we use processed regression model features as a references.
ref_data_name = 's3://wmg-streaming-prediction-dev/streaming_data_processed/df_final_feature_v2.csv'
ref_data = pd.read_csv(ref_data_name, index_col = False)
equiv = {2697: 2714, 2698: 2699, 2700:2699, 2707: 1000,2717: 2744, 2719: 1015685, 2727: 2710, 2728: 1000, 2738: 1000, 2743: 2767, 2750: 1001, 
        2752: 1000, 2756: 2755, 2768: 2744, 2774: 2744, 3933: 2699, 1000127: 2734,1000128: 2734, 1000129: 2734, 1000130: 2734, 1000328: 1000,
        1000386: 1001, 1008945:1000, 1009585:1001, 1009905:1001, 1011382:1001, 1011385:2744,  1013145:2767, 1014465: 1001, 1014965:1001, 1006386: 1001,
        2705: 2732, 2771: 2767, 2710: 2744, 2724: 2744}
ref_data['genre_mapped'] = ref_data['major_genre_code'].map(equiv)
ref_data['genre_mapped'].fillna(ref_data['major_genre_code'], inplace = True)
ref_data = ref_data[(ref_data['day0_volume'] != 0) & (ref_data['day1_volume'] != 0) & (ref_data['day2_volume'] != 0)]
ref_data = ref_data[(ref_data['first_week_streams']>200)]

In [11]:
# read in raw streaming files. we want to use all history of a song for training
raw = pd.read_csv('s3://wmg-streaming-prediction-dev/product-key-agg/rna.gz', compression='gzip', 
                  # names = col_names,
                  # sep='\x01',
                   quotechar='"')

raw.columns = ['product_key', 'ISRC', 'release_date', 'stream_date', 'streams']

In [12]:
#only keep same songs for two models
filtered_data = raw[raw['product_key'].isin(ref_data['product_key'])]
del raw # run this commend if OOM error

In [13]:
# assert the all songs have data since release date
check_date = filtered_data.groupby('product_key').agg({'release_date': 'min', 'stream_date':'min'})
check_date['release_date'].equals(check_date['stream_date'])

True

In [14]:
# data format check and order the dataframe as timeseries
filtered_data['stream_date_datetime'] = pd.to_datetime(filtered_data['stream_date']).dt.strftime('%Y/%m/%d %H:%M:%S')
filtered_data = filtered_data.sort_values(['product_key','stream_date_datetime'], ascending=True)

#### Create time series input

In [15]:
# create time series following the DeepAR input format
timeseries = []
prod_key_list = filtered_data['product_key'].unique()
for i in tqdm(prod_key_list):
    ts_i = filtered_data[filtered_data['product_key'] == i]
    idx = pd.period_range(min(ts_i.stream_date_datetime), max(ts_i.stream_date_datetime))
    ts_i_reindexed = ts_i.set_index('stream_date_datetime')['streams']
    ts_i_reindexed = ts_i_reindexed.rename(str(i))
    ts_i_reindexed.index = pd.to_datetime(ts_i_reindexed.index)
    ts_i_reindexed = ts_i_reindexed.asfreq(freq = 'D')
    timeseries.append(ts_i_reindexed)

100%|██████████| 29855/29855 [07:37<00:00, 65.27it/s]


In [16]:
del filtered_data # run this commend if OOM error
## we want to keep songs with longer than 60 days history for training
cutted_ts  = [ts for ts in timeseries if len(ts) <= 60]
timeseries = [ts for ts in timeseries if len(ts) > 60]

In [17]:
# hold out a sample for testing:
# do a stratified train/test split to make sure each genre has at least some songs represented in both training and testing set.
X = pd.concat([ref_data.iloc[:, :12], ref_data.iloc[:, 13:]], axis = 1)
y = ref_data.first_week_streams
x_train, x_test, y_train, y_test = train_test_split(X,y,
                                                    train_size=.8, 
                                                    stratify=ref_data.major_genre_code)
#keep training data for modeling, and test data for testing
modeling_list = list(x_train['product_key'].apply(str))
cutoff_list = list(x_test['product_key'].apply(str))
timeseries_modeling = [ts for ts in timeseries if (ts.name) in modeling_list]
timeseries_cutoff = [ts for ts in timeseries if (ts.name) in cutoff_list]
# data format check
timeseries_modeling = [ts.astype(float).fillna(0) for ts in timeseries_modeling]

In [29]:
#process and save text files
timeseries_cutoff = [ts.astype(float).fillna(0) for ts in timeseries_cutoff]
ts_cat_cutoff = []
for ts in tqdm(timeseries_cutoff):
    prod_key = ts.name 
    cat_code = ref_data[ref_data['product_key'] == int(prod_key)][['genre_coded', 'has_collaboration']]
    cat_code_array = cat_code.iloc[0].rename(prod_key)
    ts_cat_cutoff.append(cat_code_array) 

100%|██████████| 5604/5604 [00:10<00:00, 549.60it/s]


In [30]:
#process and save text files
with open("ts_cut_off.txt", "wb") as fp:   #Pickling
    pickle.dump(timeseries_cutoff, fp)
with open("ts_cat_cut_off.txt", "wb") as fp:   #Pickling
    pickle.dump(ts_cat_cutoff, fp)

In [18]:
# helper functions to save files to local and s3
def write_dicts_to_file(path, data):
    with open(path, 'wb') as fp:
        for d in data:
            fp.write(json.dumps(d).encode("utf-8"))
            fp.write("\n".encode('utf-8'))


def series_to_obj(ts, cat=None):
    obj = {"start": str(ts.index[0]), "target": list(ts)}
    if cat is not None:
        obj["cat"] = cat
    return obj

def series_to_jsonline(ts, cat=None):
    return json.dumps(series_to_obj(ts, cat))


s3 = boto3.resource('s3')
def copy_to_s3(local_file, s3_path, override=False):
    assert s3_path.startswith('s3://')
    split = s3_path.split('/')
    bucket = split[2]
    path = '/'.join(split[3:])
    buk = s3.Bucket(bucket)
    
    if len(list(buk.objects.filter(Prefix=path))) > 0:
        if not override:
            print('File s3://{}/{} already exists.\nSet override to upload anyway.\n'.format(s3_bucket, s3_path))
            return
        else:
            print('Overwriting existing file')
    with open(local_file, 'rb') as data:
        print('Uploading file to {}'.format(s3_path))
        buk.put_object(Key=path, Body=data)

#### Create Genre as a Catagorical Feature

In [20]:
# create a genre map following DeepAR input Format
genre_map = {}
for i, genre in enumerate(ref_data['genre_mapped'].unique()):
    genre_map.update({genre: i})
ref_data['genre_coded'] = ref_data['genre_mapped'].map(genre_map)

# create a series of categorical variable 
ts_cat_modeling = []
for ts in tqdm(timeseries_modeling):
    prod_key = ts.name 
    cat_code = ref_data[ref_data['product_key'] == int(prod_key)][['genre_coded', 'has_collaboration']]
    cat_code_array = cat_code.iloc[0].rename(prod_key)
    ts_cat_modeling.append(cat_code_array)   

100%|██████████| 22284/22284 [00:40<00:00, 546.66it/s]


In [21]:
# set parameters to create training and testing files
freq = 'D'
# we predict for 7 days
prediction_length = 7 
# we also use 7 days as context length, this is the number of state updates accomplished before making predictions
context_length = 7 

#### Create training and testing data with genre feature

In [22]:
training_data_new_features = [
    {
        "start": str(ts.index[0]),
        "target": ts[:-prediction_length*2].tolist(),
        "cat": ts_cat_modeling[i].tolist()
    }
    for i, ts in enumerate(timeseries_modeling)
]

In [24]:
num_test_windows = 2
test_data_new_features = [
    {
        "start": str(ts.index[0]),
        "target": ts[:-prediction_length*k].tolist(),
        "cat": ts_cat_modeling[i].tolist()
    }
    for k in range(1, num_test_windows + 1) 
    for i, ts in enumerate(timeseries_modeling)
]

In [25]:
# check data consistency, quality check before saving
def check_dataset_consistency(train_dataset, test_dataset=None):
    d = train_dataset[0]
    has_dynamic_feat = 'dynamic_feat' in d
    if has_dynamic_feat:
        num_dynamic_feat = len(d['dynamic_feat'])
    has_cat = 'cat' in d
    if has_cat:
        num_cat = len(d['cat'])
    
    def check_ds(ds):
        for i, d in enumerate(ds):
            if has_dynamic_feat:
                assert 'dynamic_feat' in d
                assert num_dynamic_feat == len(d['dynamic_feat'])
                for f in d['dynamic_feat']:
                    assert len(d['target']) == len(f)
            if has_cat:
                assert 'cat' in d
                assert len(d['cat']) == num_cat
    check_ds(train_dataset)
    if test_dataset is not None:
        check_ds(test_dataset)
        
check_dataset_consistency(training_data_new_features, test_data_new_features)

#### Save files to S3

In [27]:
write_dicts_to_file("train_new_features.json", training_data_new_features)
write_dicts_to_file("test_new_features.json", test_data_new_features)

s3_data_path_new_features = "s3://{}/{}-new-features/data".format(s3_bucket, s3_prefix)
s3_output_path_new_features = "s3://{}/{}-new-features/output".format(s3_bucket, s3_prefix)

print('Uploading to S3 this may take a few minutes depending on your connection.')
copy_to_s3("train_new_features.json", s3_data_path_new_features + "/train/train_new_features.json", override=True)
copy_to_s3("test_new_features.json", s3_data_path_new_features + "/test/test_new_features.json", override=True)

Uploading to S3 this may take a few minutes depending on your connection.
Overwriting existing file
Uploading file to s3://wmg-streaming-prediction-dev/streaming_data_processed/ts_data-new-features/data/train/train_new_features.json
Overwriting existing file
Uploading file to s3://wmg-streaming-prediction-dev/streaming_data_processed/ts_data-new-features/data/test/test_new_features.json
