# Dataset

In this Notebook, I'll extract some features to try to create an initial dataset for training a good model.

In [1]:
import pandas as pd

train_data = pd.read_parquet('inputs/train_data.parquet', engine='pyarrow')
train_data.index = train_data.sku
train_data

Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active
0,464801,2021-02-01,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000
1,464801,2021-02-02,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000
2,464801,2021-02-03,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000
3,464801,2021-02-04,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000
4,464801,2021-02-05,1,156.78,REA,classic,fulfillment,free_shipping,1440.000000
...,...,...,...,...,...,...,...,...,...
37660274,129187,2021-03-31,0,22057.00,ARG,classic,drop_off,free_shipping,267.710767
37660275,6707,2021-03-31,0,26999.00,ARG,classic,cross_docking,free_shipping,266.083333
37660276,170355,2021-03-31,0,3400.00,ARG,classic,drop_off,paid_shipping,0.252633
37660277,246568,2021-03-31,0,6289.00,ARG,classic,fulfillment,free_shipping,135.416667


I'll manually define mappings from categorical variables to numbers.

In [2]:
currency_map = {
    'REA': 0,
    'MEX': 1,
    'ARG': 2,
    'DOL': 3
}

listing_type_map = {
    'classic': 0,
    'premium': 1
}

shipping_logistic_type_map = {
    'fulfillment': 0,
    'drop_off': 1,
    'cross_docking': 2
}

shipping_payment_map = {
    'paid_shipping': 0,
    'free_shipping': 1
}

I'll replace the values of the categorical variables with their equivalent in numbers.

In [3]:
train_data['currency'] = train_data['currency'].map(currency_map)
train_data['listing_type'] = train_data['listing_type'].map(listing_type_map)
train_data['shipping_logistic_type'] = train_data['shipping_logistic_type'].map(shipping_logistic_type_map)
train_data['shipping_payment'] = train_data['shipping_payment'].map(shipping_payment_map)
train_data

Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active
0,464801,2021-02-01,0,156.78,0,0,0,1,1440.000000
1,464801,2021-02-02,0,156.78,0,0,0,1,1440.000000
2,464801,2021-02-03,0,156.78,0,0,0,1,1440.000000
3,464801,2021-02-04,0,156.78,0,0,0,1,1440.000000
4,464801,2021-02-05,1,156.78,0,0,0,1,1440.000000
...,...,...,...,...,...,...,...,...,...
37660274,129187,2021-03-31,0,22057.00,2,0,1,1,267.710767
37660275,6707,2021-03-31,0,26999.00,2,0,2,1,266.083333
37660276,170355,2021-03-31,0,3400.00,2,0,1,0,0.252633
37660277,246568,2021-03-31,0,6289.00,2,0,0,1,135.416667


Add date features.

In [5]:
from datetime import datetime

start_date = datetime(2021, 2, 1)
unique_dates = train_data['date'].unique().tolist()
dates_dict = {x: (datetime.strptime(x, '%Y-%m-%d') - start_date).days for x in unique_dates}
train_data['day_index'] = train_data['date'].apply(lambda x: dates_dict[x])


holidays = {
    'REA': ['2021-04-01', '2021-04-02', '2021-04-21'],
    'ARG': ['2021-02-15', '2021-02-16', '2021-03-24', '2021-04-01', '2021-04-02'],
    'MXN': ['2021-02-01', '2021-03-15', '2021-04-01', '2021-04-02'],
    'DOL': []
}

train_data['is_holiday'] = train_data.apply(lambda x: 1 if x['date'] in holidays[x['currency']] else 0)
train_data['date'] = pd.to_datetime(train_data['date'])

# 0 = Monday and 6 = Sunday
train_data['day_of_week'] = train_data['date'].apply(lambda x: x.dayofweek)
train_data['day_of_month'] = train_data['date'].apply(lambda x: x.day)

train_data

TypeError: strptime() argument 1 must be str, not int

Load additional data, specific from the items.

In [None]:
from collections import defaultdict
import json

item_details = defaultdict(lambda: None)
domain_map = defaultdict(lambda: None)

with open('inputs/items_static_metadata_full.jl') as f:
    for index, line in enumerate(f):
        data = json.loads(line)
        sku = data['sku']
        item_details[sku] = {
            'item_id': data['item_id'],
            'domain_id': domain_map.setdefault(data['item_domain_id'], len(domain_map))
        }

I get the list of unique items in the dataset

In [None]:
skus = train_data['sku'].unique().tolist()

for index, sku in enumerate(skus):
    pass