In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)

## Items (SKU metadata)

In [2]:
%%time

items = pd.read_json('items_static_metadata_full.jl', lines=True)
items['item_domain_id'] = np.where(items['sku'].isin([459892]), 'MLB-HEADSET_TELEPHONES', items['item_domain_id'])
items['item_domain_id'] = np.where(items['sku'].isin([553503]), 'MLB-HAIR_STRAIGHTENERS', items['item_domain_id'])
items['item_domain_id'] = np.where(items['sku'].isin([454273]), 'MLB-HEADPHONES', items['item_domain_id'])
items

CPU times: user 2.86 s, sys: 471 ms, total: 3.33 s
Wall time: 3.4 s


Unnamed: 0,item_domain_id,item_id,item_title,site_id,sku,product_id,product_family_id
0,MLB-SNEAKERS,492155,Tênis Masculino Olympikus Cyber Barato Promoçao,MLB,0,,MLB15832732
1,MLB-SURFBOARD_RACKS,300279,Suporte Rack Prancha Parede C/ Regulagem Horiz...,MLB,1,,
2,MLM-NECKLACES,69847,5 Collares Plateados Dama Gargantilla Choker -...,MLM,2,,
3,MLM-RINGS,298603,Lindo Anillo De Bella Crepusculo Twilight Prom...,MLM,3,,
4,MLB-WEBCAMS,345949,Webcam Com Microfone Hd 720p Knup Youtube Pc V...,MLB,4,,
...,...,...,...,...,...,...,...
660911,MLB-CELLPHONE_PARTS,320792,2 Parafuso Pentalobe Torx iPhone 5 5c 5s 6 6s ...,MLB,660911,,
660912,MLM-AUTOMOTIVE_EMBLEMS,90441,Emblema Circular Wolfsburg Edition 5cm,MLM,660912,,
660913,MLB-SOFA_AND_FUTON_COVERS,202580,Kit Capa De Sofá Coladinha Jogo 2e3 Lugares Ma...,MLB,660913,,
660914,MLB-SNEAKERS,490874,Calçado Sapatênis Clássico Baltic 234 Ollie,MLB,660914,,


In [3]:
items[items['item_domain_id'].isnull()]

Unnamed: 0,item_domain_id,item_id,item_title,site_id,sku,product_id,product_family_id


In [4]:
items.isnull().sum()

item_domain_id            0
item_id                   0
item_title                0
site_id                   0
sku                       0
product_id           631248
product_family_id    577732
dtype: int64

In [5]:
items['site_id'].value_counts()

MLB    336334
MLM    265912
MLA     58670
Name: site_id, dtype: int64

## Test

In [6]:
%%time

test = pd.read_csv('test_data.csv').merge(items, how='left')
test

CPU times: user 1.24 s, sys: 86.8 ms, total: 1.33 s
Wall time: 1.35 s


Unnamed: 0,sku,target_stock,item_domain_id,item_id,item_title,site_id,product_id,product_family_id
0,464801,3,MLB-NEBULIZERS,344151,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB,MLB9838512,MLB9838510
1,645793,4,MLB-NEBULIZERS,438135,"Inalador Infantil, 2 Anos De Garantia, G-tech,...",MLB,,MLB9838510
2,99516,8,MLB-ADHESIVE_TAPES,221252,Fita Dupla Face Preta Colar Touch Lcd 3mm Celu...,MLB,,
3,538100,8,MLB-SCHOOL_AND_OFFICE_GLUES,62099,Cola T-7000 Black Pra Uso Em Touch E Celular E...,MLB,,
4,557191,10,MLB-DECORATIVE_VINYLS,168198,Adesivo De Parede Unicornio + 30 Florais,MLB,,
...,...,...,...,...,...,...,...,...
551467,129187,8,MLA-RADIO_BASE_STATIONS,408559,Base Bibanda Yedro M04 Vus Vhf Uhf,MLA,,
551468,6707,30,MLA-PRINTERS,193784,Impresora Multifunción Hp Laserjet 137fnw Con...,MLA,MLA15159034,MLA15159032
551469,170355,3,MLA-WRISTWATCHES,110276,Reloj Hombre Skmei 9178 Rock Calavera En Acero...,MLA,,
551470,246568,2,MLA-HARD_DRIVES_AND_SSDS,456892,Disco Sólido Interno Western Digital Wds250g2...,MLA,MLA15697725,MLA15697724


In [7]:
test['target_stock'].mean()

18.06947224881771

In [8]:
target_stock_by_site_id = test.groupby('site_id').agg(
    mean_target=pd.NamedAgg('target_stock', 'mean'),
    sd_target=pd.NamedAgg('target_stock', 'std')
)
target_stock_by_site_id

Unnamed: 0_level_0,mean_target,sd_target
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1
MLA,17.633859,59.890839
MLB,17.223478,71.636813
MLM,19.24382,173.933007


## Preprocessing   

In [9]:
%%time

train = pd.read_parquet('train_data.parquet').merge(items, how='left')
train['date'] = pd.to_datetime(train['date'])
train

CPU times: user 33.2 s, sys: 14.1 s, total: 47.2 s
Wall time: 44.7 s


Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,item_domain_id,item_id,item_title,site_id,product_id,product_family_id
0,464801,2021-02-01,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB,MLB9838512,MLB9838510
1,464801,2021-02-02,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB,MLB9838512,MLB9838510
2,464801,2021-02-03,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB,MLB9838512,MLB9838510
3,464801,2021-02-04,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB,MLB9838512,MLB9838510
4,464801,2021-02-05,1,156.78,REA,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB,MLB9838512,MLB9838510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37660274,129187,2021-03-31,0,22057.00,ARG,classic,drop_off,free_shipping,267.710767,MLA-RADIO_BASE_STATIONS,408559,Base Bibanda Yedro M04 Vus Vhf Uhf,MLA,,
37660275,6707,2021-03-31,0,26999.00,ARG,classic,cross_docking,free_shipping,266.083333,MLA-PRINTERS,193784,Impresora Multifunción Hp Laserjet 137fnw Con...,MLA,MLA15159034,MLA15159032
37660276,170355,2021-03-31,0,3400.00,ARG,classic,drop_off,paid_shipping,0.252633,MLA-WRISTWATCHES,110276,Reloj Hombre Skmei 9178 Rock Calavera En Acero...,MLA,,
37660277,246568,2021-03-31,0,6289.00,ARG,classic,fulfillment,free_shipping,135.416667,MLA-HARD_DRIVES_AND_SSDS,456892,Disco Sólido Interno Western Digital Wds250g2...,MLA,MLA15697725,MLA15697724


In [10]:
train['item_domain_id'].value_counts()

MLB-SUPPLEMENTS            786063
MLM-CELLPHONE_COVERS       592007
MLB-HAIR_TREATMENTS        266634
MLM-T_SHIRTS               254175
MLM-HEADPHONES             247016
                            ...  
MLA-FIRE_STARTER_FLINTS         7
MLA-MINI_BAG_SEALERS            6
MLA-GLOVES_AND_MITTENS          6
MLA-PET_FOOTWEAR                5
MLA-RADIO_BASE_STATIONS         1
Name: item_domain_id, Length: 8408, dtype: int64

### Temporal split

In [11]:
%%time

data_train = train[train['date'] <= '2021-03-01'].copy()

data_train.loc[:, 'is_active'] = np.where(data_train['minutes_active'] > 0, 1, 0) 
data_train.loc[:, 'is_inactive'] = np.where(data_train['minutes_active'] == 0, 1, 0) 

# use money as pct change instead of absolute
data_train['current_price'] = data_train.groupby('sku')['current_price'].pct_change()

data_train['week'] = data_train['date'].dt.isocalendar().week.astype(int)
data_train['week'] = data_train['week'] - data_train['week'].min() + 1

data_train['has_zero_sold'] = np.where(data_train['sold_quantity'] == 0, 1, 0)

data_train

CPU times: user 17.1 s, sys: 13.3 s, total: 30.4 s
Wall time: 32.2 s


Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,item_domain_id,item_id,item_title,site_id,product_id,product_family_id,is_active,is_inactive,week,has_zero_sold
0,464801,2021-02-01,0,,REA,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB,MLB9838512,MLB9838510,1,0,1,1
1,464801,2021-02-02,0,0.0,REA,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB,MLB9838512,MLB9838510,1,0,1,1
2,464801,2021-02-03,0,0.0,REA,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB,MLB9838512,MLB9838510,1,0,1,1
3,464801,2021-02-04,0,0.0,REA,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB,MLB9838512,MLB9838510,1,0,1,1
4,464801,2021-02-05,1,0.0,REA,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB,MLB9838512,MLB9838510,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37615964,488932,2021-03-01,0,,ARG,classic,cross_docking,free_shipping,146.083333,MLA-WIRELESS_CHARGERS,233429,Cargador Inalambrico Rapido Samsung S21 S21 Pl...,MLA,,,1,0,5,1
37615995,323246,2021-03-01,0,,ARG,classic,drop_off,paid_shipping,0.000000,MLA-HAIR_SHAMPOOS_AND_CONDITIONERS,299333,Shampoo Solido Y Acondicionador Pack - Diferen...,MLA,,,0,1,5,1
37616026,7706,2021-03-01,0,,ARG,classic,drop_off,paid_shipping,0.000000,MLA-HAIR_SHAMPOOS_AND_CONDITIONERS,299333,Shampoo Solido Y Acondicionador Pack - Diferen...,MLA,,,0,1,5,1
37616057,86446,2021-03-01,0,,ARG,classic,drop_off,paid_shipping,34.533333,MLA-PACKAGING_CONTAINERS,355940,"50 Envases De 50ml Flip Top P/souvenir, Alcoho...",MLA,,,1,0,5,1


In [12]:
%%time

data_val = train[train['date'] > '2021-03-01'].copy()

data_val.loc[:, 'is_active'] = np.where(data_val['minutes_active'] > 0, 1, 0) 
data_val.loc[:, 'is_inactive'] = np.where(data_val['minutes_active'] == 0, 1, 0)

# use money as pct change instead of absolute
data_val['current_price'] = data_val.groupby('sku')['current_price'].pct_change()

data_val['week'] = data_val['date'].dt.isocalendar().week.astype(int)
data_val['week'] = data_val['week'] - data_val['week'].min() + 1

data_val['has_zero_sold'] = np.where(data_val['sold_quantity'] == 0, 1, 0)

data_val

CPU times: user 12 s, sys: 6.38 s, total: 18.4 s
Wall time: 20.5 s


Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,item_domain_id,item_id,item_title,site_id,product_id,product_family_id,is_active,is_inactive,week,has_zero_sold
29,464801,2021-03-02,0,,REA,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB,MLB9838512,MLB9838510,1,0,1,1
30,464801,2021-03-03,0,0.000000,REA,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB,MLB9838512,MLB9838510,1,0,1,1
31,464801,2021-03-04,0,0.000000,REA,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB,MLB9838512,MLB9838510,1,0,1,1
32,464801,2021-03-05,0,-0.071803,REA,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB,MLB9838512,MLB9838510,1,0,1,1
33,464801,2021-03-06,0,0.000000,REA,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB,MLB9838512,MLB9838510,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37660274,129187,2021-03-31,0,,ARG,classic,drop_off,free_shipping,267.710767,MLA-RADIO_BASE_STATIONS,408559,Base Bibanda Yedro M04 Vus Vhf Uhf,MLA,,,1,0,5,1
37660275,6707,2021-03-31,0,,ARG,classic,cross_docking,free_shipping,266.083333,MLA-PRINTERS,193784,Impresora Multifunción Hp Laserjet 137fnw Con...,MLA,MLA15159034,MLA15159032,1,0,5,1
37660276,170355,2021-03-31,0,,ARG,classic,drop_off,paid_shipping,0.252633,MLA-WRISTWATCHES,110276,Reloj Hombre Skmei 9178 Rock Calavera En Acero...,MLA,,,1,0,5,1
37660277,246568,2021-03-31,0,,ARG,classic,fulfillment,free_shipping,135.416667,MLA-HARD_DRIVES_AND_SSDS,456892,Disco Sólido Interno Western Digital Wds250g2...,MLA,MLA15697725,MLA15697724,1,0,5,1


### Mappings

In [13]:
sku2item_domain_id = {x[0]: x[1] for x in items[['sku', 'item_domain_id']].values}
sku2site_id = {x[0]: x[1] for x in items[['sku', 'site_id']].values}
sku2item_title = {x[0]: x[1] for x in items[['sku', 'item_title']].values}

site_id2int = {x: i for i, x in enumerate(items['site_id'].unique(), start=1)}
item_domain_id2int = {x: i for i, x in enumerate(items['item_domain_id'].unique(), start=1)}

In [14]:
def build_target_stock_and_inventory_days(dataset, days):
    '''
    Provided by the workshop (with some minor changes from me): https://www.youtube.com/watch?v=WqIXnWHyMVA
    '''
    
    np.random.seed(42)
    
    print('Sorting records...')
    temp_df = dataset[['sku', 'date', 'sold_quantity']].sort_values(['sku', 'date'])
    
    print('Grouping quantity...')
    temp_dict = temp_df.groupby('sku').agg({'sold_quantity': lambda x: [i for i in x]})['sold_quantity'].to_dict()
    # {0: [0, 0, 0, 1, 1, 0, 0, 3, ..., 0]}
    
    result = []
    for idx, list_quantity in temp_dict.items():
        cumsum = np.array(list_quantity).cumsum()
        # [0, 0, 0, 1, 2, 2, 2, 5, ..., k]
        
        stock_target = 0
        if cumsum[-1] > 0 and len(cumsum) == days:
            
            # choose a random target different from 0
            while stock_target == 0:
                stock_target = np.random.choice(cumsum)
                
            # get the first day with this amount of sales
            day_to_stockout = np.argwhere(cumsum == stock_target).min() + 1

            # add to list
            result.append({'sku': idx,
                           'item_domain_id': sku2item_domain_id[idx],
                           'site_id': site_id2int[sku2site_id[idx]],
                           'item_title': sku2item_title[idx],
                           'target_stock': stock_target,
                           'inventory_days': day_to_stockout})
            
    return result

In [15]:
%%time

val_dataset = pd.DataFrame(build_target_stock_and_inventory_days(data_val, days=30))

Sorting records...
Grouping quantity...
CPU times: user 37.9 s, sys: 4.23 s, total: 42.1 s
Wall time: 43.6 s


## Feature Engineering

In [16]:
def long_to_wide(df: pd.DataFrame, is_val: bool, features: list, fill_na: bool):
    '''
    Convert dataframe from long to wide, using one or more features.
    '''
    
    df_long = df.copy().set_index(['sku', 'date'])
    df_wide = df_long[features].unstack().droplevel(0, axis=1)
         
    days = 30 if is_val else 29
    idxs = list(range(1, days + 1)) * len(features)
    cols = np.repeat(features, days)  
    
    # val goes from [1, 30] and train goes from [1, 29], so train needs +1
    if is_val:
        df_wide.columns = [f'{x[0]}_{x[1]}' for x in zip(cols, idxs)]
        
        # delete first day for each feature if is validation data
        df_wide = df_wide[[x for x in df_wide.columns if x.split('_')[-1] != '1']]
        
    else:
        df_wide.columns = [f'{x[0]}_{x[1] + 1}' for x in zip(cols, idxs)]
        
    if fill_na:
        # impute NA based on each feature
        for col in df_wide.columns:
            if 'sold_quantity' in col or 'minutes_active' in col or 'current_price' in col:
                na_replace = 0
            elif 'listing_type' in col or 'shipping_logistic_type' in col or 'shipping_payment' in col:
                na_replace = 0

            df_wide[col] = df_wide[col].fillna(na_replace)
        
    return df_wide

In [17]:
# check categories and try to incorporate rolling windows for categorical features
for col in ['listing_type', 'shipping_logistic_type', 'shipping_payment']:
    print(col)
    print(train[col].unique())
    print()

listing_type
['classic' 'premium']

shipping_logistic_type
['fulfillment' 'cross_docking' 'drop_off']

shipping_payment
['free_shipping' 'paid_shipping']



In [18]:
%%time

roll_feats = ['sold_quantity', 'minutes_active', 'current_price', 'has_zero_sold',
              'listing_type', 'shipping_logistic_type', 'shipping_payment']

data_train_wide = long_to_wide(
    data_train,
    is_val=False,
    features=roll_feats,
    fill_na=False
)

data_train_wide_filled_na = long_to_wide(
    data_train,
    is_val=False,
    features=roll_feats,
    fill_na=True
)

data_train_wide.shape, data_train_wide_filled_na.shape

CPU times: user 50.8 s, sys: 29.7 s, total: 1min 20s
Wall time: 1min 29s


((639351, 203), (639351, 203))

In [19]:
# check the data format
data_train_wide.head(10)

Unnamed: 0_level_0,sold_quantity_2,sold_quantity_3,sold_quantity_4,sold_quantity_5,sold_quantity_6,sold_quantity_7,sold_quantity_8,sold_quantity_9,sold_quantity_10,sold_quantity_11,sold_quantity_12,sold_quantity_13,sold_quantity_14,sold_quantity_15,sold_quantity_16,sold_quantity_17,sold_quantity_18,sold_quantity_19,sold_quantity_20,sold_quantity_21,sold_quantity_22,sold_quantity_23,sold_quantity_24,sold_quantity_25,sold_quantity_26,sold_quantity_27,sold_quantity_28,sold_quantity_29,sold_quantity_30,minutes_active_2,minutes_active_3,minutes_active_4,minutes_active_5,minutes_active_6,minutes_active_7,minutes_active_8,minutes_active_9,minutes_active_10,minutes_active_11,minutes_active_12,minutes_active_13,minutes_active_14,minutes_active_15,minutes_active_16,minutes_active_17,minutes_active_18,minutes_active_19,minutes_active_20,minutes_active_21,minutes_active_22,...,shipping_logistic_type_10,shipping_logistic_type_11,shipping_logistic_type_12,shipping_logistic_type_13,shipping_logistic_type_14,shipping_logistic_type_15,shipping_logistic_type_16,shipping_logistic_type_17,shipping_logistic_type_18,shipping_logistic_type_19,shipping_logistic_type_20,shipping_logistic_type_21,shipping_logistic_type_22,shipping_logistic_type_23,shipping_logistic_type_24,shipping_logistic_type_25,shipping_logistic_type_26,shipping_logistic_type_27,shipping_logistic_type_28,shipping_logistic_type_29,shipping_logistic_type_30,shipping_payment_2,shipping_payment_3,shipping_payment_4,shipping_payment_5,shipping_payment_6,shipping_payment_7,shipping_payment_8,shipping_payment_9,shipping_payment_10,shipping_payment_11,shipping_payment_12,shipping_payment_13,shipping_payment_14,shipping_payment_15,shipping_payment_16,shipping_payment_17,shipping_payment_18,shipping_payment_19,shipping_payment_20,shipping_payment_21,shipping_payment_22,shipping_payment_23,shipping_payment_24,shipping_payment_25,shipping_payment_26,shipping_payment_27,shipping_payment_28,shipping_payment_29,shipping_payment_30
sku,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,872.65,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,...,cross_docking,cross_docking,cross_docking,cross_docking,cross_docking,cross_docking,cross_docking,cross_docking,cross_docking,cross_docking,cross_docking,cross_docking,cross_docking,cross_docking,cross_docking,cross_docking,cross_docking,cross_docking,cross_docking,cross_docking,cross_docking,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping
3,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1306.883333,0.0,706.464617,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,...,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,5.0,0.0,2.0,1.0,0.0,2.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,12.0,0.0,1.0,0.0,1.0,0.0,2.0,1440.0,1440.0,1440.0,1074.666667,0.0,0.0,0.0,70.621433,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,...,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,paid_shipping,paid_shipping,paid_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping
6,0.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,2.0,2.0,0.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,...,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,11.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,1.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1427.8732,1440.0,1440.0,1440.0,840.783333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,289.817883,1440.0,...,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping
8,1.0,2.0,18.0,14.0,2.0,5.0,13.0,59.0,6.0,7.0,2.0,2.0,5.0,1.0,13.0,11.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1258.816667,0.0,0.0,0.0,0.0,...,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,...,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping,free_shipping
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,...,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,fulfillment,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping,paid_shipping


In [20]:
%%time

data_val_wide = long_to_wide(
    data_val,
    is_val=True,
    features=roll_feats,
    fill_na=False
)

data_val_wide_filled_na = long_to_wide(
    data_val,
    is_val=True,
    features=roll_feats,
    fill_na=True
)

data_val_wide.shape, data_val_wide_filled_na.shape

CPU times: user 53.6 s, sys: 33 s, total: 1min 26s
Wall time: 1min 34s


((660916, 203), (660916, 203))

In [21]:
# testing some windows [(end, start), (end, start), ...]
w = 1
n_windows = 29 - w
[(30 - i, 30 - i - w) for i in list(range(n_windows))]

[(30, 29),
 (29, 28),
 (28, 27),
 (27, 26),
 (26, 25),
 (25, 24),
 (24, 23),
 (23, 22),
 (22, 21),
 (21, 20),
 (20, 19),
 (19, 18),
 (18, 17),
 (17, 16),
 (16, 15),
 (15, 14),
 (14, 13),
 (13, 12),
 (12, 11),
 (11, 10),
 (10, 9),
 (9, 8),
 (8, 7),
 (7, 6),
 (6, 5),
 (5, 4),
 (4, 3),
 (3, 2)]

In [22]:
# testing some windows [(end, start), (end, start), ...]
w = 5
n_windows = 29 - w
[(30 - i, 30 - i - w) for i in list(range(n_windows))]

[(30, 25),
 (29, 24),
 (28, 23),
 (27, 22),
 (26, 21),
 (25, 20),
 (24, 19),
 (23, 18),
 (22, 17),
 (21, 16),
 (20, 15),
 (19, 14),
 (18, 13),
 (17, 12),
 (16, 11),
 (15, 10),
 (14, 9),
 (13, 8),
 (12, 7),
 (11, 6),
 (10, 5),
 (9, 4),
 (8, 3),
 (7, 2)]

In [23]:
# testing some windows [(end, start), (end, start), ...]
w = 10
n_windows = 29 - w
[(30 - i, 30 - i - w) for i in list(range(n_windows))]

[(30, 20),
 (29, 19),
 (28, 18),
 (27, 17),
 (26, 16),
 (25, 15),
 (24, 14),
 (23, 13),
 (22, 12),
 (21, 11),
 (20, 10),
 (19, 9),
 (18, 8),
 (17, 7),
 (16, 6),
 (15, 5),
 (14, 4),
 (13, 3),
 (12, 2)]

### LOFO (Leave One Feature Out)

After applying LOFO when building the LGBM model, I reduced from 106 to 40 features and still improved the CV.  

In [30]:
selected_features_lofo = [
    # lag for sold quantity
    'sold_quantity_17', 'sold_quantity_18', 'sold_quantity_19',
    'sold_quantity_20', 'sold_quantity_21', 'sold_quantity_25',
    'sold_quantity_26', 'sold_quantity_27', 'sold_quantity_28',
    'sold_quantity_29', 'sold_quantity_30',
    
    # lag for minutes active
    'minutes_active_29', 'minutes_active_30',
    
    # rolling mean sold quantity
    'roll_mean_sold_30_11', 'roll_mean_sold_29_10', 'roll_mean_sold_28_9',
    'roll_mean_sold_27_8', 'roll_mean_sold_26_7', 'roll_mean_sold_25_6',
    'roll_mean_sold_24_5',
    
    # rolling mean minutes active
    'roll_mean_minutes_30_11', 'roll_mean_minutes_29_10', 'roll_mean_minutes_26_7',
    'roll_mean_minutes_25_6', 'roll_mean_minutes_24_5',
    
    # rolling mean pct change of price
    'roll_mean_price_30_11', 'roll_mean_price_29_10', 'roll_mean_price_27_8',
    'roll_mean_price_25_6', 'roll_mean_price_24_5',
    
    # rolling sum (count) of days with zero sold
    'roll_sum_zero_sold_30_11', 'roll_sum_zero_sold_24_5',
    
    # rolling sum (count) across different categorical features
    'roll_sum_shipping_logistic_type_fulfillment_30_11',
    'roll_sum_shipping_logistic_type_cross_docking_30_11',
    'roll_sum_shipping_logistic_type_fulfillment_28_9',
    'roll_sum_shipping_logistic_type_fulfillment_26_7',
    'roll_sum_shipping_logistic_type_fulfillment_25_6',
    'roll_sum_shipping_logistic_type_fulfillment_24_5',
    'roll_sum_shipping_payment_free_shipping_28_9',
    
    # target stock (this feature was provided in the test set as well)
    'target_stock'
]

print(len(selected_features_lofo))

40


In [25]:
def create_train_val_test(
        w,
        n_windows,
        use_undersampling: bool,
        use_standardize: bool,
        use_minmax: bool
    ):
    '''
    Create datasets using rolling windows features, based on window size "w".    
    If "n_windows" is passed, keep only the last "n_windows".
    '''
    
    # w must be in [1, 27]
    assert w in range(1, 28)
        
    if n_windows == None:
        n_windows = 29 - w
    
    print()
    print(f'------ w = {w} ------')
    print(f'------ n_windows = {n_windows} ------')
    
    
    if use_standardize or use_minmax:
        df_roll_train = data_train_wide_filled_na.copy()
        df_roll_val = data_val_wide_filled_na.copy()
    else:
        df_roll_train = data_train_wide.copy()
        df_roll_val = data_val_wide.copy()
      
    # rolling window for w days
    for end, start in [(30 - i, 30 - i - w) for i in list(range(n_windows))]:
        for feat in roll_feats:
            print(end, start, feat)
            
            cols = [f'{feat}_{i}' for i in list(range(start, end + 1))]
            
            if feat == 'sold_quantity':
                df_roll_train[f'roll_mean_sold_{end}_{start}'] = df_roll_train[cols].mean(axis=1)
                df_roll_val[f'roll_mean_sold_{end}_{start}'] = df_roll_val[cols].mean(axis=1)
                
            if feat == 'minutes_active':
                df_roll_train[f'roll_mean_minutes_{end}_{start}'] = df_roll_train[cols].mean(axis=1)
                df_roll_val[f'roll_mean_minutes_{end}_{start}'] = df_roll_val[cols].mean(axis=1)

            elif feat == 'current_price':
                df_roll_train[f'roll_mean_price_{end}_{start}'] = df_roll_train[cols].mean(axis=1)
                df_roll_val[f'roll_mean_price_{end}_{start}'] = df_roll_val[cols].mean(axis=1)

            elif feat == 'has_zero_sold':
                df_roll_train[f'roll_sum_zero_sold_{end}_{start}'] = df_roll_train[cols].sum(axis=1)
                df_roll_val[f'roll_sum_zero_sold_{end}_{start}'] = df_roll_val[cols].sum(axis=1)
                
            # categorical features
            elif feat == 'listing_type':
                for lvl in ['classic', 'premium']:
                    df_roll_train[f'roll_sum_{feat}_{lvl}_{end}_{start}'] = (df_roll_train[cols] == lvl).sum(axis=1)
                    df_roll_val[f'roll_sum_{feat}_{lvl}_{end}_{start}'] = (df_roll_val[cols] == lvl).sum(axis=1)
                
            elif feat == 'shipping_logistic_type':
                for lvl in ['fulfillment', 'cross_docking', 'drop_off']:
                    df_roll_train[f'roll_sum_{feat}_{lvl}_{end}_{start}'] = (df_roll_train[cols] == lvl).sum(axis=1)
                    df_roll_val[f'roll_sum_{feat}_{lvl}_{end}_{start}'] = (df_roll_val[cols] == lvl).sum(axis=1)
            
            elif feat == 'shipping_payment':
                for lvl in ['free_shipping', 'paid_shipping']:
                    df_roll_train[f'roll_sum_{feat}_{lvl}_{end}_{start}'] = (df_roll_train[cols] == lvl).sum(axis=1)
                    df_roll_val[f'roll_sum_{feat}_{lvl}_{end}_{start}'] = (df_roll_val[cols] == lvl).sum(axis=1)
                
                
    # bind new features
    val_dataset_ok = val_dataset.set_index('sku').join(df_roll_train)
    sub_dataset_ok = test[['sku', 'target_stock']].set_index('sku').join(df_roll_val)
    print('Binded new features')
    print()  
    
    # choosing features
    feats = selected_features_lofo
    
    
    # data for submission
    X_test = sub_dataset_ok.copy()
    X_test = X_test[feats]


    # split train/val
    X_train, X_val, y_train, y_val = train_test_split(
        val_dataset_ok[feats],
        val_dataset_ok['inventory_days'] - 1, # use target in [0, 29] instead of [1, 30]
        random_state=42
    )   
    
    if use_standardize:
        print('Stardardize...')
        for col in feats:
            mean_ = X_train[col].mean()
            std_ = X_train[col].std()
            X_train[col] = (X_train[col] - mean_) / std_
            X_val[col] = (X_val[col] - mean_) / std_
            X_test[col] = (X_test[col] - mean_) / std_

            # impute NA (very few ones)    
            X_train[col] = X_train[col].fillna(mean_) 
            X_val[col] = X_val[col].fillna(mean_)
            X_test[col] = X_test[col].fillna(mean_) 
            
    if use_minmax:
        print('MinMax...')
        for col in feats:            
            min_ = X_train[col].min()
            max_ = X_train[col].max()
            X_train = (X_train - min_) / (max_ - min_)
            X_val = (X_val - min_) / (max_ - min_)
            X_test = (X_test - min_) / (max_ - min_)
            
            # impute NA (very few ones)  
            mean_ = X_train[col].mean()
            X_train[col] = X_train[col].fillna(mean_) 
            X_val[col] = X_val[col].fillna(mean_)        
            X_test[col] = X_test[col].fillna(mean_) 
            

    if use_undersampling:
        print('Undersampling...')
        print('Before Undersampling:', X_train.shape[0])
        smp = RandomUnderSampler(random_state=2021)
        X_train, y_train = smp.fit_resample(X_train, y_train)
        print('After Undersampling:', X_train.shape[0])
    
    
    if use_standardize or use_minmax:
        assert X_train.isnull().sum().sum() == 0   
        assert X_val.isnull().sum().sum() == 0  
        assert X_test.isnull().sum().sum() == 0 
    
    
    print('Using', X_train.shape[1], 'features')
    
    return X_train, X_val, y_train, y_val, X_test

### Save datasets

In [26]:
%%time

w = 19
n_windows = 7

X_train, X_val, y_train, y_val, X_test = create_train_val_test(
    w=w,
    n_windows=n_windows,
    use_undersampling=False,
    use_standardize=False,
    use_minmax=False
)

X_train.reset_index().to_feather('X_train.f')
X_val.reset_index().to_feather('X_val.f')
y_train.reset_index().to_feather('y_train.f')
y_val.reset_index().to_feather('y_val.f')
X_test.reset_index().to_feather('X_test.f')


------ w = 19 ------
------ n_windows = 7 ------
30 11 sold_quantity
30 11 minutes_active
30 11 current_price
30 11 has_zero_sold
30 11 listing_type
30 11 shipping_logistic_type
30 11 shipping_payment
29 10 sold_quantity
29 10 minutes_active
29 10 current_price
29 10 has_zero_sold
29 10 listing_type
29 10 shipping_logistic_type
29 10 shipping_payment
28 9 sold_quantity
28 9 minutes_active
28 9 current_price
28 9 has_zero_sold
28 9 listing_type
28 9 shipping_logistic_type
28 9 shipping_payment
27 8 sold_quantity
27 8 minutes_active
27 8 current_price
27 8 has_zero_sold
27 8 listing_type
27 8 shipping_logistic_type
27 8 shipping_payment
26 7 sold_quantity
26 7 minutes_active
26 7 current_price
26 7 has_zero_sold
26 7 listing_type
26 7 shipping_logistic_type
26 7 shipping_payment
25 6 sold_quantity
25 6 minutes_active
25 6 current_price
25 6 has_zero_sold
25 6 listing_type
25 6 shipping_logistic_type
25 6 shipping_payment
24 5 sold_quantity
24 5 minutes_active
24 5 current_price
24 5 has

In [27]:
%%time

w = 19
n_windows = 7

X_train_standard, X_val_standard, _, _, X_test_standard = create_train_val_test(
    w=w,
    n_windows=n_windows,
    use_undersampling=False,
    use_standardize=True,
    use_minmax=False
)

X_train_standard.reset_index().to_feather('X_train_standard.f')
X_val_standard.reset_index().to_feather('X_val_standard.f')
X_test_standard.reset_index().to_feather('X_test_standard.f')


------ w = 19 ------
------ n_windows = 7 ------
30 11 sold_quantity
30 11 minutes_active
30 11 current_price
30 11 has_zero_sold
30 11 listing_type
30 11 shipping_logistic_type
30 11 shipping_payment
29 10 sold_quantity
29 10 minutes_active
29 10 current_price
29 10 has_zero_sold
29 10 listing_type
29 10 shipping_logistic_type
29 10 shipping_payment
28 9 sold_quantity
28 9 minutes_active
28 9 current_price
28 9 has_zero_sold
28 9 listing_type
28 9 shipping_logistic_type
28 9 shipping_payment
27 8 sold_quantity
27 8 minutes_active
27 8 current_price
27 8 has_zero_sold
27 8 listing_type
27 8 shipping_logistic_type
27 8 shipping_payment
26 7 sold_quantity
26 7 minutes_active
26 7 current_price
26 7 has_zero_sold
26 7 listing_type
26 7 shipping_logistic_type
26 7 shipping_payment
25 6 sold_quantity
25 6 minutes_active
25 6 current_price
25 6 has_zero_sold
25 6 listing_type
25 6 shipping_logistic_type
25 6 shipping_payment
24 5 sold_quantity
24 5 minutes_active
24 5 current_price
24 5 has