# Experiments

This notebook shows sample experiments we ran to estimate/re-estimate the standard errror of e-commerce metrics, e.g., ABV, ABS, ASP.

Experiment tracking is done via simple data files.

In [None]:
import os, sys
sys.path.append(os.path.join(os.getcwd(), 'src/'))

from datetime import datetime
import glob

from oce_ecomm_abv_calculation.sample_statistics.vanilla import VanillaSampleStatistics
from oce_ecomm_abv_calculation.sample_statistics.oneway_bootstrap import OnewayBootstrapStatistics
from oce_ecomm_abv_calculation.utils.experiment_data import consolidate_experiment_data_files

# UCI Online Retail II Dataset

Some end dates used in our experiments:

* `datetime(2009, 12, 8)` - 7 days
* `datetime(2009, 12, 15)` - 14 days
* `datetime(2009, 12, 22)` - 21 days / 3 weeks
* `datetime(2009, 12, 29)` - 28 days / 4 weeks
* `datetime(2010, 1, 5)` - 35 days / 5 weeks
* `datetime(2010, 1, 12)` - 42 days / 6 weeks
* `datetime(2010, 1, 19)` - 49 days / 7 weeks
* `datetime(2010, 1, 26)` - 56 days / 8 weeks
* `datetime(2010, 2, 2)` - 63 days / 9 weeks
* `datetime(2010, 2, 13)` - 74 days (10% of dataset duration)
* `datetime(2010, 3, 1)` - 3 months
* `datetime(2010, 4, 1)` - 4 months
* `datetime(2010, 4, 28)` - 148 days (20% of dataset duration)
* `datetime(2010, 6, 1)` - 6 months
* `datetime(2010, 7, 11)` - 222 days (30% of dataset duration)
* `datetime(2010, 9, 23)` - 296 days (40% of dataset duration)
* `datetime(2010, 12, 6)` - 370 days (50% of dataset duration)
* `datetime(2011, 2, 18)` - 444 days (60% of dataset duration)
* `datetime(2011, 5, 3)` - 518 days (70% of dataset duration)
* `datetime(2011, 7, 16)` - 592 days (80% of dataset duration)
* `datetime(2011, 9, 28)` - 666 days (90% of dataset duration)
* `datetime(2012, 1, 1)` - Full dataset duration

These end dates define the expanding windows, which simulate different experiment durations.

## UCI Online Retail II - ABV

In [None]:
start_date = datetime(2009, 12, 1)
end_date = datetime(2009, 12, 29)

num_std_error_samples = 5
num_bootstrap_means = 500

# Vanilla sample statistics
VanillaSampleStatistics(
    dataset="uci_online_retail_ii_customer_order_view",
    response_col='r_BasketValue',
    start_time=start_date,
    end_time=end_date
).save_statistics_as_pd_df()

print(f"Vanilla sample statistics saved.             ")

# Oneway bootstrap statistics
uci_retail_co_abv_oneway = (
    OnewayBootstrapStatistics(
        dataset="uci_online_retail_ii_customer_order_view",
        response_col='r_BasketValue',
        start_time=start_date,
        end_time=end_date
    )
)

for i in range(1, num_std_error_samples + 1):
    uci_retail_co_abv_oneway.standard_error(num_bootstrap_means=num_bootstrap_means, verbose=True)
    uci_retail_co_abv_oneway.save_latest_result_as_pd_df()
    print(f"Oneway bootstrap run {i} saved.             ")

## UCI Online Retail II - ABS

In [None]:
start_date = datetime(2009, 12, 1)
end_date = datetime(2009, 12, 29)

num_std_error_samples = 5
num_bootstrap_means = 500

# Vanilla sample statistics
VanillaSampleStatistics(
    dataset="uci_online_retail_ii_customer_order_view",
    response_col='r_BasketSize',
    start_time=start_date,
    end_time=end_date
).save_statistics_as_pd_df()

print(f"Vanilla sample statistics saved.            ")

# Oneway bootstrap statistics
uci_retail_co_abs_oneway = (
    OnewayBootstrapStatistics(
        dataset="uci_online_retail_ii_customer_order_view",
        response_col='r_BasketSize',
        start_time=start_date,
        end_time=end_date
    )
)

for i in range(1, num_std_error_samples + 1):
    uci_retail_co_abs_oneway.standard_error(num_bootstrap_means=num_bootstrap_means, verbose=True)
    uci_retail_co_abs_oneway.save_latest_result_as_pd_df()
    print(f"Oneway bootstrap run {i} saved.             ")

## UCI Online Retail II - ASP

In [None]:
start_date = datetime(2009, 12, 1)
end_date = datetime(2009, 12, 29)

num_std_error_samples = 1
num_bootstrap_means = 500

# Vanilla sample statistics
VanillaSampleStatistics(
    dataset="uci_online_retail_ii_customer_order_item_view",
    response_col='r_SellingPrice',
    start_time=start_date,
    end_time=end_date
).save_statistics_as_pd_df()

print(f"Vanilla sample statistics saved.             ")

# Oneway bootstrap statistics
uci_retail_coi_asp_oneway = (
    OnewayBootstrapStatistics(
        dataset="uci_online_retail_ii_customer_order_item_view",
        response_col='r_SellingPrice',
        start_time=start_date,
        end_time=end_date
    )
)

for i in range(1, num_std_error_samples + 1):
    uci_retail_coi_asp_oneway.standard_error(num_bootstrap_means=num_bootstrap_means, verbose=True)
    uci_retail_coi_asp_oneway.save_latest_result_as_pd_df()
    print(f"Oneway bootstrap run {i} saved.              ")

# Olist Brazilian e-Commerce Dataset

Some end dates used in our experiments:

* `datetime(2016, 10, 18)` - first month (with only 2 orders) + 14 days
* `datetime(2016, 11, 1)` - first month (with only 2 orders) + 28 days
* `datetime(2016, 11, 15)` - first month (with only 2 orders) + 42 days / 6 weeks
* `datetime(2016, 11, 29)` - first month (with only 2 orders) + 56 days / 8 weeks
* `datetime(2017, 1, 28)` - 146 days (20% of dataset duration)
* `datetime(2017, 4, 11)` - 219 days (30% of dataset duration)
* `datetime(2017, 6, 23)` - 292 days (40% of dataset duration)
* `datetime(2017, 9, 4)` - 365 days (50% of dataset duration / 1 year)
* `datetime(2017, 11, 16)` - 438 days (60% of dataset duration)
* `datetime(2018, 1, 28)` - 511 days (70% of dataset duration)
* `datetime(2018, 4, 11)` - 584 days (80% of dataset duration)
* `datetime(2018, 6, 23)` - 657 days (90% of dataset duration)
* `datetime(2018, 9, 4)` - Full dataset duration

These end dates define the expanding windows, which simulate different experiment durations.

## Olist Brazilian e-Commerce - ABV

In [None]:
start_date = datetime(2016, 9, 4)
end_date = datetime(2016, 11, 1)

num_std_error_samples = 5
num_bootstrap_means = 500

# Vanilla sample statistics
VanillaSampleStatistics(
    dataset="olist_brazilian_ecommerce_customer_order_view",
    response_col='r_BasketValue',
    start_time=start_date,
    end_time=end_date
).save_statistics_as_pd_df()

print(f"Vanilla sample statistics saved.                 ")

# Oneway bootstrap statistics
olist_ecommerce_co_abv_oneway = (
    OnewayBootstrapStatistics(
        dataset="olist_brazilian_ecommerce_customer_order_view",
        response_col='r_BasketValue',
        start_time=start_date,
        end_time=end_date
    )
)

for i in range(1, num_std_error_samples + 1):
    olist_ecommerce_co_abv_oneway.standard_error(num_bootstrap_means=num_bootstrap_means, verbose=True)
    olist_ecommerce_co_abv_oneway.save_latest_result_as_pd_df()
    print(f"Oneway bootstrap run {i} saved.                  ")

## Olist Brazilian e-Commerce - ABS

In [None]:
start_date = datetime(2016, 9, 4)
end_date = datetime(2016, 11, 1)

num_std_error_samples = 5
num_bootstrap_means = 500

# Vanilla sample statistics
VanillaSampleStatistics(
    dataset="olist_brazilian_ecommerce_customer_order_view",
    response_col='r_BasketSize',
    start_time=start_date,
    end_time=end_date
).save_statistics_as_pd_df()

print(f"Vanilla sample statistics saved.                 ")

# Oneway bootstrap statistics
olist_ecommerce_co_abs_oneway = (
    OnewayBootstrapStatistics(
        dataset="olist_brazilian_ecommerce_customer_order_view",
        response_col='r_BasketSize',
        start_time=start_date,
        end_time=end_date
    )
)

for i in range(1, num_std_error_samples + 1):
    olist_ecommerce_co_abs_oneway.standard_error(num_bootstrap_means=num_bootstrap_means, verbose=True)
    olist_ecommerce_co_abs_oneway.save_latest_result_as_pd_df()
    print(f"Oneway bootstrap run {i} saved.                  ")

## Olist Brazilian e-Commerce - ASP

In [None]:
start_date = datetime(2016, 9, 4)
end_date = datetime(2016, 11, 1)

num_std_error_samples = 5
num_bootstrap_means = 500

# Vanilla sample statistics
VanillaSampleStatistics(
    dataset="olist_brazilian_ecommerce_customer_order_item_view",
    response_col='r_SellingPrice',
    start_time=start_date,
    end_time=end_date
).save_statistics_as_pd_df()

print(f"Vanilla sample statistics saved.                  ")

# Oneway bootstrap statistics
olist_ecommerce_coi_asp_oneway = (
    OnewayBootstrapStatistics(
        dataset="olist_brazilian_ecommerce_customer_order_item_view",
        response_col='r_SellingPrice',
        start_time=start_date,
        end_time=end_date
    )
)

for i in range(1, num_std_error_samples + 1):
    olist_ecommerce_coi_asp_oneway.standard_error(num_bootstrap_means=num_bootstrap_means, verbose=True)
    olist_ecommerce_coi_asp_oneway.save_latest_result_as_pd_df()
    print(f"Oneway bootstrap run {i} saved.                   ")

# Utilities


The following cell consolidates the many parquet files generated by each `save_as_pd_df()` to a dataframe in a single parquet file.

In [None]:
expt_method = "vanilla"                              # e.g. "vanilla", "oneway"
dataset = "uci_online_retail_ii_customer_order_view" # e.g. "olist_brazilian_ecommerce_customer_order_view"
response_col = "r_BasketValue"                       # e.g. "r_BasketValue", "r_BasketSize", "r_SellingPrice"
expt_data_dir_path = "./data/"


consolidate_experiment_data_files(
    dir_path=expt_data_dir_path,
    expt_method=expt_method,
    dataset=dataset,
    response_col=response_col,
    cleanup_files=True
)