## Install Packages as Needed

In [1]:
%%bash
pip install --upgrade pip
pip install pandas
pip install pyarrow
pip install matplotlib
pip install statsmodels
pip install pycausalimpact



# !!! None of the following cells are required to execute the example Jupyter notebooks !!!

### The following cells cite original data sets and modify them for the examples in the workshop

In [2]:
import pandas as pd
import random
from datetime import datetime, timedelta
import numpy as np

random.seed(8675309)

In [3]:
def random_date(start, end):
    return start + timedelta(
        days=random.randint(0, int((end - start).days)))

# test the function
# random_date(datetime(2024,1,1), datetime(2024,2,1))

## Simple Experiment with Regression Data Prep

Original Data set is from a [Criteo Uplift Modeling project on Kaggle](https://www.kaggle.com/datasets/arashnic/uplift-modeling), which uses a data set from the paper ["A Large Scale Benchmark for Uplift Modeling"](https://s3.us-east-2.amazonaws.com/criteo-uplift-dataset/large-scale-benchmark.pdf)

In [4]:
xpdf = pd.read_csv('data/criteo-uplift-v2.1.csv')
xpdf = xpdf[xpdf.visit == 1]
xpdf.drop(columns=['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7','f8', 'f9', 'f10', 'f11', 'visit', 'exposure'], inplace=True)
xpdf['date'] = [random_date(datetime(2024,7,1), datetime(2024,10,1)) for _ in range(len(xpdf))]
xpdf.sort_values(by=['date'], inplace=True)
# xpdf.head()

In [5]:
# xpdf.groupby('treatment').mean()

In [6]:
xpdf['experiment_cohort'] = xpdf.treatment.apply(lambda x: 'new_website' if int(x) == 1 else 'control_old_website')
xpdf.head()

Unnamed: 0,treatment,conversion,date,experiment_cohort
3943771,1,0,2024-07-01,new_website
9204439,1,0,2024-07-01,new_website
11998400,1,0,2024-07-01,new_website
10096813,1,0,2024-07-01,new_website
9696364,1,0,2024-07-01,new_website


In [7]:
xpdf.to_parquet('data/simple_experiment_criteo_as_timeseries.parquet')

In [8]:
xpdf = None
del(xpdf)

## Diff in Diff & CausalImpact Examples Data Prep

Data sourced from the following Kaggle competition:
> Florian Knauer, Will Cukierski. (2015). Rossmann Store Sales. Kaggle. https://kaggle.com/competitions/rossmann-store-sales

In [9]:
ross = pd.read_csv('data/train.csv')
ross.columns = [c.lower() for c in ross.columns]
ross = ross[(ross.date >= '2015-02-01') & (ross.date < '2015-08-01')]

  ross = pd.read_csv('data/train.csv')


In [10]:
# ross.store.nunique()

In [11]:
def _store_geo_group(x):
    if x > 500:
        return 'all_others'
    elif x % 5 < 2:
        return 'test_holdout'
    else:
        return 'control_baseline'

In [12]:
ross['geo_group'] = ross.store.apply(_store_geo_group)
# ross.head()

In [13]:
grdf = ross.groupby(['date', 'dayofweek', 'geo_group']).sales.sum().unstack().reset_index().sort_values(by='date').copy()
# grdf.head()

In [14]:
avg_sales = round(grdf.tail(28).test_holdout.mean())
effect_mean = .3*avg_sales
effect_sd = .1*effect_mean

In [15]:
grdf['test_holdout'] = grdf.apply(lambda x: max(x['test_holdout'] - np.random.normal(effect_mean, effect_sd), 1000*random.random() ) if x['date'] >= '2015-05-01' else x['test_holdout'], axis=1)
grdf['date'] = pd.to_datetime(grdf['date'])
grdf['total_sales'] = grdf.all_others + grdf.control_baseline + grdf.test_holdout

In [16]:
grdf.to_parquet('data/rossmann_sales.parquet')

In [17]:
ross = None
del(ross)
grdf = None
del(grdf)