# Variance Reduction

This example notebook will you show you one of the most popular approaches to variance reduction in A/B testing — CUPED (covariates using pre-experimental data).

In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=[FutureWarning, DeprecationWarning])

import sys
import logging
import numpy as np
import pandas as pd

from abacus.auto_ab.abtest import ABTest
from abacus.auto_ab.params import ABTestParams, DataParams, HypothesisParams

logging.basicConfig(level = logging.INFO)

%load_ext autoreload
%autoreload 2

## Variance reduction with no difference between groups

In [None]:
df = pd.read_csv('./data/ab_data_height.csv')
df.head()

In [None]:
data_params = DataParams(
    id_col='id', 
    group_col='groups',
    control_name='A',
    treatment_name='B',
    target='height_now', 
    covariate='height_prev',    # provide covariate for CUPED
    is_grouped=True
)

hypothesis_params = HypothesisParams(
    alpha=0.05, 
    beta=0.2, 
    alternative='greater',  
    metric_type='continuous', 
    metric_name='mean',
)

ab_params = ABTestParams(data_params, hypothesis_params)

In [None]:
ab_test = ABTest(df, ab_params)

Mean and variance before variance reduction:

In [None]:
print('Control mean = {:.3f}'.format(np.mean(ab_test.params.data_params.control)))
print('Treatment mean = {:.3f}'.format(np.mean(ab_test.params.data_params.treatment)))
print('Control var = {:.3f}'.format(np.var(ab_test.params.data_params.control)))
print('Treatment var = {:.3f}'.format(np.var(ab_test.params.data_params.treatment)))

Apply CUPED:

In [None]:
ab_test1 = ab_test.cuped()

Mean and variance after variance reduction:

In [None]:
print('Control mean = {:.3f}'.format(np.mean(ab_test1.params.data_params.control)))
print('Treatment mean = {:.3f}'.format(np.mean(ab_test1.params.data_params.treatment)))
print('Control var = {:.3f}'.format(np.var(ab_test1.params.data_params.control)))
print('Treatment var = {:.3f}'.format(np.var(ab_test1.params.data_params.treatment)))

As can be seen, variance of experiment dataset decreased, although it does not mean that difference is detected:

In [None]:
print(f"bootstrap_test: {ab_test.test_boot_confint()}")
print(f"mannwhitney_test: {ab_test.test_mannwhitney()}")
print(f"welch_test: {ab_test.test_welch()}")

In [None]:
ab_test.plot()

In [None]:
ab_test1.plot()

## Variance reduction with differences between groups

In [None]:
df = pd.read_csv('./data/ab_data_height.csv')

# Increase values of one group
incs = np.sort(np.random.uniform(0.0, 2, df.shape[0]))
df['height_now'].where(df['groups'] == 'A',
                                    df['height_now'] + incs, 
                                    axis=0,
                                    inplace=True)

In [None]:
df.head()

In [None]:
data_params = DataParams(
    id_col='id', 
    group_col='groups',
    control_name='A',
    treatment_name='B',
    target='height_now', 
    covariate='height_prev', 
    is_grouped=True
)

hypothesis_params = HypothesisParams(
    alpha=0.05, 
    beta=0.2, 
    alternative='greater',  
    metric_type='continuous', 
    metric_name='mean',
)

ab_params = ABTestParams(data_params, hypothesis_params)

In [None]:
ab_test = ABTest(df, ab_params)

Mean and variance before variance reduction:

In [None]:
print('Control mean = {:.3f}'.format(np.mean(ab_test.params.data_params.control)))
print('Treatment mean = {:.3f}'.format(np.mean(ab_test.params.data_params.treatment)))
print('Control var = {:.3f}'.format(np.var(ab_test.params.data_params.control)))
print('Treatment var = {:.3f}'.format(np.var(ab_test.params.data_params.treatment)))

In [None]:
print(f"bootstrap_test: {ab_test.test_boot_confint()}")
print(f"mannwhitney_test: {ab_test.test_mannwhitney()}")
print(f"welch_test: {ab_test.test_welch()}")

Apply CUPED:

In [None]:
ab_test1 = ab_test.cuped()

Mean and variance after variance reduction:

In [None]:
print('Control mean = {:.3f}'.format(np.mean(ab_test1.params.data_params.control)))
print('Treatment mean = {:.3f}'.format(np.mean(ab_test1.params.data_params.treatment)))
print('Control var = {:.3f}'.format(np.var(ab_test1.params.data_params.control)))
print('Treatment var = {:.3f}'.format(np.var(ab_test1.params.data_params.treatment)))

As can be seen, variance of experiment dataset decreased, and as there is a difference, we are able to detect it:

In [None]:
print(f"bootstrap_test: {ab_test.test_boot_confint()}")
print(f"mannwhitney_test: {ab_test.test_mannwhitney()}")
print(f"welch_test: {ab_test.test_welch()}")

In [None]:
ab_test.plot()

In [None]:
ab_test1.plot()