In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame(
    dict(
        user_id=np.random.randint(0, 1000, size=100000),
        n_imp=np.random.randint(1, 101, size=100000),
    )
)
df["n_click"] = df.n_imp.apply(lambda x: np.random.randint(0, int(np.ceil((x+1)*.3))))
df

Unnamed: 0,user_id,n_imp,n_click
0,680,75,12
1,88,94,17
2,601,93,5
3,527,24,7
4,759,58,6
...,...,...,...
99995,810,84,11
99996,670,68,0
99997,23,30,7
99998,961,64,5


In [3]:
from t_ab.aa import AATest
from t_ab.ctr import ImpressionBasedCTRTtest, UserBasedCTRTtest

ibtest = ImpressionBasedCTRTtest("user_id", "n_imp", "n_click")
ubtest = UserBasedCTRTtest("user_id", "n_imp", "n_click")
aa_test = AATest(ibtest, ubtest)

In [4]:
from typing import Generator
from sklearn.model_selection import train_test_split


def split_data(values: np.ndarray, n: int, random_state: int = 42) -> Generator[np.ndarray, None, None]:
    for i in range(n-1):
        vs, values = train_test_split(values, train_size=1/(n-i), random_state=random_state)
        yield vs
    yield values


def dfs_loader(df: pd.DataFrame, col: str, n_groups: int, n_tests: int=1000) -> Generator[list[pd.DataFrame], None, None]:
    values = df[col].unique()
    for i in range(n_tests):
        yield [df[df[col].isin(vs)] for vs in split_data(values, n_groups, i)]

In [5]:
aa_test_results = aa_test(dfs_loader(df, "user_id", 3, 1000))

In [6]:
for test_name, aa_test_result in aa_test_results.items():
    print(test_name)
    print(aa_test_result.multipletests_result, aa_test_result.is_rejected)

n_click / n_imp (Impression-Based)
(array([False, False, False]), array([0.98066104, 0.98066104, 0.90229819]), 0.016952427508441503, 0.016666666666666666) False
n_click / n_imp (User-Based)
(array([False, False, False]), array([0.91907445, 0.69803535, 0.49664311]), 0.016952427508441503, 0.016666666666666666) False


In [7]:
from t_ab.ab import ABTest

ab_test = ABTest(ibtest, ubtest)

In [8]:
for dfs in dfs_loader(df, "user_id", 3):
    ab_test_results = ab_test(dfs)
    break

In [9]:
from pprint import pprint

for test_name, ab_test_result in ab_test_results.items():
    print(test_name)
    pprint(ab_test_result)

n_click / n_imp (Impression-Based)
{'(0, 1)': CTRTestResult(statistics=(Statistics(mean=0.14748434553765621, std=0.011043581635527823, nobs=333), Statistics(mean=0.14719853699196322, std=0.010148118603448702, nobs=333)), ttest_result=TtestResult(statistic=0.3477436020625978, pvalue=0.7281429525900802, is_rejected=False)),
 '(0, 2)': CTRTestResult(statistics=(Statistics(mean=0.14748434553765621, std=0.011043581635527823, nobs=333), Statistics(mean=0.14796773785685513, std=0.01039992035177221, nobs=334)), ttest_result=TtestResult(statistic=-0.5819564517531648, pvalue=0.5607932439174669, is_rejected=False)),
 '(1, 2)': CTRTestResult(statistics=(Statistics(mean=0.14719853699196322, std=0.010148118603448702, nobs=333), Statistics(mean=0.14796773785685513, std=0.01039992035177221, nobs=334)), ttest_result=TtestResult(statistic=-0.966698856063126, pvalue=0.33404602585415644, is_rejected=False))}
n_click / n_imp (User-Based)
{'(0, 1)': CTRTestResult(statistics=(Statistics(mean=0.14736459018089