In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame(
    dict(
        user_id=np.random.randint(0, 1000, size=100000),
        n_imp=np.random.randint(1, 101, size=100000),
    )
)
df["n_click"] = df.n_imp.apply(lambda x: np.random.randint(0, int(np.ceil((x+1)*.3))))
df

Unnamed: 0,user_id,n_imp,n_click
0,245,57,3
1,175,34,10
2,468,81,24
3,274,52,8
4,773,88,1
...,...,...,...
99995,724,26,0
99996,649,17,2
99997,601,17,1
99998,700,33,1


In [3]:
from t_ab.frequentist.aa import AATest
from t_ab.frequentist.ab import ABTest
from t_ab.frequentist.ctr import ImpressionBasedCTRTtest, UserBasedCTRTtest


ibtest = ImpressionBasedCTRTtest("user_id", "n_imp", "n_click")
ubtest = UserBasedCTRTtest("user_id", "n_imp", "n_click")
ab_test = ABTest(ibtest, ubtest)
aa_test = AATest(ab_test)

In [4]:
from typing import Generator
from sklearn.model_selection import train_test_split


def split_data(values: np.ndarray, n: int, random_state: int = 42) -> Generator[np.ndarray, None, None]:
    for i in range(n-1):
        vs, values = train_test_split(values, train_size=1/(n-i), random_state=random_state)
        yield vs
    yield values


def dfs_loader(df: pd.DataFrame, col: str, n_groups: int, n_tests: int=1000) -> Generator[list[pd.DataFrame], None, None]:
    values = df[col].unique()
    for i in range(n_tests):
        yield [df[df[col].isin(vs)] for vs in split_data(values, n_groups, i)]

In [5]:
aa_test_results = aa_test(dfs_loader(df, "user_id", 3, 1000))

In [6]:
aa_test_results

Unnamed: 0_level_0,Unnamed: 1_level_0,pvalues,uniform_pvalues,multipletests_result,is_rejected
metrics_base,metrics,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Impression-Based,n_click / n_imp,"[[0.3037505737563933, 0.3071507265999299, 0.59...","[0.5449291514771972, 0.19671091851551004, 0.22...","([False, False, False], [0.5449291514771972, 0...",False
User-Based,n_click / n_imp,"[[0.24539863401097697, 0.3778377285435336, 0.7...","[0.5645819501878142, 0.43740839069701853, 0.12...","([False, False, False], [0.6834906811418815, 0...",False


In [7]:
for dfs in dfs_loader(df, "user_id", 3):
    ab_test_results = ab_test(dfs)
    break

In [8]:
ab_test_results

Unnamed: 0,metrics_base,metrics,pair,mean_l,std_l,mean_r,std_r,pvalue,is_rejected
0,Impression-Based,n_click / n_imp,"(0, 1)",0.14781,0.010575,0.146977,0.01031,0.303751,False
1,Impression-Based,n_click / n_imp,"(0, 2)",0.14781,0.010575,0.146378,0.010281,0.076663,False
2,Impression-Based,n_click / n_imp,"(1, 2)",0.146977,0.01031,0.146378,0.010281,0.452729,False
3,User-Based,n_click / n_imp,"(0, 1)",0.147835,0.010723,0.146887,0.010325,0.245399,False
4,User-Based,n_click / n_imp,"(0, 2)",0.147835,0.010723,0.146468,0.010112,0.090701,False
5,User-Based,n_click / n_imp,"(1, 2)",0.146887,0.010325,0.146468,0.010112,0.59675,False
