## A/B test online simulation

1. Split test_set customers into two subgroups (A and B).
2. For Group A, generate baseline recommendations.
3. For Group B, generate recommendations based on the new model.
4. Calculate the CTR (or hit rate, meaning "whether the purchased item was found in the top K").
5. Run a z-test for significance.

In [2]:
import numpy as np
import pandas as pd

In [3]:
chunk_of_test_set = pd.read_csv('chunk_of_test_set.csv')
chunk_of_test_set.shape

(10000, 5)

In [6]:
candidates_features = pd.read_csv('candidates_features_chunk.csv')
drop_cols = ["club_member_status",	"fashion_news_frequency"]
data = candidates_features.drop(columns=drop_cols)


In [7]:
from catboost import CatBoostClassifier

model = CatBoostClassifier()
model.load_model("catboost_recommender.cbm")

<catboost.core.CatBoostClassifier at 0x1084178b0>

In [8]:
data["pred_prob"] = model.predict_proba(data.drop(columns=["customer_id", "article_id", "label"], errors='ignore'))[:, 1]


In [9]:
# chunk_of_test_set = 1000 rows, 8937 unique customers. 
# n_A = 4000, n_B = 4000, total 

n = 4000  

test_users = chunk_of_test_set['customer_id'].unique()
np.random.seed(42)
shuffled = np.random.permutation(test_users)
group_A = shuffled[:4000]
group_B = shuffled[4000:8000]


# Get customer_id + features from train_set, Using for predictions. 
#test_A = data[data['customer_id'].isin(group_A)]

test_B = chunk_of_test_set[chunk_of_test_set['customer_id'].isin(group_B)]
ground_truth_test_B = test_B.groupby('customer_id')['article_id'].apply(set).to_dict()

candidates_B = data[data['customer_id'].isin(group_B)]


In [10]:
b_users = candidates_B['customer_id'].unique()
test_B = chunk_of_test_set[chunk_of_test_set['customer_id'].isin(b_users)]
ground_truth_test_B = test_B.groupby('customer_id')['article_id'].apply(set).to_dict()

In [16]:
len(ground_truth_test_B)

3675

In [11]:
# New Model prediction for Group B. 

# predict probability for each canditates. Canditets were selected before. Then ranking. Because MAP@k, consider order. 
candidates_B["pred_prob"] = model.predict_proba(candidates_B.drop(columns=["customer_id", "article_id", "label"], errors='ignore'))[:, 1]

predicted_candidates_B = (
    candidates_B.sort_values(["customer_id", "pred_prob"], ascending=[True, False])
    .groupby("customer_id")["article_id"]
    .apply(lambda x: list(x.head(12))) # Take Top 12
    .reset_index(name="predictions")
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  candidates_B["pred_prob"] = model.predict_proba(candidates_B.drop(columns=["customer_id", "article_id", "label"], errors='ignore'))[:, 1]


In [15]:
from src.map_at_k1 import map_at_k

map12 = map_at_k(ground_truth_test_B, predicted_candidates_B, k=12)
print(f"MAP@12: {map12:.4f}")

MAP@12: 0.0195


In [19]:
def calc_ctr(gr_tr, user_candidates, k=12):
    ''' Calculate CTR (hit-rate: purchased articles hit top-K)'''
    clicks = []
    for user, target_items in gr_tr.items():
        # recommend candidates for purchases
        recs = (user_candidates[user_candidates['customer_id']==user]['predictions']).values[0]
        
        hit = int(any(item in recs for item in target_items))  # 1 if guess, else 0
        
        clicks.append(hit)
    return np.array(clicks)

In [20]:
ctr_B = calc_ctr(ground_truth_test_B, predicted_candidates_B)
print(f"CTR Group B (new model): {ctr_B.mean():.3f}")

CTR Group B (new model): 0.054


In [21]:
# Calculation for group A; 

In [22]:
group_A = shuffled[:4000] # set of customers
test_A = chunk_of_test_set[chunk_of_test_set['customer_id'].isin(group_A)]

ground_truth_test_A = test_A.groupby('customer_id')['article_id'].apply(set).to_dict()

# Global Top 12 Popular Products (by Likelihood or Sales)
top12_global = data.groupby("article_id")["pred_prob"].mean().sort_values(ascending=False).head(12).index.tolist()

candidates_A = pd.DataFrame({
    "customer_id": list(group_A),
    "predictions": [top12_global]*len(group_A)
})

In [23]:
map12 = map_at_k(ground_truth_test_A, candidates_A, k=12)
print(f"MAP@12: {map12:.4f}")

ctr_A = calc_ctr(ground_truth_test_A, candidates_A)
print(f"CTR Group A (old model): {ctr_A.mean():.3f}")

MAP@12: 0.0005
CTR Group A (old model): 0.002


In [24]:
#!pip install statsmodels

In [25]:
from statsmodels.stats.proportion import proportions_ztest

# Let's use A/B test (z-test for proportion)
successes = [ctr_A.sum(), ctr_B.sum()]
nobs = [len(ctr_A), len(ctr_B)]
print(nobs)

z_stat, p_val = proportions_ztest(successes, nobs, alternative='smaller')
print(f"Z-statistic: {z_stat:.3f}, p-value: {p_val:.4f}")


[4000, 3675]
Z-statistic: -14.048, p-value: 0.0000
