In [1]:
import numpy as np
from read_db.CH import Getch
import pandas as pd
from scipy import stats

In [2]:
begin_date_str = '2022-03-15'
end_date_str = '2022-03-21'
groups_tuple = (0,3,1,2)

q = f"""SELECT  exp_group,
        user_id,
        sum(action = 'like') as likes,
        sum(action = 'view') as views,
        likes/views as ctr
FROM simulator_20220320.feed_actions 
WHERE toDate(time) between '{begin_date_str}' and '{end_date_str}'
    and exp_group in {groups_tuple}
GROUP BY exp_group, user_id
"""

In [3]:
df = Getch(q).df
df

Unnamed: 0,exp_group,user_id,likes,views,ctr
0,3,115383,9,30,0.300000
1,1,18392,7,32,0.218750
2,3,123580,13,48,0.270833
3,2,131473,14,134,0.104478
4,2,32420,26,128,0.203125
...,...,...,...,...,...
40074,0,27030,8,39,0.205128
40075,3,122758,30,198,0.151515
40076,1,132567,45,296,0.152027
40077,3,122190,19,65,0.292308


In [4]:
df0,df1,df2,df3 = [x for _,x in df.groupby(['exp_group'])]
dfs = (df0,df1,df2,df3)

In [5]:
a = [len(dfr) for dfr in dfs]
print(a, sum(a), sum(a) == len(df))

[9999, 10079, 9952, 10049] 40079 True


In [6]:
def add_linearized_likes_to_df(df_control, df_target):
    CTR_control = df_control.likes.sum() / df_control.views.sum()
    df_control['linearized_likes'] = df_control.likes - CTR_control * df_control.views
    df_target['linearized_likes'] = df_target.likes - CTR_control * df_target.views

In [7]:
# Assuming that control,target pairs are: (0,3) and (1,2)
add_linearized_likes_to_df(df0, df3)
add_linearized_likes_to_df(df1, df2)

In [8]:
def print_ttests_diffs(df_control, df_target, alpha=0.05):
    ttest_ctr = stats.ttest_ind(a=df_control.ctr, b=df_target.ctr, equal_var=False)
    ttest_lin = stats.ttest_ind(a=df_control.linearized_likes, b=df_target.linearized_likes, equal_var=False)
    
    diff = ttest_ctr.pvalue - ttest_lin.pvalue
    ratio = ttest_ctr.pvalue / ttest_lin.pvalue
    print(f"""CTR:               {ttest_ctr}, {ttest_ctr.pvalue < alpha}\nLinearized likes:  {ttest_lin}, {ttest_lin.pvalue < alpha}\np-val got smaller: {diff > 0}\np-vals difference: {diff}\np-vals ratio:      {ratio:e}\n""")

In [9]:
print_ttests_diffs(df0, df3)
print_ttests_diffs(df1, df2)

CTR:               Ttest_indResult(statistic=-13.896870721904069, pvalue=1.055849414662529e-43), True
Linearized likes:  Ttest_indResult(statistic=-15.214995460903827, pvalue=5.4914249479690016e-52), True
p-val got smaller: True
p-vals difference: 1.0558494091711042e-43
p-vals ratio:      1.922724e+08

CTR:               Ttest_indResult(statistic=0.7094392041270485, pvalue=0.4780623130874935), False
Linearized likes:  Ttest_indResult(statistic=6.122579994775974, pvalue=9.439432187037644e-10), True
p-val got smaller: True
p-vals difference: 0.47806231214355027
p-vals ratio:      5.064524e+08



Для групп (0,3) качественного изменения не произошло ($True \to True$), количественное оказалось малым ($1.05585e-43$), а относительное большим ($1.9227e+08$).

Для групп (1,2) качественное изменение произошло ($False \to True$), количественное оказалось большим ($0.47806$), относительное тоже ($5.0645e+08$).

Как и было заявлено, линеаризация не сделала хуже (в первом случае качественно так же, количественно почти так же; во втором качественно и количественно лучше).