In [2]:
import pandas as pd
import numpy as np
import pandahouse as ph
from scipy import stats
import seaborn as sns

sns.set(rc={'figure.figsize':(11.7,8.27)})

In [3]:
connection = {
    'host': 'https://clickhouse.lab.karpov.courses',
    'password': 'dpo_python_2020',
    'user': 'student',
    'database': 'simulator_20221020'
}

In [4]:
query = '''
select exp_group, 
    user_id,
    sum(action = 'like') as likes,
    sum(action = 'view') as views,
    likes/views as ctr
from simulator_20221020.feed_actions
where toDate(time) between '2022-10-03' and '2022-10-09'
    and exp_group in (0, 1, 2, 3)
group by exp_group, user_id    
'''
df = ph.read_clickhouse(query, connection=connection)

In [5]:
df.head()

Unnamed: 0,exp_group,user_id,likes,views,ctr
0,3,115383,9,30,0.3
1,1,18392,7,32,0.21875
2,3,123580,13,48,0.270833
3,2,131473,14,134,0.104478
4,2,32420,26,128,0.203125


In [6]:
#Считаем общий CTR в контрольной группе  𝐶𝑇𝑅𝑐𝑜𝑛𝑡𝑟𝑜𝑙=𝑠𝑢𝑚(𝑙𝑖𝑘𝑒𝑠)/𝑠𝑢𝑚(𝑣𝑖𝑒𝑤𝑠)
CTRc_0 = df[df.exp_group == 0].likes.sum() / df[df.exp_group == 0].views.sum()
CTRc_1 = df[df.exp_group == 1].likes.sum() / df[df.exp_group == 1].views.sum()

In [7]:
CTRc_0, CTRc_1

(0.20823612262916305, 0.20802680490126244)

In [8]:
#Посчитаем в обеих группах поюзерную метрику  𝑙𝑖𝑛𝑒𝑎𝑟𝑖𝑧𝑒𝑑_𝑙𝑖𝑘𝑒𝑠=𝑙𝑖𝑘𝑒𝑠−𝐶𝑇𝑅𝑐𝑜𝑛𝑡𝑟𝑜𝑙∗𝑣𝑖𝑒𝑤𝑠
def lin_likes(x):
    if x.exp_group in [0,3]:
        ll = x['likes'] - CTRc_0 * x['views']
    if x.exp_group in [1,2]:
        ll = x['likes'] - CTRc_1 * x['views']
    return ll

In [9]:
df['linearized_likes'] = df.apply(lambda x: lin_likes(x), axis=1)
df.head()

Unnamed: 0,exp_group,user_id,likes,views,ctr,linearized_likes
0,3,115383,9,30,0.3,2.752916
1,1,18392,7,32,0.21875,0.343142
2,3,123580,13,48,0.270833,3.004666
3,2,131473,14,134,0.104478,-13.875592
4,2,32420,26,128,0.203125,-0.627431


In [10]:
set_0 = df.query('exp_group == 0')
set_1 = df.query('exp_group == 1')
set_2 = df.query('exp_group == 2')
set_3 = df.query('exp_group == 3')

In [11]:
df.groupby('exp_group').ctr.mean()

exp_group
0    0.215950
1    0.215605
2    0.214419
3    0.232624
Name: ctr, dtype: float64

"Поюзерные" CTR практически не отличаются.

In [12]:
result_ctr_1 = stats.ttest_ind(set_0['ctr'],
                set_3['ctr'],
                equal_var=False).pvalue
result_lin_likes_1 = stats.ttest_ind(set_0['linearized_likes'],
                set_3['linearized_likes'],
                equal_var=False).pvalue

In [13]:
#Показатели теста между группами 0 и 3
result_ctr_1, result_lin_likes_1

(1.055849414662529e-43, 5.4914249479690016e-52)

In [14]:
result_ctr_2 = stats.ttest_ind(set_1['ctr'],
                set_2['ctr'],
                equal_var=False).pvalue
result_lin_likes_2 = stats.ttest_ind(set_1['linearized_likes'],
                set_2['linearized_likes'],
                equal_var=False).pvalue

In [15]:
#Показатели теста между группами 1 и 2
result_ctr_2, result_lin_likes_2

(0.4780623130874935, 9.439432187037712e-10)

Вывод: В обоих тестах видно отдичие, p-value по метрике линеаризованных лайков стало меньше.