In [1]:
from _shared import *

## Задача 1. Оценка эксперимента с CUPED

 Оцените эксперимент __«Sending email (correct link)»__ с использованием CUPED.
 
 В качестве ковариаты используйте выручку пользователей за 4 недели до эксперимента.


Данные эксперимента __«Sending email (correct link)»__:
- `2022-05-03/2022-05-03T12_df_sales.csv` и `2022-05-03/experiment_users.csv`.
- Эксперимент проводился с __2022-04-25__ по __2022-05-02__.
- Метрика — средняя выручка с клиента.

В качестве ответа введите p-value с точность до 4-го знака после точки.

In [2]:
def calculate_theta(metric, covariate):
    """
    metric - y.
    covariate - x.
    
    theta = covariance(y, x) / variance(x)
    """
    covariance = np.cov(metric, covariate)[0, 1]
    variance = covariate.var()

    return covariance / variance

In [3]:
data_path = './data/{}'
filenames = ('2022-05-03T12_df_sales.csv', 'experiment_users_cuped.csv')

beg_date = pd.Timestamp('2022-04-25')
end_date = pd.Timestamp('2022-05-02')

beg_date_pre = beg_date - pd.Timedelta(4, 'w')
end_date_pre = beg_date


experiment_users = pd.read_csv(data_path.format(filenames[1]))
experiment_users.head()

Unnamed: 0,user_id,pilot
0,a9a6e8,0
1,23420a,0
2,cbc468,0
3,583c90,0
4,19ce47,0


In [4]:
# df = read_from_database(filenames[0])
df_sales = get_data_subset(
    df=pd.read_csv(data_path.format(filenames[0]), parse_dates=[1]),
    begin_date=beg_date_pre,
    end_date=end_date,
    user_ids=experiment_users['user_id'],
    columns=['user_id', 'date', 'price']
)

df_sales.head()

Unnamed: 0,user_id,date,price
190426,3561cd,2022-03-28 10:02:00,600
190427,96ae9b,2022-03-28 10:02:45,660
190428,6eea6a,2022-03-28 10:04:34,540
190429,e225fd,2022-03-28 10:05:57,2310
190430,f96d85,2022-03-28 10:10:30,1530


In [5]:
df_exp = (
    df_sales
    .loc[
        (df_sales['date'] >= beg_date) & (df_sales['date'] < end_date),
        ['user_id', 'price']
    ].groupby('user_id')
    ['price'].sum()
    .reset_index()
)
df_exp.head()

Unnamed: 0,user_id,price
0,0000e4,840
1,000112,1380
2,0001ff,720
3,00045f,720
4,000470,2280


In [6]:
df_cov = (
    df_sales
    .loc[
        (df_sales['date'] >= beg_date_pre) & (df_sales['date'] < end_date_pre),
        ['user_id', 'price']
    ].groupby('user_id')
    ['price'].sum()
    .reset_index()
)
df_cov.head()

Unnamed: 0,user_id,price
0,0000d4,720
1,0000de,1320
2,0000e7,3840
3,000152,780
4,0001ff,720


In [7]:
df = (
    experiment_users.merge(
        right=df_exp,
        how='left',
        on='user_id'
    ).merge(
        right=df_cov,
        how='left',
        on='user_id',
        suffixes=['_exp', '_cov']
    ).fillna(0.0)
)

df = df.loc[:, ['user_id', 'pilot', 'price_exp', 'price_cov']]

df.head()

Unnamed: 0,user_id,pilot,price_exp,price_cov
0,a9a6e8,0,930.0,900.0
1,23420a,0,0.0,0.0
2,cbc468,0,0.0,0.0
3,583c90,0,2490.0,7350.0
4,19ce47,0,0.0,0.0


In [8]:
theta = calculate_theta(metric=df['price_exp'], covariate=df['price_cov'])
df['price_cuped'] = df['price_exp'] - theta * df['price_cov']

df.head()

Unnamed: 0,user_id,pilot,price_exp,price_cov,price_cuped
0,a9a6e8,0,930.0,900.0,850.405278
1,23420a,0,0.0,0.0,0.0
2,cbc468,0,0.0,0.0,0.0
3,583c90,0,2490.0,7350.0,1839.976437
4,19ce47,0,0.0,0.0,0.0


In [9]:
df.shape[0], experiment_users.shape[0], df.shape[0] == experiment_users.shape[0]

(109367, 109367, True)

In [10]:
res_1 = stats.ttest_ind(
    df.loc[df['pilot'] == 0, 'price_cuped'],
    df.loc[df['pilot'] == 1, 'price_cuped']
).pvalue

round(res_1, 4)

0.0539

In [11]:
df_metrics = pd.read_csv(data_path.format('df_metrics_1000.csv'))
df_metrics.head()

Unnamed: 0,user_id,pilot,metric,cov
0,cb1bf2,0,0.0,1680.0
1,f09a23,0,0.0,1080.0
2,791598,1,1410.0,0.0
3,fb1f12,0,0.0,810.0
4,f81590,1,600.0,1290.0


In [12]:
df_check = df_metrics.merge(
    right=df,
    how='inner',
    on='user_id'
).loc[:, ['user_id', 'pilot_x', 'metric', 'cov', 'price_exp', 'price_cov']]

df_check['metric_diff'] = df_check['metric'] - df_check['price_exp']
df_check['cov_diff'] = df_check['cov'] - df_check['price_cov']

df_check[['metric_diff', 'cov_diff']].describe()

Unnamed: 0,metric_diff,cov_diff
count,1000.0,1000.0
mean,0.0,0.0
std,0.0,0.0
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,0.0,0.0


## Задача 2. Функция вычисления CUPED-метрики

Реализуйте функцию `calculate_cuped_metric`.

Обратите внимание, что в этом задании нужно использовать формулу из [лекции](https://lab.karpov.courses/learning/347/module/3348/lesson/29684/83611/390960/) с вычитанием среднего значения ковариаты.

__Внимание!__
>Для вычисления параметра θ нужно оценить ковариацию и дисперсию. Оценивать их можно разными способами, например, есть смещённые и несмещённые оценки. На выборках маленького размера значения разных оценок могут сильно отличаться. Для решения этого задания используйте функции из библиотеки `numpy` с параметрами по умолчанию.

In [13]:
import numpy as np
import pandas as pd


def calculate_theta(metric, covariate):
    """
    metric - y.
    covariate - x.
    
    theta = covariance(y, x) / variance(x)
    """
    covariance = np.cov(metric, covariate)[0, 1]
    variance = np.var(covariate)

    return covariance / variance


def calculate_cuped_metric(df_metric, df_cov):
    """Считает значения cuped-метрики.

    :param df_metric (pd.DataFrame): таблица со значениями метрики во время эксперимента
        со столбцами ['user_id', 'metric'].
    :param df_cov (pd.DataFrame): таблица со значениями ковариаты
        со столбцами ['user_id', 'cov'].
    :return df: таблица со значениями cuped-метрики со столбцами ['user_id', 'metric'].
    """
    df_res = df_metric.merge(
        right=df_cov,
        how='left',
        on='user_id'
    ).fillna(0.0)
    theta = calculate_theta(metric=df_res['metric'], covariate=df_res['cov'])
    
    df_res['metric'] = df_res['metric'] - theta * (df_res['cov'] - np.mean(df_res['cov']))
    
    return df_res.loc[:, ['user_id', 'metric']]

In [14]:
df_metric = pd.DataFrame({'user_id': [1, 2, 3], 'metric': [2000, 2500, 3000]})
df_cov = pd.DataFrame({'user_id': [1, 2, 3], 'cov': [1100, 1500, 0]})
df = calculate_cuped_metric(df_metric, df_cov)
# df = pd.DataFrame({'user_id': [1, 2, 3], 'metric': [2159.53, 2933.01, 2407.46]})
df.round(2)

Unnamed: 0,user_id,metric
0,1,2159.53
1,2,2933.01
2,3,2407.46


In [15]:
# Solution.
def calculate_cuped_theta(metric, cov):
    """Вычисляем Theta.

    :param metric (np.array): значения метрики во время пилота
    :param cov (np.array): значения ковариат
    """
    covariance = np.cov(cov, metric)[0, 1]
    variance = cov.var()
    theta = covariance / variance
    return theta


def calculate_cuped_metric(df_metric, df_cov):
    df = pd.merge(df_metric, df_cov, on='user_id')
    metric = df['metric'].values
    cov = df['cov'].values
    theta = calculate_cuped_theta(metric, cov)
    df['metric'] -= theta * (df['cov'] - df['cov'].mean())
    df.drop('cov', axis=1, inplace=True)
    return df