In [1]:
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.disable_max_rows()
from datetime import datetime

## Example of Metrics Calculation

### User Activity

In [2]:
## Loading the data from a .csv file 
data = pd.read_csv("data/activity_pretest.csv")

In [3]:
data.head()

Unnamed: 0,userid,dt,activity_level
0,a5b70ae7-f07c-4773-9df4-ce112bc9dc48,2021-10-01,0
1,d2646662-269f-49de-aab1-8776afced9a3,2021-10-01,0
2,c4d1cfa8-283d-49ad-a894-90aedc39c798,2021-10-01,0
3,6889f87f-5356-4904-a35a-6ea5020011db,2021-10-01,0
4,dbee604c-474a-4c9d-b013-508e5a0e3059,2021-10-01,0


In [4]:
data.activity_level.value_counts().sort_values()

20     24520
7      48339
17     48395
8      48396
13     48534
4      48556
15     48599
14     48620
3      48659
1      48732
9      48820
11     48832
19     48901
6      48901
12     48911
16     48934
10     48943
18     48982
2      49074
5      49227
0     909125
Name: activity_level, dtype: int64

In [5]:
data.groupby('activity_level').describe().head()

Unnamed: 0_level_0,userid,userid,userid,userid,dt,dt,dt,dt
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
activity_level,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,909125,60000,6b953416-72e5-4b6e-b634-41c8d3bf98a4,27,909125,31,2021-10-11,29511
1,48732,33688,3c5297b6-602e-4479-9a97-e2b4cb444f0a,6,48732,31,2021-10-19,1620
2,49074,33761,3d5b7e5d-d7b8-459b-a4f0-33231fc930fd,6,49074,31,2021-10-14,1665
3,48659,33634,fd9d8064-2f3f-47ba-9deb-0a38bc0b1a3d,6,48659,31,2021-10-28,1663
4,48556,33502,dc396a83-174c-4244-8a33-71eae2283eeb,8,48556,31,2021-10-29,1632


In [6]:
activity = data.query('activity_level > 0').groupby(['dt', 'activity_level']).count().reset_index()

In [7]:
alt.Chart(activity).mark_line(size=1).encode(
    alt.X('dt:T', axis=alt.Axis(title = 'date')),
    alt.Y('userid:Q', axis=alt.Axis(title = 'number of users')),
    tooltip=['activity_level'], 
    color='activity_level:N'
).properties(
    width=600,
    height=400
)

In [None]:
# data set suspcious, due to uniformly distributed across 1-19 times a day, and not for 20.
## are there bots? suspicious activity? different types of account? 
## type of account has different times a day they will be open without user consent etc
# need to check data validity 

### Calculating Daily Active Users

In this dataset, a userid will count towards DAU if their activity_level for that day is not zero.

In [8]:
activity = data.query('activity_level > 0').groupby(['dt']).count().reset_index()

In [9]:
activity.describe()

Unnamed: 0,userid,activity_level
count,31.0,31.0
mean,30673.387097,30673.387097
std,90.968375,90.968375
min,30489.0,30489.0
25%,30608.0,30608.0
50%,30661.0,30661.0
75%,30728.5,30728.5
max,30902.0,30902.0


In [10]:
alt.Chart(activity).mark_line(size=4).encode(
    alt.X('dt:T', axis=alt.Axis(title = 'date')),
    alt.Y('userid:Q', axis=alt.Axis(title = 'number of users'))
).properties(
    width=600,
    height=400, 
    title='Daily Active Users'
)

In [None]:
# a bit too stable for a social network, what actions affect users etc 

### Click-through rate

In [11]:
## Loading the data from a .csv file 
data = pd.read_csv("data/ctr_pretest.csv")

In [None]:
# click-through is how many ads a user has clicked on 

In [12]:
data.head()

Unnamed: 0,userid,dt,ctr
0,4b328144-df4b-47b1-a804-09834942dce0,2021-10-01,34.28
1,34ace777-5e9d-40b3-a859-4145d0c35c8d,2021-10-01,34.67
2,8028cccf-19c3-4c0e-b5b2-e707e15d2d83,2021-10-01,34.77
3,652b3c9c-5e29-4bf0-9373-924687b1567e,2021-10-01,35.42
4,45b57434-4666-4b57-9798-35489dc1092a,2021-10-01,35.04


In [13]:
data.describe()

Unnamed: 0,ctr
count,950875.0
mean,33.000242
std,1.731677
min,30.0
25%,31.5
50%,33.0
75%,34.5
max,36.0


In [None]:
## not very realistic, check average industry click-through rate across different platforms
## and we how this compares to competitors

In [14]:
ctr = data.groupby(['dt']).mean().reset_index()

In [15]:
alt.Chart(ctr).mark_line(size=4).encode(
    alt.X('dt:T', axis=alt.Axis(title = 'date')),
    alt.Y('ctr:Q', axis=alt.Axis(title = 'ctr'), scale=alt.Scale(domain=[32, 34])),
    tooltip=['ctr'], 
).properties(
    width=600,
    height=400, 
    title='Average Daily CTR'
)

In [None]:
# is this the right test? need good tracking. Check different metrics.
# percent of adds purchased, create accounts, lifetime value etc 


## A/B Test considerations

In [None]:
# is the idea worth testing? 
# a trade-off between value of learnings and investment 

## Reasons not to do A/B Test:


In [None]:
# too expensive to test- development capacity, monetary impact, time investment etc
# too risky to test on the users: a disruptive change, shaky implementation etc
# too vague idea to test- unclear hypothesis, testing for curiosity only, undefined metrics etc 
# too many tests running already- not practical and dilutes the user experience and trustworthiness of the results

## How to find right idea to test: Qualatative and Quantative analysis

In [None]:
 # oppertunity sizing= analyse the data you have to estimate the impact on the test
    # e.g. impact smaller on smaller part of app compared to homepage, may not be a priority if not on homepage

In [None]:
# investigate the drivers for the behavior you want to change

In [None]:
# Do user research- could be done using consumer research e.g qualatative analysis, or what users do in quantative 

In [None]:
# learn from own iterations and competitor research- learning from blogs and articles, optimized performance etc 
# websites genreally have a call to action to encourage users- e.g. subscribe to newsletter, sign up to website etc 

## Hypothesis- what do you expect to happen if you introduce a new feature?

In [None]:
# statement you will either approve or disprove using A/B test
# two types of hypothesis:
# 1. base hypothesis- what you expect will happen when you introduce the change you are testing- specifc and measurable 
## base hypotheis should identify the independant variable (which you will can change and control) 
## and dependant variable (which indicates the impact of the test and should ideally depend on the independant variable)
# should see change in metric based on what you are introducing 
# for the hypothesis testing you cannot prove your base hypothesis, so need to disprove the Null hypothesis
# null hypothesis is reverse of base hypothesis

## Need to define hypothesis- what will this feature improve? For users and the product

In [16]:
# users have more adds, more likely to purchase the product, higher user satisfaction, higher affiliate revenue etc, for example

In [None]:
## ideally have user researcher who do qualatative data analysi and could guide us if those hypothesis were true or not
##  this would allow us to estimate crucial points we need to test, track and evaluate 

In [None]:
## e.g for us, H1- base hypothesis is with introduction of tailored ads, users are more likely to click on the ads,
## therefore the CTR will increase per user
# H0- Tailored ads will have no effect on the user engagement with the ads, and will not affect CTR
## will have a success metric based on the hypothesis- it is the dependant variable,
# for our test the dependant variable is CTR. So, this will be our Sucess Metric
# However, it is very common for the companies to have multiple dependant variables, 
# depending on how much tracking and data is availible or can be built for the test
## for our test, dependant variable could be affiliate revenue, number of users making purchases, 
## number of users becoming customers etc
# usually focus on one success metric for the test 

## Need to defend guardrail metrics- don't want other metrics to worsen from feature

In [None]:
## apart from clear dependant variable, we can have other metrics impacted by the change we are introducing
## some of them might be unexpected, some might be unimportant, but some may be very business critical 
# usually want to control the test for busisiness-critical metrics.
# We would define the non-inferiority margin for those critical metrics
# how much can it decrease/increase depending on the variable so that we can still consider the test a success?
# retention and engagement metrics are typical guardrail metrics, or other technical metrics like number of errors/crashes
# in our case, it is important not to lose the users, so we would like to keep DAU(daily active users) stable 
## other guardrail metrics could be:
## user retention, WAU, MAU, error rate, crash rate etc 

## Setting up an A/B test

In [None]:
## restrictions include:
# if large number of users engaged in marketing campaign to test feature, so small control group for a/b test
# multiple a/g tests ongoing, so will have to wait until they end due to a lot of space
# business restrictions, time limits, user limits, risks etc

In [None]:
# to decide test duration and group size (control group and test group) we need to know:
# minimum detectable effect
# power- beta- type 2 error
# significance level- p value - type 1 error rate 

## Defining minimum detectable effect

In [None]:
# knowing how many users we have, what is currently the mean and variance of our success metric (and guardrail metric)
# and what our business needs are we can define the Minimum Detectable Effect
# MDE is a mix of data analysis, statistics and business acumen
# there is often a tradeoff betwen precision and practicality 
# may want to have the smallest MDE, but with the restriction of our user base
## e.g for big company, 0.01% change to CTR will cause millions more users and more revenue
## but smaller company, would require a bigger increase 
# we want the MDE to be bigger than standard deviation of the metric
# standard deviation indicates how much do the values of the metric deviate from the mean(average)value of that metric 

## Statistical significance- how do we know the test results will repeat?

In [None]:
# #can define statistical significance as 1-a(alpha) to set an acceptable level of mistakenly rejecting the Null Hypothesis-
## which means we think we see an impact we expected when there is none. Known as a false positive 
# usually set alpha between 1 and 10&, with 5% common- 5/100 times we will have a false positive 
# when alpha is 5%, we have a statistical significance of 95%
## different types of tests require different levels of significance, 
## when testing a different website layout we may be okay with 5% false positives- 
## how long we run the test, how many users we have to expose to, and how high we can have the statistical signifiance
# but testing a new type of medicine, we might want the highest level of statistical significance- 
# very important to have low amount of false positive for new medicine 
# our ability to reach the level of statistcal significance depends
## on the metric we target, the length of the experiment and number of exposed users

## How can we know that we are not rejecting a successful test?- Power B(Beta)

In [None]:
## we define the test power(1-b-(Beta)) to set a level of mistankely accepting the Null Hypothesis- 
## thinking we don't have an impact when there is an impact. Known as a false negative 

In [None]:
## usually b is between 10% and 20%, with most commonly for digital expeiriments having b= 20%, 
## This means that if we run the test 100 times, in 20 times out of a hundred we will have a false negative test 
# when beta is 20%, we have a test power of 80%
## like with alpha, when we decide on a test power, 
## we need to take into account how critical would it be to mistakenly accept the Null Hypothesis.
# it is more common to allow that because the decision would be to not roll out the successful A/B test
# and though that's a missed opportunity and wasted resources, it's less critical than rolling out unsucessful A/B test

## Type 1 and type 2 errors

In [None]:
# CTR difference is < 5.5% no change:
# we decide that test is not successful: correct- true negative- probability = 1-a
# we decide test is successful: type 1 error- false positive- probability = a
# ctr difference is > 5.5% positive change:
# we decide test is not successful: type 2 error- false negative- probability = b
# we decide test is successful - correct true positive- probability = 1-b

In [17]:
# statistical significance influences sample size calculation

In [None]:
# two or one tailed test- one tailed is we know that ctr in test group is bigger than ctr in control group
# two tailed- we know that ctr in test group is different than control group
# second option requires two tailed test, that increases the sample size, but often is a more practical option to go with
# if the test turns out to be negative for your success metric, you would want to know that with certainity

## Z Test- sample size calculation

In [None]:
## A Z test is a statistical test to determine whether two population means are different 
## when the variances are known, and the sample size is large 
# large is over 30
# if smaller, t test, but very rare in digital experiments 
# a z test is a hypothesis test in which the z statistic follows a normal distribution
# we will need the z statistic to determine the samle size 
# we expect to have an independant observations, on a randomly selected dataset, with a sample size of over 30

In [None]:
# in one tailed test (we want to know if test statistic is bigger or smaller than control)
# statistical significance is alpha, if value is less than alpha, we don't reject it, and do if bigger
# in two tailed test, we divide alpha by 2, and if value is between both alphas, we don't reject it 
# we do if it is outside that range (lower than lower alpha, or higher than higher alpha)
# need to find critical value where if values fall within that, then we can reject null hypothesis, if not, we can't 
## critical value for two tailed test is on z table on left side (z value), + or - value in table 
## (alpha divided by 2 for top value)
# is 1- b for power, so 1- 0.2 for us = 0.8, so z of 0.8 (p value). Don't need to divided by 2, so look for cloest to 0.8
# then 1 - value  = beta 

## Types of metrics- Binomial or continuous 

In [None]:
# Binomial take on a value of either 0 or 1. The action happens or not 
# Very often it's a rate of something. Conversion rate, proportion of some sort
# In our case it will be CTR. A user either clicks on the add or doesn't. 
# This also most commonly solved for distribution when doing A/B test anaylsis

In [None]:
# calculate sample size using 2 tailed z test to calculate minumum sample size
# formula = n-Sample size = 2 * p (1-p)*(Z(1-a/2)+Z(1-b)^2/mde^2
# from table, z(1-a/2) = 1.96
# z(1-b) = 0.84
# p = pooled proportion = (u1-u2)/2
# u1 (pronounced mu) -(mean)proportion of the control group (mean daily active users for us)
# u2-(mean)proportion of the test group 
# mde(minimum detectable effect) should be an absolute value- percentage point increase we want to see, not relative one 
# mde = 0.02
# p = (0.33+0.33+0.02)/2 = 0.34
#  n-Sample size = 2 * p (1-p)*(Z(1-a/2)+Z(1-b)^2/mde^2 = 2 * 0.34 * 0.66 * (1.96 + 0.84)^2/0.02^2 = 8796.48 = approx 8797
# Therefore sample size is 8797 users in each group, provided the groups are of 50% split

In [None]:
# continuous metrics- the metric takes value on a scale. Usually those are the metrics per user:
# revenue per user, activity per user, number of likes etc 
## could be metrics per day. for example DAU (daily active users).
## However, when we calculate sample size for DAU, our sample becomes number of days rather than number of users needed 
# sometimes it is beneficial to convert continous metrics to binomial to ease the process of setting up the A/B test
# we could convert DAU to active users out of total users per day.
# This will give us a similar ratio to binomial metrics and allows us to use z test due to bigger population size 
# is a tradeoff, since may be alarming due to number of active users decreasing compared to overall number of users 

In [None]:
# sample size for continous metrics- use 2 tailed z test again to calculate minimum sample size 

In [None]:
# N-sample size = 2 * o(standard deviation sign- sort of) ^ 2 * Z(1-a/2) + Z(1-b)^2 / (u1-u2)^2
# u1 (pronounced mu) - proportion of the control group, u2- proportion of the test group
# o - standard deviation of the metric 
# from table, z(1-a/2) = 1.96
# z(1-b) = 0.84
# o = 91
## u1 = 30673, u2 = 30773(ideal mean), u2 is 100 daily active users more, 
## allows us to calculate relative difference of 0.33% between test and control group
# N-sample size = 2 * o(standard deviation sign- sort of) ^ 2 * Z(1-a/2) + Z(1-b)^2 / (u1-u2)^2 = (1.96 +0.84)^2/(30673-30773)^2
# = 2 * 8281 * 0.000784 = 12.98 = 13
# 13 days in each group, provided the groups are of 50% split- might be too long, and 100 users too much
# change to u1 = 30673, u2 = 30980, which is 2 * 91 * (1.96 + 0.84)^ 2/ (-307)^2 = 1.38 = 2 days 
# so then 2 days in each group provided groups are 50% split to assess statistical significance impact 