In [1]:
from scipy import stats

## Example of Significance Power Calculator

In [6]:
def binomial_sample_size(metric, mde, alpha, beta): # mde = minimum detectable effect, metric for us is CTR so 0.33
    # standard normal distribution to determine z-values
    snd = stats.norm(0, 1)

    Z_beta = snd.ppf(1-beta)
    print(Z_beta)

    Z_alpha = snd.ppf(1-alpha/2)
    print(Z_alpha)

    # average of probabilities from both groups
    p = (metric + metric+mde) / 2
    print(p)

    N = (2 * p * 
             (1 - p) * 
             (0.84 + 1.96)**2
             / mde**2)

    return N # difference between amemnding third line to variable or not is we get precise value if put specific values in

In [7]:
binomial_sample_size(metric=0.33, mde=0.02, alpha=0.05, beta=0.2)

0.8416212335729143
1.959963984540054
0.34


8796.479999999998

In [4]:
def continuos_sample_size(metric, mde, sd, alpha, beta):
    # standard normal distribution to determine z-values
    snd = stats.norm(0, 1)

    Z_beta = snd.ppf(1-beta)
    print(Z_beta)

    Z_alpha = snd.ppf(1-alpha/2)
    print(Z_alpha)

    N = (2 * sd**2 * 
             (Z_beta + Z_alpha)**2
             / mde**2)

    return N

In [5]:
continuos_sample_size(metric=30673, mde=300, sd=91, alpha=0.05, beta=0.2)

0.8416212335729143
1.959963984540054


1.4443682906698845

## Test prep- Finalising the decision on how to run the A/B Test

In [None]:
# 1. Hypothesis is defined
# 2. success and non-inferiority metrics are set and we can track them
# 3. We agreed a desired significance and power levels for the test
# 4. We can reach the signficance and power in a reasonable timeframe (2 days based on no of users)
# 5. We've agreed on a halt criteria for the test during the monitoring stage (if no of crashes is too high, test will stop)
# 6. Duration of the test also includes the ability to measure long term metrics if needed (retention, weekly active users etc)
## 7. We've ensured that there are no other tests/releases that could interfere with the test or the other way around 
## which test could interfere with

## Assignment process

In [None]:
# usually a randomised test assignment is done based on user_id
# There needs to be no pre-test bias between the groups 
## Pre-test bias means that users are not randomly shuffled between the groups 
## and one of the groups end up with a significantly different mean to any tracked metric 
## to avoid pre-test bias companies run pre-assignment analysis, 
## where they 'assign' (but not really) users randomly based on  multiple different seeds 
## and calculate difference in the metrics  
# then company select the seed with no potential bias

## Performance monitoring

In [None]:
## monitor the test performance on critical metrics while it is running 
## to be able to prevent a potential negative impact on the users and the business 
# we don't yet determine if test is successful or not
## we monitor alarming trends, like increase in crashes, drop in engagement, 
## and may want to pause the test and investigate issues
## There could also be a potential negative reaction from users, 
## so it is a good idea to monitor Customer Support issues and social media

## How to deal with an impact from Peeking?

In [None]:
## Peeking is an issue of multiple test result calculations, the more you check the significance of your success metric
## the more likely you are to see a false positive 
# can happen if you check the result too early 
## when we monitor the test, we only look and make decisions based on the critical business metrics (number of errors, crashes),
## not the success metric
# p-hacking is the practice of calculating the p-value up to the point where it reaches statistical significance
## and basing the success of the test on that observation- not a good practice from an ethical standpoint 

## Analysis of test results

In [None]:
# assuming we didn't have any major hiccups and we can analyse the data at the end of the test. These are a few steps:
# 1. Look at pretest bias for the groups, look at overall metrics over time
# 2. Observe whethere there is novelty effect and consider excluding the results from the test 
# 3. Calculate the relative and absolute difference 
# 4. Use our alpha (stastical significance level) and beta (test power) to determine whether test results are signficant or not 
# 5. Break down success metrics on their components- why sucess metrics met 
# 6. Share results with PMs, engineers, URs, and all other relevant stakeholders 

## Presenting your results

In [None]:
# practice your presentation:
# set the scene: feature info, test hypothesis, success metric, duration, group size, any assumptions made
# mention how you chose and calculated the analysis metrics- from work or from take home assignment 
# state the test results and their reliability for each of the main metrics, statistical significance for test cases 
# talk about other interesting finding in the test e.g. data abnormal etc 
# talk about potential future analysis and tests- or improvements 

In [None]:
# how would you improve the results? 
# based on analysis of A/B test performance and the new feature 

## Next steps

In [None]:
# 1. What could you improve in the test set up? more users assigned maybe
# 2. Improve or change test monitoring process- theoretical as test done
## 3. Improve the analysis part. Is there any more data that would be very valuable to look at?
## in our case it could be link clicks, how much less over time do people click the link, how often do those links appear etc

## What happens with scale?

In [None]:
# more users brings almost instant statistical signficance as long as not time sensitive, but also bring new challenges
# may also have to wait if 7 or 21 day retention 
 # challenges include tests having more than one test case, multiple tests running at the same time 
    # different uses may have different experiences, so hard to differentiate between them
    # determine test feature exposure - did users find feature easily? Look at users who went into settings menu
    # might want to test interconnected features
    ## if testing in different markets(+ marketing limitations)- may only be able to have a small control group or not 
    ## since marketing campaign may want a large number of userbase testing the feature  
    # important to consider ethics and user discrmininations 
## you will asked to pull some data to satisy the curiosity of the people in your company and will need to be abe to tell them
## what value of that question will be- could help advise them of better metric to use etc, 
## and if users are seeing things too often etc
# to estimate the effect of that we also need to know our estimated revenue. Whether we will take a big hit or not

## Analysis steps- how does an average A/B test analysis process look like?

In [None]:
## 1. look at the assignments. Does the number of users represent the correct percent of the assignment split?
## Are the number of daily/weekly etc assignments expected?
# 2. Look at the pretest period or sucess metrics qnd non-inferiority metrics,
## to make sure groups evenly distributed across different metrics. Is there any pretest bias or general bias?
## 3. Look at the critical performance metrics. UX critical (crashes, availiability, etc)
## and business critical(revenue, engagement)
# 4. calculate the significance of the results on the Success Metric and Non-Inferiority Margin  
# 5. Observe any temporal effects on the metrics. Is the impact stable or is there a novelty effect?
## 6. Summaarise the results in an understandable, non-technical way, with the possibility to look at the assumptions made 
## and methods used for the analysis 
# 7. Summarise the suggestions for the next steps and potential additional research 
# 8. Share the results with stakeholders and data insights community, if there is one 
