In [1]:
import pandas as pd
import os
import numpy as np

import scipy.stats as stats
import statsmodels.stats.api as sms
from statsmodels.stats.proportion import proportions_ztest
import matplotlib.pyplot as plt
import seaborn as sns

#os.getcwd()


In [2]:
ab_data = pd.read_csv('ab_data.csv')

In [3]:
ab_data.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [4]:
ab_data.shape

(294478, 5)

In [5]:
ab_data['user_id'].value_counts()

637561    2
821876    2
643869    2
938802    2
916765    2
         ..
710897    1
708848    1
665839    1
663790    1
630836    1
Name: user_id, Length: 290584, dtype: int64

In [6]:
ab_data.loc[ab_data['user_id']==643869]

Unnamed: 0,user_id,timestamp,group,landing_page,converted
43290,643869,2017-01-07 06:30:21.865985,treatment,new_page,0
286944,643869,2017-01-21 07:48:57.244089,treatment,old_page,1


In [7]:
ab_data['user_id'].value_counts()

637561    2
821876    2
643869    2
938802    2
916765    2
         ..
710897    1
708848    1
665839    1
663790    1
630836    1
Name: user_id, Length: 290584, dtype: int64

In [8]:
ab_data = ab_data.drop_duplicates(subset=['user_id'], keep=False)

In [9]:
ab_data.shape

(286690, 5)

In [10]:
ab_data['timestamp'] = pd.to_datetime(ab_data['timestamp'])

In [11]:
ab_data['user_id'].value_counts()

630732    1
721461    1
774703    1
731696    1
729649    1
         ..
943462    1
937315    1
935266    1
941409    1
630836    1
Name: user_id, Length: 286690, dtype: int64

In [12]:
ab_data.loc[(ab_data['landing_page']!='new_page') & (ab_data['group']=='treatment')]['converted'].value_counts()

Series([], Name: converted, dtype: int64)

In [13]:
pd.crosstab(ab_data['group'],ab_data['landing_page'])

landing_page,new_page,old_page
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,0,143293
treatment,143397,0


In [14]:
pd.crosstab(ab_data['group'],ab_data['converted'])

converted,0,1
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,126073,17220
treatment,126372,17025


In [15]:
pre_test_success = 17220/(126073+17220)

In [16]:
post_test_success = 17025/(17025+126372)

In [17]:
pre_test_success

0.12017335110577627

## Business Problem

Both pre and post conversion rates are around 12%, with post test metrics actually seeming to be a little worse with the new design than the old design.

Your business team is starting to worry that this new design is losing potential customers, wants to know if they should keep this design or revert back to the old button. They say that in order to keep the new button, they would like to see at least a 3% increase in subscribers being attracted. Otherwise, they will return back to the old design.


In [18]:
desired_effect = pre_test_success + 0.03
desired_effect

0.1501733511057763

In [19]:
desired_effect

0.1501733511057763

## Effect Size
Effect size provides valuable information about the strength and importance of observed differences or relationships in a study, helping researchers and practitioners make more informed decisions and interpretations of their findings


In [20]:
# # # article eff size - same as my calc value?
#effect_size_article = sms.proportion_effectsize(pre_treatment_success,post_treatment_success)
effect_size= sms.proportion_effectsize(pre_test_success,desired_effect) # see if there is an effect between expected pre and post amounts
effect_size


-0.08786769827197471

In [21]:
## Min sample size required for test

In [22]:
# https://www.statsmodels.org/dev/generated/statsmodels.stats.power.NormalIndPower.solve_power.html
# https://towardsdatascience.com/ab-testing-with-python-e5964dd66143
required_n = sms.NormalIndPower().solve_power(
    effect_size, 
    power=0.8, # Power is the probability that the test correctly rejects the Null Hypothesis if the Alternative Hypothesis is true. Typical default value is 0.8, saying that there is an 80% chance that the test correctly rejects the null hypothesis.
    alpha=0.05, 
    ratio=1
    )               

In [23]:
required_n = np.ceil(required_n)
print(required_n)


2034.0


Since at the time of experiment control group and treatment group is not unknown, we divide it on the basis of conversion by equally spliting basis the conversion.

In [24]:
random_state = 23
control_sample = ab_data[ab_data['converted'] == 0].sample(int(required_n), random_state=random_state)
treatment_sample = ab_data[ab_data['converted'] == 1].sample(int(required_n), random_state=random_state)


In [25]:
control_sample.shape

(2034, 5)

In [26]:
control_sample.head(3)

Unnamed: 0,user_id,timestamp,group,landing_page,converted
70732,674944,2017-01-04 06:48:50.014598,treatment,new_page,0
74344,681957,2017-01-23 22:27:43.556943,treatment,new_page,0
129595,923701,2017-01-15 03:53:36.554339,treatment,new_page,0


In [27]:
treatment_sample.shape

(2034, 5)

In [28]:
treatment_sample.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
50064,826279,2017-01-17 05:53:42.747994,control,old_page,1
50672,829948,2017-01-03 22:58:26.151483,treatment,new_page,1
120670,654205,2017-01-02 17:38:05.912553,treatment,new_page,1
208440,816890,2017-01-03 07:54:32.096480,treatment,new_page,1
1541,921793,2017-01-15 00:09:38.424409,control,old_page,1


In [29]:
ab_test = pd.concat([control_sample, treatment_sample], axis=0)
ab_test.reset_index(drop=True, inplace=True)

ab_test.head()


Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,674944,2017-01-04 06:48:50.014598,treatment,new_page,0
1,681957,2017-01-23 22:27:43.556943,treatment,new_page,0
2,923701,2017-01-15 03:53:36.554339,treatment,new_page,0
3,749134,2017-01-03 17:17:51.870576,treatment,new_page,0
4,690367,2017-01-12 23:52:40.202761,control,old_page,0


In [30]:
ab_test.shape

(4068, 5)

In [31]:
pd.crosstab(ab_test['group'],ab_test['converted'])

converted,0,1
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,1010,963
treatment,1024,1071


## Test to see if new design performed differently from the old design

In [32]:
control_results = ab_test[ab_test['group'] == 'control']['converted']
treatment_results = ab_test[ab_test['group'] == 'treatment']['converted']
n_con = control_results.count() # number of converted users
n_treat = treatment_results.count() # number of treatment users
successes = [control_results.sum(), treatment_results.sum()]
nobs = [n_con, n_treat]

z_stat, pval = proportions_ztest(successes, nobs=nobs)

print(f'z statistic two-tailed test: {z_stat:.2f}')
print(f'p-value two-tailed test: {pval:.3f}')


z statistic two-tailed test: -1.47
p-value two-tailed test: 0.140


## Results from two-tailed test:


Because our p-value in the two-tailed test was > our alpha level of 0.05, we cannot conclude that there is a statistically difference in performance between the new and old designs at the level that the business leaders would have liked to see. These results alone may make our team reconsider if they'd like to continue to use the new subscriber button or not.

But, this test does not tell us the second part of the business leader's question - which is if this new button proves to be worse than the old design or not. This is because the two-tailed test does not provide any sort of direction. To get determine if the new design has more converted users than the old design, we will need to run a one-tailed test.



In [33]:
##One-tailed Hypothesis: The new button is better than the old button at attracting subscribers.


In [34]:
# alternative: control < treatment 
# alternative: the new design attracts more subscribers than the old
z_stat, pval = proportions_ztest(successes, nobs=nobs, alternative='smaller')

print(f'z statistic one-tailed test that control < treatment : {z_stat:.2f}')
print(f'p-value one-tailed test that  control < treatment : {pval:.3f}')
print('')

# alternative: control > treatment 
# alternative: the old design attracts more subscribers than the new
# in other words, the new design is losing us subscribers
z_stat, pval = proportions_ztest(successes, nobs=nobs, alternative='larger')

print(f'z statistic one-tailed test that control > treatment : {z_stat:.2f}')
print(f'p-value one-tailed test that  control > treatment : {pval:.3f}')


z statistic one-tailed test that control < treatment : -1.47
p-value one-tailed test that  control < treatment : 0.070

z statistic one-tailed test that control > treatment : -1.47
p-value one-tailed test that  control > treatment : 0.930


## Results from one-tailed test:
Because our p-value in the one-tailed test where control < treatment was > our alpha level of 0.05, we cannot conclude that the new button attracts subscribers better than the old button. So, the new design is not necessarily gaining us any new subscribers.

Also, because our p-value in the one-tailed test where control > tretment was > our alpha level of 0.05, we cannot conclude that the new button deters subscribers compared to the old button - this is good news and shows the business leaders that we aren't losing subscribers because of this new design like they had previously thought.

## Overall Results
Our statistical tests have revealed that the new design is not doing a better job at attracting customers to become subscribers to our site than the old design. Although the new design is not necessarily losing us customers either, business leaders may consider if they would like to revert back to the old design, or continue to try other solutions to attempt to attract more subscribers.