In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ab-testing/ab_data.csv


In [2]:
# Packages imports
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.stats.api as sms
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil

%matplotlib inline

## Preparing the Data

In [3]:
df = pd.read_csv('../input/ab-testing/ab_data.csv')

In [4]:
df.shape

(294478, 5)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294478 non-null  int64 
 1   timestamp     294478 non-null  object
 2   group         294478 non-null  object
 3   landing_page  294478 non-null  object
 4   converted     294478 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


In [6]:
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [7]:
pd.crosstab(df['group'], df['landing_page'])

landing_page,new_page,old_page
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,1928,145274
treatment,145311,1965


In [8]:
# Finding the number of duplicates user
user_duplicate = df['user_id'].value_counts()
users_multi = user_duplicate[user_duplicate > 1].count()
users_multi

3894

In [9]:
# Removing the duplicates user
to_drop = user_duplicate[user_duplicate > 1].index
df = df[~df['user_id'].isin(to_drop)]

## Hypothesis

In [10]:
null_hypothesis = 'Perubahan Landing Page Tidak Berpengaruh Terhadap Converted'
alternate_hypothesis = 'Perubahan Landing Page Berpengaruh Terhadap Converted'

## Sampling

In [11]:
# comparing two proportions
effect_size = sms.proportion_effectsize(0.13, 0.15)   
# solve for any one parameter of the power of a two sample z-test
required_n = sms.NormalIndPower().solve_power(effect_size, power=0.8,  alpha=0.05,ratio=1)                                                 
required_n = ceil(required_n)                                                   

print(required_n)

4720


In [12]:
control_sample = df[df['group'] == 'control'].sample(n=required_n, random_state=22)
treatment_sample = df[df['group'] == 'treatment'].sample(n=required_n, random_state=22)

ab_test = pd.concat([control_sample, treatment_sample], axis=0)
ab_test.reset_index(drop=True, inplace=True)

## Testing the Hypothesis

In [13]:
from statsmodels.stats.proportion import proportions_ztest, proportion_confint
control_results = ab_test[ab_test['group'] == 'control']['converted']
treatment_results = ab_test[ab_test['group'] == 'treatment']['converted']
n_con = control_results.count()
n_treat = treatment_results.count()
successes = [control_results.sum(), treatment_results.sum()]
nobs = [n_con, n_treat]

z_stat, pval = proportions_ztest(successes, nobs=nobs)
(lower_con, lower_treat), (upper_con, upper_treat) = proportion_confint(successes, nobs=nobs, alpha=0.05)

print(f'z statistic: {z_stat:.2f}')
print(f'p-value: {pval:.3f}')
print(f'ci 95% for control group: [{lower_con:.3f}, {upper_con:.3f}]')
print(f'ci 95% for treatment group: [{lower_treat:.3f}, {upper_treat:.3f}]')

p_value = f'p-value: {pval:.3f}'

z statistic: -0.34
p-value: 0.732
ci 95% for control group: [0.114, 0.133]
ci 95% for treatment group: [0.116, 0.135]


In [14]:
if p_value < '0.05':
    print('the null hypothesis should be rejected and accept alternate hypothesis')
    print(alternate_hypothesis)
else:
    print('the alternate hypothesis should be rejected and accept null hypothesis')
    print(null_hypothesis)

the alternate hypothesis should be rejected and accept null hypothesis
Perubahan Landing Page Tidak Berpengaruh Terhadap Converted
