In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(r'C:\Users\user\Documents\Data Science\bestsellers with categories.csv')
df.head()

Unnamed: 0,Name,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction


# a) Proportion One Population Z-Test

In [3]:
# Fiction = 1, Non-Fiction = 0
df['Genre_Bin'] = np.where(df['Genre'] == 'Fiction', 1, 0)
df.head()

Unnamed: 0,Name,Author,User Rating,Reviews,Price,Year,Genre,Genre_Bin
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction,0
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction,1
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction,0
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction,1
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction,0


In [4]:
number_fiction  =  np.sum(df['Genre_Bin'])
total = len(df['Genre_Bin'])
fiction_rate = number_fiction/total
print('Proporsi jumlah buku fiksi: ', fiction_rate)

Proporsi jumlah buku fiksi:  0.43636363636363634


**Ho: Proporsi jumlah buku fiksi = 0.5   
Ha: Proporsi jumlah buku fiksi > 0.5**

In [6]:
from statsmodels.stats.proportion import proportions_ztest
z_stats, p_value = proportions_ztest(fiction_rate, total, 0.5, alternative = 'larger')
print('p-value: ', p_value)
print('z-stats: ', z_stats)

p-value:  1.0
z-stats:  -415.8063261233479


In [7]:
df['Genre'].value_counts()

Non Fiction    310
Fiction        240
Name: Genre, dtype: int64

# b) Proportion Two Population Z-Test

In [8]:
# A/B Testing
visitor_data_before = np.random.binomial(1, 0.5, 1000)
visitor_data_after = np.random.binomial(1, 0.7, 1000)
visitor_data_before = np.where(visitor_data_before == 1, 'buying', 'no buying')
visitor_data_after = np.where(visitor_data_after == 1, 'buying', 'no buying')

In [12]:
df_before = pd.DataFrame({'layout':'old layout', 'conversion': visitor_data_before})
df_after = pd.DataFrame({'layout':'new layout', 'conversion': visitor_data_after})

In [13]:
df_full = df_before.append(df_after)
df_full

Unnamed: 0,layout,conversion
0,old layout,buying
1,old layout,no buying
2,old layout,buying
3,old layout,no buying
4,old layout,no buying
...,...,...
995,new layout,buying
996,new layout,buying
997,new layout,buying
998,new layout,buying


In [14]:
pd.crosstab(df_full['layout'], df_full['conversion'])

conversion,buying,no buying
layout,Unnamed: 1_level_1,Unnamed: 2_level_1
new layout,734,266
old layout,509,491


**Ho: p-before = p-after   
Ha: p-after > p-before**

In [15]:
# Conversion Rate Before 
n_success_before = len(df_full[(df_full['layout'] == 'old layout') & (df_full['conversion'] == 'buying')])
n_before = len(df_full[(df_full['layout'] == 'old layout')])
success_rate_before = n_success_before/n_before

print('Conversion rate before: ', success_rate_before)

Conversion rate before:  0.509


In [16]:
# Conversion Rate After
n_success_after = len(df_full[(df_full['layout'] == 'new layout') & (df_full['conversion'] == 'buying')])
n_after = len(df_full[(df_full['layout'] == 'new layout')])
success_rate_after = n_success_after/n_after

print('Conversion rate after: ', success_rate_after)

Conversion rate after:  0.734


In [17]:
# Total number buying (success)
number_success = np.array([n_success_before, n_success_after])

# Total Traffic (all visitor = not buying + buying)
total = np.array([n_before, n_after])

In [19]:
from statsmodels.stats.proportion import proportions_ztest
z_stat, p_value = proportions_ztest(number_success, total, alternative = 'smaller')

print('p-value: ', p_value)

p-value:  1.6411909747198188e-25


# c) Chi-Squared Test for Independence betweet Two Categorical Variables

In [20]:
df_c = pd.DataFrame(data = [[207, 282, 241], [234, 242, 232]],
                   columns = ['Banana', 'Mango', 'Pineapple'], 
                   index = ['Male Customer', 'Female Customer'])
df_c

Unnamed: 0,Banana,Mango,Pineapple
Male Customer,207,282,241
Female Customer,234,242,232


**Ho: Two variables are independent  
Ha: Two variables are dependent**

In [26]:
from scipy.stats import chi2_contingency

data = [[207, 282, 241], [234, 242, 232]]
stat, pvalue, dof, expected = chi2_contingency(data)

# Conclusion
alpha = 0.05
print('p-value: ', pvalue)
if pvalue <= alpha:
    print('Dependent : Reject Ho')
else:
    print('Independent : Accpet Ho')

p-value:  0.1031971404730939
Independent : Accpet Ho
