In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

# Adjust Dataset for bootstrap

In [None]:
# Two options to start with, choose either one, but one at a time.
# Option 1: importing data with outliers
query = s.parse_query_file('/Users/leon.zhao/Desktop/SQL/simplification_test_summary')

# Option 2: importing data without outliers
# data = s.parse_query_file('/Users/leon.zhao/Desktop/SQL/simplification_test_summary_no_outliers')

# Printing the query to check
print(query)
rows, colnames = s.execute_query(query)

In [None]:
# Loading data into a dataframe and presenting the new data format
df = pd.DataFrame(data = rows, columns = colnames)
df.head()

In [None]:
# Choosing the desired metric and taking out the zeros

platform = {}
rev_output = {}
for plat in ['AD','iOS']:
    df1 = df.loc[df['PLAT'] == plat]
    platform[plat] = df1
    rev_output[plat + '_rev'] = platform[plat].loc[:,['PLAT','VARIANT_NAME','BUMP_REVENUE','PROMOTION_REVENUE','SUB_REVENUE'] ]
    rev_output[plat + '_rev']['IAP_REVENUE'] = rev_output[plat + '_rev'].iloc[:,1:].sum(axis=1)
    rev_output[plat + '_rev_no_zeros'] = rev_output[plat + '_rev'][rev_output[plat + '_rev'].IAP_REVENUE != 0.0]


In [None]:
# Creating a name list for variants

df_names = []
for key, value in rev_output.items() :
    df_names.append(key)

In [None]:
# Describing the sliced data within each variant

variant_dic = {}
for df in df_names:
    for variant_name in ['Control', 'Subs_1d_3d', 'Subs_bump_3d', 'Subs_1d_3d_7d_14d']:
        print(df+'_'+variant_name)
        print(rev_output[df][rev_output[df].VARIANT_NAME == variant_name].describe())
        variants = rev_output[df].loc[rev_output[df]['VARIANT_NAME'] == variant_name]
        variant_dic[df+'_'+variant_name+'_iap'] = variants.iloc[:,-1]

for key, value in variant_dic.items():
    iap = pd.DataFrame(value)

In [None]:
# creating an non-zero dic

variant_dic_nonzero = {}
for variant, value_series in variant_dic.items():
    values_non_zero = [v for v in value_series.values if v > 0]
    variant_dic_nonzero[variant] = np.array(values_non_zero)

# Getting the following
1. User penetration
2. Number of Users Bucketed
3. Total sum of the key metrics within bucket

In [None]:
p = {}
proportion = [['AD_rev', 'AD_rev_no_zeros'],['iOS_rev', 'iOS_rev_no_zeros']]
variant_name = ['Control', 'Subs_1d_3d', 'Subs_bump_3d', 'Subs_1d_3d_7d_14d']
for i in range(len(proportion)):
    for v_name in variant_name:
        p_comb_n = proportion[i][1]+'_'+v_name
        p_comb_d = proportion[i][0]+'_'+v_name
        p[p_comb_d] = variant_dic[p_comb_n+'_iap'].count()/variant_dic[p_comb_d+'_iap'].count(), variant_dic[p_comb_d+'_iap'].count(), variant_dic[p_comb_d+'_iap'].sum()
    
p

# Sampling the data for num_iterations times
1. Simulating number of purchasers with binomial draws
2. Simulating value spent with random draws
3. Generate sim_final: values spent per purchaser

In [None]:
num_iterations = 10000
binomial = {}
revenue = {}
sim_final = {}
for variants, values in p.items():
    #print(int(values[1]))
    for i in range(num_iterations):
        binomial.setdefault(variants, []).append(np.random.binomial(values[1], values[0]))
        revenue.setdefault(variants, []).append(np.random.choice(variant_dic_nonzero[variants + '_iap'], size=binomial[variants][i], replace=True))
        sim_final.setdefault(variants, []).append(np.sum(revenue[variants][i]) / values[1])


# Get differences for key metric variants vs. control

In [None]:
variant_name = []
for name in p.keys():
    variant_name.append(name)

variant_name.sort()

ad_variant_name = variant_name[:len(variant_name)//2]
ios_variant_name = variant_name[len(variant_name)//2:]


ad_bootstrapped_diff = {}
ios_bootstrapped_diff = {}

for i in range(1,len(ad_variant_name)):
    for j in range(num_iterations):
        ad_bootstrapped_diff.setdefault(ad_variant_name[i], []).append(sim_final[ad_variant_name[i]][j] - sim_final[ad_variant_name[0]][j])

for i in range(1,len(ios_variant_name)):
    for j in range(num_iterations):
        ios_bootstrapped_diff.setdefault(ios_variant_name[i], []).append(sim_final[ios_variant_name[i]][j] - sim_final[ios_variant_name[0]][j])


In [None]:
for key, value in ad_bootstrapped_diff.items():
    print(np.array(value))

In [None]:
for key, value in ad_bootstrapped_diff.items():
    ad_bootstrapped_diff[key] = np.array(value)
    x=np.linspace(min(value), max(value), 42)
    mean=np.array(value).mean()
    std=np.array(value).std()
    y_pdf=stats.norm.pdf(x,mean,std)
    plt.hist(value, bins=1000, label=key, density = False)
    plt.plot(x,y_pdf, label='PDF')
    plt.title(key)
    plt.legend()
    plt.xlabel('Diff vs. Control')
    plt.ylabel('Frequency')
    plt.title(r'Histogram of Revenue Differences vs. Control')
    plt.show()



In [None]:
alpha=0.05
output = {}

for keys in ad_bootstrapped_diff.keys():
    val = np.percentile(ad_bootstrapped_diff[keys], 50)
    low = np.percentile(ad_bootstrapped_diff[keys], 100 * (alpha / 2.))
    high = np.percentile(ad_bootstrapped_diff[keys], 100 * (1 - alpha / 2.))
    print(keys)
    print('{}\t({}, {})'.format(val,low,high))
    print(min(2*np.sum(np.array(ad_bootstrapped_diff[keys]) < 0)/num_iterations,
              2*np.sum(np.array(ad_bootstrapped_diff[keys]) > 0)/num_iterations))

In [None]:
for keys in ios_bootstrapped_diff.keys():
    val = np.percentile(ios_bootstrapped_diff[keys], 50)
    low = np.percentile(ios_bootstrapped_diff[keys], 100 * (alpha / 2.))
    high = np.percentile(ios_bootstrapped_diff[keys], 100 * (1 - alpha / 2.))
    print(keys)
    print('{}\t({}, {})'.format(val,low,high))
    print(min(2*np.sum(np.array(ios_bootstrapped_diff[keys]) < 0)/num_iterations,
              2*np.sum(np.array(ios_bootstrapped_diff[keys]) > 0)/num_iterations))

In [None]:
AD_rev_Subs_1d_3d = np.array(ad_bootstrapped_diff['AD_rev_Subs_1d_3d'])
AD_rev_Subs_bump_3d = np.array(ad_bootstrapped_diff['AD_rev_Subs_bump_3d'])
AD_rev_Subs_1d_3d_7d_14d = np.array(ad_bootstrapped_diff['AD_rev_Subs_1d_3d_7d_14d'])
iOS_rev_Subs_1d_3d = np.array(ios_bootstrapped_diff['iOS_rev_Subs_1d_3d'])
iOS_rev_Subs_bump_3d = np.array(ios_bootstrapped_diff['iOS_rev_Subs_bump_3d'])
iOS_rev_Subs_1d_3d_7d_14d = np.array(ios_bootstrapped_diff['iOS_rev_Subs_1d_3d_7d_14d'])

ad_diff = np.concatenate((AD_rev_Subs_1d_3d, AD_rev_Subs_bump_3d, AD_rev_Subs_1d_3d_7d_14d),0)
gph = [AD_rev_Subs_1d_3d, AD_rev_Subs_bump_3d, AD_rev_Subs_1d_3d_7d_14d, iOS_rev_Subs_1d_3d, iOS_rev_Subs_bump_3d, iOS_rev_Subs_1d_3d_7d_14d]

In [None]:
# multiple box plots on one figure
plt.figure(figsize=(12,6))
plt.boxplot(gph, showfliers=False, vert=False)
plt.show()

# Archives

In [None]:
# from collections import defaultdict

# num_iterations = 10
# sim_final = defaultdict(list)
# for variant, values in p.items():
#     #print(int(values[1]))
#     for i in range(num_iterations):
#         n_non_zero = np.random.binomial(values[1], values[0])  
#         revenue = np.sum(np.random.choice(a=variant_dic_nonzero[variant + '_iap'], size=n_non_zero, replace=True))
#         sim_final[variant].append(revenue/values[1])

# sim_final

In [None]:
# bootstrapped_means_diff = np.array(sim_final['AD_rev_Control']) - np.array(sim_final['AD_rev_Subs_1d_3d'])
# alpha=0.05
# low = np.percentile(bootstrapped_means_diff, 100 * (alpha / 2.))
# val = np.percentile(bootstrapped_means_diff, 50)
# high = np.percentile(bootstrapped_means_diff, 100 * (1 - alpha / 2.))

# print(bootstrapped_means_diff.shape[0])
# # print(variant)
# print('{}\t({}, {})'.format(val,low,high))
# print('p-value: {}'.format(min(2*np.sum(bootstrapped_means_diff < 0)/bootstrapped_means_diff.shape[0],
#                                2*np.sum(bootstrapped_means_diff > 0)/bootstrapped_means_diff.shape[0])))

# print(bootstrapped_means_diff > 0)



In [None]:
# for variant in p.keys():
#     bootstrapped_means_diff = np.array(sim_final[variant])
#     alpha=0.05
#     low = np.percentile(bootstrapped_means_diff, 100 * (alpha / 2.))
#     val = np.percentile(bootstrapped_means_diff, 50)
#     high = np.percentile(bootstrapped_means_diff, 100 * (1 - alpha / 2.))

#     # print(ctrl.describe(), test.describe())
#     print(variant)
#     print('{}\t({}, {})'.format(val,low,high))
#     print('\n')