In [7]:
# Packages imports
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.stats.api as sms
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil

%matplotlib inline

In [6]:
#Power analysis - Minimum required sample size for Test and Control Groups 

def get_two_proportions_sample_size(
    p1,
    p2,
    alpha=0.05,
    beta=0.2,
    two_side=False
):
    if two_side:
        z_a = stats.norm.ppf(1-alpha/2)
    else:
        z_a = stats.norm.ppf(1-alpha)
    z_b = stats.norm.ppf(1 - beta)
    
    delta = abs(p1 - p2)
    p_bar = (p1 + p2) / 2
    q_bar = 1 - p_bar
    q1 = 1 - p1
    q2 = 1 - p2
    
    n = (np.sqrt(p_bar*q_bar*2)*z_a + np.sqrt(p1*q1+p2*q2)*z_b)**2 / (delta**2)
    return int(np.ceil(n))

get_two_proportions_sample_size(0.015, 0.02)

8504

In [8]:
#Import dataframe, Check for duplicate users

df = pd.read_csv('churn_email_2021_12_03.csv')
duplicate_counts = df['renter_id'].value_counts(ascending=False)
multi_users = duplicate_counts[duplicate_counts > 1].count()

print(f'There are {multi_users} users that appear multiple times in the dataset')

There are 0 users that appear multiple times in the dataset


In [9]:
#Check for null values

df.isnull().sum()

renter_id                 0
test_group                0
rental_count              0
total_gross_revenue       0
promocode_spend_amount    0
dtype: int64

In [17]:
#New boolean column added for converted vs non-converted

converted = df['rental_count'] > 0
df['converted'] = converted.astype('int')

In [10]:
#Snapshot of group aggregates 

df[['test_group','rental_count','total_gross_revenue','promocode_spend_amount']].groupby('test_group').describe().style.format('{:.1f}')

Unnamed: 0_level_0,rental_count,rental_count,rental_count,rental_count,rental_count,rental_count,rental_count,rental_count,total_gross_revenue,total_gross_revenue,total_gross_revenue,total_gross_revenue,total_gross_revenue,total_gross_revenue,total_gross_revenue,total_gross_revenue,promocode_spend_amount,promocode_spend_amount,promocode_spend_amount,promocode_spend_amount,promocode_spend_amount,promocode_spend_amount,promocode_spend_amount,promocode_spend_amount
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
test_group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
Control,41320.0,1.8,2.0,0.0,1.0,1.0,2.0,32.0,41320.0,39.5,48.3,0.0,14.0,25.0,46.0,1343.0,41320.0,0.1,1.3,0.0,0.0,0.0,0.0,66.5
Tail,403.0,1.8,2.0,0.0,1.0,1.0,2.0,14.0,403.0,39.8,53.9,0.0,14.0,22.5,46.0,489.0,403.0,0.1,1.2,0.0,0.0,0.0,0.0,21.4
Test,41670.0,1.8,2.0,0.0,1.0,1.0,2.0,27.0,41670.0,39.4,46.7,0.0,14.0,24.0,46.0,700.0,41670.0,0.4,2.2,0.0,0.0,0.0,0.0,111.0


In [14]:
#homogeneity of variance aka assumption that groups have equal variance. Significant result means unequal variance


x,pval = stats.levene(df['rental_count'][df['test_group']== 'Control'],
             df['rental_count'][df['test_group']== 'Test'],
             center= 'mean')

if pval > 0.05:
    print('P-Value = {}. Groups have equal variance'.format(pval))
else:
    print('P-Value = {}. Groups do not have equal variance'.format(pval))

P-Value = 0.5295790828884632. Groups have equal variance


In [18]:
#Snapshot of conversion rate and group variance

groups = ['Test','Control']
df_clean = df[df['test_group'].isin(groups)]

conversion_rates = df_clean.groupby('test_group')['converted']

std_p = lambda x: np.std(x, ddof=0)              # Std. deviation of the proportion
se_p = lambda x: stats.sem(x, ddof=0)            # Std. error of the proportion (std / sqrt(n))

conversion_rates = conversion_rates.agg([np.mean, std_p, se_p])
conversion_rates.columns = ['conversion_rate', 'std_deviation', 'std_error']


conversion_rates.style.format('{:.2f}')

Unnamed: 0_level_0,conversion_rate,std_deviation,std_error
test_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Control,0.95,0.22,0.0
Test,0.95,0.23,0.0


In [19]:
#Aggregates per renter by group

sum_columns = ['rental_count', 'total_gross_revenue', 'promocode_spend_amount','converted']
stats_per_renter = df_clean.groupby('test_group').agg({'renter_id':'count', 'rental_count': 'sum', 'total_gross_revenue': 'sum', 'promocode_spend_amount': 'sum', 'converted': 'sum'})
stats_per_renter['rentals_per_renter'] = stats_per_renter['rental_count'] / stats_per_renter['renter_id']
stats_per_renter['gmv_per_renter'] = stats_per_renter['total_gross_revenue'] / stats_per_renter['renter_id']
stats_per_renter['promo_spend_per_renter'] = stats_per_renter['promocode_spend_amount'] / stats_per_renter['renter_id']
stats_per_renter.style.format('{:.2f}')

Unnamed: 0_level_0,renter_id,rental_count,total_gross_revenue,promocode_spend_amount,converted,rentals_per_renter,gmv_per_renter,promo_spend_per_renter
test_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Control,41320.0,73816.0,1632947.53,5449.84,39206.0,1.79,39.52,0.13
Test,41670.0,74011.0,1641165.11,15770.79,39419.0,1.78,39.38,0.38


In [20]:
#t-tests for rental, GMV, and promospend values. First 'Rental Count'. Not significant

import researchpy as rp

rp.ttest(group1= df['rental_count'][df['test_group'] == 'Control'], group1_name= "Control",
         group2= df['rental_count'][df['test_group'] == 'Test'], group2_name= "Test")

(   Variable        N      Mean        SD        SE  95% Conf.  Interval
 0   Control  41320.0  1.786447  2.021219  0.009943   1.766958  1.805936
 1      Test  41670.0  1.776122  2.015708  0.009875   1.756768  1.795476
 2  combined  82990.0  1.781263  2.018448  0.007007   1.767530  1.794996,
                Independent t-test     results
 0  Difference (Control - Test) =       0.0103
 1           Degrees of freedom =   82988.0000
 2                            t =       0.7368
 3        Two side test p value =       0.4612
 4       Difference < 0 p value =       0.7694
 5       Difference > 0 p value =       0.2306
 6                    Cohen's d =       0.0051
 7                    Hedge's g =       0.0051
 8                Glass's delta =       0.0051
 9                  Pearson's r =       0.0026)

In [21]:
#GMV. Not significant

rp.ttest(group1= df['total_gross_revenue'][df['test_group'] == 'Control'], group1_name= "Control",
         group2= df['total_gross_revenue'][df['test_group'] == 'Test'], group2_name= "Test")

(   Variable        N       Mean         SD        SE  95% Conf.   Interval
 0   Control  41320.0  39.519543  48.302610  0.237624  39.053795  39.985291
 1      Test  41670.0  39.384812  46.699272  0.228770  38.936419  39.833205
 2  combined  82990.0  39.451893  47.504086  0.164899  39.128693  39.775094,
                Independent t-test     results
 0  Difference (Control - Test) =       0.1347
 1           Degrees of freedom =   82988.0000
 2                            t =       0.4085
 3        Two side test p value =       0.6829
 4       Difference < 0 p value =       0.6586
 5       Difference > 0 p value =       0.3414
 6                    Cohen's d =       0.0028
 7                    Hedge's g =       0.0028
 8                Glass's delta =       0.0028
 9                  Pearson's r =       0.0014)

In [22]:
#Promo spend. Significant

rp.ttest(group1= df['promocode_spend_amount'][df['test_group'] == 'Control'], group1_name= "Control",
         group2= df['promocode_spend_amount'][df['test_group'] == 'Test'], group2_name= "Test")

(   Variable        N      Mean        SD        SE  95% Conf.  Interval
 0   Control  41320.0  0.131894  1.326445  0.006525   0.119104  0.144684
 1      Test  41670.0  0.378469  2.186945  0.010713   0.357470  0.399467
 2  combined  82990.0  0.255701  1.814560  0.006299   0.243355  0.268047,
                Independent t-test     results
 0  Difference (Control - Test) =      -0.2466
 1           Degrees of freedom =   82988.0000
 2                            t =     -19.6182
 3        Two side test p value =       0.0000
 4       Difference < 0 p value =       0.0000
 5       Difference > 0 p value =       1.0000
 6                    Cohen's d =      -0.1362
 7                    Hedge's g =      -0.1362
 8                Glass's delta =      -0.1859
 9                  Pearson's r =       0.0679)

In [23]:
#import churn deciles and add to main dataframe. Check null values in new column

churn_list = pd.read_csv('customer_churn_2021_11_10.txt', delimiter= '\t')
df_plus_churn = pd.merge(df,churn_list[['renter_id', 'decile']],on='renter_id',how='left')
df_plus_churn['decile'].isnull().sum()

3060

In [46]:
#fill null values with 0. Change data type to int. Create list for looping

df_plus_churn['decile'] = df_plus_churn['decile'].fillna(0)
df_plus_churn['decile'] = df_plus_churn['decile'].astype(int)
deciles = [1,2,3,4,5,6,7,8,9,10]

In [51]:
#Loop through each decile checking if any are significant. Other than promo spend, no deciles are significantly different

for decile in deciles:
    filtered_df = df_plus_churn[df_plus_churn['decile'] == decile]
    print('T-Test results for decile {}'.format(decile))
    print(rp.ttest(group1= filtered_df['promocode_spend_amount'][filtered_df['test_group'] == 'Control'], group1_name= "Control",
                    group2= filtered_df['promocode_spend_amount'][filtered_df['test_group'] == 'Test'], group2_name= "Test"))
    

T-Test results for decile 1
(   Variable       N      Mean        SD        SE  95% Conf.  Interval
0   Control  1304.0  0.086887  0.688282  0.019060   0.049495  0.124279
1      Test  1451.0  0.323118  1.571396  0.041253   0.242197  0.404040
2  combined  2755.0  0.211305  1.240222  0.023629   0.164974  0.257637,                Independent t-test    results
0  Difference (Control - Test) =     -0.2362
1           Degrees of freedom =   2753.0000
2                            t =     -5.0135
3        Two side test p value =      0.0000
4       Difference < 0 p value =      0.0000
5       Difference > 0 p value =      1.0000
6                    Cohen's d =     -0.1913
7                    Hedge's g =     -0.1913
8                Glass's delta =     -0.3432
9                  Pearson's r =      0.0951)
T-Test results for decile 2
(   Variable       N      Mean        SD        SE  95% Conf.  Interval
0   Control  1629.0  0.133333  0.972996  0.024107   0.086049  0.180618
1      Test  1565.0

In [59]:
#pivot data by decile. By renter averages for testing values. Clearer picture of differences between groups

df_plus_churn_clean = df_plus_churn[df_plus_churn['test_group'].isin(groups)]

df_churn_pivot = df_plus_churn_clean.pivot_table(index='decile', 
                                                 columns='test_group', 
                                                 values= ['rental_count', 'converted','total_gross_revenue', 'promocode_spend_amount'],
                                                 aggfunc='mean')
df_churn_pivot.style.format('{:.2f}')

Unnamed: 0_level_0,converted,converted,promocode_spend_amount,promocode_spend_amount,rental_count,rental_count,total_gross_revenue,total_gross_revenue
test_group,Control,Test,Control,Test,Control,Test,Control,Test
decile,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,0.92,0.91,0.33,0.94,1.38,1.36,33.69,34.89
1,0.94,0.93,0.09,0.32,1.37,1.38,28.88,29.55
2,0.94,0.93,0.13,0.39,1.23,1.29,31.02,31.0
3,0.93,0.93,0.11,0.42,1.24,1.23,31.24,31.39
4,0.91,0.93,0.24,0.33,1.19,1.19,30.72,32.94
5,0.92,0.94,0.26,0.57,1.15,1.15,34.18,32.32
6,0.94,0.93,0.21,0.36,1.23,1.2,33.47,34.85
7,0.94,0.93,0.24,0.38,1.2,1.16,34.28,33.52
8,0.94,0.94,0.15,0.33,1.24,1.26,31.89,32.68
9,0.94,0.94,0.14,0.37,1.4,1.39,33.88,33.97
