In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as ss
from tqdm.auto import tqdm

In [2]:
# Function to print all information about DataFrame
def review_dataframe(df):
    print(" DATA INFO ".center(125,'-'))
    print(df.info())
    
    print(" SHAPE OF DATASET ".center(125,'-'))
    print('Rows:{}'.format(df.shape[0]))
    print('Columns:{}'.format(df.shape[1]))
    
    print(" DATA TYPES ".center(125,'-'))
    print(df.dtypes)
    
    print(" STATISTICS OF DATA ".center(125,'-'))
    print(df.describe(include="all"))
    
    print(" MISSING VALUES ".center(125,'-'))
    print(df.isnull().sum()[df.isnull().sum()>0].sort_values(ascending = False))
    
    print(" DUPLICATED VALUES ".center(125,'-'))
    print(df.duplicated().sum())

In [3]:
df_for_bs = pd.read_csv('data/for_bootstrap.csv', decimal=',', sep=';')
df_for_bs.head()

Unnamed: 0.1,Unnamed: 0,value,experimentVariant
0,1,10.380495,Control
1,2,9.546867,Control
2,3,11.088215,Control
3,4,10.147274,Control
4,5,9.789808,Control


Description:

    value – metric value
    experimentVariant – Test version (Control – control, Treatment – test)

In [4]:
review_dataframe(df=df_for_bs)

--------------------------------------------------------- DATA INFO ---------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         1000 non-null   int64  
 1   value              1000 non-null   float64
 2   experimentVariant  1000 non-null   object 
dtypes: float64(1), int64(1), object(1)
memory usage: 23.6+ KB
None
------------------------------------------------------ SHAPE OF DATASET -----------------------------------------------------
Rows:1000
Columns:3
--------------------------------------------------------- DATA TYPES --------------------------------------------------------
Unnamed: 0             int64
value                float64
experimentVariant     object
dtype: object
----------------------------------------------------- STATISTICS OF DATA ---------------

In [5]:
control_group = df_for_bs.query('experimentVariant == "Control"').value
control_group

0      10.380495
1       9.546867
2      11.088215
3      10.147274
4       9.789808
         ...    
495    10.973929
496    10.261074
497     8.780090
498    11.169078
499    11.499566
Name: value, Length: 500, dtype: float64

In [6]:
treatment_group = df_for_bs.query('experimentVariant == "Treatment"').value
treatment_group

500      10.380495
501       9.546867
502      11.088215
503      10.147274
504       9.789808
          ...     
995    1000.000000
996    1200.000000
997    1500.000000
998    2000.000000
999    3000.000000
Name: value, Length: 500, dtype: float64

In [7]:
print(f'Size of control group:    {control_group.shape[0]}')
print(f'Size of treatment group:  {treatment_group.shape[0]}')

Size of control group:    500
Size of treatment group:  500


In [8]:
# Checking for equals
treatment_group.equals(control_group)

False

In [9]:
def get_bootstrap_results(column_1,
                          column_2,
                          boot_iterations,
                          statistic,
                          boot_conf_level
                         ):
    boot_len = max(len(column_1), len(column_2))
    boot_data = []
    for iteration in range(boot_iterations):
        sample_column_1 = column_1.sample(boot_len, replace=True).values
        sample_column_2 = column_2.sample(boot_len, replace=True).values
        boot_data.append(statistic(sample_column_1 - sample_column_2))
        
    df_boot_data = pd.DataFrame(boot_data)
    
    p_value = min(sum(np.array(boot_data)<=0), 
                  sum(np.array(boot_data)>=0)
                 ) * 2 / len(boot_data)
    return p_value
    

In [10]:
df_results_bs_mw = pd.DataFrame(columns=['Boot Mean', 'Boot Median', 'P Mann-Whitneyu'])

result_mannwhitneyu = ss.mannwhitneyu(control_group, treatment_group, alternative='two-sided')

for iteration in tqdm(range(10)):
    result_bs_mean = get_bootstrap_results(column_1=control_group, 
                                           column_2=treatment_group, 
                                           boot_iterations=1000, 
                                           statistic=np.mean, 
                                           boot_conf_level=0.95
                                          )
    result_bs_median = get_bootstrap_results(column_1=control_group, 
                                             column_2=treatment_group, 
                                             boot_iterations=1000, 
                                             statistic=np.median, 
                                             boot_conf_level=0.95
                                            )

    temp_data = {'Boot Mean': [result_bs_mean], 'Boot Median': [result_bs_median], 'P Mann-Whitneyu': [result_mannwhitneyu.pvalue]}
    df_results_bs_mw = pd.concat([df_results_bs_mw, pd.DataFrame(temp_data)], ignore_index=True)
    
#     print(f'Boot mean: {result_bs_mean};\tBoot median: {result_bs_median};\tMN: {result_mannwhitneyu.pvalue}')

df_results_bs_mw

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,Boot Mean,Boot Median,P Mann-Whitneyu
0,0.0,0.942,0.859215
1,0.006,0.882,0.859215
2,0.002,0.958,0.859215
3,0.006,0.88,0.859215
4,0.004,0.9,0.859215
5,0.004,0.924,0.859215
6,0.008,0.834,0.859215
7,0.0,0.9,0.859215
8,0.006,0.912,0.859215
9,0.002,0.916,0.859215
