# Build a P-value interpreter function with ChatGPT

In [1]:
def interpret_p_value(p_value, threshold=0.05):
    """
    Interpret the p-value in a statistical context against a specific threshold.

    Parameters:
    p_value (float): The p-value to interpret, expected to be in the range 0 to 1.
    threshold (float): The significance level threshold, default is 0.05.

    Returns:
    None: Prints the interpretation of the p-value.

    Raises:
    ValueError: If the p_value or threshold is not within the range [0, 1].

    Description:
    This function assesses the provided p-value against a given threshold:
    - p < threshold: Evidence suggests rejecting the null hypothesis.
    - p >= threshold: Not enough evidence to reject the null hypothesis.

    It's important to note that the interpretation of p-values is context-dependent
    and should be considered along with other factors in statistical analysis.
    """
    if not (0 <= p_value <= 1):
        raise ValueError("p_value must be between 0 and 1.")
    if not (0 <= threshold <= 1):
        raise ValueError("threshold must be between 0 and 1.")

    if p_value < threshold:
        print(f"p-value ({p_value}) is less than the threshold ({threshold}).")
        print("Evidence suggests rejecting the null hypothesis.")
    else:
        print(f"p-value ({p_value}) is greater than or equal to the threshold ({threshold}).")
        print("Not enough evidence to reject the null hypothesis.")

# Example usage
interpret_p_value(0.03, threshold=0.05)

p-value (0.03) is less than the threshold (0.05).
Evidence suggests rejecting the null hypothesis.


# Libraries and Data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import scipy.stats as stats
import numpy as np
import statsmodels.api as sm

: 

In [None]:
# Load the data
df = pd.read_csv("Krushing Kingdoms.csv")
df.head()

: 

In [None]:
# df information
df.info()

: 

# Define Hypothesis

### Null Hypothesis: There is no difference in retention between the two groups

### Alternative Hypothesis: There is a difference in the mean retention between the 2 groups

# EDA

In [None]:
# 1. Descriptive Statistics
df.describe()

#Lots of outliers due to the standard deviation being so high...need to clean.

: 

The data has outliers and we should clean it

In [None]:
# 2. Group Comparisons
grouped = df.groupby('team_level_access').agg({'minutes_played': ['mean', 'median'],'day1_retention': 'mean','day7_retention': 'mean'})
print(grouped)

: 

In [None]:
# Distribution of team level access
plt.figure(figsize=(6, 4))
sns.countplot(x='team_level_access', data=df)
plt.title('Distribution of Team Level Access')
plt.show()

: 

In [None]:
# Boxplot of minutes played by team level
plt.figure(figsize=(6, 4))
sns.boxplot(x='team_level_access', y='minutes_played', data=df)
plt.title('Minutes Played by Team Level')
plt.show()

#This is a descriptive way of showing all of the outliers

: 

In [None]:
# Binning the minutes played into categories
bins = [0, 60, 120, 180, 240, 300, 360]
labels = ['0-60', '61-120', '121-180', '181-240', '241-300', '301-360']
df['minutes_played_group'] = pd.cut(df['minutes_played'],
                                    bins=bins, labels=labels, right=False)

# Creating a pivot table
cohort_data = df.pivot_table(index='team_level_access',
                            columns='minutes_played_group',
                            aggfunc='size', fill_value=0)

# Plotting the cohort data
plt.figure(figsize=(12, 6))
sns.heatmap(cohort_data, annot=True, fmt="d", cmap="YlGnBu")
plt.title('Cohort Visualization of Minutes Played per Level Access')
plt.xlabel('Minutes Played Group')
plt.ylabel('Team Level Access')
plt.show()

: 

# Outliers

In [None]:
# Look at the minutes played
df.minutes_played.hist()

: 

In [None]:
# Summary statistics
df.describe()

: 

In [None]:
# Calculating IQR for minutes_played
Q1 = df['minutes_played'].quantile(0.25)
Q3 = df['minutes_played'].quantile(0.75)
IQR = Q3 - Q1

: 

In [None]:
# Defining outliers for minutes_played
outliers = ((df['minutes_played'] < (Q1 - 1.5 * IQR)) | (df['minutes_played'] > (Q3 + 1.5 * IQR)))


# Removing outliers based on minutes_played
df_cleaned = df[~outliers]

df_cleaned.describe()  # Display summary statistics without outliers

: 

# Randomization

### Simple Randomization

In [None]:
# Let's say '1' is for group A and '0' is for group B
number_of_users = 500
group_assignment = np.random.choice([0,1], size = number_of_users, p = [0.5, 0.5])

# Count the number of elements per group
np.unique(group_assignment, return_counts = True)

: 

### Block Randomization

In [None]:
from sklearn.utils import shuffle

: 

In [None]:
# Assuming that "block_size" is how many users we want in each block
block_size = 50
blocks = [0] * block_size + [1] * block_size
group_assignment = shuffle(blocks * int(number_of_users / (2 * block_size)))

# Count the number of elements per group
np.unique(group_assignment, return_counts = True)

: 

### Stratified Randomization

In [None]:
# Build a copy from our df
df_stratified = df.copy()

: 

In [None]:
# Create a age group variable
age_groups = ['18-25', '26-34', '36-45', '46-55']
df_stratified['age_group'] = np.random.choice(age_groups,
                                              size = len(df_stratified),
                                              replace = True)
df_stratified.head()

: 

In [None]:
# Build a function for stratified randomization
def stratified_randomization(group):
  # Assign 'A' or 'B' with equal probability
  group['group_assignment'] = np.random.choice(['A', 'B'],
                                              size = len(group),
                                              p = [0.5, 0.5])
  return group

: 

In [None]:
# Apply the function
df_stratified = df_stratified.groupby(
    'age_group', group_keys = False).apply(stratified_randomization)
df_stratified.head()

: 

In [None]:
# Check the results
df_stratified.groupby(['age_group', 'group_assignment']).size()

: 

### Cluster Randomization / Geotest

In [None]:
# Build a copy from our df
clustering_df = df.copy()

: 

In [None]:
# Create a city variable
cities = ['New York', 'Los Angeles', 'Chicago', 'Houston']
clustering_df['city'] = np.random.choice(cities,
                                        size = len(clustering_df),
                                        replace = True)
clustering_df.head()

: 

In [None]:
# create a list with two cities in one cluster and 2 in anoter
assigned_clusters = [0, 0 ,1, 1]
np.random.shuffle(assigned_clusters)

: 

In [None]:
# Create a dictionary to map out the city and the assigned cluster
city_to_group = dict(zip(cities, assigned_clusters))
city_to_group

: 

In [None]:
# Create the cluster group variable
clustering_df['cluster_group'] = clustering_df['city'].map(city_to_group)
clustering_df.head()

: 

In [None]:
# Check the results
clustering_df.groupby(['city', 'cluster_group']).size()

: 

# Sample Size Calculation

In [None]:
# Settings
alpha = 0.05
power = 0.8
# Minimum Detectable effect
mde = 0.05

: 

### Sample Size Calculation for Proportions

In [None]:
# Computing the Baseline %
baseline_retention = df_cleaned[df_cleaned['team_level_access'] == 'level_5']['day7_retention'].mean()
print(f"The Baseline Retention is {baseline_retention}")

: 

In [None]:
# Calculate the effect size
effect_size = sm.stats.proportion_effectsize(baseline_retention,
                                            baseline_retention + mde)
effect_size

: 

In [None]:
# Initiate the power analysis
power_analysis = sm.stats.NormalIndPower()
sample_size_discrete = power_analysis.solve_power(effect_size = effect_size,
                                                power = power,
                                                alpha = alpha,
                                                ratio = 1.0)
print(f"The Sample Size per group is {round(sample_size_discrete)}")

: 

### Sample size for continuous outcomes

In [None]:
# Computing the Baseline minutes
baseline_minutes = df_cleaned[
    df_cleaned['team_level_access'] == 'level_5']['minutes_played'].mean()
print(f"The Baseline minutes is {baseline_minutes}")

# Baseline variability
baseline_sigma = df_cleaned[
    df_cleaned['team_level_access'] == 'level_5']['minutes_played'].std()
print(f"The Baseline Variability is {baseline_sigma}")

: 

In [None]:
# Compute the effect size
effect_size = mde * baseline_minutes / baseline_sigma
print(f"The effect size is {effect_size}")

: 

In [None]:
# Alpha and Beta
Zalpha = stats.norm.ppf(1-alpha / 2)
Zbeta = stats.norm.ppf(power)

: 

In [None]:
# Calculate the Sample size
sample_size_continuous = (
    (Zalpha + Zbeta)**2 * (baseline_sigma**2)) / (effect_size **2)
print(f"The Sample Size per group is {sample_size_continuous:.0f}")

: 

### What if we don't clean the outliers?

In [None]:
# Computing the Baseline minutes
baseline_minutes = df[
    df['team_level_access'] == 'level_5']['minutes_played'].mean()
print(f"The Baseline minutes is {baseline_minutes}")

# Baseline variability
baseline_sigma = df[
    df['team_level_access'] == 'level_5']['minutes_played'].std()
print(f"The Baseline Variability is {baseline_sigma}")

: 

In [None]:
# Compute the effect size
effect_size = mde * baseline_minutes / baseline_sigma
print(f"The effect size is {effect_size}")

: 

In [None]:
# Alpha and Beta
Zalpha = stats.norm.ppf(1-alpha / 2)
Zbeta = stats.norm.ppf(power)

: 

In [None]:
# Calculate the Sample size
sample_size_continuous = (
    (Zalpha + Zbeta)**2 * (baseline_sigma**2)) / (effect_size **2)
print(f"The Sample Size per group is {sample_size_continuous:.0f}")

: 

Pro Tip 1 :Always clean outliers for A/B Testing

Pro Tip 2: Choose the proportion as the main outcome KPI and the continuous as secondary

# Post Analysis

### Post Analysis for Retention with Calculated Sample Size

In [None]:
# Sampling
N = int(sample_size_discrete)
sample5 = df_cleaned[
    df_cleaned['team_level_access'] == 'level_5'].sample(n = N,
                                                         random_state = 1502)

sample7 = df_cleaned[
    df_cleaned['team_level_access'] == 'level_7'].sample(n = N,
                                                         random_state = 1502)

: 

In [None]:
# Calculating the number of successes and trials
n_5= sample5['day7_retention'].count()
n_7= sample7['day7_retention'].count()
successes_5= sample5['day7_retention'].sum()
successes_7= sample7['day7_retention'].sum()

: 

In [None]:
# Compute the level access retention rate
rr_5 = successes_5 / n_5
rr_7 = successes_7 / n_7
print(f"The Retention Rate for level 5 is {rr_5}")
print(f"The Retention Rate for level 7 is {rr_7}")

: 

In [None]:
# Proportions Test
import statsmodels.stats.proportion as proportion
z_value, p_value = proportion.proportions_ztest([successes_5, successes_7], [n_5, n_7],
                                                alternative = "two-sided")
interpret_p_value(p_value, threshold=0.05)

: 

### Post Analysis for Retention with the Complete Data

In [None]:
# Sampling
N = int(sample_size_discrete)
sample5 = df_cleaned[df_cleaned['team_level_access'] == 'level_5']

sample7 = df_cleaned[df_cleaned['team_level_access'] == 'level_7']

: 

In [None]:
# Calculating the number of successes and trials
n_5= sample5['day7_retention'].count()
n_7= sample7['day7_retention'].count()
successes_5= sample5['day7_retention'].sum()
successes_7= sample7['day7_retention'].sum()

: 

In [None]:
# Compute the level access retention rate
rr_5 = successes_5 / n_5
rr_7 = successes_7 / n_7
print(f"The Retention Rate for level 5 is {rr_5}")
print(f"The Retention Rate for level 7 is {rr_7}")

: 

variance_proportions = proportion_mean * (1 - proportion_mean)

In [None]:
# Proportions Test
import statsmodels.stats.proportion as proportion
z_value, p_value = proportion.proportions_ztest([successes_5, successes_7], [n_5, n_7],
                                                alternative = "two-sided")
interpret_p_value(p_value, threshold=0.05)

: 

### Post Analysis for Minutes Played

In [None]:
# Prepare the inputs
minutes5 = df_cleaned[df_cleaned['team_level_access'] == 'level_5']['minutes_played']

minutes7 = df_cleaned[df_cleaned['team_level_access'] == 'level_7']['minutes_played']

: 

In [None]:
# 2 sample T-test
t_statistic, p_value = stats.ttest_ind(minutes5,
                                       minutes7,
                                       alternative = 'two-sided')
interpret_p_value(p_value, threshold=0.05)

: 