In [6]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Loads Dataset 
df = pd.read_csv('marketing_AB.csv')

# Data Cleaning: Checks duplicates and drops unwanted columns
df.duplicated(subset='user id').sum()
df.drop(["Unnamed: 0", "user id"], axis=1, inplace=True)

# Filters and checks categorical data
df_category = df[['test group', 'converted', 'most ads day', 'most ads hour']]
df_category.nunique()

# Test Preparation 
from scipy.stats import chi2_contingency
alpha = 0.05

# Statistical Test
for variable in df_category.columns:
  if variable != 'converted':
    # Creates a contingency table
    contingency_table = pd.crosstab(df_category[variable], df_category['converted'])

    # Performs chi-squared test
    chi2, p, _, _ = chi2_contingency(contingency_table)

    # Displays the results
    print(f"\nChi-squared test for {variable} vs. converted: ")
    print(f"Chi-squared value: {chi2}")
    print(f"p-value: {p}")

    # Checks for significance
    if p < alpha:
      print(f"The difference in conversion rates across {variable} is statistically significant.")
    else:
      print(f"There is no significance difference in conversion rates across {variable}.")


Chi-squared test for test group vs. converted: 
Chi-squared value: 54.005823883685245
p-value: 1.9989623063390075e-13
The difference in conversion rates across test group is statistically significant.

Chi-squared test for most ads day vs. converted: 
Chi-squared value: 410.0478857936585
p-value: 1.932184379244731e-85
The difference in conversion rates across most ads day is statistically significant.

Chi-squared test for most ads hour vs. converted: 
Chi-squared value: 430.76869230822086
p-value: 8.027629823696771e-77
The difference in conversion rates across most ads hour is statistically significant.


In [None]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Loads Dataset 
df = pd.read_csv('marketing_AB.csv')

# Data Cleaning: Checks duplicates and drops unwanted columns
df.duplicated(subset='user id').sum()
df.drop(["Unnamed: 0", "user id"], axis=1, inplace=True)

# Filters and checks categorical data
df_category = df[['test group', 'converted', 'most ads day', 'most ads hour']]
df_category.nunique()

# Test Preparation
from scipy.stats import normaltest, levene, ttest_ind, mannwhitneyu

# Statistical Test

# 1: Checks Assumptions
# Normality Assumption (Using D'Agostino's K-squared test)
dagostino_stat_true, dagostino_p_value_true = normaltest(df[df['converted'] == True]['total ads'])
dagostino_stat_false, dagostino_p_value_false = normaltest(df[df['converted'] == False]['total ads'])

print(f"D'Agostino's test for normality (True group): p-value = {dagostino_p_value_true}")
print(f"D'Agostino's test for normality (False group): p-value = {dagostino_p_value_false}")

# Equality of variances assumption (Levene's Test)
levene_stat, levene_p_value = levene(df[df['converted'] == True]['total ads'], 
                                     df[df['converted'] == False]['total ads'])
print(f"Levene's test for equality of variances: p-value = {levene_p_value}")

# 2: Performs a Suitable Test
alpha = 0.05

if dagostino_p_value_true > alpha and dagostino_p_value_false > alpha and levene_p_value > alpha:
  # Assumptions met - use t-test for means
  t_stat, t_p_value = ttest_ind(df[df['converted']]['total ads'], df[~df['converted']]['total ads'])
  print(f"Independent two-sample t-test: p-value = {t_p_value}")
else:
  # Assumptions not met - use MannWhitney U test for medians
  u_stat, u_p_value = mannwhitneyu(df[df['converted']]['total ads'], df[~df['converted']]['total ads'])
  print(f"Mann-Whitney U Test: p-value = {u_p_value}")



D'Agostino's test for normality (True group): p-value = 0.0
D'Agostino's test for normality (False group): p-value = 0.0
Levene's test for equality of variances: p-value = 0.0
Mann-Whitney U Test: p-value = 0.0
