In [853]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import random
from scipy.stats import chisquare
from scipy.stats import ttest_ind

In [854]:
# Import data
control_data = pd.read_csv(r'C:\Users\w1381\桌面\python-projects\5293\project\control_group.csv', sep = ';')
treat_data = pd.read_csv(r'C:\Users\w1381\桌面\python-projects\5293\project\test_group.csv', sep=';')

In [855]:
# Overview of the data
control_data.head()


Unnamed: 0,Campaign Name,Date,Spend [USD],# of Impressions,Reach,# of Website Clicks,# of Searches,# of View Content,# of Add to Cart,# of Purchase
0,Control Campaign,1.08.2019,2280,82702.0,56930.0,7016.0,2290.0,2159.0,1819.0,618.0
1,Control Campaign,2.08.2019,1757,121040.0,102513.0,8110.0,2033.0,1841.0,1219.0,511.0
2,Control Campaign,3.08.2019,2343,131711.0,110862.0,6508.0,1737.0,1549.0,1134.0,372.0
3,Control Campaign,4.08.2019,1940,72878.0,61235.0,3065.0,1042.0,982.0,1183.0,340.0
4,Control Campaign,5.08.2019,1835,,,,,,,


In [856]:
treat_data.head()

Unnamed: 0,Campaign Name,Date,Spend [USD],# of Impressions,Reach,# of Website Clicks,# of Searches,# of View Content,# of Add to Cart,# of Purchase
0,Test Campaign,1.08.2019,3008,39550,35820,3038,1946,1069,894,255
1,Test Campaign,2.08.2019,2542,100719,91236,4657,2359,1548,879,677
2,Test Campaign,3.08.2019,2365,70263,45198,7885,2572,2367,1268,578
3,Test Campaign,4.08.2019,2710,78451,25937,4216,2216,1437,566,340
4,Test Campaign,5.08.2019,2297,114295,95138,5863,2106,858,956,768


In [857]:
# Concate two datasets
data = pd.concat([control_data, treat_data], axis=0)

In [858]:
# Overview of the new dataset
data.head()

Unnamed: 0,Campaign Name,Date,Spend [USD],# of Impressions,Reach,# of Website Clicks,# of Searches,# of View Content,# of Add to Cart,# of Purchase
0,Control Campaign,1.08.2019,2280,82702.0,56930.0,7016.0,2290.0,2159.0,1819.0,618.0
1,Control Campaign,2.08.2019,1757,121040.0,102513.0,8110.0,2033.0,1841.0,1219.0,511.0
2,Control Campaign,3.08.2019,2343,131711.0,110862.0,6508.0,1737.0,1549.0,1134.0,372.0
3,Control Campaign,4.08.2019,1940,72878.0,61235.0,3065.0,1042.0,982.0,1183.0,340.0
4,Control Campaign,5.08.2019,1835,,,,,,,


In [859]:
data.tail()

Unnamed: 0,Campaign Name,Date,Spend [USD],# of Impressions,Reach,# of Website Clicks,# of Searches,# of View Content,# of Add to Cart,# of Purchase
25,Test Campaign,26.08.2019,2311,80841.0,61589.0,3820.0,2037.0,1046.0,346.0,284.0
26,Test Campaign,27.08.2019,2915,111469.0,92159.0,6435.0,2976.0,2552.0,992.0,771.0
27,Test Campaign,28.08.2019,2247,54627.0,41267.0,8144.0,2432.0,1281.0,1009.0,721.0
28,Test Campaign,29.08.2019,2805,67444.0,43219.0,7651.0,1920.0,1240.0,1168.0,677.0
29,Test Campaign,30.08.2019,1977,120203.0,89380.0,4399.0,2978.0,1625.0,1034.0,572.0


In [860]:
# Transform 'Campaign Name' into 'treatment'
treat_map = {"Control Campaign":0, "Test Campaign":1}
data["treatment"] = data["Campaign Name"].map(treat_map)


In [861]:
# # Rename some columns
# rename_map = {col: col.replace('#', 'number') for col in data.columns if '#' in col}

# # Rename the columns using the mapping
# data.rename(columns=rename_map, inplace=True)

In [862]:
data.head()

Unnamed: 0,Campaign Name,Date,Spend [USD],# of Impressions,Reach,# of Website Clicks,# of Searches,# of View Content,# of Add to Cart,# of Purchase,treatment
0,Control Campaign,1.08.2019,2280,82702.0,56930.0,7016.0,2290.0,2159.0,1819.0,618.0,0
1,Control Campaign,2.08.2019,1757,121040.0,102513.0,8110.0,2033.0,1841.0,1219.0,511.0,0
2,Control Campaign,3.08.2019,2343,131711.0,110862.0,6508.0,1737.0,1549.0,1134.0,372.0,0
3,Control Campaign,4.08.2019,1940,72878.0,61235.0,3065.0,1042.0,982.0,1183.0,340.0,0
4,Control Campaign,5.08.2019,1835,,,,,,,,0


In [863]:
data.tail()

Unnamed: 0,Campaign Name,Date,Spend [USD],# of Impressions,Reach,# of Website Clicks,# of Searches,# of View Content,# of Add to Cart,# of Purchase,treatment
25,Test Campaign,26.08.2019,2311,80841.0,61589.0,3820.0,2037.0,1046.0,346.0,284.0,1
26,Test Campaign,27.08.2019,2915,111469.0,92159.0,6435.0,2976.0,2552.0,992.0,771.0,1
27,Test Campaign,28.08.2019,2247,54627.0,41267.0,8144.0,2432.0,1281.0,1009.0,721.0,1
28,Test Campaign,29.08.2019,2805,67444.0,43219.0,7651.0,1920.0,1240.0,1168.0,677.0,1
29,Test Campaign,30.08.2019,1977,120203.0,89380.0,4399.0,2978.0,1625.0,1034.0,572.0,1


# 1) First method using regression analysis

In [864]:
# treatment parameter is the column name which indicates if it is treatment or not (string)
# features parameter should be a (list)
def SRM_regression_checker(df, features):
    features_formula = ' + '.join([f'Q("{feature}")' for feature in features])
    formula = f'treatment ~ {features_formula}'
    # fit the regression 
    model = smf.glm(formula, data=df).fit()
    # get the p-values for the main effect using a Wald test
    wald_p_values = model.wald_test_terms(scalar=True).table
    return wald_p_values

In [865]:
type(data["Date"].values[0])

str

In [866]:
# drop 'Date' and 'Campaign Name' column
data_2 = data.drop(['Date','Campaign Name'], axis=1)


In [867]:
# Notice there is NA in the dataset
data_2.isnull().sum()

Spend [USD]            0
# of Impressions       1
Reach                  1
# of Website Clicks    1
# of Searches          1
# of View Content      1
# of Add to Cart       1
# of Purchase          1
treatment              0
dtype: int64

In [868]:
# Display which row has NA value
data_2.loc[data_2.isnull().any(axis=1)]

Unnamed: 0,Spend [USD],# of Impressions,Reach,# of Website Clicks,# of Searches,# of View Content,# of Add to Cart,# of Purchase,treatment
4,1835,,,,,,,,0


# 2) Second method using Goodness of Fit Chi-squared test or two sample t test?

In [869]:
for feature in data_2.columns.to_list():
    control_obs = len(data_2[data_2['treatment'] == 0])
    treat_obs = len(data_2[data_2['treatment'] == 1])
    observation = [control_obs, treat_obs] # You may use list comprehension to combine these codes
    expected = [30, 30]
    chi_stats, p_value = chisquare(observation, f_exp = expected)
    print(f"{feature} : {p_value}")

Spend [USD] : 1.0
# of Impressions : 1.0
Reach : 1.0
# of Website Clicks : 1.0
# of Searches : 1.0
# of View Content : 1.0
# of Add to Cart : 1.0
# of Purchase : 1.0
treatment : 1.0


## Notice that above chi-squared test is wrong since we are dealing with continuous features. goodness of fit chi-sqaured test is not appropriate for continuous data.

In [870]:
for feature in data_2.columns:
    # Skip the 'treatment'
    if (feature != 'treatment'):
        control_group = data_2[data_2['treatment'] == 0][feature].dropna()
        treatment_group = data_2[data_2['treatment'] == 1][feature].dropna()

        # Perform the two sample t-test
        t_stat, p_value = ttest_ind(control_group, treatment_group, equal_var=False)

        # Print the feature name and the p-value
        print(f"{feature}: {p_value}")

Spend [USD]: 0.00433025048398858
# of Impressions: 9.549344891722425e-06
Reach: 2.0077300697963643e-06
# of Website Clicks: 0.12054876064243353
# of Searches: 0.26780825254796115
# of View Content: 0.63742989921356
# of Add to Cart: 8.693192807018178e-05
# of Purchase: 0.9760037958073526


In [871]:
SRM_regression_checker(data_2,  ["# of Impressions"])

Unnamed: 0,statistic,pvalue,df_constraint
Intercept,58.312125,2.236614e-14,1
"Q(""# of Impressions"")",23.858773,1.036683e-06,1


In [872]:
SRM_regression_checker(data_2,  ["Reach"])

Unnamed: 0,statistic,pvalue,df_constraint
Intercept,73.920967,8.130808e-18,1
"Q(""Reach"")",28.094022,1.155623e-07,1


In [873]:
SRM_regression_checker(data_2,  ["# of Website Clicks"])

Unnamed: 0,statistic,pvalue,df_constraint
Intercept,0.62211,0.430264,1
"Q(""# of Website Clicks"")",2.486643,0.114816,1


In [874]:
SRM_regression_checker(data_2,  ["# of Searches"])

Unnamed: 0,statistic,pvalue,df_constraint
Intercept,1.080967,0.298481,1
"Q(""# of Searches"")",1.293529,0.255399,1


In [875]:
SRM_regression_checker(data_2,  ["# of View Content"])

Unnamed: 0,statistic,pvalue,df_constraint
Intercept,9.274681,0.002323,1
"Q(""# of View Content"")",0.226723,0.633964,1


In [876]:
SRM_regression_checker(data_2,  ["# of Add to Cart"])

Unnamed: 0,statistic,pvalue,df_constraint
Intercept,51.462054,7.299441e-13,1
"Q(""# of Add to Cart"")",18.054547,2.146654e-05,1
