In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.multicomp import pairwise_tukeyhsd, MultiComparison
from sklearn.preprocessing import LabelEncoder

# Read data file into object called df
df = pd.read_csv("./datasets/WA_Fn-UseC_-Marketing-Campaign-Eff-UseC_-FastF.csv")

# Label encoding for MarketSize column
label_encoder = LabelEncoder()
df['MarketSize'] = label_encoder.fit_transform(df['MarketSize'])


In [2]:
# Data visualization and exploration
print(df.head(5))
print(df.info())
print(df.describe())
df.groupby('Promotion')['AgeOfStore'].describe()
df.groupby('Promotion')['week'].describe()
df.groupby('Promotion')['SalesInThousands'].describe()
df.groupby('Promotion', as_index=False).agg({"SalesInThousands": "sum"})
total_sales = df.groupby(['week', 'Promotion'])['SalesInThousands'].sum()
avg_sales = df.groupby(['week', 'Promotion'])['SalesInThousands'].mean()
grouped_promo_data = df.groupby(by=['Promotion', 'week'])
promo_summary = grouped_promo_data.agg({'SalesInThousands':['min', 'mean', 'median', 'max', 'std', 'sum']})
grouped_store_data = df.groupby(by=['Promotion'])
store_summary = grouped_store_data.agg({'AgeOfStore':['min', 'mean', 'median', 'max', 'std']})


   MarketID  MarketSize  LocationID  AgeOfStore  Promotion  week  \
0         1           1           1           4          3     1   
1         1           1           1           4          3     2   
2         1           1           1           4          3     3   
3         1           1           1           4          3     4   
4         1           1           2           5          2     1   

   SalesInThousands  
0             33.73  
1             35.67  
2             29.03  
3             39.25  
4             27.81  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 548 entries, 0 to 547
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   MarketID          548 non-null    int64  
 1   MarketSize        548 non-null    int32  
 2   LocationID        548 non-null    int64  
 3   AgeOfStore        548 non-null    int64  
 4   Promotion         548 non-null    int64  
 5   week              548

In [3]:
# Visualizing SalesInThousands by Promotion
plt.figure(figsize=(8, 6))
sns.boxplot(x='Promotion', y='SalesInThousands', data=df)
plt.title('SalesInThousands by Promotion')
plt.xlabel('Promotion')
plt.ylabel('SalesInThousands')
plt.savefig("plots/SalesInThousands_by_Promotion.png")
plt.close()

# Visualizing AgeOfStore by Promotion
plt.figure(figsize=(8, 6))
sns.boxplot(x='Promotion', y='AgeOfStore', data=df)
plt.title('AgeOfStore by Promotion')
plt.xlabel('Promotion')
plt.ylabel('AgeOfStore')
plt.savefig("plots/AgeOfStore_by_Promotion.png")
plt.close()

# Visualizing MarketSize distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='MarketSize', data=df)
plt.title('MarketSize Distribution')
plt.xlabel('MarketSize')
plt.ylabel('Count')
plt.savefig("plots/MarketSize_Distribution.png")
plt.close()

# Visualizing SalesInThousands over time (weeks) by Promotion
plt.figure(figsize=(12, 6))
sns.lineplot(x='week', y='SalesInThousands', hue='Promotion', data=df)
plt.title('SalesInThousands over Time by Promotion')
plt.xlabel('Week')
plt.ylabel('SalesInThousands')
plt.savefig("plots/SalesInThousands_over_Time_by_Promotion.png")
plt.close()

# Visualizing total SalesInThousands by Promotion and Week
total_sales = df.groupby(['week', 'Promotion'])['SalesInThousands'].sum().reset_index()
plt.figure(figsize=(12, 6))
sns.barplot(x='week', y='SalesInThousands', hue='Promotion', data=total_sales)
plt.title('Total SalesInThousands by Promotion and Week')
plt.xlabel('Week')
plt.ylabel('Total SalesInThousands')
plt.savefig("plots/Total_SalesInThousands_by_Promotion_and_Week.png")
plt.close()

# Visualizing average SalesInThousands by Promotion and Week
avg_sales = df.groupby(['week', 'Promotion'])['SalesInThousands'].mean().reset_index()
plt.figure(figsize=(12, 6))
sns.barplot(x='week', y='SalesInThousands', hue='Promotion', data=avg_sales)
plt.title('Average SalesInThousands by Promotion and Week')
plt.xlabel('Week')
plt.ylabel('Average SalesInThousands')
plt.savefig("plots/Average_SalesInThousands_by_Promotion_and_Week.png")
plt.close()

# Visualizing correlation between numeric variables
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.savefig("plots/Correlation_Matrix.png")
plt.close()

# Explore distribution of SalesInThousands and AgeOfStore with histograms and KDE plots
plt.figure(figsize=(12, 6))
sns.histplot(df['SalesInThousands'], kde=True, color='blue', label='SalesInThousands')
sns.histplot(df['AgeOfStore'], kde=True, color='orange', label='AgeOfStore')
plt.title('Distribution of SalesInThousands and AgeOfStore')
plt.xlabel('Value')
plt.ylabel('Count')
plt.legend()
plt.savefig("plots/Distribution_of_SalesInThousands_and_AgeOfStore.png")
plt.close()

# Pairplot to visualize relationships between numeric variables
sns.pairplot(df, hue='Promotion')
plt.savefig("plots/Pairplot.png")
plt.close()

# Using swarmplot to visualize SalesInThousands by MarketSize and Promotion
plt.figure(figsize=(10, 6))
sns.swarmplot(x='MarketSize', y='SalesInThousands', hue='Promotion', data=df)
plt.title('SalesInThousands by MarketSize and Promotion')
plt.xlabel('MarketSize')
plt.ylabel('SalesInThousands')
plt.legend(title='Promotion', loc='upper left')
plt.savefig("plots/SalesInThousands_by_MarketSize_and_Promotion.png")
plt.close()

In [4]:
# Build ANOVA model using OLS
def build_anova_model(data):
    anova_lm = ols('SalesInThousands ~ Promotion', data=data).fit()
    anova_lm_robust = ols('SalesInThousands ~ Promotion', data=data).fit(cov_type='HC3')
    return anova_lm, anova_lm_robust

anova_lm, anova_lm_robust = build_anova_model(df)

print(anova_lm.summary())
print(anova_lm_robust.summary())

                            OLS Regression Results                            
Dep. Variable:       SalesInThousands   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     1.921
Date:                Sun, 30 Jul 2023   Prob (F-statistic):              0.166
Time:                        14:36:36   Log-Likelihood:                -2320.8
No. Observations:                 548   AIC:                             4646.
Df Residuals:                     546   BIC:                             4654.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     55.9494      1.929     29.004      0.0

In [5]:
# Post-hoc testing
def posthoc_tukey(anova_lm, data):
    posthoc = MultiComparison(data['SalesInThousands'], data['Promotion'])
    return posthoc.tukeyhsd().summary()

print("Post-Hoc Tukey Test:")
print(posthoc_tukey(anova_lm, df))

Post-Hoc Tukey Test:
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower    upper  reject
-----------------------------------------------------
     1      2 -10.7696    0.0 -14.7738 -6.7654   True
     1      3  -2.7345 0.2444  -6.7388  1.2697  False
     2      3   8.0351    0.0   4.1208 11.9493   True
-----------------------------------------------------


In [6]:
# Separate ANOVA test for promotions 1 and 3
df_promo_1_3 = df[df['Promotion'].isin([1, 3])]
anova_lm_promo_1_3, anova_lm_robust_promo_1_3 = build_anova_model(df_promo_1_3)

print("ANOVA for Promotion 1 and 3:")
print(anova_lm_promo_1_3.summary())
print(anova_lm_robust_promo_1_3.summary())

ANOVA for Promotion 1 and 3:
                            OLS Regression Results                            
Dep. Variable:       SalesInThousands   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     2.418
Date:                Sun, 30 Jul 2023   Prob (F-statistic):              0.121
Time:                        14:36:36   Log-Likelihood:                -1522.6
No. Observations:                 360   AIC:                             3049.
Df Residuals:                     358   BIC:                             3057.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     59.4663  

# Data Analysis and Marketing Insights
**Data Overview:**
- The dataset contains 548 entries and 7 columns, giving us a substantial amount of data to work with.
- With columns like MarketID, MarketSize, LocationID, AgeOfStore, Promotion, week, and SalesInThousands, we have a diverse set of variables to analyze.
- The data types are well-structured, with int64, int32, and float64 values ensuring data integrity.

**Summary Statistics:**
- When examining the summary statistics, we find that the mean sales across all promotions and weeks is approximately 53.47 thousand dollars. The spread, indicated by the standard deviation of 16.76 thousand dollars, shows some variability in the sales data.
- The range of sales is impressive, ranging from a minimum of 17.34 thousand dollars to a maximum of 99.65 thousand dollars.
- Additionally, the mean age of stores in the dataset is around 8.50 years, with a standard deviation of 6.64 years, hinting at the varying age distribution of stores.

**ANOVA Model:**
- The ANOVA model allows us to investigate the impact of the "Promotion" variable on sales (SalesInThousands).
- Although the model does not provide a substantial R-squared value (0.004), suggesting a limited explanatory power, the analysis remains crucial to uncovering potential insights.
- The p-value for the F-statistic (Prob (F-statistic)) is 0.166, which, regrettably, does not reach statistical significance at the 0.05 level. As such, the "Promotion" variable does not seem to significantly influence overall sales.

**Robust ANOVA Model:**
- Recognizing the possibility of heteroscedasticity, we turn to a robust ANOVA model using the HC3 covariance type.
- The results closely mirror those of the non-robust model, with a p-value of 0.172 for the "Promotion" variable.
- Alas, the "Promotion" variable remains non-significant in the robust model as well, indicating little influence on overall sales.

**Post-Hoc Tukey Test:**
- Performing the post-hoc Tukey test, we compare mean sales across different promotion groups.
- Notably, Promotion 1 and Promotion 2 demonstrate a statistically significant difference in mean sales (reject=True). Furthermore, Promotion 2 and Promotion 3 also display a significant difference (reject=True).
- However, the comparison between Promotion 1 and Promotion 3 reveals no significant discrepancy in mean sales (reject=False).

**Separate ANOVA for Promotions 1 and 3:**
- Isolating Promotions 1 and 3, we delve into potential differences in mean sales between these two groups.
- The R-squared value for this model, though modest at 0.007, offers some indication of the "Promotion" variable's influence on sales within these groups.
- Nonetheless, the p-value for the "Promotion" variable stands at 0.121 (non-robust model) and 0.122 (robust model), falling short of statistical significance at the 0.05 level. Consequently, the difference in mean sales between Promotion 1 and Promotion 3 is not considered significant.

**Post-Hoc Tukey Test for Promotions 1 and 3:**
- The post-hoc Tukey test confirms the absence of a significant difference in mean sales between Promotions 1 and 3 (reject=False).

**Overall Marketing Insights:**
- Excitingly, the ANOVA and Tukey test results uncover a compelling insight: there is no significant difference in sales performance between Promotion 1 and Promotion 3. However, Promotion 2 lags behind both Promotion 1 and Promotion 3, displaying a statistically lower mean sales figure.
- Armed with these findings, the marketing and sales team may consider allocating more resources to Promotions 1 and 3 or devising strategies to enhance the effectiveness of Promotion 2.
- Armed with data-backed insights, the team can now make informed decisions and optimize marketing campaigns for higher sales and success.
