# Marketing Mix EDA & Hypothesis Testing

This notebook performs exploratory data analysis and hypothesis testing on the marketing mix dataset (People, Product, Place, Promotion).

In [None]:
# Cell 1: Imports
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from datetime import datetime

pd.set_option('display.max_columns', None)

In [None]:
# Cell 2: Load data
df_raw = pd.read_csv('marketing_data.csv')

print(df_raw.shape)
df_raw.head()

In [None]:
# Cell 3: Basic info – check Dt_Customer, Income etc.
df = df_raw.copy()

print(df.dtypes)
print("
Sample Dt_Customer and Income:")
print(df[['Dt_Customer', ' Income ']].head(10))

In [None]:
# Cell 4: Strip spaces from column names
df.columns = df.columns.str.strip()

df.columns

In [None]:
# Cell 5: Parse Dt_Customer as datetime
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], format='%m/%d/%y')

df[['Dt_Customer']].head()

In [None]:
# Cell 6: Clean Income – remove $ and commas, convert to numeric
df['Income'] = (
    df['Income']
    .astype(str)
    .str.replace(r'[^0-9.\-]', '', regex=True)
)

df['Income'] = pd.to_numeric(df['Income'], errors='coerce')

print(df[['Income']].head(10))
print('Missing Income:', df['Income'].isna().sum())

In [None]:
# Cell 7: Check and clean Education and Marital_Status categories
print('Unique Education values before cleaning:
', df['Education'].unique())
print('
Unique Marital_Status values before cleaning:
', df['Marital_Status'].unique())

In [None]:
# Cell 8: Standardize Education and Marital_Status categories
edu_map = {
    'Basic': 'Basic',
    '2n Cycle': '2nd Cycle',
    'Graduation': 'Graduation',
    'Master': 'Master',
    'PhD': 'PhD'
}
df['Education'] = df['Education'].replace(edu_map)

marital_map = {
    'Single': 'Single',
    'Married': 'Married',
    'Together': 'Together',
    'Divorced': 'Divorced',
    'Widow': 'Widow',
    'Alone': 'Single',
    'Absurd': 'Single',
    'YOLO': 'Single'
}
df['Marital_Status'] = df['Marital_Status'].replace(marital_map)

print('Education after cleaning:', df['Education'].unique())
print('Marital_Status after cleaning:', df['Marital_Status'].unique())

In [None]:
# Cell 9: Impute Income by (Education, Marital_Status) group mean
group_means = df.groupby(['Education', 'Marital_Status'])['Income'].transform('mean')

df['Income'] = df['Income'].fillna(group_means)

# If any still missing, fill with overall mean
df['Income'] = df['Income'].fillna(df['Income'].mean())

print('Missing Income after imputation:', df['Income'].isna().sum())
print(df['Income'].describe())

In [None]:
# Cell 10: Create TotalChildren and Age
reference_year = 2014

df['TotalChildren'] = df['Kidhome'] + df['Teenhome']
df['Age'] = reference_year - df['Year_Birth']

df[['Year_Birth', 'Age', 'TotalChildren']].head()

In [None]:
# Cell 11: Total spending across product categories
product_cols = [
    'MntWines', 'MntFruits', 'MntMeatProducts',
    'MntFishProducts', 'MntSweetProducts', 'MntGoldProds'
]

df['TotalSpend'] = df[product_cols].sum(axis=1)

df[product_cols + ['TotalSpend']].head()

In [None]:
# Cell 12: Total purchases from channels
purchase_cols = ['NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']

df['TotalPurchases'] = df[purchase_cols].sum(axis=1)

df[purchase_cols + ['TotalPurchases']].head()

In [None]:
# Cell 13: Histograms for key numeric variables
num_cols = [
    'Age', 'Income', 'TotalChildren', 'TotalSpend',
    'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases',
    'TotalPurchases', 'NumWebVisitsMonth'
]

df[num_cols].hist(figsize=(15, 10), bins=20)
plt.tight_layout()
plt.show()

In [None]:
# Cell 14: Boxplots to inspect outliers
plt.figure(figsize=(15, 8))
sns.boxplot(data=df[num_cols])
plt.xticks(rotation=45)
plt.show()

In [None]:
# Cell 15: Outlier treatment using IQR capping

def cap_outliers(series, factor=1.5):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - factor * iqr
    upper = q3 + factor * iqr
    return np.clip(series, lower, upper)

for col in ['Income', 'TotalSpend', 'TotalPurchases']:
    df[col + '_capped'] = cap_outliers(df[col])

df[['Income', 'Income_capped', 'TotalSpend', 'TotalSpend_capped']].head()

In [None]:
# Cell 16: Boxplots after capping
plt.figure(figsize=(15, 8))
sns.boxplot(data=df[['Income_capped', 'TotalSpend_capped', 'TotalPurchases_capped']])
plt.xticks(rotation=45)
plt.show()

In [None]:
# Cell 17: Ordinal encoding for Education
edu_order = ['Basic', '2nd Cycle', 'Graduation', 'Master', 'PhD']
edu_map_ord = {level: i for i, level in enumerate(edu_order)}
df['Education_Ord'] = df['Education'].map(edu_map_ord)

df[['Education', 'Education_Ord']].head()

In [None]:
# Cell 18: One-hot encoding for Marital_Status and Country
df_encoded = pd.get_dummies(
    df,
    columns=['Marital_Status', 'Country'],
    drop_first=True
)

df_encoded.head()

In [None]:
# Cell 19: Correlation heatmap
corr_cols = [
    'Age', 'Income_capped', 'TotalChildren', 'TotalSpend_capped',
    'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases',
    'NumWebVisitsMonth', 'TotalPurchases_capped',
    'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3',
    'AcceptedCmp4', 'AcceptedCmp5', 'Response',
    'Education_Ord'
]

corr = df_encoded[corr_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation heatmap')
plt.show()

In [None]:
# Cell 20: Hypothesis 1 – Older vs younger store purchases
median_age = df['Age'].median()
df['AgeGroup'] = np.where(df['Age'] >= median_age, 'Older', 'Younger')

older_store = df.loc[df['AgeGroup'] == 'Older', 'NumStorePurchases']
younger_store = df.loc[df['AgeGroup'] == 'Younger', 'NumStorePurchases']

t_stat, p_val = stats.ttest_ind(older_store, younger_store, equal_var=False)

print('Older mean store purchases:', older_store.mean())
print('Younger mean store purchases:', younger_store.mean())
print('t-statistic:', t_stat, 'p-value:', p_val)

In [None]:
# Cell 21: Boxplot for store purchases by age group
plt.figure(figsize=(6, 4))
sns.boxplot(x='AgeGroup', y='NumStorePurchases', data=df)
plt.title('Store purchases by age group')
plt.show()

In [None]:
# Cell 22: Hypothesis 2 – Customers with kids prefer online

df['HasKids'] = np.where(df['TotalChildren'] > 0, 'Kids', 'NoKids')

kids_web = df.loc[df['HasKids'] == 'Kids', 'NumWebPurchases']
nokids_web = df.loc[df['HasKids'] == 'NoKids', 'NumWebPurchases']

t_stat2, p_val2 = stats.ttest_ind(kids_web, nokids_web, equal_var=False)

print('Kids mean web purchases:', kids_web.mean())
print('No-kids mean web purchases:', nokids_web.mean())
print('t-statistic:', t_stat2, 'p-value:', p_val2)

In [None]:
# Cell 23: Boxplot for web purchases by kids vs no kids
plt.figure(figsize=(6, 4))
sns.boxplot(x='HasKids', y='NumWebPurchases', data=df)
plt.title('Web purchases by kids vs no kids')
plt.show()

In [None]:
# Cell 24: Hypothesis 3 – Channel cannibalization (store vs web/catalog)

corr_store_web = df['NumStorePurchases'].corr(df['NumWebPurchases'])
corr_store_catalog = df['NumStorePurchases'].corr(df['NumCatalogPurchases'])

print('Correlation store vs web:', corr_store_web)
print('Correlation store vs catalog:', corr_store_catalog)

In [None]:
# Cell 25: Scatter plots for cannibalization
fig, axes = plt.subplots(1, 2, figsize=(10, 4))

sns.regplot(x='NumWebPurchases', y='NumStorePurchases', data=df, ax=axes[0])
axes[0].set_title('Store vs Web purchases')

sns.regplot(x='NumCatalogPurchases', y='NumStorePurchases', data=df, ax=axes[1])
axes[1].set_title('Store vs Catalog purchases')

plt.tight_layout()
plt.show()

In [None]:
# Cell 26: Hypothesis 4 – US vs rest of world (total purchases)

df['IsUS'] = np.where(df['Country'] == 'US', 'US', 'NonUS')

us_purchases = df.loc[df['IsUS'] == 'US', 'TotalPurchases']
nonus_purchases = df.loc[df['IsUS'] == 'NonUS', 'TotalPurchases']

t_stat3, p_val3 = stats.ttest_ind(us_purchases, nonus_purchases, equal_var=False)

print('US mean total purchases:', us_purchases.mean())
print('Non-US mean total purchases:', nonus_purchases.mean())
print('t-statistic:', t_stat3, 'p-value:', p_val3)

In [None]:
# Cell 27: Boxplot US vs Non-US
plt.figure(figsize=(6, 4))
sns.boxplot(x='IsUS', y='TotalPurchases', data=df)
plt.title('Total purchases: US vs Non-US')
plt.show()

In [None]:
# Cell 28: Product performance (revenue)

product_revenue = df[product_cols].sum().sort_values(ascending=False)

print(product_revenue)

plt.figure(figsize=(8, 4))
sns.barplot(x=product_revenue.index, y=product_revenue.values)
plt.xticks(rotation=45)
plt.title('Revenue by product category')
plt.ylabel('Total spend')
plt.show()

In [None]:
# Cell 29: Age vs last campaign acceptance (Response)

plt.figure(figsize=(6, 4))
sns.boxplot(x='Response', y='Age', data=df)
plt.title('Age vs last campaign response')
plt.show()

In [None]:
# Cell 30: Acceptance rate by age bins

age_bins = [18, 30, 40, 50, 60, 80, 100]
df['AgeBin'] = pd.cut(df['Age'], bins=age_bins)

age_response = df.groupby('AgeBin')['Response'].mean()

print(age_response)

plt.figure(figsize=(8, 4))
age_response.plot(kind='bar')
plt.ylabel('Response rate')
plt.title('Last campaign acceptance rate by age bin')
plt.show()

In [None]:
# Cell 31: Country with greatest number of last campaign acceptors

country_accepts = df[df['Response'] == 1]['Country'].value_counts()

print(country_accepts.head())

plt.figure(figsize=(8, 4))
country_accepts.plot(kind='bar')
plt.ylabel('Number of customers')
plt.title('Customers who accepted last campaign by country')
plt.show()

In [None]:
# Cell 32: Children at home vs total spend

children_spend = df.groupby('TotalChildren')['TotalSpend'].mean()

print(children_spend)

plt.figure(figsize=(6, 4))
children_spend.plot(kind='bar')
plt.ylabel('Average total spend')
plt.title('Average total spend by number of children at home')
plt.show()

In [None]:
# Cell 33: Scatter plot – Total spend vs number of children

plt.figure(figsize=(6, 4))
sns.scatterplot(x='TotalChildren', y='TotalSpend', data=df)
plt.title('Total spend vs number of children')
plt.show()

In [None]:
# Cell 34: Education background of customers who complained in last 2 years

complain_edu = df[df['Complain'] == 1]['Education'].value_counts(normalize=True)

print(complain_edu)

plt.figure(figsize=(6, 4))
complain_edu.plot(kind='bar')
plt.ylabel('Proportion of complainers')
plt.title('Education of customers who complained in last 2 years')
plt.show()

In [None]:
# Cell 35: Save processed datasets

df.to_csv('marketing_data_processed.csv', index=False)
df_encoded.to_csv('marketing_data_encoded.csv', index=False)

print('Saved marketing_data_processed.csv and marketing_data_encoded.csv')