In [17]:
# Data Manipulation & Analysis
import pandas as pd  
import numpy as np   

# Data Visualization
import matplotlib.pyplot as plt  
import seaborn as sns            

# Statistical Analysis
import scipy.stats as stats       
from statsmodels.stats.multicomp import pairwise_tukeyhsd 
from scipy.stats import mannwhitneyu, kruskal ,shapiro

# Bonus (for reporting)
import warnings                   
warnings.filterwarnings('ignore') 

In [8]:
# Load data
data = pd.read_csv("sandwich.csv")

# Set visual style
sns.set_style("whitegrid")  # Clean background for plots
# Preview data




In [32]:
# Set dark and gray style
plt.style.use('ggplot') 
sns.set_palette("gray")  # Gray-scale palette for boxplots
plt.rcParams['axes.facecolor'] = 'gray'  # Dark gray background
plt.rcParams['figure.facecolor'] = '#2e2e2e'
plt.rcParams['text.color'] = 'white'
plt.rcParams['axes.labelcolor'] = 'white'
plt.rcParams['xtick.color'] = 'white'
plt.rcParams['ytick.color'] = 'white'

# Function to add individual median labels to each box
def add_median_labels(ax, data, x, hue):
    # Calculate medians for each group
    medians = data.groupby([x, hue])['antCount'].median().reset_index()
    print(f"Medians for {x} by {hue}:\n", medians)  # Debug print
    # Get unique categories and hues
    categories = data[x].unique()
    hues = data[hue].unique()
    # Iterate through each category and hue
    box_width = 0.4 / len(hues)  # Approximate width of each box
    for i, cat in enumerate(categories):
        for j, hue_val in enumerate(hues):
            # Get median for the current group
            median_row = medians[(medians[x] == cat) & (medians[hue] == hue_val)]
            if not median_row.empty:
                group_median = median_row['antCount'].iloc[0]
                # Calculate x-position: center of the box
                x_pos = i + (j * box_width) + (box_width / 2) - 0.2
                # Place label at the median y-position, centered in the box
                ax.text(x_pos, group_median, f'{group_median:.1f}', 
                        ha='center', va='center', color='white', fontsize=9, 
                        weight='bold', bbox=dict(facecolor='black', alpha=0.8, edgecolor='white', boxstyle='round,pad=0.2'))
                print(f"Label added: {cat}, {hue_val}, Median={group_median:.1f}, x_pos={x_pos:.3f}")  # Debug print

# Boxplot 1: antCount by Bread Type (hue=butter)
plt.figure(figsize=(12, 6))
ax1 = sns.boxplot(x='bread', y='antCount', hue='butter', data=data, palette=['#666666', '#b3b3b3'], order=['Whole Grain', 'Rye', 'White', 'Multi Grain'])
add_median_labels(ax1, data, 'bread', 'butter')
plt.title('Ant Count by Bread Type (Butter vs. No Butter)', color='white', fontsize=14, pad=20)
plt.xlabel('Bread Type', color='white', fontsize=12)
plt.ylabel('Ant Count', color='white', fontsize=12)
plt.xticks(rotation=45, color='white', fontsize=10)
plt.yticks(fontsize=10)
plt.legend(title='Butter', loc='upper left', fontsize=10, title_fontsize=12)
plt.tight_layout()
plt.savefig('bread_boxplot_visible_medians.png', dpi=300, bbox_inches='tight', facecolor='#2e2e2e')
plt.close()

# Boxplot 2: antCount by Topping Type (hue=butter)
plt.figure(figsize=(12, 6))
ax2 = sns.boxplot(x='topping', y='antCount', hue='butter', data=data, palette=['#666666', '#b3b3b3'], order=['Ham and gherkins', 'Peanut butter', 'Yeast spread'])
add_median_labels(ax2, data, 'topping', 'butter')
plt.title('Ant Count by Topping Type (Butter vs. No Butter)', color='white', fontsize=14, pad=20)
plt.xlabel('Topping Type', color='white', fontsize=12)
plt.ylabel('Ant Count', color='white', fontsize=12)
plt.xticks(rotation=45, color='white', fontsize=10)
plt.yticks(fontsize=10)
plt.legend(title='Butter', loc='upper left', fontsize=10, title_fontsize=12)
plt.tight_layout()
plt.savefig('topping_boxplot_visible_medians.png', dpi=300, bbox_inches='tight', facecolor='#2e2e2e')
plt.close()

Medians for bread by butter:
          bread butter  antCount
0  Multi Grain     no      37.0
1  Multi Grain    yes      53.5
2          Rye     no      40.5
3          Rye    yes      42.0
4        White     no      40.0
5        White    yes      47.5
6  Whole Grain     no      33.5
7  Whole Grain    yes      54.0
Label added: Rye, no, Median=40.5, x_pos=-0.100
Label added: Rye, yes, Median=42.0, x_pos=0.100
Label added: Multi Grain, no, Median=37.0, x_pos=0.900
Label added: Multi Grain, yes, Median=53.5, x_pos=1.100
Label added: White, no, Median=40.0, x_pos=1.900
Label added: White, yes, Median=47.5, x_pos=2.100
Label added: Whole Grain, no, Median=33.5, x_pos=2.900
Label added: Whole Grain, yes, Median=54.0, x_pos=3.100
Medians for topping by butter:
             topping butter  antCount
0  Ham and gherkins     no      50.5
1  Ham and gherkins    yes      64.0
2     Peanut butter     no      30.0
3     Peanut butter    yes      48.5
4      Yeast spread     no      30.0
5      Yeas

In [20]:
# Verify dataset properties
print("Dataset Overview:")
print(f"Number of observations: {len(data)}")
print(f"Missing values:\n{data.isna().sum()}")
print("\nValue Counts:")
print("Bread:\n", data['bread'].value_counts())
print("Topping:\n", data['topping'].value_counts())
print("Butter:\n", data['butter'].value_counts())

# Descriptive Statistics
print("\nDescriptive Statistics for antCount:")
print(data['antCount'].describe())
print("\nMedians by Bread Type (Butter vs No Butter):")
print(data.groupby(['bread', 'butter'])['antCount'].median())
print("\nMedians by Topping (Butter vs No Butter):")
print(data.groupby(['topping', 'butter'])['antCount'].median())
print("\nIQRs by Bread Type and Butter:")
for bread in data['bread'].unique():
    for butter in ['no', 'yes']:
        subset = data[(data['bread'] == bread) & (data['butter'] == butter)]['antCount']
        if len(subset) > 0:
            q1, q3 = np.percentile(subset, [25, 75])
            print(f"{bread} (Butter: {butter}): Q1={q1:.1f}, Q3={q3:.1f}, IQR={q3-q1:.1f}")
print("\nIQRs by Topping and Butter:")
for topping in data['topping'].unique():
    for butter in ['no', 'yes']:
        subset = data[(data['topping'] == topping) & (data['butter'] == butter)]['antCount']
        if len(subset) > 0:
            q1, q3 = np.percentile(subset, [25, 75])
            print(f"{topping} (Butter: {butter}): Q1={q1:.1f}, Q3={q3:.1f}, IQR={q3-q1:.1f}")

# Normality Tests
print("\nShapiro-Wilk Normality Tests:")
for bread in data['bread'].unique():
    for butter in ['yes', 'no']:
        subset = data[(data['bread'] == bread) & (data['butter'] == butter)]['antCount']
        if len(subset) >= 3:  # Shapiro-Wilk requires at least 3 samples
            stat, p = shapiro(subset)
            print(f"{bread} (Butter: {butter}): p-value={p:.4f}")
for topping in data['topping'].unique():
    for butter in ['yes', 'no']:
        subset = data[(data['topping'] == topping) & (data['butter'] == butter)]['antCount']
        if len(subset) >= 3:
            stat, p = shapiro(subset)
            print(f"{topping} (Butter: {butter}): p-value={p:.4f}")

# Statistical Tests
# Mann-Whitney U Test: Butter (yes vs no)
butter_yes = data[data['butter'] == 'yes']['antCount']
butter_no = data[data['butter'] == 'no']['antCount']
mwu_stat, mwu_p = mannwhitneyu(butter_yes, butter_no, alternative='two-sided')
print("\nMann-Whitney U Test (Butter: Yes vs No):")
print(f"Statistic: {mwu_stat:.1f}, p-value: {mwu_p:.4f}")

# Kruskal-Wallis H-Test: Bread Type
bread_groups = [data[data['bread'] == bread]['antCount'] for bread in data['bread'].unique()]
kw_stat_bread, kw_p_bread = kruskal(*bread_groups)
print("\nKruskal-Wallis H-Test (Bread Type):")
print(f"Statistic: {kw_stat_bread:.2f}, p-value: {kw_p_bread:.4f}")

# Kruskal-Wallis H-Test: Topping Type
topping_groups = [data[data['topping'] == topping]['antCount'] for topping in data['topping'].unique()]
kw_stat_topping, kw_p_topping = kruskal(*topping_groups)
print("\nKruskal-Wallis H-Test (Topping Type):")
print(f"Statistic: {kw_stat_topping:.2f}, p-value: {kw_p_topping:.4f}")

Dataset Overview:
Number of observations: 48
Missing values:
antCount    0
bread       0
topping     0
butter      0
dtype: int64

Value Counts:
Bread:
 bread
Rye            12
Multi Grain    12
White          12
Whole Grain    12
Name: count, dtype: int64
Topping:
 topping
Ham and gherkins    16
Peanut butter       16
Yeast spread        16
Name: count, dtype: int64
Butter:
 butter
no     24
yes    24
Name: count, dtype: int64

Descriptive Statistics for antCount:
count    48.000000
mean     43.500000
std      15.148906
min      18.000000
25%      30.500000
50%      43.000000
75%      57.000000
max      76.000000
Name: antCount, dtype: float64

Medians by Bread Type (Butter vs No Butter):
bread        butter
Multi Grain  no        37.0
             yes       53.5
Rye          no        40.5
             yes       42.0
White        no        40.0
             yes       47.5
Whole Grain  no        33.5
             yes       54.0
Name: antCount, dtype: float64

Medians by Topping (Butte