In [1]:
import numpy as np
import pandas as pd
from scipy import stats

In [2]:
# Let's simulate some coffee shop waiting times
np.random.seed(42)  # for reproducibility

# Scenario: The manager claims average wait time is 3 minutes
# You collected data for 60 customers over a week
# Here's your sample data (in minutes):
waiting_times = np.random.normal(loc=3.4, scale=0.8, size=60)

# Let's look at our data
print(f"Sample mean: {waiting_times.mean():.2f} minutes")
print(f"Sample standard deviation: {waiting_times.std():.2f} minutes")
print(f"Sample size: {len(waiting_times)}")

Sample mean: 3.28 minutes
Sample standard deviation: 0.72 minutes
Sample size: 60


### Task 1: Perform a one-sample t-test
You need to test whether the average waiting time is significantly different from 3 minutes. Since we don't know the true population standard deviation, we'll use a t-test.
Start by setting up your hypotheses clearly:<br>
$H_0: \mu = 3$ (the population mean is 3 minutes) <br>
$H_A: \mu \neq 3$ (the population mean is not 3 minutes — two-tailed test)

In [3]:
# Your code here - try to:
# 1. Calculate the test statistic manually first
x_hat = np.mean(waiting_times)
hyp_mean = 3.0
sd_sample = np.std(waiting_times, ddof=1)
n = len(waiting_times)
ddof = n - 1

t_stats = (x_hat - hyp_mean) / (sd_sample / np.sqrt(n))
p_value = 2 * (1 - stats.t.cdf(abs(t_stats), df=ddof))

print(f"manual test statistic: {t_stats}")
print(f"manual p-value: {p_value}")

manual test statistic: 2.944382096364252
manual p-value: 0.004623249382156747


In [5]:
# 2. Then use scipy.stats.ttest_1samp() to verify
ttest, pvalue = stats.ttest_1samp(
    a=waiting_times, popmean=hyp_mean, alternative="two-sided"
)

print(f"scipy test statistic: {ttest}")
print(f"scipy p-value: {pvalue}")

scipy test statistic: 2.9443820963642517
scipy p-value: 0.004623249382156754


### Task 2. Interpret the results
1) If the true average waiting time really were 3 minutes, there's only a 0.46% chance we'd see a sample mean as far from 3 as we did (3.24 minutes in this case)
2) It's really unlikely our data came from a population with a mean of 3 minutes. So we reject that idea and conclude the true mean is probably different from 3 minutes.

### Task 3: Explore the effect of sample size
1. Is the sample mean of the small sample similar to the full sample?
2. What happened to the t-statistic? Did it get larger or smaller?
3. What happened to the p-value?
4. With the small sample, would you still reject the null hypothesis at α = 0.05?
5. Why do you think sample size has this effect?

In [6]:
# Task 3: Effect of sample size
small_sample = waiting_times[:10]

print(f"\n--- Small Sample Analysis ---")
print(f"Small sample mean: {small_sample.mean():.2f} minutes")
print(f"Small sample std: {small_sample.std(ddof=1):.2f} minutes")
print(f"Small sample size: {len(small_sample)}")

# Calculate t-test for small sample
t_small, p_small = stats.ttest_1samp(small_sample, popmean=3.0, alternative="two-sided")

print(f"\nSmall sample t-statistic: {t_small:.4f}")
print(f"Small sample p-value: {p_small:.4f}")

# Compare with your original results
print(f"\n--- Comparison ---")
print(f"Original (n=60): t = {ttest:.4f}, p = {pvalue:.4f}")
print(f"Small (n=10): t = {t_small:.4f}, p = {p_small:.4f}")


--- Small Sample Analysis ---
Small sample mean: 3.76 minutes
Small sample std: 0.58 minutes
Small sample size: 10

Small sample t-statistic: 4.1466
Small sample p-value: 0.0025

--- Comparison ---
Original (n=60): t = 2.9444, p = 0.0046
Small (n=10): t = 4.1466, p = 0.0025


### Two-Sample T-Tests: Comparing Groups

In [7]:
import numpy as np
import pandas as pd
from scipy import stats

np.random.seed(42)

# Let's simulate waiting times for two shifts
# Morning shift: generally faster (experienced baristas, regular customers who know their orders)
morning_wait_times = np.random.normal(loc=2.8, scale=0.6, size=45)

# Evening shift: generally slower (tired baristas, more complex orders, people less rushed)
evening_wait_times = np.random.normal(loc=3.5, scale=0.9, size=50)

# Let's organize this in a way that mirrors real data
coffee_data = pd.DataFrame(
    {
        "wait_time": np.concatenate([morning_wait_times, evening_wait_times]),
        "shift": ["morning"] * 45 + ["evening"] * 50,
    }
)

print("Sample summary statistics:")
print(coffee_data.groupby("shift")["wait_time"].agg(["count", "mean", "std"]))

Sample summary statistics:
         count      mean       std
shift                             
evening     50  3.508677  0.815090
morning     45  2.670254  0.558168


## 2. Independent Two-Sample t-Test (Equal Variances)

This test compares the means of two independent groups, assuming they have similar variability. 

**Formula:**
$ t = \frac{\bar{x}_1 - \bar{x}_2}{s_p \cdot \sqrt{\frac{1}{n_1} + \frac{1}{n_2}}}$

**Pooled Standard Deviation:**
$ s_p = \sqrt{\frac{(n_1-1)s_1^2 + (n_2-1)s_2^2}{n_1 + n_2 - 2}}$

Where:

- $\bar{x}_1, \bar{x}_2$ = means of groups 1 and 2
- $s_1^2, s_2^2$ = variances of groups 1 and 2
- $n_1, n_2$ = sample sizes of groups 1 and 2
- $s_p$ = pooled standard deviation (weighted average of the two standard deviations)

**Degrees of freedom:** $df = n_1 + n_2 - 2$

In [8]:
morning_data, evening_data = (
    coffee_data[coffee_data["shift"] == shift]["wait_time"].values
    for shift in ["morning", "evening"]
)

In [9]:
x_morning, x_evening = np.mean(morning_data), np.mean(evening_data)
var_morning, var_evening = np.var(morning_data, ddof=1), np.var(evening_data, ddof=1)
n_morning, n_evening = len(morning_data), len(evening_data)
sd_morning, sd_evening = np.std(morning_data, ddof=1), np.std(evening_data, ddof=1)

In [10]:
# Welch’s t-Test (Equal Variance)
t_stat_welch, p_val_welch = stats.ttest_ind(
    morning_data,
    evening_data,
    equal_var=False,
)

# Welch’s t-Test (Non-equal Variance)
t_stat_welch_eq, p_val_welch_eq = stats.ttest_ind(
    morning_data,
    evening_data,
    equal_var=True,
)

se_morning, se_evening = stats.sem(morning_data), stats.sem(evening_data)
welch_df = (se_morning**2 + se_evening**2) ** 2 / (
    se_morning**4 / (len(morning_data) - 1) + se_evening**4 / (len(evening_data) - 1)
)
print(f"degrees of freedom: {welch_df:.2f}")
print(f"\nt-statistic (equal variance): {t_stat_welch:.4f}")
print(f"p-value (equal variance): {p_val_welch:.4e}")
print(f"\nt-statistic (non-equal variance): {t_stat_welch_eq:.4e}")
print(f"p-value (non-equal variance): {p_val_welch_eq:.4e}")

degrees of freedom: 87.05

t-statistic (equal variance): -5.8975
p-value (equal variance): 6.8771e-08

t-statistic (non-equal variance): -5.7852e+00
p-value (non-equal variance): 9.6727e-08


In [11]:
# Pooled Standard Deviation
pooled_num = (n_morning - 1) * var_morning + (n_evening - 1) * var_evening
pooled_den = n_morning + n_evening - 2
pooled_sd = np.sqrt(pooled_num / pooled_den)

In [12]:
print(f"SD via numpy with ddof=1: morning: {sd_morning}, evening: {sd_evening}")
print(f"Pooled SD (manual calculation): {pooled_sd}")

SD via numpy with ddof=1: morning: 0.5581683340428335, evening: 0.8150902303627161
Pooled SD (manual calculation): 0.7052988020769762


### Task 1: Interpret the Results
What's your conclusion at α = 0.05? Do morning and evening shifts have different average wait times? <br>
Why do you think Welch's test and the pooled variance test give slightly different results? <br>
Which test would you trust more in this case, and why? (Hint: look at the standard deviations of the two groups) <br>

In [None]:
print("Breaking down pooled standard deviation:")
print(f"Morning: n={n_morning}, variance={var_morning:.4f}, sd={sd_morning:.4f}")
print(f"Evening: n={n_evening}, variance={var_evening:.4f}, sd={sd_evening:.4f}")

# Each group contributes to the pooled variance weighted by its degrees of freedom
morning_contribution = (n_morning - 1) * var_morning
evening_contribution = (n_evening - 1) * var_evening

print(
    f"\nMorning contributes: {n_morning-1} × {var_morning:.4f} = {morning_contribution:.4f}"
)
print(
    f"Evening contributes: {n_evening-1} × {var_evening:.4f} = {evening_contribution:.4f}"
)

total_df = (n_morning - 1) + (n_evening - 1)
pooled_variance = (morning_contribution + evening_contribution) / total_df

print(f"\nPooled variance = {pooled_variance:.4f}")
print(f"Pooled SD = √{pooled_variance:.4f} = {np.sqrt(pooled_variance):.4f}")

# Compare to simple average (this would be wrong!)
simple_avg_sd = (sd_morning + sd_evening) / 2
print(f"\nSimple average of SDs (wrong!): {simple_avg_sd:.4f}")
print(f"Correct pooled SD: {pooled_sd:.4f}")

Breaking down pooled standard deviation:
Morning: n=45, variance=0.3116, sd=0.5582
Evening: n=50, variance=0.6644, sd=0.8151

Morning contributes: 44 × 0.3116 = 13.7083
Evening contributes: 49 × 0.6644 = 32.5542

Pooled variance = 0.4974
Pooled SD = √0.4974 = 0.7053

Simple average of SDs (wrong!): 0.6866
Correct pooled SD: 0.7053


In [26]:
# Standard error calculation for each approach

# Welch's approach (keeps variances separate)
se_welch = np.sqrt((var_morning / n_morning) + (var_evening / n_evening))
t_welch_manual = (x_morning - x_evening) / se_welch

print("Welch's approach:")
print(f"SE = ({var_morning:.4f}/{n_morning} + {var_evening:.4f}/{n_evening})")
print(f"SE = ({var_morning/n_morning:.6f} + {var_evening/n_evening:.6f})")
print(f"SE = {se_welch:.4f}")
print(f"t = {t_welch_manual:.4f}")

# Pooled approach (uses common variance)
se_pooled = pooled_sd * np.sqrt(1 / n_morning + 1 / n_evening)
t_pooled_manual = (x_morning - x_evening) / se_pooled

print("\nPooled approach:")
print(f"SE = {pooled_sd:.4f} × (1/{n_morning} + 1/{n_evening})")
print(f"SE = {pooled_sd:.4f} × {np.sqrt(1/n_morning + 1/n_evening):.4f}")
print(f"SE = {se_pooled:.4f}")
print(f"t = {t_pooled_manual:.4f}")

print(f"\nDifference in SE: {se_welch - se_pooled:.4f}")
print(f"Welch's SE is {'larger' if se_welch > se_pooled else 'smaller'}")

Welch's approach:
SE = (0.3116/45 + 0.6644/50)
SE = (0.006923 + 0.013287)
SE = 0.1422
t = -5.8975

Pooled approach:
SE = 0.7053 × (1/45 + 1/50)
SE = 0.7053 × 0.2055
SE = 0.1449
t = -5.7852

Difference in SE: -0.0028
Welch's SE is smaller


In [29]:
# Degrees of freedom comparison
df_pooled = n_morning + n_evening - 2
df_welch = welch_df

print(f"Pooled test df: {df_pooled}")
print(f"Welch's test df: {df_welch:.1f}")
print(f"Difference: {df_pooled - df_welch:.1f}")

Pooled test df: 93
Welch's test df: 87.0
Difference: 6.0


In [30]:
# Test for equal variances (Levene's test)
stat_levene, p_levene = stats.levene(morning_data, evening_data)
print("Levene's test for equal variances:")
print(f"Statistic: {stat_levene:.4f}")
print(f"p-value: {p_levene:.4f}")
print(
    f"Conclusion: {'Equal variances' if p_levene > 0.05 else 'Unequal variances'} (at α=0.05)"
)

# Variance ratio
variance_ratio = var_evening / var_morning
print(f"\nVariance ratio (evening/morning): {variance_ratio:.2f}")
print("Rule of thumb: ratios > 2 or < 0.5 suggest unequal variances")

Levene's test for equal variances:
Statistic: 5.9471
p-value: 0.0166
Conclusion: Unequal variances (at α=0.05)

Variance ratio (evening/morning): 2.13
Rule of thumb: ratios > 2 or < 0.5 suggest unequal variances
