# Statistical Analysis: Tax Comparison Between Yangon and Naypyitaw

**Research Question:** We hypothesize that the average taxes in Yangon are higher than Naypyitaw.

**Note:** This is a **one-tailed test** (directional hypothesis: Yangon > Naypyitaw)

---

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

# Load data
df = pd.read_csv('supermarket_sales.csv')
print(f"Dataset loaded: {df.shape[0]} transactions\n")

## 1. Descriptive Statistics

In [None]:
# Separate data by city (Yangon vs Naypyitaw) - using Tax 5% column
yangon_taxes = df[df['City'] == 'Yangon']['Tax 5%']
naypyitaw_taxes = df[df['City'] == 'Naypyitaw']['Tax 5%']

# Calculate statistics
yangon_mean = yangon_taxes.mean()
yangon_std = yangon_taxes.std()
yangon_n = len(yangon_taxes)

naypyitaw_mean = naypyitaw_taxes.mean()
naypyitaw_std = naypyitaw_taxes.std()
naypyitaw_n = len(naypyitaw_taxes)

yangon_total = yangon_taxes.sum()
naypyitaw_total = naypyitaw_taxes.sum()
total_taxes = yangon_total + naypyitaw_total

print("="*70)
print("DESCRIPTIVE STATISTICS")
print("="*70)
print(f"\n{'City':<15} {'Mean ($)':<15} {'Std Dev ($)':<15} {'Sample Size':<15}")
print("-"*70)
print(f"{'Yangon':<15} {yangon_mean:<15.2f} {yangon_std:<15.2f} {yangon_n:<15}")
print(f"{'Naypyitaw':<15} {naypyitaw_mean:<15.2f} {naypyitaw_std:<15.2f} {naypyitaw_n:<15}")

print(f"\n{'='*70}")
print("TOTAL TAXES COLLECTED")
print("="*70)
print(f"Yangon:    ${yangon_total:,.2f} ({yangon_total/total_taxes*100:.2f}%)")
print(f"Naypyitaw: ${naypyitaw_total:,.2f} ({naypyitaw_total/total_taxes*100:.2f}%)")
print(f"Difference: {abs(yangon_total/total_taxes - naypyitaw_total/total_taxes)*100:.2f} percentage points")

print(f"\n{'='*70}")
print("INITIAL OBSERVATION")
print("="*70)
if yangon_mean > naypyitaw_mean:
    print(f"Yangon mean (${yangon_mean:.2f}) > Naypyitaw mean (${naypyitaw_mean:.2f})")
    print("→ Data supports the hypothesis direction")
else:
    print(f"Yangon mean (${yangon_mean:.2f}) < Naypyitaw mean (${naypyitaw_mean:.2f})")
    print("→ Data contradicts the hypothesis! Naypyitaw has higher average taxes.")

## 2. Calculate Test Statistics

In [None]:
# Step 1: Difference in sample means
mean_diff = yangon_mean - naypyitaw_mean
print("="*70)
print("STEP 1: DIFFERENCE IN SAMPLE MEANS")
print("="*70)
print(f"x̄₁ - x̄₂ = {yangon_mean:.2f} - {naypyitaw_mean:.2f} = {mean_diff:.2f}")
print(f"\nNote: Negative value indicates Yangon < Naypyitaw (opposite of hypothesis)")

# Step 2: Pooled standard deviation
pooled_var = ((yangon_n - 1) * yangon_std**2 + (naypyitaw_n - 1) * naypyitaw_std**2) / (yangon_n + naypyitaw_n - 2)
pooled_std = np.sqrt(pooled_var)
print(f"\n{'='*70}")
print("STEP 2: POOLED STANDARD DEVIATION")
print("="*70)
print(f"sp = √[((n₁-1)s₁² + (n₂-1)s₂²) / (n₁+n₂-2)]")
print(f"sp = √[(({yangon_n}-1)×{yangon_std:.2f}² + ({naypyitaw_n}-1)×{naypyitaw_std:.2f}²) / {yangon_n+naypyitaw_n-2}]")
print(f"sp = {pooled_std:.2f}")

# Step 3: Standard error
se = pooled_std * np.sqrt(1/yangon_n + 1/naypyitaw_n)
print(f"\n{'='*70}")
print("STEP 3: STANDARD ERROR")
print("="*70)
print(f"SE = sp × √(1/n₁ + 1/n₂)")
print(f"SE = {pooled_std:.2f} × √(1/{yangon_n} + 1/{naypyitaw_n})")
print(f"SE = {pooled_std:.2f} × {np.sqrt(1/yangon_n + 1/naypyitaw_n):.4f}")
print(f"SE = {se:.2f}")

# Step 4: 95% Confidence Interval
t_critical = 1.96  # For large samples
ci_lower = mean_diff - t_critical * se
ci_upper = mean_diff + t_critical * se
print(f"\n{'='*70}")
print("STEP 4: 95% CONFIDENCE INTERVAL")
print("="*70)
print(f"CI = (x̄₁ - x̄₂) ± t₀.₀₂₅,df × SE")
print(f"CI = {mean_diff:.2f} ± {t_critical} × {se:.2f}")
print(f"CI = {mean_diff:.2f} ± {t_critical * se:.2f}")
print(f"CI = [{ci_lower:.2f}, {ci_upper:.2f}]")
print(f"\nInterpretation: CI {'includes' if ci_lower <= 0 <= ci_upper else 'does not include'} 0")
if ci_lower <= 0 <= ci_upper:
    print("→ Suggests no significant difference")
else:
    print("→ Suggests significant difference exists")

## 3. Hypothesis Testing (One-Tailed)

In [None]:
print("="*70)
print("HYPOTHESIS TEST (ONE-TAILED)")
print("="*70)
print("\nH₀: μ_Yangon ≤ μ_Naypyitaw (Yangon taxes are not higher)")
print("H₁: μ_Yangon > μ_Naypyitaw (Yangon taxes are higher)")
print("\nSignificance level: α = 0.05")
print("\nNote: This is a ONE-TAILED test (testing if Yangon > Naypyitaw)")

# Calculate t-statistic
t_stat = mean_diff / se
df = yangon_n + naypyitaw_n - 2

print(f"\n{'='*70}")
print("T-STATISTIC")
print("="*70)
print(f"t = (x̄₁ - x̄₂) / SE = {mean_diff:.2f} / {se:.2f} = {t_stat:.3f}")
print(f"df = n₁ + n₂ - 2 = {yangon_n} + {naypyitaw_n} - 2 = {df}")

# Calculate p-value (one-tailed)
t_test_result = stats.ttest_ind(yangon_taxes, naypyitaw_taxes)
p_value_two_tailed = t_test_result.pvalue

# For one-tailed test: if t > 0, divide by 2; if t < 0, use 1 - p/2
if t_stat > 0:
    p_value_one_tailed = p_value_two_tailed / 2
else:
    p_value_one_tailed = 1 - (p_value_two_tailed / 2)

print(f"\n{'='*70}")
print("P-VALUE")
print("="*70)
print(f"For one-tailed test with t = {t_stat:.3f} and df = {df}:")
print(f"Two-tailed p-value: {p_value_two_tailed:.4f}")
print(f"One-tailed p-value: {p_value_one_tailed:.4f}")
print(f"\nNote: Since t < 0 (opposite direction), p-value is very high")

## 4. Effect Size

In [None]:
# Cohen's d
cohens_d = mean_diff / pooled_std

print("="*70)
print("EFFECT SIZE (Cohen's d)")
print("="*70)
print(f"d = (x̄₁ - x̄₂) / sp = {mean_diff:.2f} / {pooled_std:.2f} = {cohens_d:.3f}")

if abs(cohens_d) < 0.2:
    effect = "Negligible"
elif abs(cohens_d) < 0.5:
    effect = "Small"
elif abs(cohens_d) < 0.8:
    effect = "Medium"
else:
    effect = "Large"

print(f"\nInterpretation: {effect} effect (|d| {'<' if abs(cohens_d) < 0.2 else '>='} 0.2)")

## 5. Decision and Conclusion

In [None]:
print("="*70)
print("DECISION")
print("="*70)
print(f"\nDecision rule: Reject H₀ if p < 0.05 (one-tailed)")
print(f"\nSince p = {p_value_one_tailed:.4f} {'<' if p_value_one_tailed < 0.05 else '>'} 0.05:")

if p_value_one_tailed < 0.05:
    decision = "REJECT H₀"
    conclusion = f"The hypothesis is SUPPORTED: Yangon taxes are significantly higher than Naypyitaw."
else:
    decision = "FAIL TO REJECT H₀"
    conclusion = f"The hypothesis is NOT SUPPORTED: Yangon taxes are NOT significantly higher than Naypyitaw."

print(f"\n→ {decision}")

print(f"\n{'='*70}")
print("CONCLUSION")
print("="*70)
print(f"\n{conclusion}")

if mean_diff < 0:
    print(f"\nIn fact, the data shows the OPPOSITE:")
    print(f"Naypyitaw has higher average taxes (${naypyitaw_mean:.2f}) than Yangon (${yangon_mean:.2f}).")
    print(f"The difference is ${abs(mean_diff):.2f}.")
    
print(f"\nThis difference is {'statistically significant' if p_value_two_tailed < 0.05 else 'NOT statistically significant'}.")
print(f"Effect size: {effect} (Cohen's d = {cohens_d:.3f})")

## Summary Table

In [None]:
print("="*70)
print("SUMMARY OF RESULTS")
print("="*70)
print(f"\n{'Statistic':<30} {'Value':<20}")
print("-"*70)
print(f"{'Difference in means':<30} ${mean_diff:.2f}")
print(f"{'Pooled std deviation':<30} ${pooled_std:.2f}")
print(f"{'Standard error':<30} ${se:.2f}")
print(f"{'T-statistic':<30} {t_stat:.3f}")
print(f"{'Degrees of freedom':<30} {df}")
print(f"{'P-value (one-tailed)':<30} {p_value_one_tailed:.4f}")
print(f"{'Cohen\'s d':<30} {cohens_d:.3f} ({effect})")
print(f"{'95% CI':<30} [{ci_lower:.2f}, {ci_upper:.2f}]")
print(f"{'Decision':<30} {decision}")
print(f"{'Hypothesis':<30} {'Supported' if p_value_one_tailed < 0.05 else 'NOT Supported'}")
print("="*70)