In [None]:
import pandas as pd

In [None]:
toyota_sales = pd.read_csv('data/car_sales/toyota_sales_data.csv')

# Choosing the Right Measure: Decision Framework

## When to Use Each Measure

**Use MEAN when:**
- Data is fairly symmetric (no extreme outliers)
- You want to account for every value
- Doing mathematical calculations

**Use MEDIAN when:**
- Data has outliers or is skewed
- You want the "typical middle" value
- Working with income, prices, or measurements with extremes

**Use MODE when:**
- Data is categorical
- You want the "most common" value
- Finding best-sellers or popular choices

## Example 1: Symmetric Data → Use MEAN

When mean and median are close, data is balanced. Mean is simpler and uses all data points.

In [None]:
# Camry sales are consistent
camry_sales = toyota_sales[toyota_sales['car_model'] == 'Camry']['sale_amount']

print(f"Mean:   ${camry_sales.mean():,.2f}")
print(f"Median: ${camry_sales.median():,.2f}")
print(f"Std Dev: ${camry_sales.std():,.2f}")
print(f"Count: {len(camry_sales)}")

## Example 2: Skewed Data with Zeros → Use MEDIAN

When you have many zero values or outliers, median gives a better "typical" value.

In [None]:
toyota_commission = pd.read_csv('data/car_sales/toyota_sales_with_commission.csv')

In [None]:
toyota_commission.head()

In [None]:
print(f"Mean commission:   ${toyota_commission['commission_amount'].mean():,.2f}")
print(f"Median commission: ${toyota_commission['commission_amount'].median():,.2f}")
print(f"\nZero commissions: {(toyota_commission['commission_amount'] == 0).sum()}")
print(f"Non-zero commissions: {(toyota_commission['commission_amount'] > 0).sum()}")

In [None]:
# Even clearer if we look at only non-zero commissions
non_zero_comm = toyota_commission[toyota_commission['commission_amount'] > 0]['commission_amount']

In [None]:

print(f"Non-zero Mean:   ${non_zero_comm.mean():,.2f}")
print(f"Non-zero Median: ${non_zero_comm.median():,.2f}")

## Example 3: Categorical Data → Use MODE

For text/category data, only mode makes sense. Mean and median don't work on categories!

In [None]:
# Most popular car model
print("Mode (most popular):", toyota_sales['car_model'].mode()[0])
print("\nAll models ranked:")
print(toyota_sales['car_model'].value_counts())

In [None]:
# Most common sale status
print("Most common status:", toyota_sales['sale_status'].mode()[0])
print("\nStatus breakdown:")
print(toyota_sales['sale_status'].value_counts())

## Practical Business Scenarios

Let's apply the decision framework to real questions.

In [None]:
# Scenario 1: "What's our average sale?"
print("=== SCENARIO 1: Average Sale ===")
print(f"Mean:   ${toyota_sales['sale_amount'].mean():,.2f}")
print(f"Median: ${toyota_sales['sale_amount'].median():,.2f}")
print(f"Difference: ${abs(toyota_sales['sale_amount'].mean() - toyota_sales['sale_amount'].median()):,.2f}")
print("\nDecision: Mean and median are close ($366 apart).")
print("Either works, but MEAN is standard for 'average sale'.")

In [None]:
# Scenario 2: "What's a typical Tundra sale?"
print("\n=== SCENARIO 2: Typical Tundra Sale ===")
tundra = toyota_sales[toyota_sales['car_model'] == 'Tundra']['sale_amount']
print(f"Mean:   ${tundra.mean():,.2f}")
print(f"Median: ${tundra.median():,.2f}")
print(f"Std Dev: ${tundra.std():,.2f}")
print("\nDecision: High std dev ($4,236) suggests variability.")
print("MEDIAN is safer - not affected by a few very high or low sales.")

In [None]:
# Scenario 3: "Which car should we stock more of?"
print("\n=== SCENARIO 3: What to Stock ===")
print("Best-selling model (MODE):", toyota_sales['car_model'].mode()[0])
print("\nTop 3:")
print(toyota_sales['car_model'].value_counts().head(3))
print("\nDecision: Use MODE - find the most popular product!")

## Quick Reference Guide

| Your Question | Which Measure | Why |
|--------------|---------------|-----|
| "What's the average?" | Mean | Standard interpretation, uses all data |
| "What's typical with outliers?" | Median | Resistant to extreme values |
| "What's most common?" | Mode | Shows frequency, works for categories |
| "Total ÷ Count = ?" | Mean | Mathematical calculations need mean |
| "What's the middle value?" | Median | Literal middle of sorted data |
| "What's our best-seller?" | Mode | Categorical popularity |

**Pro tip:** When in doubt, report both mean and median and explain the difference!

In [None]:
# One-stop comparison by car model
comparison = toyota_sales.groupby('car_model')['sale_amount'].agg([
    ('Mean', 'mean'),
    ('Median', 'median'),
    ('Std_Dev', 'std'),
    ('Count', 'count')
]).round(2)

comparison['Best_Measure'] = comparison.apply(
    lambda row: 'Median' if row['Std_Dev'] > 3000 else 'Mean', axis=1
)

comparison.sort_values('Mean')