In [1]:
import pandas as pd

In [2]:
toyota_sales = pd.read_csv('data/car_sales/toyota_sales_data.csv')

# Choosing the Right Measure: Decision Framework

## When to Use Each Measure

**Use MEAN when:**
- Data is fairly symmetric (no extreme outliers)
- You want to account for every value
- Doing mathematical calculations

**Use MEDIAN when:**
- Data has outliers or is skewed
- You want the "typical middle" value
- Working with income, prices, or measurements with extremes

**Use MODE when:**
- Data is categorical
- You want the "most common" value
- Finding best-sellers or popular choices

## Example 1: Symmetric Data → Use MEAN

When mean and median are close, data is balanced. Mean is simpler and uses all data points.

In [3]:
# Camry sales are consistent
camry_sales = toyota_sales[toyota_sales['car_model'] == 'Camry']['sale_amount']

print(f"Mean:   ${camry_sales.mean():,.2f}")
print(f"Median: ${camry_sales.median():,.2f}")
print(f"Std Dev: ${camry_sales.std():,.2f}")
print(f"Count: {len(camry_sales)}")

Mean:   $27,470.87
Median: $27,448.58
Std Dev: $1,430.89
Count: 856


## Example 2: Skewed Data with Zeros → Use MEDIAN

When you have many zero values or outliers, median gives a better "typical" value.

In [4]:
toyota_commission = pd.read_csv('data/car_sales/toyota_sales_with_commission.csv')

In [5]:
toyota_commission.head()

Unnamed: 0,sale_id,sale_rep_id,sale_date,car_model,sale_amount,commission_pct,sale_status,commission_amount
0,1,16,2024-11-18,Tundra,44496.88,0.05,Completed,2224.84
1,2,11,2024-11-08,Tacoma,34824.72,,Pending,0.0
2,3,5,2024-11-03,Corolla,20275.08,,Completed,0.0
3,4,20,2024-11-06,Corolla,20068.93,,Completed,0.0
4,5,1,2024-11-26,Tundra,49811.99,0.03,Completed,1494.36


In [6]:
print(f"Mean commission:   ${toyota_commission['commission_amount'].mean():,.2f}")
print(f"Median commission: ${toyota_commission['commission_amount'].median():,.2f}")
print(f"\nZero commissions: {(toyota_commission['commission_amount'] == 0).sum()}")
print(f"Non-zero commissions: {(toyota_commission['commission_amount'] > 0).sum()}")

Mean commission:   $829.39
Median commission: $792.62

Zero commissions: 1274
Non-zero commissions: 3726


In [7]:
# Even clearer if we look at only non-zero commissions
non_zero_comm = toyota_commission[toyota_commission['commission_amount'] > 0]['commission_amount']

In [8]:

print(f"Non-zero Mean:   ${non_zero_comm.mean():,.2f}")
print(f"Non-zero Median: ${non_zero_comm.median():,.2f}")

Non-zero Mean:   $1,112.98
Non-zero Median: $1,005.76


## Example 3: Categorical Data → Use MODE

For text/category data, only mode makes sense. Mean and median don't work on categories!

In [9]:
# Most popular car model
print("Mode (most popular):", toyota_sales['car_model'].mode()[0])
print("\nAll models ranked:")
print(toyota_sales['car_model'].value_counts())

Mode (most popular): RAV4

All models ranked:
car_model
RAV4          860
Camry         856
Corolla       827
Tacoma        826
Tundra        817
Highlander    814
Name: count, dtype: int64


In [10]:
# Most common sale status
print("Most common status:", toyota_sales['sale_status'].mode()[0])
print("\nStatus breakdown:")
print(toyota_sales['sale_status'].value_counts())

Most common status: Completed

Status breakdown:
sale_status
Completed    3543
Pending      1075
Cancelled     382
Name: count, dtype: int64


## Practical Business Scenarios

Let's apply the decision framework to real questions.

In [12]:
# Scenario 1: "What's our average sale?"
print("=== SCENARIO 1: Average Sale ===")
print(f"Mean:   ${toyota_sales['sale_amount'].mean():,.2f}")
print(f"Median: ${toyota_sales['sale_amount'].median():,.2f}")
print(f"Difference: ${abs(toyota_sales['sale_amount'].mean() - toyota_sales['sale_amount'].median()):,.2f}")
print("\nDecision: Mean and median are close ($366 apart).")
print("Either works, but MEAN is standard for 'average sale'.")

=== SCENARIO 1: Average Sale ===
Mean:   $32,979.71
Median: $32,613.76
Difference: $365.94

Decision: Mean and median are close ($366 apart).
Either works, but MEAN is standard for 'average sale'.


In [15]:
# Scenario 2: "What's a typical Tundra sale?"
print("\n=== SCENARIO 2: Typical Tundra Sale ===")
tundra = toyota_sales[toyota_sales['car_model'] == 'Tundra']['sale_amount']
print(f"Mean:   ${tundra.mean():,.2f}")
print(f"Median: ${tundra.median():,.2f}")
print(f"Std Dev: ${tundra.std():,.2f}")
print("\nDecision: High std dev ($4,236) suggests variability.")
print("MEDIAN is safer - not affected by a few very high or low sales.")


=== SCENARIO 2: Typical Tundra Sale ===
Mean:   $42,478.58
Median: $42,306.67
Std Dev: $4,236.08

Decision: High std dev ($4,236) suggests variability.
MEDIAN is safer - not affected by a few very high or low sales.


In [16]:
# Scenario 3: "Which car should we stock more of?"
print("\n=== SCENARIO 3: What to Stock ===")
print("Best-selling model (MODE):", toyota_sales['car_model'].mode()[0])
print("\nTop 3:")
print(toyota_sales['car_model'].value_counts().head(3))
print("\nDecision: Use MODE - find the most popular product!")


=== SCENARIO 3: What to Stock ===
Best-selling model (MODE): RAV4

Top 3:
car_model
RAV4       860
Camry      856
Corolla    827
Name: count, dtype: int64

Decision: Use MODE - find the most popular product!


## Quick Reference Guide

| Your Question | Which Measure | Why |
|--------------|---------------|-----|
| "What's the average?" | Mean | Standard interpretation, uses all data |
| "What's typical with outliers?" | Median | Resistant to extreme values |
| "What's most common?" | Mode | Shows frequency, works for categories |
| "Total ÷ Count = ?" | Mean | Mathematical calculations need mean |
| "What's the middle value?" | Median | Literal middle of sorted data |
| "What's our best-seller?" | Mode | Categorical popularity |

**Pro tip:** When in doubt, report both mean and median and explain the difference!

In [17]:
# One-stop comparison by car model
comparison = toyota_sales.groupby('car_model')['sale_amount'].agg([
    ('Mean', 'mean'),
    ('Median', 'median'),
    ('Std_Dev', 'std'),
    ('Count', 'count')
]).round(2)

comparison['Best_Measure'] = comparison.apply(
    lambda row: 'Median' if row['Std_Dev'] > 3000 else 'Mean', axis=1
)

comparison.sort_values('Mean')

Unnamed: 0_level_0,Mean,Median,Std_Dev,Count,Best_Measure
car_model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Corolla,22442.18,22477.28,1460.47,827,Mean
Camry,27470.87,27448.58,1430.89,856,Mean
RAV4,30985.72,30922.64,2356.26,860,Mean
Tacoma,34998.46,34989.82,2906.54,826,Mean
Highlander,40002.87,39988.89,2912.37,814,Mean
Tundra,42478.58,42306.67,4236.08,817,Median
