In [None]:
import pandas as pd

In [None]:
toyota_sales = pd.read_csv('data/car_sales/toyota_sales_data.csv')

# Standard Deviation - Variance in Original Units

**Standard deviation = √Variance**

## Why Standard Deviation?
- Same concept as variance (measures spread)
- But in **original units** (dollars, not squared dollars!)
- Much easier to interpret

**Low std dev:** Data clustered near mean
**High std dev:** Data spread far from mean

In [None]:
# Compare variance and standard deviation
sale_amounts = toyota_sales["sale_amount"]

In [None]:
variance = sale_amounts.var()

In [None]:
std_dev = sale_amounts.std()


In [None]:
print(f"Variance: ${variance:,.2f}")
print(f"Standard Deviation: ${std_dev:,.2f}")

In [None]:
print(f"\nRelationship check:")
print(f"√Variance = ${variance**0.5:,.2f}")
print(f"Std Dev = ${std_dev:,.2f}")

## Standard Deviation in Our Data

Let's see how spread differs across car models.

In [None]:
# Standard deviation by car model
std_by_model = toyota_sales.groupby('car_model')['sale_amount'].agg([
    ('Mean', 'mean'),
    ('Variance', 'var'),
    ('Std_Dev', 'std'),
    ('Count', 'count')
]).round(2)

std_by_model.sort_values('Std_Dev', ascending=False)

In [None]:
# Visualize the spread for two models
print("=== TUNDRA ===")
tundra = toyota_sales[toyota_sales['car_model'] == 'Tundra']['sale_amount']
print(f"Mean: ${tundra.mean():,.2f}")
print(f"Std Dev: ${tundra.std():,.2f}")
print(f"Typical range: ${tundra.mean() - tundra.std():,.2f} to ${tundra.mean() + tundra.std():,.2f}")


In [None]:
print("\n=== CAMRY ===")
camry = toyota_sales[toyota_sales['car_model'] == 'Camry']['sale_amount']
print(f"Mean: ${camry.mean():,.2f}")
print(f"Std Dev: ${camry.std():,.2f}")
print(f"Typical range: ${camry.mean() - camry.std():,.2f} to ${camry.mean() + camry.std():,.2f}")

## The 68-95-99.7 Rule (For Normal Distributions)

For data that's roughly bell-shaped:
- **68%** of data falls within 1 std dev of the mean
- **95%** of data falls within 2 std devs of the mean
- **99.7%** of data falls within 3 std devs of the mean

This is a helpful guideline for understanding spread.

In [None]:
# Check the 68% rule for Camry
camry = toyota_sales[toyota_sales['car_model'] == 'Camry']['sale_amount']
mean = camry.mean()
std = camry.std()

In [None]:
# Count values within 1 std dev
within_1std = ((camry >= mean - std) & (camry <= mean + std)).sum()
total = len(camry)
percentage = (within_1std / total) * 100

In [None]:
print(f"Camry Mean: ${mean:,.2f}")
print(f"Camry Std Dev: ${std:,.2f}")
print(f"Range: ${mean - std:,.2f} to ${mean + std:,.2f}")
print(f"\nSales within 1 std dev: {within_1std} out of {total}")
print(f"Percentage: {percentage:.1f}%")

In [None]:
# Check the 68% rule for Camry
corolla = toyota_sales[toyota_sales['car_model'] == 'Corolla']['sale_amount']
mean = corolla.mean()
std = corolla.std()

In [None]:
# Count values within 1 std dev
within_1std = ((corolla >= mean - std) & (corolla <= mean + std)).sum()
total = len(corolla)
percentage = (within_1std / total) * 100

In [None]:
print(f"Corolla Mean: ${mean:,.2f}")
print(f"Corolla Std Dev: ${std:,.2f}")
print(f"Range: ${mean - std:,.2f} to ${mean + std:,.2f}")
print(f"\nSales within 1 std dev: {within_1std} out of {total}")
print(f"Percentage: {percentage:.1f}%")

In [None]:
# Compare all models side by side
comparison = toyota_sales.groupby('car_model')['sale_amount'].agg([
    ('Mean', 'mean'),
    ('Std_Dev', 'std')
]).round(0)

# Add a column showing std dev as percentage of mean
comparison['Variability_%'] = ((comparison['Std_Dev'] / comparison['Mean']) * 100). \
    round(1)

comparison.sort_values('Std_Dev', ascending=False)

## Summary: Standard Deviation

**Key Points:**
- Standard deviation = √Variance
- Measures spread in **original units** (easier to interpret!)
- Higher std dev = more variability
- Lower std dev = more consistency
- Use `.std()` in Pandas

**Advantage over variance:** You can say "sales vary by ±$4,200" instead of "variance is 17 million squared dollars"

**Next:** Range - the simplest measure of spread!