In [1]:
import pandas as pd

In [2]:
toyota_sales = pd.read_csv('data/car_sales/toyota_sales_data.csv')

# Standard Deviation - Variance in Original Units

**Standard deviation = √Variance**

## Why Standard Deviation?
- Same concept as variance (measures spread)
- But in **original units** (dollars, not squared dollars!)
- Much easier to interpret

**Low std dev:** Data clustered near mean
**High std dev:** Data spread far from mean

In [3]:
# Compare variance and standard deviation
sale_amounts = toyota_sales["sale_amount"]

In [4]:
variance = sale_amounts.var()

In [5]:
std_dev = sale_amounts.std()


In [6]:
print(f"Variance: ${variance:,.2f}")
print(f"Standard Deviation: ${std_dev:,.2f}")

Variance: $55,060,868.24
Standard Deviation: $7,420.30


In [7]:
print(f"\nRelationship check:")
print(f"√Variance = ${variance**0.5:,.2f}")
print(f"Std Dev = ${std_dev:,.2f}")


Relationship check:
√Variance = $7,420.30
Std Dev = $7,420.30


## Standard Deviation in Our Data

Let's see how spread differs across car models.

In [8]:
# Standard deviation by car model
std_by_model = toyota_sales.groupby('car_model')['sale_amount'].agg([
    ('Mean', 'mean'),
    ('Variance', 'var'),
    ('Std_Dev', 'std'),
    ('Count', 'count')
]).round(2)

std_by_model.sort_values('Std_Dev', ascending=False)

Unnamed: 0_level_0,Mean,Variance,Std_Dev,Count
car_model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Tundra,42478.58,17944408.01,4236.08,817
Highlander,40002.87,8481919.85,2912.37,814
Tacoma,34998.46,8448002.73,2906.54,826
RAV4,30985.72,5551941.84,2356.26,860
Corolla,22442.18,2132979.07,1460.47,827
Camry,27470.87,2047449.6,1430.89,856


In [9]:
# Visualize the spread for two models
print("=== TUNDRA ===")
tundra = toyota_sales[toyota_sales['car_model'] == 'Tundra']['sale_amount']
print(f"Mean: ${tundra.mean():,.2f}")
print(f"Std Dev: ${tundra.std():,.2f}")
print(f"Typical range: ${tundra.mean() - tundra.std():,.2f} to ${tundra.mean() + tundra.std():,.2f}")


=== TUNDRA ===
Mean: $42,478.58
Std Dev: $4,236.08
Typical range: $38,242.50 to $46,714.66


In [10]:
print("\n=== CAMRY ===")
camry = toyota_sales[toyota_sales['car_model'] == 'Camry']['sale_amount']
print(f"Mean: ${camry.mean():,.2f}")
print(f"Std Dev: ${camry.std():,.2f}")
print(f"Typical range: ${camry.mean() - camry.std():,.2f} to ${camry.mean() + camry.std():,.2f}")


=== CAMRY ===
Mean: $27,470.87
Std Dev: $1,430.89
Typical range: $26,039.98 to $28,901.77


## The 68-95-99.7 Rule (For Normal Distributions)

For data that's roughly bell-shaped:
- **68%** of data falls within 1 std dev of the mean
- **95%** of data falls within 2 std devs of the mean
- **99.7%** of data falls within 3 std devs of the mean

This is a helpful guideline for understanding spread.

In [11]:
# Check the 68% rule for Camry
camry = toyota_sales[toyota_sales['car_model'] == 'Camry']['sale_amount']
mean = camry.mean()
std = camry.std()

In [12]:
# Count values within 1 std dev
within_1std = ((camry >= mean - std) & (camry <= mean + std)).sum()
total = len(camry)
percentage = (within_1std / total) * 100

In [13]:
print(f"Camry Mean: ${mean:,.2f}")
print(f"Camry Std Dev: ${std:,.2f}")
print(f"Range: ${mean - std:,.2f} to ${mean + std:,.2f}")
print(f"\nSales within 1 std dev: {within_1std} out of {total}")
print(f"Percentage: {percentage:.1f}%")

Camry Mean: $27,470.87
Camry Std Dev: $1,430.89
Range: $26,039.98 to $28,901.77

Sales within 1 std dev: 490 out of 856
Percentage: 57.2%


In [14]:
# Check the 68% rule for Camry
corolla = toyota_sales[toyota_sales['car_model'] == 'Corolla']['sale_amount']
mean = corolla.mean()
std = corolla.std()

In [15]:
# Count values within 1 std dev
within_1std = ((corolla >= mean - std) & (corolla <= mean + std)).sum()
total = len(corolla)
percentage = (within_1std / total) * 100

In [16]:
print(f"Corolla Mean: ${mean:,.2f}")
print(f"Corolla Std Dev: ${std:,.2f}")
print(f"Range: ${mean - std:,.2f} to ${mean + std:,.2f}")
print(f"\nSales within 1 std dev: {within_1std} out of {total}")
print(f"Percentage: {percentage:.1f}%")

Corolla Mean: $22,442.18
Corolla Std Dev: $1,460.47
Range: $20,981.70 to $23,902.65

Sales within 1 std dev: 471 out of 827
Percentage: 57.0%


In [17]:
# Compare all models side by side
comparison = toyota_sales.groupby('car_model')['sale_amount'].agg([
    ('Mean', 'mean'),
    ('Std_Dev', 'std')
]).round(0)

# Add a column showing std dev as percentage of mean
comparison['Variability_%'] = ((comparison['Std_Dev'] / comparison['Mean']) * 100). \
    round(1)

comparison.sort_values('Std_Dev', ascending=False)

Unnamed: 0_level_0,Mean,Std_Dev,Variability_%
car_model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tundra,42479.0,4236.0,10.0
Highlander,40003.0,2912.0,7.3
Tacoma,34998.0,2907.0,8.3
RAV4,30986.0,2356.0,7.6
Corolla,22442.0,1460.0,6.5
Camry,27471.0,1431.0,5.2


## Summary: Standard Deviation

**Key Points:**
- Standard deviation = √Variance
- Measures spread in **original units** (easier to interpret!)
- Higher std dev = more variability
- Lower std dev = more consistency
- Use `.std()` in Pandas

**Advantage over variance:** You can say "sales vary by ±$4,200" instead of "variance is 17 million squared dollars"

**Next:** Range - the simplest measure of spread!