# Class 16 Warm-up: Normal Distributions and Data Standardization

In [None]:
import numpy as np
from datascience import *
import matplotlib.pyplot as plt
%matplotlib inline

## Weight and Height Data
Let's load our table again.

In [None]:
population = Table.read_table("./data/weight-height.csv")
population.show(5)

Let's just focus on height.

In [None]:
pop_ht = population.select("Gender", "Height")
pop_ht.show(5)

## Plot the distribution by gender for the full 10,000 samples

In [None]:
male_heights = pop_ht.where("Gender", "Male").column("Height")
female_heights = pop_ht.where("Gender", "Female").column("Height")

all_heights = pop_ht.column("Height")
bins = np.linspace(np.min(all_heights), np.max(all_heights), 30)

plt.figure(figsize=(8, 5))

# Plot overlapped histograms with transparency
plt.hist(male_heights, bins=bins, color="steelblue", alpha=0.6, label="Male", edgecolor="white")
plt.hist(female_heights, bins=bins, color="tomato", alpha=0.6, label="Female", edgecolor="white")

plt.title("Height Distribution by Gender")
plt.xlabel("Height (inches)")
plt.ylabel("Count")
plt.legend()
plt.tight_layout()
plt.show()

## Let's look at just the men.

In [None]:
from scipy.stats import norm
plt.figure(figsize=(8, 5))
plt.hist(male_heights, bins=bins, color="steelblue", alpha=0.6, label="Male", edgecolor="white")
plt.title("Height Distribution of Men")
plt.xlabel("Height (inches)")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

## Instead of counts, express the y-axis as fraction of the total

In [None]:
plt.figure(figsize=(8, 5))

# density=True tells the histogram function to normalize the distibution to sum to 1, making it a probability density function
plt.hist(male_heights, density=True, bins=bins, color="steelblue", alpha=0.6, label="Male", edgecolor="white")

plt.title("Height Distribution of Men as Fraction")
plt.xlabel("Height (inches)")
plt.ylabel("Probability")
plt.tight_layout()
plt.show()

## If the data are normally distributed the mean and standard deviation are all you need to characterize the PDF

In [None]:
from scipy.stats import norm

male_ht_avg = np.mean(male_heights)
male_ht_std = np.std(male_heights)

# Plot the normal distribution on top of the histogram
# The function "norm" create a normal distribution object where loc is the mean, and scale is the standard deviation
dist = norm(loc=male_ht_avg, scale=male_ht_std)

# Generate x values
x = np.arange(55, 80, 0.1)

# Generate y values using the PDF of the distribution object we created
y = dist.pdf(x)

plt.figure(figsize=(8, 5))
plt.hist(male_heights, density=True, bins=bins, color="steelblue", alpha=0.6, label="Male", edgecolor="white")
plt.plot(x, y, color='k')
plt.title("Height Distribution of Men as Fraction of the Population")
plt.xlabel("Height (inches)")
plt.ylabel("Probability")
plt.tight_layout()
plt.show()

## Challenge #1: How well do you think that this normal distribution reflects the true height distribution of the population?

Answer:

#### If the population is normally distributed, we can answer all sorts of questions directly from the probability density and cumulative distribution functions.

**For example, what fraction of the male population has a height between 70 and 75 inches?**

In [None]:
# Method 1: Direct Count
male_hts = pop_ht.where("Gender", "Male")
between70_75 = male_hts.where("Height", are.between(70, 75))
fraction = between70_75.num_rows / male_hts.num_rows
fraction

In [None]:
# Method 2: Use normal distribution's CDF (cummulative distribution function)

In [None]:
fraction_norm = dist.cdf(75) - dist.cdf(70)
fraction_norm

In [None]:
# Shade the band between 70 and 75 inches on the PDF
lower, upper = 70, 75

plt.figure(figsize=(8, 5))
# Density histogram
plt.hist(male_heights, density=True, bins=bins, color="steelblue", alpha=0.6, edgecolor="white", label="Male (density)")

# Fitted normal curve
plt.plot(x, y, color='k', lw=2, label='Normal fit')

# Create x values inside the band and fill under the PDF
x_band = np.linspace(lower, upper, 200)
y_band = dist.pdf(x_band)
plt.fill_between(x_band, y_band, color='orange', alpha=0.35, label='70–75 in band')

# Reference lines
plt.axvline(lower, color='orange', linestyle='--', alpha=0.8)
plt.axvline(upper, color='orange', linestyle='--', alpha=0.8)

# Annotate with fraction from the normal model
fraction_norm = dist.cdf(upper) - dist.cdf(lower)
plt.text(0.99, 0.95, f"P(70 ≤ H ≤ 75) ≈ {fraction_norm:.3f}",
         transform=plt.gca().transAxes, ha='right', va='top')

plt.title("Male Height — Density, Normal Fit, and 70–75 in Band")
plt.xlabel("Height (inches)")
plt.ylabel("Density")
plt.legend()
plt.tight_layout()
plt.show()


## Challenge 2: What fraction of women are between 5 ft 2 inches and 5 ft 5 inches tall?
You can use either of the two methods shown above.

## Standardizing Data

To standardize a data set, you simply subtract the average from each point and divide by the standard deviation. The standardized data will then have a mean of zero and a standard deviation of one.

$$ z = \frac{x - \bar{x}}{\sigma} $$

**Let's stardize the male and female height separately and plot on the same plot.**

In [None]:
male_hts = pop_ht.where("Gender", "Male")
men = male_hts.column("Height")
men_avg = np.mean(men)
men_std = np.std(men)

male_hts = male_hts.with_columns("Male Height Standardized", (men - men_avg) / men_std)

In [None]:
male_hts

In [None]:
male_hts.hist("Male Height Standardized", bins=20)

## Challenge 3: 
Explain in words what the units are on the x-axis of this plot.

## Challenge 4: 
Plot the standardized height for the women. How does this distribution compare with that of the men?

Answer: