# The Original Regression Example

In [None]:
# Import Numpy and Datascience modules.
import numpy as np
from datascience import *

# Plotting 
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
heights = Table().read_table("data/family_heights.csv")
heights

On average, the fathers were taller than the mothers.

In [None]:
heights.select("father", "mother").hist(bins=20)

Dalton used a factor of 1.08 times the mother's height averaged with the fathers height to calculate the "midparent height."

So the prediction of your height would be based on your mid-parent height

$$ (dad + 1.08(mom))\over 2 $$

In [None]:
heights_adjusted = heights.with_column("mother_adjusted", heights.column("mother") * 1.08)
heights_adjusted.select("father", "mother_adjusted").hist(bins=20)

In [None]:
heights.scatter("midparentHeight", "childHeight")

In [None]:
def standard_units(xyz):
    "Convert any array of numbers to standard units."
    return (xyz - np.mean(xyz))/np.std(xyz)  

In [None]:
def correlation(t, label_x, label_y):
    return np.mean(standard_units(t.column(label_x))*standard_units(t.column(label_y)))

def slope(t, label_x, label_y):
    r = correlation(t, label_x, label_y)
    return r*np.std(t.column(label_y))/np.std(t.column(label_x))

def intercept(t, label_x, label_y):
    return np.mean(t.column(label_y)) - slope(t, label_x, label_y)*np.mean(t.column(label_x))

In [None]:
def fit(table, x, y):
    """Return the height of the regression line at each x value."""
    a = slope(table, x, y)
    b = intercept(table, x, y)
    return a * table.column(x) + b

In [None]:
heights_fitted = heights.with_column('Fitted', fit(heights, 'midparentHeight', 'childHeight'))
heights_fitted

In [None]:
heights_fitted.select("midparentHeight", "childHeight", "Fitted").scatter("midparentHeight")

In [None]:
heights.select("midparentHeight", "childHeight").scatter("midparentHeight", fit_line=True)

In [None]:
r = correlation(heights, "midparentHeight", "childHeight")
r

In [None]:
r**2

## Boys only

In [None]:
boys = heights.where("sex", are.equal_to("male"))
boys.show(3)

In [None]:
boys_fitted = boys.with_column('Fitted', fit(boys, 'midparentHeight', 'childHeight'))
boys_fitted.show(3)

In [None]:
boys.select("midparentHeight", "childHeight").scatter("midparentHeight", fit_line=True)

In [None]:
r = correlation(boys, "midparentHeight", "childHeight")
r

In [None]:
r**2

In [None]:
girls = heights.where("sex", are.equal_to("female"))
girls_fitted = girls.with_column('Fitted', fit(girls, 'midparentHeight', 'childHeight'))
girls.select("midparentHeight", "childHeight").scatter("midparentHeight", fit_line=True)

In [None]:
r = correlation(girls, "midparentHeight", "childHeight")
r

In [None]:
r**2

## Try the prediction on yourself!

In [None]:
# If you are male

moms_height_inches = 65.5
dads_height_inches = 70
midparent = (dads_height_inches + 1.08 * moms_height_inches) / 2
m = slope(boys, "midparentHeight", "childHeight")
b = intercept(boys, "midparentHeight", "childHeight")

prediction = m * midparent + b
print(f"Your midparent height is {midparent:.1f} inches.")
print(f"Your predicted height is {prediction:.1f} inches.")
print(f"Equivelently, your predicted height is {int(prediction/12)}ft {prediction%12:.1f} in.") 

In [None]:
# If you are female

moms_height_inches = 65.5
dads_height_inches = 70
midparent = (dads_height_inches + 1.08 * moms_height_inches) / 2
m = slope(girls, "midparentHeight", "childHeight")
b = intercept(girls, "midparentHeight", "childHeight")

prediction = m * midparent + b
print(f"Your midparent height is {midparent:.1f} inches.")
print(f"Your predicted height is {prediction:.1f} inches.")
print(f"Equivelently, your predicted height is {int(prediction/12)}ft {prediction%12:.1f} in.") 

## History
"In 1886, Galton published a paper called Regression towards mediocrity in hereditary stature.[3] In the paper, he observed that extreme characteristics (e.g., height) in parents are not passed on completely to their offspring. Rather, the characteristics in the offspring regress towards a mediocre point. Today, this point is called the mean. By measuring the heights of hundreds of people, he was able to quantify regression to the mean, and estimate the size of the effect. Galton wrote that, “the average regression of the offspring is a constant fraction of their respective mid-parental deviations”. This means that the difference between a child and its parents for some characteristic is proportional to its parents' deviation from typical people in the population. If its parents are each two inches taller than the averages for men and women, on average, the child will be shorter than its parents by some factor times two inches. Today, this factor has been calculated to be one minus the regression coefficient. For height, Galton estimated this coefficient to be about two thirds: the height of an individual will measure around a midpoint that is two thirds of the parents’ deviation from the population average." --[Wikipedia](https://simple.wikipedia.org/wiki/Regression_toward_the_mean#:~:text=Regression%20toward%20the%20mean%20simply,parents%20tended%20to%20be%20taller.)

![Regression](data/midparent.png)