In [None]:
from datascience import *
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

# Bootstrapping for Linear Regression
**Cricket Thermometer**
For this example, we return to the ideas of using the frequency of cricket chirps to predict temperature. But first, we will define some helper functions.

## Helper functions
You should understand what each of these functions does.

In [None]:
def standard_units(any_numbers):
    "Convert any array of numbers to standard units."
    return (any_numbers - np.mean(any_numbers))/np.std(any_numbers)     

def correlation(t, label_x, label_y):
    return np.mean(standard_units(t.column(label_x))*standard_units(t.column(label_y)))

def slope(t, label_x, label_y):
    r = correlation(t, label_x, label_y)
    return r*np.std(t.column(label_y))/np.std(t.column(label_x))

def intercept(t, label_x, label_y):
    return np.mean(t.column(label_y)) - slope(t, label_x, label_y)*np.mean(t.column(label_x))

## Student Challenge 1
Explain what the **intercept()** function does.

## Load the data and fit a line

In [None]:
cricket = Table.read_table("./data/cricket_thermometer.csv")
cricket

In [None]:
label_x = "Chirps_per_sec"
label_y = "Temperature_deg_F"
cricket.scatter(label_x, label_y, fit_line=True)

In [None]:
m = slope(cricket, label_x, label_y) 
b = intercept(cricket, label_x, label_y)

print(f"The slope of the best fit line is: {m}")
print(f"The intercept is {b}")

## Student Challenge 2
Use the equation of the best-fit line to predict the temperature associated with 19 chirp/second.

## Student Challenge 3
Extrapolation is risky. Explain why you would not use this same fit to predict the temperature associated with 40 chirps/second.

## Bootstrapping to find the confidence interval for the slope of the best-fit line.
We fit a line to our data, but our data is only a sample. We know that the slope of the line fit to the full population of all cricket measurements on all days is probably different, but how can we find the likely range of slopes?

**Bootstrapping!**

We create many "new" samples from our existing data and fit a line to each of these bootstrap samples to build up a distributions of slopes.

In [None]:
slopes = make_array()
for i in np.arange(5000):
    bootstrap_sample = cricket.sample()
    bootstrap_slope = slope(bootstrap_sample, label_x, label_y)
    slopes = np.append(slopes, bootstrap_slope)
Table().with_column('Bootstrap Slopes', slopes).hist(bins=20)
plt.plot(m, 0, marker="^", c="yellow", markersize=30)
plt.title("Best fit slope (yellow triangle) and distribution of possible slopes.");

In [None]:
# Hint
left = percentile(2.5, slopes)
left

## Student Challenge 4
Given the hint above, what is the 95% confidence interval for the slope of the best-fit line?

## Final Visualization
You have been learning to fit lines from scratch, but there are many powerful Python libraries that will fit lines and confidence intervals automatically. One of these is "seaborn," which amusingly was name after Sam Seaborn, the Deputy White House Communications Director and a speechwriter in the TV fictional presidential drama, "West Wing."

Here is a plot that shows you range of lines that encompass the true slope with 95% confidence based on our sample.

In [None]:
import seaborn as sns

# Convert the datascience Table to a pandas DataFrame for seaborn
cricket_df = cricket.to_df()

plt.figure(figsize=(7, 5))
sns.regplot(
    data=cricket_df,
    x="Chirps_per_sec",
    y="Temperature_deg_F",
    ci=95,           # show 95% confidence interval
    scatter_kws={"s": 40, "alpha": 0.8},
    line_kws={"color": "crimson", "linewidth": 2}
)
plt.title("Cricket Chirps vs Temperature with 95% CI (Seaborn)")
plt.xlabel("Chirps per second")
plt.ylabel("Temperature (Â°F)")
plt.tight_layout()
plt.show()

## Student Challenge 5
How would the 95% confidence interval (pink area) change if we had more data?