In [None]:
## import statements
# These lines load the tests. 
from gofer.ok import check
import numpy as np
from datascience import *
import pandas as pd
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
# Fix for datascience plots

### Normal Distribution

$$
\phi(z) = {\frac{1}{\sqrt{2 \pi}}} e^{-\frac{1}{2}z^2}, ~~ -\infty < z < \infty
$$

In [None]:
# The standard normal curve (standardized data)
# Plots a normal curve with specified parameters and area below curve shaded between lbound and rbound.
# datascience.util.plot_cdf_area(rbound=None, lbound=None, mean=0, sd=1)¶

plot_normal_cdf()

In [None]:
plot_normal_cdf(mean=5)

In [None]:
plot_normal_cdf(mean=5, sd=0.5)

In [None]:
plot_normal_cdf(rbound=1)

In [None]:
from scipy import stats

In [None]:
stats.norm.cdf(1)

In [None]:
plot_normal_cdf(lbound=1)

In [None]:
# The total area under the curve is 1
1 - stats.norm.cdf(1)

In [None]:
# Area under the standard normal curve, between -1 and 1

plot_normal_cdf(rbound=1, lbound=-1)

In [None]:
# Roughly 68% of the area lies within plus or minus one standard deviation
stats.norm.cdf(1) - stats.norm.cdf(-1)

In [None]:
# Area under the standard normal curve, between -2 and 2

plot_normal_cdf(2, lbound=-2)

In [None]:
# Roughly 95% of the area lies within plus or minus two standard deviations
stats.norm.cdf(2) - stats.norm.cdf(-2)

### Correlation

In [None]:
x = np.arange(1, 7, 1)
y = make_array(2, 3, 1, 5, 2, 7)
t = Table().with_columns(
        'x', x,
        'y', y
    )
t

In [None]:
t.scatter(0, 1, s=30, color='red')

In [None]:
def standard_units(numbers_array):
    "Convert any array of numbers to standard units."
    return (numbers_array - np.mean(numbers_array))/np.std(numbers_array)   

In [None]:
t_su = t.with_columns(
        'x (standard units)', standard_units(x),
        'y (standard units)', standard_units(y)
    )
t_su

In [None]:
t_product = t_su.with_column('product of standard units', t_su.column(2) * t_su.column(3))
t_product

In [None]:
# r is the average of the products of standard units

r = np.mean(t_product.column(4))
r

### Plot the regression line with data in standard units

In [None]:
t_predict = t_product.with_column('y predict standard units', t_product.column(2) * r)
t_predict

In [None]:
t_predict.scatter("x (standard units)", "y (standard units)")
ax=plt.gca()
ax.plot(t_predict.column("x (standard units)"), t_predict.column("y predict standard units"))

### Regression line in the original units

In [None]:
slope = r * np.std(t_predict.column("y")) / np.std(t_predict.column("x"))
intercept = np.mean(t_predict.column("y")) - slope * np.mean(t_predict.column("x"))
print("slope = ", slope)
print("intercept = ", intercept)

t_predict = t_predict.with_column('y predict', t_predict.column("x") * slope + intercept)
t_predict

In [None]:
t_predict.scatter("x", "y")
ax=plt.gca()
ax.plot(t_predict.column("x"), t_predict.column("y predict"))

#### What if the points lie exactly on a line?

In [None]:
x = np.arange(1, 7, 1)
y = x * 2 + 1
t = Table().with_columns(
        'x', x,
        'y', y
    )
t

In [None]:
t.scatter(0, 1, s=30, color='red')

In [None]:
t_su = t.with_columns(
        'x (standard units)', standard_units(x),
        'y (standard units)', standard_units(y)
    )
t_su

In [None]:
t_product = t_su.with_column('product of standard units', t_su.column(2) * t_su.column(3))
t_product

In [None]:
# r is the average of the products of standard units

r = np.mean(t_product.column(4))
r

## Old Faithful - Lab 8

In [None]:
faithful = Table.read_table("../../Lab08/faithful-new.csv")

In [None]:
faithful.stats() 

In [None]:
# Now plot
plt.style.use('ggplot')
plt.scatter(faithful.column('duration'),faithful.column('wait'))
#faithful.scatter('duration','wait') #
plt.title("Old Faithful")
plt.xlabel("duration")
plt.ylabel("wait")
plt.savefig("scatter.png") # Helpful way to save figure
plt.show()

***
Standard Units
***
$\bar{x}$=mean$(x)$<br>
<br>variance=mean$((x-\bar{x})^2)$<br>
<br>SD = $\sqrt{<(x-\bar{x})^2>}$<br>
<br>Z = ${\frac{(x-\bar{x})}{SD}}$

In [None]:
duration_mean =  np.mean(faithful.column("duration"))
duration_std = np.std(faithful.column("duration"))
wait_mean =  np.mean(faithful.column("wait"))
wait_std = np.std(faithful.column("wait"))

faithful_standard = Table().with_columns(
    "duration (standard units)", (faithful.column("duration") - duration_mean) / duration_std,
    "wait (standard units)", (faithful.column("wait") - wait_mean) / wait_std)
faithful_standard

In [None]:
faithful_standard.scatter("duration (standard units)", "wait (standard units)")

In [None]:
faithful_standard.hist("wait (standard units)")

***
Regression line
***
$$y = m x + b $$
$$ slope = m $$
$$ intercept = b $$
Standard units
$$y = m x$$


When the data are in standard units, the slope of the best-fit line is just r.

In [None]:
x = faithful_standard.column("duration (standard units)")
y = faithful_standard.column("wait (standard units)")
r = np.mean(x * y)
r

In [None]:
xp = np.arange(np.min(x), np.max(x), 0.01)
yp = r * xp

In [None]:
plt.scatter(x, y)
plt.plot(xp, yp, color="black");

In [None]:
print(f"The r-squared value is: {r**2:.2f}")

In [None]:
faithful_standard.scatter("duration (standard units)", "wait (standard units)", fit_line=True)

In [None]:
faithful.scatter("wait", "duration", fit_line=True)