## Bootstrap Slope Confidence Interval Examples
Class 023

In [None]:
## import statements
# These lines load the tests. 
from gofer.ok import check
import numpy as np
from datascience import *
import pandas as pd
import matplotlib
from matplotlib import patches
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter('ignore', FutureWarning)
plt.style.use('ggplot')
from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets

In [None]:
def standard_units(xyz):
    "Convert any array of numbers to standard units."
    return (xyz - np.mean(xyz))/np.std(xyz)  
def correlation(t, label_x, label_y):
    return np.mean(standard_units(t.column(label_x))*standard_units(t.column(label_y)))
# Regression
def slope(t, label_x, label_y):
    r = correlation(t, label_x, label_y)
    return r*np.std(t.column(label_y))/np.std(t.column(label_x))
def intercept(t, label_x, label_y):
    return np.mean(t.column(label_y)) - slope(t, label_x, label_y)*np.mean(t.column(label_x))

In [None]:
faithful = Table.read_table("../Lab08/faithful-new.csv")
faithful.scatter('duration','wait')

In [None]:
faithful = Table.read_table("../Lab08/faithful-new.csv")
faithful.scatter('duration','wait')

#### Slope
***

slope of the regression line $= r \cdot
\frac{SD_y}{SD_x}$<br>
intercept of the regression line $=
\bar{y} - slope \cdot \bar{x}$<br><br>
***
equation of the regression line:<br> $ \hat{y} =$
 slope $\cdot x $+ intercept<br>
error in fit:<br> error $ =  y - \hat{y} = y - ($slope $\cdot x $+ intercept)<br>

***

In [None]:
duration_mean =  np.mean(faithful["duration"])
duration_std = np.std(faithful.column("duration"))
wait_mean =  np.mean(faithful.column("wait"))
wait_std = np.std(faithful.column("wait"))

faithful_standard = Table().with_columns(
    "duration (standard units)", (faithful.column("duration") - duration_mean) / duration_std,
    "wait (standard units)", (faithful.column("wait") - wait_mean) / wait_std)

In [None]:
r = np.mean(faithful_standard["duration (standard units)"]*faithful_standard["wait (standard units)"])
r

In [None]:
faithful

In [None]:
slope = r * wait_std/duration_std
print("Slope; ",slope)
print("For every 1 minute longer eruption duration we wait about ",round(slope,1)," minutes longer")

In [None]:
intercept = wait_mean - slope * duration_mean
intercept

In [None]:
# error = y - ypredict
tbl = faithful
error_list = []
for i in np.arange(tbl.num_rows):
    predicted_y = tbl.row(i)[0]*slope+intercept
    error = tbl.row(i)[1] - predicted_y
    error_list.append(error)

In [None]:
plt.scatter(faithful['duration'],error_list)
plt.show()

In [None]:
def fit_line(tbl):
    # x data is in column 0 and y data is in column 1
    # Your code may need more than 1 line below here.
    xdata, ydata = tbl.column(0), tbl.column(1)
    def mse_f(slope,intercept):
        fitted = slope * xdata + intercept
        return np.mean((ydata - fitted) ** 2)    
    values = minimize(mse_f)
    slope = values[0]
    intercept = values[1]
    return make_array(slope, intercept)

In [None]:
fit_line(faithful)

In [None]:
fit_line(faithful.sample())



### Bootstrap 95% Confidence Interval

In [None]:
bootstrap_slope = make_array()
for i in np.arange(1000):
    bootstrap_slope = np.append(bootstrap_slope, fit_line(faithful.sample())[0]) # Sample without replacement

lower_end = percentile(2.5,bootstrap_slope)
upper_end = percentile(97.5,bootstrap_slope)
Table().with_column("slope estimate", bootstrap_slope).hist(bins=np.arange(9, 13, .1), unit="minutes")
print("95% confidence interval for the predicted additional wait for 1 minute longer duration of Old Fathful : [{:g}, {:g}, {:g}] minutes".format(lower_end, slope, upper_end))
plt.plot([lower_end, upper_end], [0, 0], color='red', lw=10)
plt.show()

***
### 2018 Philadelphia Primary voter registration
Voter registration by Ward. Look at the average size of a Ward

In [None]:
url = 'qualified_voter_listing_2018_primary_by_ward.csv'
url='https://phl.carto.com/api/v2/sql?q=SELECT+*+FROM+qualified_voter_listing_2018_primary_by_ward&filename=qualified_voter_listing_2018_primary_by_ward&format=csv&skipfields=cartodb_id,the_geom,the_geom_webmercator'
vote = Table().read_table(url).where('ward',are.containing("WD")) # Elimate row with with totals only wards
vote.sort('total',descending=True)

In [None]:
np.mean(vote['total'])

In [None]:
vote = vote.with_columns("Gender_ratio", vote['female']/vote['male'])

In [None]:
np.average(vote['total'])

In [None]:
np.average(vote['Gender_ratio'])

In [None]:
plt.hist(vote['total'],bins = np.arange(10000,20000,1000),color='yellow',alpha=0.4, edgecolor='black', linewidth=1.2)
plt.title('Philadelphia Voting Ward Size')
plt.scatter(np.mean(vote['total']),0,s=300, marker='o', 
            c='red',alpha=0.8, edgecolors='blue')
plt.show()

In [None]:
plt.hist(vote['Gender_ratio'])

In [None]:
def one_bootstrap_mean():
    resample = vote.sample()
    return np.mean(resample.column('total'))

In [None]:
one_bootstrap_mean()

In [None]:
percentile(50, vote.column('total')) # median

In [None]:
vote.sample().sort('total',descending=True)

In [None]:
total_mean = []
for i in np.arange(1000):
    total_mean.append(one_bootstrap_mean())
plt.hist(total_mean, bins = np.arange(10000,20000,250),color='yellow',alpha=0.4, edgecolor='black', linewidth=1.2)
plt.title('Philadelphia Voting Ward Size Bootstrap mean')
plt.scatter(np.mean(vote['total']),0,s=300, marker='o', 
            c='red',alpha=0.8, edgecolors='blue')
plt.show()

In [None]:
left = percentile(2.5, total_mean)
right = percentile(97.5, total_mean)

make_array(left, right)

In [None]:
plt.hist(total_mean)
plt.plot([left, right], [0, 0], color='yellow', lw=8)
plt.title("Philadelphia Voters by Ward")
plt.xlim(10000,20000)
plt.tight_layout()
plt.savefig('bootstrap_CI.png')