# Bootstrap
Bootstrapping is a popular technique for estimating a parameter or statistic with limited data. In a sense, bootstrapping itself has parameters to estimate. How much data does one need? How many bootstrap replicates are required to achieve a good estimate of the target statistic? Using sample data from the 2020 Presidential election, we will attempt to estimate the vote for 4 of the closest battleground states: Georgia, Pennsylvania, Michigan, and Arizon.

In [15]:
import pandas as pd
import numpy as np
import pickle
import pprint

## The Data
From the original dataset, we will create a dictionary of sets of indices into the original dataset, each for a different sample size.

In [16]:
# Get the dataset containing the population.
vote = pd.read_csv('11-Python Statistics in EDA\\11.1.3-Inference and Modeling II\countypres_2000-2020.csv')
vote.head()

Unnamed: 0,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,totalvotes,version,mode
0,2000,ALABAMA,AL,AUTAUGA,1001.0,PRESIDENT,AL GORE,DEMOCRAT,4942.0,17208.0,20191203,TOTAL
1,2000,ALABAMA,AL,AUTAUGA,1001.0,PRESIDENT,GEORGE W. BUSH,REPUBLICAN,11993.0,17208.0,20191203,TOTAL
2,2000,ALABAMA,AL,AUTAUGA,1001.0,PRESIDENT,RALPH NADER,GREEN,160.0,17208.0,20191203,TOTAL
3,2000,ALABAMA,AL,AUTAUGA,1001.0,PRESIDENT,OTHER,OTHER,113.0,17208.0,20191203,TOTAL
4,2000,ALABAMA,AL,BALDWIN,1003.0,PRESIDENT,AL GORE,DEMOCRAT,13997.0,56480.0,20191203,TOTAL


In [17]:
# Get 2020 Presidential Election Votes in swing states for Biden 
swing=['GEORGIA', 'PENNSYLVANIA', 'MICHIGAN',  'ARIZONA', 'WISCONSIN', 'MINNESOTA', 'COLORADO', 'NORTH CAROLINA', 'OHIO', 'FLORIDA']
vote = vote[(vote['year']==2020) & (vote['office'] == 'PRESIDENT') & (vote['candidate'].isin(['JOSEPH R BIDEN JR','DONALD J TRUMP'])) & (vote['state'].isin(swing))]
vote.head()


Unnamed: 0,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,totalvotes,version,mode
50930,2020,ARIZONA,AZ,APACHE,4001.0,PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,16460.0,35172.0,20210622,EARLY VOTE
50931,2020,ARIZONA,AZ,APACHE,4001.0,PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,6539.0,35172.0,20210622,ELECTION DAY
50932,2020,ARIZONA,AZ,APACHE,4001.0,PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,294.0,35172.0,20210622,PROVISIONAL
50942,2020,ARIZONA,AZ,APACHE,4001.0,PRESIDENT,DONALD J TRUMP,REPUBLICAN,7262.0,35172.0,20210622,EARLY VOTE
50943,2020,ARIZONA,AZ,APACHE,4001.0,PRESIDENT,DONALD J TRUMP,REPUBLICAN,4053.0,35172.0,20210622,ELECTION DAY


In [18]:
# Summarize proportion of vote for each candidate
biden = vote[vote['candidate'] == 'JOSEPH R BIDEN JR']
trump = vote[vote['candidate'] == 'DONALD J TRUMP']
print(biden.head())
print(biden.shape[0])
print(trump.head())
print(trump.shape[0])
total_votes = sum(vote[vote['candidate'] == 'DONALD J TRUMP']['totalvotes'].values)
biden_votes = sum(vote[vote['candidate'] == 'JOSEPH R BIDEN JR']['candidatevotes'].values) / total_votes
trump_votes = sum(vote[vote['candidate'] == 'DONALD J TRUMP']['candidatevotes'].values) / total_votes
results = {}
results['JOSEPH R BIDEN JR'] =  biden_votes
results['DONALD J TRUMP'] = trump_votes
print("Results of Popular Vote across Selected Swing States\n\tBiden: {}\n\tTrump: {}".format(results['JOSEPH R BIDEN JR'], results['DONALD J TRUMP']))


       year    state state_po county_name  county_fips     office  \
50930  2020  ARIZONA       AZ      APACHE       4001.0  PRESIDENT   
50931  2020  ARIZONA       AZ      APACHE       4001.0  PRESIDENT   
50932  2020  ARIZONA       AZ      APACHE       4001.0  PRESIDENT   
50945  2020  ARIZONA       AZ     COCHISE       4003.0  PRESIDENT   
50946  2020  ARIZONA       AZ     COCHISE       4003.0  PRESIDENT   

               candidate     party  candidatevotes  totalvotes   version  \
50930  JOSEPH R BIDEN JR  DEMOCRAT         16460.0     35172.0  20210622   
50931  JOSEPH R BIDEN JR  DEMOCRAT          6539.0     35172.0  20210622   
50932  JOSEPH R BIDEN JR  DEMOCRAT           294.0     35172.0  20210622   
50945  JOSEPH R BIDEN JR  DEMOCRAT         21563.0     60442.0  20210622   
50946  JOSEPH R BIDEN JR  DEMOCRAT          1495.0     60442.0  20210622   

               mode  
50930    EARLY VOTE  
50931  ELECTION DAY  
50932   PROVISIONAL  
50945    EARLY VOTE  
50946  ELECTION DA

In [19]:
# Function to compute the statistic
def compute_statistic(data):
    """Compute the statistic, proportion of votes."""
    return np.mean(sum(data['candidatevotes'].values) / sum(data['totalvotes'].values))

In [20]:
def compute_replicate(data, func):
    """Computes a replicate from the data."""                
    idx = np.random.choice(data.index, data.shape[0])        
    return func(data.loc[idx])
    

In [21]:
def compute_replicates(data, func, replicate_sizes, candidate):
    """Computes n_replicates for the data using func."""
    bs_replicates = {}
    for replicate_size in replicate_sizes:                
        replicates = np.empty(replicate_size)        
        for i in range(replicate_size):
            if i % 1000 == 0:
                print(".",end="")
            replicates[i] = compute_replicate(data,func)        
            
        idx = "n_replicates:".format(replicate_size)        
        
        # Compute Grand Mean and confidence intervals
        average = np.mean(replicates)
        ci = np.percentile(replicates,[2.5, 97.5])
        moe = (ci[1]-ci[0]) / 2
        diff = np.abs(results[candidate]-average)
        bs_replicates[idx] = {}
        bs_replicates[idx]['mean'] = average
        bs_replicates[idx]['ci'] = ci
        bs_replicates[idx]['moe'] = moe
        bs_replicates[idx]['diff'] = diff
        print("\t\tComputed {} replicates. \n\t\t\tSample Mean: {}\n\t\t\tPopulation Mean: {}\n\t\t\tConfidence Interval: {}\n\t\t\tMargin of Error: {}\n\t\t\tDifference from Population Mean: {}".format(
            replicate_size, str(average), str(results[candidate]), str(ci), str(moe), str(diff)))        
    return bs_replicates
    

In [22]:
def compute_samples(data, func, sample_sizes, replicate_sizes, candidate):
    """Creates the bootstrap samples and the associated replicates."""
    bs_samples = {}
    for sample_size in sample_sizes:
        print("\n\tComputing samples of size {}".format(sample_size))    
        idx = np.random.choice(data.index,sample_size)
        data = data.loc[idx]
        didx = "sample_size: {}".format(str(sample_size))
        bs_samples[didx] = compute_replicates(data, func, replicate_sizes, candidate)
    return bs_samples
    
    
    

In [23]:
def process_candidates(data, func, candidates, sample_sizes, replicate_sizes):
    """Processes all bootsrap samples for each candidate."""
    bs_candidates = {}
    for candidate in candidates:
        print("Processing {}".format(candidate))
        bs_candidates[candidate] = compute_samples(data, func, sample_sizes, replicate_sizes, candidate)
    return bs_candidates

In [24]:
def save_bs(bs, filepath):
    f = open(filepath, 'wb')
    pickle.dump(bs,f)
    f.close()
    

In [25]:
def read_bs(bs,filepath):
     f = open(filepath, 'rb') 
     bs = pickle.load(f)
     f.close()
     return bs
        

In [26]:
bs = {}
bs_file = '11-Python Statistics in EDA\\11.1.3-Inference and Modeling II\bootstrap_samples.pkl'
sample_sizes = np.array([50,100,250,500])
replicate_sizes = np.array([1000,2500,5000,10000])
candidates = np.unique(vote['candidate'].values)

bs = process_candidates(vote, compute_statistic, candidates, sample_sizes, replicate_sizes)

pprint(bs)
save_bs(bs, bs_file)


Processing DONALD J TRUMP

	Computing samples of size 50
.		Computed 1000 replicates. 
			Sample Mean: 0.3078919726063031
			Population Mean: 0.2603386225489249
			Confidence Interval: [0.18499811 0.4235862 ]
			Margin of Error: 0.11929404646714854
			Difference from Population Mean: 0.04755335005737815
...		Computed 2500 replicates. 
			Sample Mean: 0.3032880706696261
			Population Mean: 0.2603386225489249
			Confidence Interval: [0.18762342 0.42250109]
			Margin of Error: 0.11743883503986104
			Difference from Population Mean: 0.04294944812070117
.....		Computed 5000 replicates. 
			Sample Mean: 0.30452270210800786
			Population Mean: 0.2603386225489249
			Confidence Interval: [0.18754989 0.42584582]
			Margin of Error: 0.11914796710105756
			Difference from Population Mean: 0.044184079559082934
..........		Computed 10000 replicates. 
			Sample Mean: 0.30541930153559177
			Population Mean: 0.2603386225489249
			Confidence Interval: [0.18696303 0.42202021]
			Margin of Error: 0.117528