## Design Approach:
* Instead of the driver task creating bootstrap samples, that funcitonality is delegated to 
executor tasks by using Spark broadcast() function.  This reduces run time by avoiding recreating the original data set
for every sample in an executor.
* Each parallel task consists of a `sample_id` and `sample_seed`.  The `sample_seed` value is used by the executor
task to create a bootstrap sample

In [1]:
from pyspark import RDD
import pickle
import base64
from pyspark import SparkConf, HiveContext
from pyspark.sql import  Row
import numpy as np
import pandas as pd
import datetime
import socket
import os
import gc

In [2]:
NUMBER_OF_ROWS = 400000

NUMBER_EXECUTORS = 32
NUMBER_CORES = 2
#NUMBER_PARTITIONS = NUMBER_EXECUTORS*NUMBER_CORES

NUMBER_OF_SAMPLES = 1000 #SAMPLE_SETS * SAMPLE_SET_REPLICATIONS

MAX_RANDOM_SEED = int(1e5)

In [3]:
sc.version

'2.3.2'

In [4]:
sc.applicationId

'local-1541468711605'

In [5]:
spark

In [6]:
np.random.seed(21)

col1 = np.random.exponential(1,size=NUMBER_OF_ROWS)
col2 = np.random.normal(0,1,NUMBER_OF_ROWS)
col3 = np.random.randint(1,10,NUMBER_OF_ROWS)
category = np.random.choice(['a','b','c','d','e'],NUMBER_OF_ROWS)
orig_df = pd.DataFrame(dict(category=category,col1=col1,col2=col2,col3=col3))

In [7]:
print(orig_df.shape)
print(orig_df.head(5))
print(type(orig_df))
orig_df.category.value_counts()

(400000, 4)
  category      col1      col2  col3
0        b  0.049952 -0.772821     6
1        a  0.341237  0.079250     8
2        b  1.276423  2.222970     8
3        a  0.021853  0.027416     8
4        b  0.230575  0.510981     8
<class 'pandas.core.frame.DataFrame'>


b    80201
e    80113
c    79918
a    79897
d    79871
Name: category, dtype: int64

In [8]:
# broadcast raw data to executors 
sc.broadcast(orig_df)

<pyspark.broadcast.Broadcast at 0x7f6ed2b90e80>

## Bootstrap core computations

In [9]:
# Define Bootstrap specific Exceptions
class BootstrapError(Exception):
    """Custom excpetion for bootstrap analysis"""
    pass

In [10]:
#
# Common Calculation fucntion
#
def calculateSampleStats(df,col):
    stat_mean = df[col].mean()
    stat_min = df[col].min()
    stat_max= df[col].max()
    stat_50th = df[col].quantile(0.5)
    
    return stat_min, stat_mean, stat_max, stat_50th

In [11]:
#
# function to return summary of sample processing
#   Returns a single row of results for the sample
#
def processASampleReturnSummary(sample_run):
    # iterator: Python iterator for each record in a sample
    
    start_time = datetime.datetime.now()

    # retrieve sample id and seed for sampling
    sample_id = sample_run['sample_id']
    sample_seed = sample_run['sample_seed']

    # create bootstrap sample using the specified sample_seed value
    sample_df = orig_df.sample(n=orig_df.shape[0],replace=True,random_state=sample_seed)

    sample_df['col2'] = sample_df['col2'] + 10*sample_id

    result_stats = dict()

    for c in ['col2','col1']:
        stats = calculateSampleStats(sample_df,c)
        stats_to_return = ['min','mean','max','50th']

        result_stats.update(dict(zip([c + '_' + stat for stat in stats_to_return],
                            [float(x) for x in stats])))

    # caculate run-time performance measures
    end_time = datetime.datetime.now()

    elapsed_time_str = '{}'.format(end_time - start_time)

    start_str = '{}'.format(start_time)
    end_str = '{}'.format(end_time)

    print('>>>>>>Pid: {:d}, completed processing sample_id {:d} at {}'\
          .format(os.getpid(),sample_id,datetime.datetime.now()))

     # return results of bootstrap analysis
    return dict(sample_id=sample_id, sample_seed=sample_seed, 
                                  shape=str(sample_df.shape),
                      worker_hostname = socket.gethostname(),
                      worker_pid = os.getpid(),
                      time_start=start_str, time_end=end_str,
                      time_elapsed=elapsed_time_str ,    
                **result_stats)
        

## Create and analyze bootstrap samples 

In [12]:
print('Starting analysis for {:,d} samples'\
     .format(NUMBER_OF_SAMPLES))
bootstrap_start = datetime.datetime.now()

Starting analysis for 1,000 samples


## Create sample ids and seeds to be use in boostrap sampling

In [13]:
np.random.seed(13)  # make repeatable

sample_seeds = pd.DataFrame(dict(sample_id=np.array(range(NUMBER_OF_SAMPLES))+1,
                                sample_seed= np.random.choice(range(MAX_RANDOM_SEED),
                                                              size=NUMBER_OF_SAMPLES,
                                                             replace=False)))
sample_seeds.head(10)

Unnamed: 0,sample_id,sample_seed
0,1,72031
1,2,27978
2,3,55639
3,4,51955
4,5,52145
5,6,3011
6,7,83607
7,8,68952
8,9,90269
9,10,69234


In [14]:
sample_seeds.tail(10)

Unnamed: 0,sample_id,sample_seed
990,991,41440
991,992,92587
992,993,166
993,994,24458
994,995,74793
995,996,21813
996,997,62437
997,998,32343
998,999,81160
999,1000,53014


## Run the parallel tasks to create sample and compute metrics

In [15]:
# create RDD to contain sample_seed to create each bootstrap sample in the executors
sample_rdd = sc.parallelize(sample_seeds.to_dict('records')).repartition(NUMBER_EXECUTORS*NUMBER_CORES)
print("sample_rdd partitions: {:d}".format(sample_rdd.getNumPartitions()))

# use mapPartitions() to now run each bootstrap sample in parallel
results_df = sample_rdd.map(processASampleReturnSummary) 

bootstrap_results = pd.DataFrame(results_df.collect())
print('completed creating pandas dataframe creation time: {}'.format(datetime.datetime.now() - bootstrap_start))
print(bootstrap_results.shape)
        
print('shape of bootstrap_results is {}'.format(bootstrap_results.shape))

sample_rdd partitions: 64
completed creating pandas dataframe creation time: 0:01:04.819175
(1000, 16)
shape of bootstrap_results is (1000, 16)


## Show sample results

In [16]:
bootstrap_results.head()

Unnamed: 0,col1_50th,col1_max,col1_mean,col1_min,col2_50th,col2_max,col2_mean,col2_min,sample_id,sample_seed,shape,time_elapsed,time_end,time_start,worker_hostname,worker_pid
0,0.693381,12.805805,0.99993,2.942334e-07,5069.994987,5074.473786,5069.995823,5065.590632,507,54430,"(400000, 4)",0:00:00.354485,2018-11-06 01:45:25.904430,2018-11-06 01:45:25.549945,004fe3182434,158
1,0.690119,12.584154,0.995685,2.942334e-07,5079.999734,5084.431249,5079.999181,5075.576396,508,82077,"(400000, 4)",0:00:00.431357,2018-11-06 01:45:26.367148,2018-11-06 01:45:25.935791,004fe3182434,158
2,0.692338,12.805805,0.997924,2.942334e-07,5090.000959,5094.473786,5090.00213,5085.299111,509,74715,"(400000, 4)",0:00:00.329984,2018-11-06 01:45:26.734229,2018-11-06 01:45:26.404245,004fe3182434,158
3,0.69288,12.805805,0.999963,2.942334e-07,5099.998244,5104.473786,5099.997106,5095.576396,510,17481,"(400000, 4)",0:00:00.318063,2018-11-06 01:45:27.066749,2018-11-06 01:45:26.748686,004fe3182434,158
4,0.693692,12.805805,0.998533,2.942334e-07,5109.998487,5114.473786,5109.998127,5105.299111,511,38621,"(400000, 4)",0:00:00.620760,2018-11-06 01:45:27.709299,2018-11-06 01:45:27.088539,004fe3182434,158


In [17]:
bootstrap_results.tail()

Unnamed: 0,col1_50th,col1_max,col1_mean,col1_min,col2_50th,col2_max,col2_mean,col2_min,sample_id,sample_seed,shape,time_elapsed,time_end,time_start,worker_hostname,worker_pid
995,0.69238,12.805805,0.998973,2.942334e-07,5019.995991,5024.431249,5019.996335,5015.299111,502,12630,"(400000, 4)",0:00:00.161538,2018-11-06 01:46:24.552215,2018-11-06 01:46:24.390677,004fe3182434,149
996,0.692749,12.805805,0.999727,2.942334e-07,5029.998916,5034.431249,5029.998461,5025.576396,503,37375,"(400000, 4)",0:00:00.172546,2018-11-06 01:46:24.725527,2018-11-06 01:46:24.552981,004fe3182434,149
997,0.69202,12.584154,0.997529,7.826788e-07,5039.995463,5044.473786,5039.997495,5035.299111,504,45086,"(400000, 4)",0:00:00.072915,2018-11-06 01:46:24.799365,2018-11-06 01:46:24.726450,004fe3182434,149
998,0.690182,12.805805,0.999054,7.826788e-07,5049.99747,5054.431249,5049.997462,5045.299111,505,67360,"(400000, 4)",0:00:00.090000,2018-11-06 01:46:24.890238,2018-11-06 01:46:24.800238,004fe3182434,149
999,0.692336,12.805805,0.999025,2.942334e-07,5059.996564,5064.473786,5059.998082,5055.299111,506,56635,"(400000, 4)",0:00:00.087523,2018-11-06 01:46:24.984491,2018-11-06 01:46:24.896968,004fe3182434,149
