# Problem Set 2
### Heather Chen
### 1. Parallel computing versus serial computing a bootstrapped cross validation
#### (a)

In [2]:
import pandas as pd
import numpy as np

In [3]:
auto_df=pd.read_csv("data/Auto.csv", na_values='?')
auto_df.dropna(inplace=True)
auto_df['mpg high']=np.where(auto_df['mpg']>= np.median(auto_df['mpg']),1,0)
auto_df['orgn1']=np.where(auto_df['origin']==1,1,0)
auto_df['orgn2']=np.where(auto_df['origin']==2,1,0)
auto_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name,mpg high,orgn1,orgn2
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu,0,1,0
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320,0,1,0
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite,0,1,0
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst,0,1,0
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino,0,1,0


In [7]:
import timeit
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import multiprocessing
from dask import compute, delayed
import dask.multiprocessing

import warnings
warnings.filterwarnings("ignore")

time1 = timeit.default_timer()

N = 100
MSE_vec_bs  = []
X = auto_df[['cylinders', 'displacement', 'horsepower', 'weight',
        'acceleration', 'year', 'orgn1', 'orgn2']]

for bs_ind in range(N):
    X_train, X_test, y_train, y_test = \
        train_test_split(X, auto_df['mpg high'], test_size = 0.35,random_state=bs_ind)
    LogReg = LogisticRegression().fit(X_train, y_train)
    y_pred = LogReg.predict(X_test)
    MSE = np.mean((y_test - y_pred) ** 2)
    MSE_vec_bs.append(MSE)


MSE_bs = np.mean(MSE_vec_bs)
MSE_bs_std = np.std(MSE_vec_bs)

time2 = timeit.default_timer()

print('Test estimate MSE bootstrap=', round(MSE_bs,5))
print('Test estimate MSE bootstrap standard err=', round(MSE_bs_std,5))
print('Seconds for computation',time2-time1)

Test estimate MSE bootstrap= 0.10196
Test estimate MSE bootstrap standard err= 0.02076
Seconds for computation 1.443614651999951


#### (b)

In [8]:
num_cores = multiprocessing.cpu_count()
print('Number of available cores is', num_cores)

Number of available cores is 8


In [11]:
def logistic(bs, seed, data):
    
    X_train, X_test, y_train, y_test = \
        train_test_split(X, auto_df['mpg high'], test_size = 0.35, random_state = seed)
    LogReg = LogisticRegression().fit(X_train, y_train)
    y_pred = LogReg.predict(X_test)
    MSE = ((y_test - y_pred) ** 2).mean()
    return MSE

start_time = timeit.default_timer()

lazy_values = [delayed(logistic)(i,i,auto_df) for i in range(100)]

results_par = compute(*lazy_values, scheduler=dask.multiprocessing.get, num_workers=num_cores)

MSE_bs2 = np.mean(results_par)
MSE_bs_std2 = np.std(results_par)

end_time= timeit.default_timer()

print('Test estimate MSE bootstrap=', MSE_bs2)
print('Test estimate MSE bootstrap standard err=', MSE_bs_std2)

print('Seconds for computation=', end_time-start_time, 'seconds')

Test estimate MSE bootstrap= 0.10195652173913043
Test estimate MSE bootstrap standard err= 0.020756917194664794
Seconds for computation= 0.5612498440000309 seconds
