## Problem Set #[2]

MACS 30250, Dr. Evans

Linghui Wu

In [1]:
import multiprocessing
num_core = multiprocessing.cpu_count()
print("The number of available cores is {}.".format(num_core))

The number of available cores is 8.


### Question 1

In [2]:
# Import the libraries
import timeit
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from dask import compute, delayed
import dask.multiprocessing

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Load the dataset
auto = pd.read_csv("data/Auto.csv", na_values="?")
auto.dropna(inplace=True)

auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [4]:
# Create the binary variable `mpg_high`
auto["mpg_high"] = np.where(auto["mpg"] > auto["mpg"].median(), 1, 0)

# Create the indicator variables `orgn1` & `orgn2`
auto["orgn1"] = auto["origin"] == 1
auto["orgn2"] = auto["origin"] == 2

auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name,mpg_high,orgn1,orgn2
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu,0,True,False
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320,0,True,False
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite,0,True,False
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst,0,True,False
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino,0,True,False


#### (a)

In [5]:
# Set dependent and independent variables
X = auto[["cylinders", "displacement", "horsepower", "weight", "acceleration", "year", "orgn1", "orgn2"]]
y = auto["mpg_high"]

In [6]:
# Perform serial computation
start1 = timeit.default_timer()
mse1 = []

for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.65, random_state=i)
    logit = LogisticRegression(solver="lbfgs", max_iter=200, n_jobs=1)
    clf = logit.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    mse_i = ((y_test - y_pred) ** 2).mean()
    mse1.append(mse_i)
    
avg_mse1 = np.array(mse1).mean()
end1 = timeit.default_timer()

print("The average error rate is {}.".format(avg_mse1))
print("The computation takes {} seconds.".format(end1 - start1))

The average error rate is 0.10108695652173912.
The computation takes 2.4696098749999997 seconds.


#### (b)

In [7]:
# Define the error rate calculation function
def cal_err_rate(boot_num, seed, data_X, data_y):
    X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, train_size=0.65, random_state=seed)
    logit = LogisticRegression(solver="lbfgs", max_iter=200, n_jobs=1)
    clf = logit.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    mse = ((y_test - y_pred) ** 2).mean()
    return mse

In [8]:
# Perform parallel computation
start2 = timeit.default_timer()
mse2 = []

for i in range(100):
    mse2.append(delayed(cal_err_rate)(i, i, X, y))
results_par = compute(mse2, schedule=dask.multiprocessing.get, num_workers=num_core)

avg_mse2 = np.array(results_par).mean()
end2 = timeit.default_timer()

print("The average error rate is {}.".format(avg_mse2))
print("The computation takes {} seconds.".format(end2 - start2))

The average error rate is 0.10108695652173912.
The computation takes 2.529451879 seconds.


**Unfortunately, using Dask to perform parallel fails to shorten the computation time. This is probably because even though the tasks are distributed across different CPU cores, the inter-core communication is too time-consuming. The multiprocessing implementation is expected to have a significant improvement in efficiency if the data size is large enough.**