# Linear Regression Dask Test

Testing linear regression using Dask

### Setup

In [None]:
# Extension reloader to import a function again when re-running cell 
%load_ext autoreload
%autoreload 2

### Load Configuration

In [2]:
"""
Loads common configuration parameters
"""
from project.src.utils.configuration_manager import Config
from pathlib import Path, PurePath
parent_dir = Path().resolve().parent

config_path = PurePath(parent_dir, 'config.ini')
config = Config(config_path)

/home/justin/Code/interpretability_project/project/notebooks
Loading configuration from: /home/justin/Code/interpretability_project/project/config.ini
raw_input: /home/justin/Code/interpretability_project/project/data/raw
input_path: /home/justin/Code/interpretability_project/project/data/interim/2018_Yellow_Taxi_Trip_Data_float64
figures_dir: figures
preprocessed_dir: /home/justin/Code/interpretability_project/project/data/preprocessed
train_data: /home/justin/Code/interpretability_project/project/data/preprocessed/X_train
train_target: /home/justin/Code/interpretability_project/project/data/preprocessed/y_train
test_data: /home/justin/Code/interpretability_project/project/data/preprocessed/X_test
test_target: /home/justin/Code/interpretability_project/project/data/preprocessed/y_test


### Start local Dask Client

In [None]:
from dask.distributed import Client, LocalCluster
try:
    if client:
        print('Restarting client')
        client.restart()
except:
    cluster = LocalCluster(dashboard_address=':20100', memory_limit='4G')
#     cluster = LocalCluster(dashboard_address=':20100')
    print('Setting new client')
    client = Client(cluster)
    print(client)
client

Setting new client
<Client: 'tcp://127.0.0.1:39049' processes=5 threads=10, memory=20.00 GB>


## Prepare / Load dataset

In [None]:
from project.src.preprocessing.dataset_manager import DatasetManager
dataset_manager = DatasetManager(config)

In [5]:
%time dataset_manager.write_dataset(overwrite=False)

[PurePosixPath('/home/justin/Code/interpretability_project/project/data/preprocessed/X_train'), PurePosixPath('/home/justin/Code/interpretability_project/project/data/preprocessed/y_train'), PurePosixPath('/home/justin/Code/interpretability_project/project/data/preprocessed/X_test'), PurePosixPath('/home/justin/Code/interpretability_project/project/data/preprocessed/y_test')]
Not overwriting existing training and test sets
CPU times: user 1.11 ms, sys: 202 µs, total: 1.31 ms
Wall time: 777 µs


In [6]:
X_train, y_train = dataset_manager.get_training_set()

### Additional Preprocessing Step

In [7]:
categorical_columns_to_transform = ['payment_type']
# Ignore datetime for now
# datetime_columns_to_transform = ['tpep_pickup_datetime', 'tpep_dropoff_datetime']
datetime_columns_to_transform = []

In [8]:
from dask_ml.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)

In [13]:
import project.src.preprocessing.preprocessing_pipelines as preprocessing

X_train = preprocessing.ohe_preprocessing_pipeline(encoder, 
                                 X_train, 
                                 categorical_columns_to_transform = categorical_columns_to_transform, 
                                 datetime_columns_to_transform = datetime_columns_to_transform)

In [14]:
X_train.head()

Unnamed: 0_level_0,passenger_count,trip_distance,fare_amount,extra,mta_tax,tolls_amount,improvement_surcharge,total_amount_wo_tip,payment_type_1,payment_type_2,payment_type_3,payment_type_4,payment_type_5
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,1.0,1.2,11.0,0.0,0.5,0.0,0.3,11.8,1.0,0.0,0.0,0.0,0.0
1,1.0,12.03,39.0,0.0,0.5,5.76,0.3,45.56,1.0,0.0,0.0,0.0,0.0
2,2.0,0.86,5.5,0.0,0.5,0.0,0.3,6.3,1.0,0.0,0.0,0.0,0.0
3,2.0,1.09,6.0,0.0,0.5,0.0,0.3,6.8,1.0,0.0,0.0,0.0,0.0
5,2.0,1.32,10.0,0.0,0.5,0.0,0.3,10.8,1.0,0.0,0.0,0.0,0.0


In [15]:
target = 'tip_fraction'
y_train[target].head()

index
0    0.250000
1    0.199956
2    0.200000
3    0.200000
5    0.200000
Name: tip_fraction, dtype: float64

### Preparation for dask

In [17]:
"""
Get the lengths of each block to allow conversion to DF
https://nbviewer.jupyter.org/github/PuneetGrov3r/MediumPosts/blob/master/Tackle/BigData-IncrementalLearningAndDask.ipynb#Method-2:-Using-Dask:
"""

'\nGet the lengths of each block to allow conversion to DF\nhttps://nbviewer.jupyter.org/github/PuneetGrov3r/MediumPosts/blob/master/Tackle/BigData-IncrementalLearningAndDask.ipynb#Method-2:-Using-Dask:\n\n'

In [16]:
from project.src.preprocessing.ddf_rescaler import DDFRescaler

In [None]:
rescaler = DDFRescaler(X_train, y_train[target])

In [None]:
y = rescaler.y
Xo = rescaler.X

In [None]:
Xo = Xo[:, 1:]

In [None]:
# Check
# Xo[-5:].compute()

In [None]:
Xo = Xo.rechunk({1: Xo.shape[1]})
Xo = Xo.rechunk({0: 200000})
y = y.rechunk({0: 200000})

### Train/validation/test prep

In [None]:
tr_len = int(0.8*Xo.shape[0])
print(tr_len)

In [None]:
xtrain, ytrain = Xo[:tr_len], y[:tr_len]
xvalid, yvalid = Xo[tr_len:], y[tr_len:]
xtrain.shape, ytrain.shape, xvalid.shape, yvalid.shape

### Train LR model

In [17]:
from dask_ml.linear_model import LinearRegression

In [19]:
est = LinearRegression(penalty='l1')

In [41]:
import joblib
%time est.fit(xtrain, y=ytrain)
filename = 'trained_models/lr_estimator_w_cat.sav'
joblib.dump(est, filename)

CPU times: user 1h 26min 34s, sys: 5min 51s, total: 1h 32min 25s
Wall time: 5h 47s


['trained_models/lr_estimator_w_cat.sav']

In [42]:
preds = est.predict(xvalid)

In [43]:
%time preds[0:10].compute()

CPU times: user 133 ms, sys: 3.27 ms, total: 136 ms
Wall time: 372 ms


array([-0.15151002, -1.03993868, -0.0201289 , -1.07174199, -0.02375292,
       -0.99538063, -1.06519956, -0.08597101, -0.98240305, -0.14028903])

In [44]:
# MAE
%time abs(preds-yvalid)).mean(axis=0).compute()

CPU times: user 7.34 s, sys: 402 ms, total: 7.74 s
Wall time: 20.1 s


0.2027258400610944

In [47]:
print((abs(preds-yvalid)).mean(axis=0).compute())

0.2027258400610944


In [45]:
# MSE
%time ((preds-yvalid)**2).mean(axis=0).compute()

CPU times: user 7.56 s, sys: 435 ms, total: 8 s
Wall time: 19.3 s


27.662080827329344

In [48]:
print(((preds-yvalid)**2).mean(axis=0).compute())

27.662080827329344


#### Previous test

In [None]:
from dask_ml.linear_model import LinearRegression

In [None]:
est = LinearRegression()

In [None]:
%time est.fit(xtrain, y=ytrain)

In [None]:
preds = est.predict(xvalid)

In [None]:
%time preds[0:10].compute()

In [None]:
# import matplotlib.pyplot as plt

In [None]:
# plt.scatter(preds.compute(), yvalid.compute())

### Test Model

In [38]:
preds.shape

(22446926,)

In [39]:
yvalid.shape

(22446926,)

In [46]:
# MAE
%time (abs(preds-yvalid)).mean(axis=0).compute()

CPU times: user 4.73 s, sys: 245 ms, total: 4.98 s
Wall time: 6.89 s


0.48113266567598045

In [45]:
# MSE
%time ((preds-yvalid)**2).mean(axis=0).compute()

CPU times: user 4.66 s, sys: 338 ms, total: 5 s
Wall time: 9.2 s


7.723888769609159

Although this isn't an apples to apples comparison as only numerical inputs were used in this regression (and not categoricals) and more data was used to train the model, it seems the linear model over the entire dataset offers a slightly lower MAE but higher MSE relative to the LightGBM model.  

This means that prediction are closer on average, but that the mistaken predictions are farther from the mark. 

### Save model

In [47]:
print(type(est))

<class 'dask_ml.linear_model.glm.LinearRegression'>


In [3]:
filename = 'trained_models/lr_estimator.sav'
import joblib

In [4]:
# joblib.dump(est, filename)

### Load model

In [None]:
test_est = joblib.load(filename)

In [None]:
# Check model
preds = test_est.predict(xvalid)

In [None]:
# MAE
%time (abs(preds-yvalid)).mean(axis=0).compute()

### Evaluate weights

In [None]:
est.coef_

In [None]:
import eli5

In [None]:
print(type(est))

In [None]:
from sklearn.linear_model import LinearRegression as sklearn_lr
sklearn_est = sklearn_lr()

In [None]:
# Move regression model to sklearn for api support
sklearn_est.coef_ = est.coef_
sklearn_est.intercept_ = est.intercept_

In [None]:
print(input_columns)

In [None]:
eli5.explain_weights(sklearn_est, 
                     feature_names=input_columns, 
                     target_names=target)