# Template

Copy when making a new notebook which will load the dataset and use Dask

### Setup

In [4]:
# Extension reloader to import a function again when re-running cell 
%load_ext autoreload
%autoreload 2

### Load Configuration

In [5]:
"""
Loads common configuration parameters
"""
import utils.configuration_manager as configuration_manager
from pathlib import PurePath
from os import getcwd

config_path = PurePath(getcwd(),'config.ini')
config = configuration_manager.Config(config_path)

# Assumes parquet directory as input
input_path = config.input_path
print('Input path: '+ input_path)

# For result storage
output_directory = config.output_directory
print('Output path: ' + output_directory)

Loading configuration from: /home/justin/Code/interpretability_experiment/config.ini
Input path: data/2018_Yellow_Taxi_Trip_Data_float64
Output path: output


### Start local Dask Client

In [6]:
from dask.distributed import Client, LocalCluster
try:
    if client:
        print('Restarting client')
        client.restart()
except:
#     cluster = LocalCluster(dashboard_address=':20100', memory_limit='4G')
    cluster = LocalCluster(dashboard_address=':20100')
    print('Setting new client')
    client = Client(cluster)
    print(client)
client

Restarting client




0,1
Client  Scheduler: tcp://127.0.0.1:46335  Dashboard: http://127.0.0.1:20100/status,Cluster  Workers: 5  Cores: 10  Memory: 25.61 GB


### Dask dataframe loader

In [7]:
import dask.dataframe as dd
import fastparquet

In [8]:
ddf = dd.read_parquet(input_path)

In [9]:
ddf.head()

Unnamed: 0_level_0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,1,2018-12-03 09:58:01,2018-12-03 10:14:17,1.0,1.2,1,N,186,161,1,11.0,0.0,0.5,2.95,0.0,0.3,14.75
1,2,2018-12-03 09:41:32,2018-12-03 10:20:08,1.0,12.03,1,N,138,162,1,39.0,0.0,0.5,9.11,5.76,0.3,54.67
2,2,2018-12-03 08:54:36,2018-12-03 08:59:35,2.0,0.86,1,N,151,166,1,5.5,0.0,0.5,1.26,0.0,0.3,7.56
3,2,2018-12-03 09:02:08,2018-12-03 09:07:16,2.0,1.09,1,N,166,238,1,6.0,0.0,0.5,1.36,0.0,0.3,8.16
4,2,2018-12-03 09:10:10,2018-12-03 09:21:32,2.0,1.78,1,N,238,75,1,9.5,0.0,0.5,2.06,0.0,0.3,12.36


Ignoring categorical data for simplicity

### Define what we are trying to model

In [17]:
target = 'tip_amount'

In [18]:
"""
Subtract the tip_amount from the total_amount to prevent any leakage, 
using a new total_amount_wo_tip column.
"""
ddf['total_amount_wo_tip'] = ddf['total_amount'] - ddf['tip_amount']

In [None]:
# Select all numerical columns as inputs
input_columns = ddf.select_dtypes(['float']).columns

In [34]:
# Remove unwanted numerical columns 
input_columns = [col for col in input_columns if col not in [target, 'total_amount']]

In [35]:
print(input_columns)

['passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tolls_amount', 'improvement_surcharge', 'total_amount_wo_tip']


In [36]:
ddf[input_columns].head()

Unnamed: 0_level_0,passenger_count,trip_distance,fare_amount,extra,mta_tax,tolls_amount,improvement_surcharge,total_amount_wo_tip
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1.0,1.2,11.0,0.0,0.5,0.0,0.3,11.8
1,1.0,12.03,39.0,0.0,0.5,5.76,0.3,45.56
2,2.0,0.86,5.5,0.0,0.5,0.0,0.3,6.3
3,2.0,1.09,6.0,0.0,0.5,0.0,0.3,6.8
4,2.0,1.78,9.5,0.0,0.5,0.0,0.3,10.3


In [37]:
ddf[target].head()

index
0    2.95
1    9.11
2    1.26
3    1.36
4    2.06
Name: tip_amount, dtype: float64

### Preparing dataset for dask

In [38]:
"""
Get the lengths of each block to allow conversion to DF
https://nbviewer.jupyter.org/github/PuneetGrov3r/MediumPosts/blob/master/Tackle/BigData-IncrementalLearningAndDask.ipynb#Method-2:-Using-Dask:
"""
lengths = []
for part in ddf.partitions:
    l = part.shape[0].compute()
    lengths.append(l)
#     print(l, part.shape[1])

In [40]:
# Set X, y to load as dask arrays
X, y = ddf[input_columns].to_dask_array(lengths=lengths) , ddf[target].to_dask_array(lengths=lengths)

In [61]:
"""
Resizing blocks in order to prevent broadcasting errors due to different input sizes
"""
chunk_length = 200000
import dask
from dask_ml.preprocessing import RobustScaler

Xo = dask.array.zeros((X.shape[0],1), chunks=(chunk_length,1))

for i, col_ in enumerate(ddf[input_columns + [target]].columns):
    if col_ == target:
        rsc = RobustScaler()
        y = rsc.fit_transform(y.reshape(-1, 1)).reshape(1, -1)[0]
    else:
        rsc = RobustScaler()
        temp = rsc.fit_transform(X[:,i].reshape(-1, 1))
        Xo = dask.array.concatenate([Xo, temp], axis=1)

In [62]:
Xo = Xo[:, 1:]

In [63]:
# Check
Xo[-5:].compute()

array([[ 0.        , -0.48412698, -0.52380952, -0.5       ,  0.        ,
         0.        ,  0.        , -0.55      ],
       [ 0.        ,  0.08333333, -0.0952381 , -0.5       ,  0.        ,
         0.        ,  0.        , -0.1       ],
       [ 0.        ,  0.03968254,  0.        , -0.5       ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.        , -0.29761905, -0.33333333, -0.5       ,  0.        ,
         0.        ,  0.        , -0.35      ],
       [ 0.        ,  0.35714286,  0.47619048, -0.5       ,  0.        ,
         0.        ,  0.        ,  0.5       ]])

In [64]:
Xo = Xo.rechunk({1: Xo.shape[1]})
Xo = Xo.rechunk({0: chunk_length})
y = y.rechunk({0: chunk_length})

### Train/validation/test prep

In [65]:
tr_len = int(0.8*Xo.shape[0])
print(tr_len)

89787700


In [66]:
xtrain, ytrain = Xo[:tr_len], y[:tr_len]
xvalid, yvalid = Xo[tr_len:], y[tr_len:]
xtrain.shape, ytrain.shape, xvalid.shape, yvalid.shape

((89787700, 8), (89787700,), (22446926, 8), (22446926,))

### Train LR model

In [68]:
from dask_ml.linear_model import LinearRegression

In [69]:
est = LinearRegression()

In [None]:
est.fit(xtrain, y=ytrain)

In [None]:
preds = est.predict(xvalid)

In [None]:
preds[0:10].compute()

In [None]:
plt.scatter(preds.compute(), yvalid.compute())