# Getting up to speed with Dask

## Part 1: Not Dask

Let's do our usual analysis on a laptop-sized machine with a dataset that fits comfortably in memory.

AWS EC2 instance type: r5.xlarge (2 CPU, 16GB RAM)

In [1]:
import pandas as pd
import numpy as np
import datetime
# import s3fs
from pathlib import Path

# change this path if you changed in Part 0
data_path = Path('data')
seed = 42

# Load and explore data

In [2]:
taxi_dtypes = {
    'store_and_fwd_flag': str,
    'RatecodeID': 'float64',
    'VendorID': 'float64',
    'passenger_count': 'float64',
    'payment_type': 'float64',
}

def load_csv(f):
    """ Your good ol' pd.read_csv """
    return pd.read_csv(
        f,
        dtype=taxi_dtypes, 
        parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])

In [4]:
%%time

taxi = load_csv(data_path/'yellow_tripdata_2019-01.csv')

CPU times: user 16.5 s, sys: 2.44 s, total: 18.9 s
Wall time: 19.4 s


In [4]:
# rows and appx. memory size in GB
len(taxi), taxi.memory_usage(deep=True).sum() / 1e9

(7667792, 1.548894112)

In [5]:
taxi.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1.0,2019-01-01 00:46:40,2019-01-01 00:53:20,1.0,1.5,1.0,N,151,239,1.0,7.0,0.5,0.5,1.65,0.0,0.3,9.95,
1,1.0,2019-01-01 00:59:47,2019-01-01 01:18:59,1.0,2.6,1.0,N,239,246,1.0,14.0,0.5,0.5,1.0,0.0,0.3,16.3,
2,2.0,2018-12-21 13:48:30,2018-12-21 13:52:40,3.0,0.0,1.0,N,236,236,1.0,4.5,0.5,0.5,0.0,0.0,0.3,5.8,
3,2.0,2018-11-28 15:52:25,2018-11-28 15:55:45,5.0,0.0,1.0,N,193,193,2.0,3.5,0.5,0.5,0.0,0.0,0.3,7.55,
4,2.0,2018-11-28 15:56:57,2018-11-28 15:58:33,5.0,0.0,2.0,N,193,193,2.0,52.0,0.0,0.5,0.0,0.0,0.3,55.55,


In [6]:
taxi.dtypes

VendorID                        float64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                    float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
dtype: object

In [7]:
%%time 
np.round(taxi.describe(), 3).T

CPU times: user 4.57 s, sys: 207 ms, total: 4.78 s
Wall time: 4.78 s


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
VendorID,7667792.0,1.637,0.54,1.0,1.0,2.0,2.0,4.0
passenger_count,7667792.0,1.567,1.224,0.0,1.0,1.0,2.0,9.0
trip_distance,7667792.0,2.801,3.738,0.0,0.9,1.53,2.8,831.8
RatecodeID,7667792.0,1.058,0.678,1.0,1.0,1.0,1.0,99.0
PULocationID,7667792.0,165.501,66.392,1.0,130.0,162.0,234.0,265.0
DOLocationID,7667792.0,163.753,70.364,1.0,113.0,162.0,234.0,265.0
payment_type,7667792.0,1.292,0.473,1.0,1.0,1.0,2.0,4.0
fare_amount,7667792.0,12.409,262.072,-362.0,6.0,8.5,13.5,623259.86
extra,7667792.0,0.328,0.507,-60.0,0.0,0.0,0.5,535.38
mta_tax,7667792.0,0.497,0.053,-0.5,0.5,0.5,0.5,60.8


# Feature engineering

Adapted from this [Kaggle kernel](https://www.kaggle.com/gaborfodor/from-eda-to-the-top-lb-0-367#Feature-Extraction)

In [8]:
def make_features(df):
    """ Create some features """
    df['pickup_weekday'] = df.tpep_pickup_datetime.dt.weekday
    df['pickup_weekofyear'] = df.tpep_pickup_datetime.dt.weekofyear
    df['pickup_hour'] = df.tpep_pickup_datetime.dt.hour
    df['pickup_minute'] = df.tpep_pickup_datetime.dt.minute
    df['pickup_year_seconds'] = (df.tpep_pickup_datetime - datetime.datetime(2019, 1, 1, 0, 0, 0)).dt.seconds
    df['pickup_week_hour'] = (df.pickup_weekday * 24) + df.pickup_hour
    df['store_and_fwd_flag'] = (df.store_and_fwd_flag == 'Y').astype(int)
    df['VendorID'] = df.VendorID.fillna(-1)
    df['RatecodeID'] = df.RatecodeID.fillna(-1)

In [9]:
%%time

make_features(taxi)

CPU times: user 3.31 s, sys: 132 ms, total: 3.44 s
Wall time: 3.44 s


In [10]:
taxi.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,pickup_weekday,pickup_weekofyear,pickup_hour,pickup_minute,pickup_year_seconds,pickup_week_hour
0,1.0,2019-01-01 00:46:40,2019-01-01 00:53:20,1.0,1.5,1.0,0,151,239,1.0,...,0.0,0.3,9.95,,1,1,0,46,2800,24
1,1.0,2019-01-01 00:59:47,2019-01-01 01:18:59,1.0,2.6,1.0,0,239,246,1.0,...,0.0,0.3,16.3,,1,1,0,59,3587,24
2,2.0,2018-12-21 13:48:30,2018-12-21 13:52:40,3.0,0.0,1.0,0,236,236,1.0,...,0.0,0.3,5.8,,4,51,13,48,49710,109
3,2.0,2018-11-28 15:52:25,2018-11-28 15:55:45,5.0,0.0,1.0,0,193,193,2.0,...,0.0,0.3,7.55,,2,48,15,52,57145,63
4,2.0,2018-11-28 15:56:57,2018-11-28 15:58:33,5.0,0.0,2.0,0,193,193,2.0,...,0.0,0.3,55.55,,2,48,15,56,57417,63


# Machine learning

In [11]:
# features and target column names
numeric_feat = [
    'pickup_weekday', 
    'pickup_weekofyear', 
    'pickup_hour', 
    'pickup_minute', 
    'pickup_year_seconds',
    'pickup_week_hour', 
    'passenger_count',
]
categorical_feat = [
    'VendorID', 
    'RatecodeID', 
    'store_and_fwd_flag',
    'PULocationID',
    'DOLocationID',
]
features = numeric_feat + categorical_feat
y_col = 'total_amount'

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [13]:
%%time

X_train, X_test, y_train, y_test = train_test_split(
    taxi[features], taxi[y_col], test_size=0.33, random_state=seed)

CPU times: user 2.09 s, sys: 517 ms, total: 2.61 s
Wall time: 2.61 s


In [14]:
xgb = XGBRegressor(
    n_estimators=10, 
    max_depth=3, 
    learning_rate=0.1, 
    random_state=seed, 
    n_jobs=-1,
)

In [15]:
%%time

_ = xgb.fit(X_train, y_train)

CPU times: user 1min 18s, sys: 928 ms, total: 1min 19s
Wall time: 1min 18s


In [17]:
%%time

# get test RMSE
preds = xgb.predict(X_test)
np.sqrt(mean_squared_error(preds, y_test))

CPU times: user 956 ms, sys: 240 ms, total: 1.2 s
Wall time: 1.2 s


29.79722106531764

<br>
<br>

# Moar data!!

![](https://i.chzbgr.com/full/6993318656/hC83012C2/analyze-all-the-data)

In [23]:
list(data_path.iterdir())

[PosixPath('data/yellow_tripdata_2019-12.csv'),
 PosixPath('data/yellow_tripdata_2019-08.csv'),
 PosixPath('data/yellow_tripdata_2019-10.csv'),
 PosixPath('data/yellow_tripdata_2019-07.csv'),
 PosixPath('data/yellow_tripdata_2019-05.csv'),
 PosixPath('data/yellow_tripdata_2019-03.csv'),
 PosixPath('data/yellow_tripdata_2019-04.csv'),
 PosixPath('data/yellow_tripdata_2019-09.csv'),
 PosixPath('data/yellow_tripdata_2019-11.csv'),
 PosixPath('data/yellow_tripdata_2019-06.csv'),
 PosixPath('data/yellow_tripdata_2019-02.csv'),
 PosixPath('data/yellow_tripdata_2019-01.csv')]

In [None]:
%%time

dfs = []
for f in data_path.iterdir():
    df = load_csv(f)
    dfs.append(df)
taxi_2019 = pd.concat(dfs)

![](https://i.kym-cdn.com/entries/icons/original/000/010/437/Oneeternitylater.jpg)

![](kernel_dead.png)

![](https://memegenerator.net/img/instances/61402104.jpg)

# Dask to the rescue!

See Part 2