# Add procs, load packages

In [1]:
addprocs(4)
using JuliaDB, OnlineStats

# Identify the CSVs to load

In [2]:
files = glob("*.csv", "/Users/joshday/datasets/taxi/")[end-10:end]

11-element Array{String,1}:
 "/Users/joshday/datasets/taxi/yellow_tripdata_2017-02.csv"
 "/Users/joshday/datasets/taxi/yellow_tripdata_2017-03.csv"
 "/Users/joshday/datasets/taxi/yellow_tripdata_2017-04.csv"
 "/Users/joshday/datasets/taxi/yellow_tripdata_2017-05.csv"
 "/Users/joshday/datasets/taxi/yellow_tripdata_2017-06.csv"
 "/Users/joshday/datasets/taxi/yellow_tripdata_2017-07.csv"
 "/Users/joshday/datasets/taxi/yellow_tripdata_2017-08.csv"
 "/Users/joshday/datasets/taxi/yellow_tripdata_2017-09.csv"
 "/Users/joshday/datasets/taxi/yellow_tripdata_2017-10.csv"
 "/Users/joshday/datasets/taxi/yellow_tripdata_2017-11.csv"
 "/Users/joshday/datasets/taxi/yellow_tripdata_2017-12.csv"

# Load dataset larger than memory (<3 minutes)

In [3]:
@time t = loadtable(files, output="bin", chunks = length(files), filenamecol = :YearMonth)

Distributed Table with 103786750 rows in 11 chunks:
Columns:
[1m#   [22m[1mcolname                [22m[1mtype[22m
───────────────────────────────────
1   YearMonth              String
2   VendorID               Int64
3   tpep_pickup_datetime   DateTime
4   tpep_dropoff_datetime  DateTime
5   passenger_count        Int64
6   trip_distance          Float64
7   RatecodeID             Int64
8   store_and_fwd_flag     String
9   PULocationID           Int64
10  DOLocationID           Int64
11  payment_type           Int64
12  fare_amount            Float64
13  extra                  Float64
14  mta_tax                Float64
15  tip_amount             Float64
16  tolls_amount           Float64
17  improvement_surcharge  Float64
18  total_amount           Float64

147.683731 seconds (3.32 M allocations: 181.816 MiB, 0.17% gc time)


# Kill the Kernel, Restart Julia, Load Table from Binary

In [1]:
addprocs(4)
using JuliaDB, OnlineStats

@time t = load("bin")

Distributed Table with 103786750 rows in 11 chunks:
Columns:
[1m#   [22m[1mcolname                [22m[1mtype[22m
───────────────────────────────────
1   YearMonth              String
2   VendorID               Int64
3   tpep_pickup_datetime   DateTime
4   tpep_dropoff_datetime  DateTime
5   passenger_count        Int64
6   trip_distance          Float64
7   RatecodeID             Int64
8   store_and_fwd_flag     String
9   PULocationID           Int64
10  DOLocationID           Int64
11  payment_type           Int64
12  fare_amount            Float64
13  extra                  Float64
14  mta_tax                Float64
15  tip_amount             Float64
16  tolls_amount           Float64
17  improvement_surcharge  Float64
18  total_amount           Float64

  0.314782 seconds (130.26 k allocations: 6.893 MiB)


# Calculate the Average Fare, Grouped by Passenger Count

In [2]:
@time groupreduce(Mean(), t, :passenger_count; select = :fare_amount)

 38.113225 seconds (1.78 M allocations: 98.364 MiB)


Distributed Table with 11 rows in 1 chunks:
[1mpassenger_count  [22mMean
─────────────────────────────────────────────────
0                Mean: n=165554 | value=13.3116
1                Mean: n=74072569 | value=12.9409
2                Mean: n=15143017 | value=13.7458
3                Mean: n=4357927 | value=13.536
4                Mean: n=2075625 | value=13.8076
5                Mean: n=4922074 | value=13.0787
6                Mean: n=3049058 | value=13.0415
7                Mean: n=338 | value=43.7983
8                Mean: n=317 | value=45.7156
9                Mean: n=270 | value=54.5304
192              Mean: n=1 | value=6.5