Skip to content

knapply/data.table-vs-parquet

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

2 Commits
 
 
 
 
 
 
 
 
 
 

Repository files navigation

Read

Brendan Knapp 10/8/2019

dl_path <- "datasets/yellow_tripdata_2010-01.csv"

if (!file.exists(dl_path)) {
  download.file(
    "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2010-01.csv",
    destfile = dl_path
  )
}
library(data.table)
library(arrow)
library(scales)
library(microbenchmark)
library(ggplot2)
col_names <- c("vendor_id", "pickup_datetime", "dropoff_datetime", 
               "passenger_count", "trip_distance", "pickup_longitude", 
               "pickup_latitude", "rate_code", "store_and_fwd_flag", 
               "dropoff_longitude", "dropoff_latitude", "payment_type", 
               "fare_amount", "surcharge", "mta_tax", "tip_amount", 
               "tolls_amount", "total_amount")

init <- fread(dl_path)
colnames(init) <- col_names

big_df <- rbindlist(
  replicate(n = 5, init, simplify = FALSE)
)

setNames(comma(dim(big_df)), c("# rows", "# cols"))
##       # rows       # cols 
## "74,318,890"         "18"
csv_path <- "datasets/csvy-file.csv"
csvy_path <- "datasets/csvy-file.csvy"
parquet_path <- "datasets/parquet-file.parquet"


fwrite(big_df, file = csv_path)
fwrite(big_df, file = csvy_path)
write_parquet(big_df, sink = parquet_path)

number_bytes(
  file.size(c(csv_path, csvy_path, parquet_path))
)
## [1] "8 Gb" "8 Gb" "6 Gb"
res <- microbenchmark::microbenchmark(
  DT_csv = fread(csv_path, showProgress = FALSE),

  DT_csvy = fread(csvy_path, showProgress = FALSE),
  
  arrow_parquet = read_parquet(parquet_path),
  

  times = 5
)
res
## Unit: seconds
##           expr      min       lq      mean   median        uq       max
##         DT_csv 79.13196 79.41450  80.33133 79.48502  81.33019  82.29499
##        DT_csvy 78.60473 80.06919  80.38983 80.19296  80.24047  82.84178
##  arrow_parquet 79.95758 84.43273 110.11436 85.31017 110.12591 190.74540
##  neval
##      5
##      5
##      5
ggplot2::autoplot(res)

data.table::getDTthreads()
## [1] 8
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18362)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_3.2.1        microbenchmark_1.4-6 scales_1.0.0        
## [4] arrow_0.15.0         data.table_1.12.4   
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.2        knitr_1.25        magrittr_1.5     
##  [4] tidyselect_0.2.5  bit_1.1-14        munsell_0.5.0    
##  [7] colorspace_1.4-1  R6_2.4.0          rlang_0.4.0      
## [10] dplyr_0.8.3       stringr_1.4.0     tools_3.6.1      
## [13] grid_3.6.1        gtable_0.3.0      xfun_0.10        
## [16] withr_2.1.2       htmltools_0.3.6   lazyeval_0.2.2   
## [19] yaml_2.2.0        bit64_0.9-7       assertthat_0.2.1 
## [22] digest_0.6.21     tibble_2.1.3      crayon_1.3.4     
## [25] purrr_0.3.2       glue_1.3.1.9000   evaluate_0.14    
## [28] rmarkdown_1.16    stringi_1.4.3     pillar_1.4.2.9001
## [31] compiler_3.6.1    pkgconfig_2.0.3

About

No description, website, or topics provided.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

 
 
 

Contributors