From 523c60a36bf45b4df5e66f3951a91948c22d5261 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 20 Jan 2024 22:53:52 -0500 Subject: [PATCH] `schema`: use par_sort_unstable when sorting unique values per field as there can be potentially many values (high cardinality) most of the time, its worth the parallel overhead --- src/cmd/schema.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cmd/schema.rs b/src/cmd/schema.rs index 780bf187d..94425a030 100644 --- a/src/cmd/schema.rs +++ b/src/cmd/schema.rs @@ -88,6 +88,7 @@ use csv::ByteRecord; use grex::RegExpBuilder; use itertools::Itertools; use log::{debug, error, info, warn}; +use rayon::slice::ParallelSliceMut; use serde::Deserialize; use serde_json::{json, value::Number, Map, Value}; use stats::Frequencies; @@ -678,7 +679,7 @@ fn construct_map_of_unique_values( let header_string = convert_to_string(header_byte_slice)?; // sort the values so enum list so schema can be diff'ed between runs - unique_values.sort_unstable(); + unique_values.par_sort_unstable(); // if log::log_enabled!(log::Level::Debug) { // // we do this as this debug is relatively expensive