Merge pull request #1666 from jqnatividad/1665-dedup-numeric

`dedup`: add --numeric option
jqnatividad · Mar 12, 2024 · 33d1022 · 33d1022
2 parents 98ed5e2 + c7f8fe1
commit 33d1022
Show file tree

Hide file tree

Showing 2 changed files with 103 additions and 38 deletions.
diff --git a/src/cmd/dedup.rs b/src/cmd/dedup.rs
@@ -10,11 +10,6 @@ than memory CSV files. This will make dedup run in streaming mode with constant
 
 Either way, the output will not only be deduplicated, it will also be sorted.
 
-Note that dedup's sorting will only be done alphabetically, not numerically. That is,
-10 will come before 2. If you need to sort numerically, use the sort command first with
-the --numeric option and pipe it to dedup with the --sorted option.
-(i.e. qsv sort --numeric in.csv | qsv dedup --sorted)
-
 A duplicate count will also be sent to <stderr>.
 
 For examples, see https://github.com/jqnatividad/qsv/blob/master/tests/test_dedup.rs.
@@ -28,6 +23,7 @@ dedup options:
                                Note that the outputs will remain at the full width
                                of the CSV.
                                See 'qsv select --help' for the format details.
+    -N, --numeric              Compare according to string numerical value
     -i, --ignore-case          Compare strings disregarding case.
     --sorted                   The input is already sorted. Do not load the CSV into
                                memory to sort it first. Meant to be used in tandem and
@@ -63,7 +59,7 @@ use serde::Deserialize;
 use simdutf8::basic::from_utf8;
 
 use crate::{
-    cmd::sort::iter_cmp,
+    cmd::sort::{iter_cmp, iter_cmp_num},
     config::{Config, Delimiter},
     select::SelectColumns,
     util, CliResult,
@@ -72,6 +68,7 @@ use crate::{
 struct Args {
     arg_input:           Option<String>,
     flag_select:         SelectColumns,
+    flag_numeric:        bool,
     flag_ignore_case:    bool,
     flag_sorted:         bool,
     flag_dupes_output:   Option<String>,
@@ -84,9 +81,23 @@ struct Args {
     flag_memcheck:       bool,
 }
 
+enum ComparisonMode {
+    Numeric,
+    IgnoreCase,
+    Normal,
+}
+
 pub fn run(argv: &[&str]) -> CliResult<()> {
     let args: Args = util::get_args(USAGE, argv)?;
-    let ignore_case = args.flag_ignore_case;
+
+    let compare_mode = if args.flag_numeric {
+        ComparisonMode::Numeric
+    } else if args.flag_ignore_case {
+        ComparisonMode::IgnoreCase
+    } else {
+        ComparisonMode::Normal
+    };
+
     let rconfig = Config::new(&args.arg_input)
         .delimiter(args.flag_delimiter)
         .no_headers(args.flag_no_headers)
@@ -119,10 +130,10 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
             };
             let a = sel.select(&record);
             let b = sel.select(&next_record);
-            let comparison = if ignore_case {
-                iter_cmp_ignore_case(a, b)
-            } else {
-                iter_cmp(a, b)
+            let comparison = match compare_mode {
+                ComparisonMode::Normal => iter_cmp(a, b),
+                ComparisonMode::Numeric => iter_cmp_num(a, b),
+                ComparisonMode::IgnoreCase => iter_cmp_ignore_case(a, b),
             };
             match comparison {
                 cmp::Ordering::Equal => {
@@ -152,40 +163,65 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
         util::njobs(args.flag_jobs);
 
         let mut all = rdr.byte_records().collect::<Result<Vec<_>, _>>()?;
-        if ignore_case {
-            all.par_sort_by(|r1, r2| {
-                let a = sel.select(r1);
-                let b = sel.select(r2);
-                iter_cmp_ignore_case(a, b)
-            });
-        } else {
-            all.par_sort_by(|r1, r2| {
-                let a = sel.select(r1);
-                let b = sel.select(r2);
-                iter_cmp(a, b)
-            });
+        match compare_mode {
+            ComparisonMode::Normal => {
+                all.par_sort_by(|r1, r2| {
+                    let a = sel.select(r1);
+                    let b = sel.select(r2);
+                    iter_cmp(a, b)
+                });
+            },
+            ComparisonMode::Numeric => {
+                all.par_sort_by(|r1, r2| {
+                    let a = sel.select(r1);
+                    let b = sel.select(r2);
+                    iter_cmp_num(a, b)
+                });
+            },
+            ComparisonMode::IgnoreCase => {
+                all.par_sort_by(|r1, r2| {
+                    let a = sel.select(r1);
+                    let b = sel.select(r2);
+                    iter_cmp_ignore_case(a, b)
+                });
+            },
         }
 
         for (current, current_record) in all.iter().enumerate() {
             let a = sel.select(current_record);
             if let Some(next_record) = all.get(current + 1) {
                 let b = sel.select(next_record);
-                if ignore_case {
-                    if iter_cmp_ignore_case(a, b) == cmp::Ordering::Equal {
-                        dupe_count += 1;
-                        if dupes_output {
-                            dupewtr.write_byte_record(current_record)?;
+                match compare_mode {
+                    ComparisonMode::Normal => {
+                        if iter_cmp(a, b) == cmp::Ordering::Equal {
+                            dupe_count += 1;
+                            if dupes_output {
+                                dupewtr.write_byte_record(current_record)?;
+                            }
+                        } else {
+                            wtr.write_byte_record(current_record)?;
                         }
-                    } else {
-                        wtr.write_byte_record(current_record)?;
-                    }
-                } else if iter_cmp(a, b) == cmp::Ordering::Equal {
-                    dupe_count += 1;
-                    if dupes_output {
-                        dupewtr.write_byte_record(current_record)?;
-                    }
-                } else {
-                    wtr.write_byte_record(current_record)?;
+                    },
+                    ComparisonMode::Numeric => {
+                        if iter_cmp_num(a, b) == cmp::Ordering::Equal {
+                            dupe_count += 1;
+                            if dupes_output {
+                                dupewtr.write_byte_record(current_record)?;
+                            }
+                        } else {
+                            wtr.write_byte_record(current_record)?;
+                        }
+                    },
+                    ComparisonMode::IgnoreCase => {
+                        if iter_cmp_ignore_case(a, b) == cmp::Ordering::Equal {
+                            dupe_count += 1;
+                            if dupes_output {
+                                dupewtr.write_byte_record(current_record)?;
+                            }
+                        } else {
+                            wtr.write_byte_record(current_record)?;
+                        }
+                    },
                 }
             } else {
                 wtr.write_byte_record(current_record)?;

diff --git a/tests/test_dedup.rs b/tests/test_dedup.rs
@@ -74,6 +74,35 @@ fn dedup_issue_1381() {
     assert_eq!(got, expected);
 }
 
+#[test]
+fn dedup_issue_1665_numeric() {
+    let wrk = Workdir::new("dedup_issue_1665_numeric");
+    wrk.create(
+        "in.csv",
+        vec![
+            svec!["data"],
+            svec!["1"],
+            svec!["3"],
+            svec!["3"],
+            svec!["5"],
+            svec!["10"],
+        ],
+    );
+
+    let mut cmd = wrk.command("dedup");
+    cmd.arg("-N").arg("in.csv");
+
+    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
+    let expected = vec![
+        svec!["data"],
+        svec!["1"],
+        svec!["3"],
+        svec!["5"],
+        svec!["10"],
+    ];
+    assert_eq!(got, expected);
+}
+
 #[test]
 fn dedup_select() {
     let wrk = Workdir::new("dedup_select");