Skip to content

Commit

Permalink
Merge pull request #1666 from jqnatividad/1665-dedup-numeric
Browse files Browse the repository at this point in the history
`dedup`: add --numeric option
  • Loading branch information
jqnatividad committed Mar 12, 2024
2 parents 98ed5e2 + c7f8fe1 commit 33d1022
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 38 deletions.
112 changes: 74 additions & 38 deletions src/cmd/dedup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,6 @@ than memory CSV files. This will make dedup run in streaming mode with constant
Either way, the output will not only be deduplicated, it will also be sorted.
Note that dedup's sorting will only be done alphabetically, not numerically. That is,
10 will come before 2. If you need to sort numerically, use the sort command first with
the --numeric option and pipe it to dedup with the --sorted option.
(i.e. qsv sort --numeric in.csv | qsv dedup --sorted)
A duplicate count will also be sent to <stderr>.
For examples, see https://github.com/jqnatividad/qsv/blob/master/tests/test_dedup.rs.
Expand All @@ -28,6 +23,7 @@ dedup options:
Note that the outputs will remain at the full width
of the CSV.
See 'qsv select --help' for the format details.
-N, --numeric Compare according to string numerical value
-i, --ignore-case Compare strings disregarding case.
--sorted The input is already sorted. Do not load the CSV into
memory to sort it first. Meant to be used in tandem and
Expand Down Expand Up @@ -63,7 +59,7 @@ use serde::Deserialize;
use simdutf8::basic::from_utf8;

use crate::{
cmd::sort::iter_cmp,
cmd::sort::{iter_cmp, iter_cmp_num},
config::{Config, Delimiter},
select::SelectColumns,
util, CliResult,
Expand All @@ -72,6 +68,7 @@ use crate::{
struct Args {
arg_input: Option<String>,
flag_select: SelectColumns,
flag_numeric: bool,
flag_ignore_case: bool,
flag_sorted: bool,
flag_dupes_output: Option<String>,
Expand All @@ -84,9 +81,23 @@ struct Args {
flag_memcheck: bool,
}

enum ComparisonMode {
Numeric,
IgnoreCase,
Normal,
}

pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;
let ignore_case = args.flag_ignore_case;

let compare_mode = if args.flag_numeric {
ComparisonMode::Numeric
} else if args.flag_ignore_case {
ComparisonMode::IgnoreCase
} else {
ComparisonMode::Normal
};

let rconfig = Config::new(&args.arg_input)
.delimiter(args.flag_delimiter)
.no_headers(args.flag_no_headers)
Expand Down Expand Up @@ -119,10 +130,10 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
};
let a = sel.select(&record);
let b = sel.select(&next_record);
let comparison = if ignore_case {
iter_cmp_ignore_case(a, b)
} else {
iter_cmp(a, b)
let comparison = match compare_mode {
ComparisonMode::Normal => iter_cmp(a, b),
ComparisonMode::Numeric => iter_cmp_num(a, b),
ComparisonMode::IgnoreCase => iter_cmp_ignore_case(a, b),
};
match comparison {
cmp::Ordering::Equal => {
Expand Down Expand Up @@ -152,40 +163,65 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
util::njobs(args.flag_jobs);

let mut all = rdr.byte_records().collect::<Result<Vec<_>, _>>()?;
if ignore_case {
all.par_sort_by(|r1, r2| {
let a = sel.select(r1);
let b = sel.select(r2);
iter_cmp_ignore_case(a, b)
});
} else {
all.par_sort_by(|r1, r2| {
let a = sel.select(r1);
let b = sel.select(r2);
iter_cmp(a, b)
});
match compare_mode {
ComparisonMode::Normal => {
all.par_sort_by(|r1, r2| {
let a = sel.select(r1);
let b = sel.select(r2);
iter_cmp(a, b)
});
},
ComparisonMode::Numeric => {
all.par_sort_by(|r1, r2| {
let a = sel.select(r1);
let b = sel.select(r2);
iter_cmp_num(a, b)
});
},
ComparisonMode::IgnoreCase => {
all.par_sort_by(|r1, r2| {
let a = sel.select(r1);
let b = sel.select(r2);
iter_cmp_ignore_case(a, b)
});
},
}

for (current, current_record) in all.iter().enumerate() {
let a = sel.select(current_record);
if let Some(next_record) = all.get(current + 1) {
let b = sel.select(next_record);
if ignore_case {
if iter_cmp_ignore_case(a, b) == cmp::Ordering::Equal {
dupe_count += 1;
if dupes_output {
dupewtr.write_byte_record(current_record)?;
match compare_mode {
ComparisonMode::Normal => {
if iter_cmp(a, b) == cmp::Ordering::Equal {
dupe_count += 1;
if dupes_output {
dupewtr.write_byte_record(current_record)?;
}
} else {
wtr.write_byte_record(current_record)?;
}
} else {
wtr.write_byte_record(current_record)?;
}
} else if iter_cmp(a, b) == cmp::Ordering::Equal {
dupe_count += 1;
if dupes_output {
dupewtr.write_byte_record(current_record)?;
}
} else {
wtr.write_byte_record(current_record)?;
},
ComparisonMode::Numeric => {
if iter_cmp_num(a, b) == cmp::Ordering::Equal {
dupe_count += 1;
if dupes_output {
dupewtr.write_byte_record(current_record)?;
}
} else {
wtr.write_byte_record(current_record)?;
}
},
ComparisonMode::IgnoreCase => {
if iter_cmp_ignore_case(a, b) == cmp::Ordering::Equal {
dupe_count += 1;
if dupes_output {
dupewtr.write_byte_record(current_record)?;
}
} else {
wtr.write_byte_record(current_record)?;
}
},
}
} else {
wtr.write_byte_record(current_record)?;
Expand Down
29 changes: 29 additions & 0 deletions tests/test_dedup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,35 @@ fn dedup_issue_1381() {
assert_eq!(got, expected);
}

#[test]
fn dedup_issue_1665_numeric() {
let wrk = Workdir::new("dedup_issue_1665_numeric");
wrk.create(
"in.csv",
vec![
svec!["data"],
svec!["1"],
svec!["3"],
svec!["3"],
svec!["5"],
svec!["10"],
],
);

let mut cmd = wrk.command("dedup");
cmd.arg("-N").arg("in.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["data"],
svec!["1"],
svec!["3"],
svec!["5"],
svec!["10"],
];
assert_eq!(got, expected);
}

#[test]
fn dedup_select() {
let wrk = Workdir::new("dedup_select");
Expand Down

0 comments on commit 33d1022

Please sign in to comment.