Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bump calamine to 0.24 #1595

Merged
merged 5 commits into from
Feb 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
25 changes: 12 additions & 13 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ cached = { version = "0.48", default-features = false, features = [
"proc_macro",
"redis_ahash",
], optional = true }
calamine = { version = "0.23", features = ["dates"] }
calamine = { version = "0.24", features = ["dates"] }
censor = { version = "0.3", optional = true }
chrono = { version = "0.4", default-features = false }
console = { version = "0.15", optional = true }
Expand Down
157 changes: 68 additions & 89 deletions src/cmd/excel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ Excel options:
for the full list of supported formats.
Note that if a date format is invalid, qsv will fall back and
return the date as if no date-format was specified.
--keep-zero-time Keep the time part of a date-time field if it is 00:00:00.
By default, qsv will remove the time part if it is 00:00:00.
--range <range> An Excel format range, like C:T or C3:T25, to extract to the CSV.
-j, --jobs <arg> The number of jobs to run in parallel.
When not set, the number of jobs is set to the number of CPUs detected.
Expand All @@ -97,7 +99,7 @@ Common options:

use std::{cmp, fmt::Write, path::PathBuf};

use calamine::{open_workbook_auto, DataType, Range, Reader, SheetType};
use calamine::{open_workbook_auto, Data, Range, Reader, SheetType};
use indicatif::HumanCount;
use log::info;
use rayon::prelude::*;
Expand All @@ -110,17 +112,18 @@ use crate::{

#[derive(Deserialize)]
struct Args {
arg_input: String,
flag_sheet: String,
flag_metadata: String,
flag_flexible: bool,
flag_trim: bool,
flag_output: Option<String>,
flag_delimiter: Option<Delimiter>,
flag_quiet: bool,
flag_date_format: Option<String>,
flag_range: String,
flag_jobs: Option<usize>,
arg_input: String,
flag_sheet: String,
flag_metadata: String,
flag_flexible: bool,
flag_trim: bool,
flag_output: Option<String>,
flag_delimiter: Option<Delimiter>,
flag_quiet: bool,
flag_date_format: Option<String>,
flag_keep_zero_time: bool,
flag_range: String,
flag_jobs: Option<usize>,
}

#[derive(PartialEq)]
Expand Down Expand Up @@ -529,19 +532,20 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
info!("exporting sheet ({sheet})... processing first row as header...");
let first_row = match rows_iter.next() {
Some(first_row) => first_row,
None => &[DataType::Empty],
None => &[Data::Empty],
};
for cell in first_row {
col_name = match *cell {
DataType::String(ref s) => s.to_string(),
DataType::Empty => String::new(),
DataType::Error(ref _e) => String::new(),
DataType::Int(ref i) => i.to_string(),
DataType::DateTime(ref f) | DataType::Float(ref f) => f.to_string(),
DataType::Bool(ref b) => b.to_string(),
DataType::DateTimeIso(ref dt) => dt.to_string(),
DataType::DurationIso(ref d) => d.to_string(),
DataType::Duration(ref d) => d.to_string(),
Data::String(ref s) => s.to_string(),
Data::Empty => String::new(),
Data::Error(ref _e) => String::new(),
Data::Int(ref i) => i.to_string(),
Data::Float(ref f) => f.to_string(),
Data::DateTime(ref exceldatetime) => exceldatetime.to_string(),
Data::Bool(ref b) => b.to_string(),
Data::DateTimeIso(ref dt) => dt.to_string(),
Data::DurationIso(ref d) => d.to_string(),
// Data::Duration(ref d) => d.to_string(),
};
record.push_field(&col_name);
}
Expand Down Expand Up @@ -580,16 +584,15 @@ pub fn run(argv: &[&str]) -> CliResult<()> {

// set chunk_size to number of rows per core/thread
let chunk_size = row_count.div_ceil(ncpus);
let keep_zero_time = args.flag_keep_zero_time;

let processed_rows: Vec<Vec<csv::StringRecord>> = rows
.par_chunks(chunk_size)
.map(|chunk| {
let mut record = csv::StringRecord::with_capacity(500, col_count);
let mut trimmed_record = csv::StringRecord::with_capacity(500, col_count);
let mut cell_date_flag: bool = false;
let mut float_val = 0_f64;
let mut float_flag: bool = false;
let mut work_date;
let mut float_val;
let mut work_date = String::new();
let mut ryu_buffer = ryu::Buffer::new();
let mut itoa_buffer = itoa::Buffer::new();
let mut formatted_date = String::new();
Expand All @@ -599,39 +602,35 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
for row in chunk {
for cell in *row {
match *cell {
DataType::Empty => record.push_field(""),
DataType::String(ref s) => record.push_field(s),
DataType::Int(ref i) => record.push_field(itoa_buffer.format(*i)),
DataType::Float(ref f) => {
Data::Empty => record.push_field(""),
Data::String(ref s) => record.push_field(s),
Data::Int(ref i) => record.push_field(itoa_buffer.format(*i)),
Data::Float(ref f) => {
float_val = *f;
float_flag = true;
cell_date_flag = false;
},
DataType::DateTime(ref f) => {
float_val = *f;
float_flag = true;
cell_date_flag = true;
},
DataType::Error(ref e) => record.push_field(&format!("{e:?}")),
DataType::Bool(ref b) => {
record.push_field(if *b { "true" } else { "false" });
// push the ryu-formatted float value if its
// not an integer or the candidate
// integer is too big or too small to be an i64
#[allow(clippy::cast_precision_loss)]
if float_val.fract().abs() > f64::EPSILON
|| float_val > i64::MAX as f64
|| float_val < i64::MIN as f64
{
record.push_field(ryu_buffer.format_finite(float_val));
} else {
// its an i64 integer. We can't use ryu to format it, because it
// will be formatted as a
// float (have a ".0"). So we use itoa.
record.push_field(itoa_buffer.format(float_val as i64));
}
},
DataType::DateTimeIso(ref dt) => record.push_field(dt),
DataType::DurationIso(ref d) => record.push_field(d),
DataType::Duration(ref d) => record.push_field(ryu_buffer.format(*d)),
};

#[allow(clippy::cast_precision_loss)]
if float_flag {
if cell_date_flag {
// its a date, so convert it
work_date = if float_val.fract() > f64::EPSILON {
// if it has a fractional part, then its a datetime
if let Some(dt) = cell.as_datetime() {
Data::DateTime(ref edt) => {
if edt.is_datetime() {
work_date.clear();
if let Some(dt) = edt.as_datetime() {
if date_format.is_empty() {
// no date format specified, so we'll just use the
// default format for the datetime
dt.to_string()
work_date = dt.to_string();
} else {
// a date format was specified, so we'll use it
(formatted_date).clear();
Expand All @@ -640,51 +639,31 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
{
// the format string was ok, so use to_string()
// to actually apply the DelayedFormat
formatted_date.to_string()
work_date = formatted_date.to_string();
} else {
// if there was a format error, revert to the
// default format
dt.to_string()
work_date = dt.to_string();
}
}
} else {
format!("ERROR: Cannot convert {float_val} to datetime")
}
} else if let Some(d) = cell.as_date() {
// if it has no fractional part and calamine can return it
// as_date, then its a date
if date_format.is_empty() {
d.to_string()
} else {
formatted_date.clear();
if write!(formatted_date, "{}", d.format(&date_format)).is_ok()
{
formatted_date.to_string()
} else {
d.to_string()
if !keep_zero_time && work_date.ends_with(" 00:00:00") {
work_date.truncate(work_date.len() - 9);
}
}
} else {
format!("ERROR: Cannot convert {float_val} to date")
// its not a datetime, its a duration
work_date = edt.as_duration().unwrap().to_string();
};

record.push_field(&work_date);
// its not a date, so just push the ryu-formatted float value if its
// not an integer or the candidate
// integer is too big or too small to be an i64
} else if float_val.fract().abs() > f64::EPSILON
|| float_val > i64::MAX as f64
|| float_val < i64::MIN as f64
{
record.push_field(ryu_buffer.format_finite(float_val));
} else {
// its an i64 integer. We can't use ryu to format it, because it
// will be formatted as a
// float (have a ".0"). So we use itoa.
record.push_field(itoa_buffer.format(float_val as i64));
}
// reset the float flag
float_flag = false;
}
},
Data::Error(ref e) => record.push_field(&format!("{e:?}")),
Data::Bool(ref b) => {
record.push_field(if *b { "true" } else { "false" });
},
Data::DateTimeIso(ref dt) => record.push_field(dt),
Data::DurationIso(ref d) => record.push_field(d),
};
}

if trim {
Expand Down
16 changes: 8 additions & 8 deletions tests/test_excel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -176,18 +176,18 @@ fn excel_date_xlsx_date_format() {
let expected = vec![
svec!["date", "plaincol"],
svec![
"1980-12-25",
"Thu 1980-12-25",
"it will still parse the dates below as date even if plaincol is not in the default \
--dates-whitelist because the cell format was set to date"
],
svec!["Tue 2001-09-11 08:30:00", "2001-09-11"],
svec!["Tue 2001-09-11 08:30:00", "Tue 2001-09-11"],
svec!["not a date", "Tue 2001-09-11 08:30:00"],
svec![
"Wednesday, Mar-14-2012",
"the date below is not parsed as a date coz we didn't explicitly set the cell format \
to a date format and \"plaincol\" is not in the --dates-whitelist"
],
svec!["2001-09-11", "9/11/01 8:30 am"],
svec!["Tue 2001-09-11", "9/11/01 8:30 am"],
];
assert_eq!(got, expected);
}
Expand All @@ -204,11 +204,11 @@ fn excel_xlsx_data_types() {
let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["int", "float", "bool", "date", "duration", "string", "emojis", "foreign"],
svec!["1", "1.1", "true", "2001-09-11", "0.4305555555555556", "The", "The", "敏捷的棕色狐狸在森林里奔跑"],
svec!["2", "1.32434354545454", "false", "2023-10-07", "0.989849537037037", "quick", "🍔", "Franz jagt im komplett verwahrlosten Taxi quer durch Bayern"],
svec!["3", "0.423546456564534", "1", "1941-12-07", "1.2815162037037038", "brown", "is", "Le rusé goupil franchit d'un bond le chien somnolent."],
svec!["4", "-54545.6565756785", "0", "2001-09-11 08:30:00", "0.9791666666666666", "fox", "💩", "El rápido zorro marrón"],
svec!["5", "-5446563454.43546", "true", "1945-08-06 08:15:00", "0.0004629629629629629", "jumped", "🙀", "いろはにほへとちりぬるをわかよたれそつねならむうゐのおくやまけふこえてあさきゆめみしゑひもせす"]
svec!["1", "1.1", "true", "2001-09-11", "PT37200S", "The", "The", "敏捷的棕色狐狸在森林里奔跑"],
svec!["2", "1.32434354545454", "false", "2023-10-07", "PT85523S", "quick", "🍔", "Franz jagt im komplett verwahrlosten Taxi quer durch Bayern"],
svec!["3", "0.423546456564534", "1", "1941-12-07", "P1DT24323S", "brown", "is", "Le rusé goupil franchit d'un bond le chien somnolent."],
svec!["4", "-54545.6565756785", "0", "2001-09-11 08:30:00", "PT84600S", "fox", "💩", "El rápido zorro marrón"],
svec!["5", "-5446563454.43546", "true", "1945-08-06 08:15:00", "PT40S", "jumped", "🙀", "いろはにほへとちりぬるをわかよたれそつねならむうゐのおくやまけふこえてあさきゆめみしゑひもせす"]
];
assert_eq!(got, expected);
}
Expand Down