Skip to content

Commit

Permalink
Cleanup CSV WriterBuilder, Default to AutoSI Second Precision (apache…
Browse files Browse the repository at this point in the history
…#4735) (apache#4909)

* Cleanup CSV WriterBuilder (apache#4735)

* Update test

* Review feedback

* Clippy
  • Loading branch information
tustvold committed Oct 11, 2023
1 parent d83008b commit 556c5ff
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 97 deletions.
134 changes: 79 additions & 55 deletions arrow-csv/src/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,53 +70,36 @@ use csv::ByteRecord;
use std::io::Write;

use crate::map_csv_error;

const DEFAULT_DATE_FORMAT: &str = "%F";
const DEFAULT_TIME_FORMAT: &str = "%T";
const DEFAULT_TIMESTAMP_FORMAT: &str = "%FT%H:%M:%S.%9f";
const DEFAULT_TIMESTAMP_TZ_FORMAT: &str = "%FT%H:%M:%S.%9f%:z";
const DEFAULT_NULL_VALUE: &str = "";

/// A CSV writer
#[derive(Debug)]
pub struct Writer<W: Write> {
/// The object to write to
writer: csv::Writer<W>,
/// Whether file should be written with headers. Defaults to `true`
/// Whether file should be written with headers, defaults to `true`
has_headers: bool,
/// The date format for date arrays
/// The date format for date arrays, defaults to RFC3339
date_format: Option<String>,
/// The datetime format for datetime arrays
/// The datetime format for datetime arrays, defaults to RFC3339
datetime_format: Option<String>,
/// The timestamp format for timestamp arrays
/// The timestamp format for timestamp arrays, defaults to RFC3339
timestamp_format: Option<String>,
/// The timestamp format for timestamp (with timezone) arrays
/// The timestamp format for timestamp (with timezone) arrays, defaults to RFC3339
timestamp_tz_format: Option<String>,
/// The time format for time arrays
/// The time format for time arrays, defaults to RFC3339
time_format: Option<String>,
/// Is the beginning-of-writer
beginning: bool,
/// The value to represent null entries
null_value: String,
/// The value to represent null entries, defaults to [`DEFAULT_NULL_VALUE`]
null_value: Option<String>,
}

impl<W: Write> Writer<W> {
/// Create a new CsvWriter from a writable object, with default options
pub fn new(writer: W) -> Self {
let delimiter = b',';
let mut builder = csv::WriterBuilder::new();
let writer = builder.delimiter(delimiter).from_writer(writer);
Writer {
writer,
has_headers: true,
date_format: Some(DEFAULT_DATE_FORMAT.to_string()),
datetime_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()),
time_format: Some(DEFAULT_TIME_FORMAT.to_string()),
timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()),
timestamp_tz_format: Some(DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()),
beginning: true,
null_value: DEFAULT_NULL_VALUE.to_string(),
}
WriterBuilder::new().with_delimiter(delimiter).build(writer)
}

/// Write a vector of record batches to a writable object
Expand All @@ -138,7 +121,7 @@ impl<W: Write> Writer<W> {
}

let options = FormatOptions::default()
.with_null(&self.null_value)
.with_null(self.null_value.as_deref().unwrap_or(DEFAULT_NULL_VALUE))
.with_date_format(self.date_format.as_deref())
.with_datetime_format(self.datetime_format.as_deref())
.with_timestamp_format(self.timestamp_format.as_deref())
Expand Down Expand Up @@ -207,9 +190,9 @@ impl<W: Write> RecordBatchWriter for Writer<W> {
#[derive(Clone, Debug)]
pub struct WriterBuilder {
/// Optional column delimiter. Defaults to `b','`
delimiter: Option<u8>,
delimiter: u8,
/// Whether to write column names as file headers. Defaults to `true`
has_headers: bool,
has_header: bool,
/// Optional date format for date arrays
date_format: Option<String>,
/// Optional datetime format for datetime arrays
Expand All @@ -227,14 +210,14 @@ pub struct WriterBuilder {
impl Default for WriterBuilder {
fn default() -> Self {
Self {
has_headers: true,
delimiter: None,
date_format: Some(DEFAULT_DATE_FORMAT.to_string()),
datetime_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()),
time_format: Some(DEFAULT_TIME_FORMAT.to_string()),
timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()),
timestamp_tz_format: Some(DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()),
null_value: Some(DEFAULT_NULL_VALUE.to_string()),
has_header: true,
delimiter: b',',
date_format: None,
datetime_format: None,
time_format: None,
timestamp_format: None,
timestamp_tz_format: None,
null_value: None,
}
}
}
Expand All @@ -254,7 +237,7 @@ impl WriterBuilder {
/// let file = File::create("target/out.csv").unwrap();
///
/// // create a builder that doesn't write headers
/// let builder = WriterBuilder::new().has_headers(false);
/// let builder = WriterBuilder::new().with_header(false);
/// let writer = builder.build(file);
///
/// writer
Expand All @@ -265,48 +248,92 @@ impl WriterBuilder {
}

/// Set whether to write headers
#[deprecated(note = "Use Self::with_header")]
#[doc(hidden)]
pub fn has_headers(mut self, has_headers: bool) -> Self {
self.has_headers = has_headers;
self.has_header = has_headers;
self
}

/// Set whether to write the CSV file with a header
pub fn with_header(mut self, header: bool) -> Self {
self.has_header = header;
self
}

/// Returns `true` if this writer is configured to write a header
pub fn header(&self) -> bool {
self.has_header
}

/// Set the CSV file's column delimiter as a byte character
pub fn with_delimiter(mut self, delimiter: u8) -> Self {
self.delimiter = Some(delimiter);
self.delimiter = delimiter;
self
}

/// Get the CSV file's column delimiter as a byte character
pub fn delimiter(&self) -> u8 {
self.delimiter
}

/// Set the CSV file's date format
pub fn with_date_format(mut self, format: String) -> Self {
self.date_format = Some(format);
self
}

/// Get the CSV file's date format if set, defaults to RFC3339
pub fn date_format(&self) -> Option<&str> {
self.date_format.as_deref()
}

/// Set the CSV file's datetime format
pub fn with_datetime_format(mut self, format: String) -> Self {
self.datetime_format = Some(format);
self
}

/// Get the CSV file's datetime format if set, defaults to RFC3339
pub fn datetime_format(&self) -> Option<&str> {
self.datetime_format.as_deref()
}

/// Set the CSV file's time format
pub fn with_time_format(mut self, format: String) -> Self {
self.time_format = Some(format);
self
}

/// Get the CSV file's datetime time if set, defaults to RFC3339
pub fn time_format(&self) -> Option<&str> {
self.time_format.as_deref()
}

/// Set the CSV file's timestamp format
pub fn with_timestamp_format(mut self, format: String) -> Self {
self.timestamp_format = Some(format);
self
}

/// Get the CSV file's timestamp format if set, defaults to RFC3339
pub fn timestamp_format(&self) -> Option<&str> {
self.timestamp_format.as_deref()
}

/// Set the value to represent null in output
pub fn with_null(mut self, null_value: String) -> Self {
self.null_value = Some(null_value);
self
}

/// Use RFC3339 format for date/time/timestamps
/// Get the value to represent null in output
pub fn null(&self) -> &str {
self.null_value.as_deref().unwrap_or(DEFAULT_NULL_VALUE)
}

/// Use RFC3339 format for date/time/timestamps (default)
#[deprecated(note = "Use WriterBuilder::default()")]
pub fn with_rfc3339(mut self) -> Self {
self.date_format = None;
self.datetime_format = None;
Expand All @@ -318,21 +345,18 @@ impl WriterBuilder {

/// Create a new `Writer`
pub fn build<W: Write>(self, writer: W) -> Writer<W> {
let delimiter = self.delimiter.unwrap_or(b',');
let mut builder = csv::WriterBuilder::new();
let writer = builder.delimiter(delimiter).from_writer(writer);
let writer = builder.delimiter(self.delimiter).from_writer(writer);
Writer {
writer,
has_headers: self.has_headers,
beginning: true,
has_headers: self.has_header,
date_format: self.date_format,
datetime_format: self.datetime_format,
time_format: self.time_format,
timestamp_format: self.timestamp_format,
timestamp_tz_format: self.timestamp_tz_format,
beginning: true,
null_value: self
.null_value
.unwrap_or_else(|| DEFAULT_NULL_VALUE.to_string()),
null_value: self.null_value,
}
}
}
Expand Down Expand Up @@ -411,11 +435,11 @@ mod tests {

let expected = r#"c1,c2,c3,c4,c5,c6,c7
Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes
consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,cupcakes
sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo
consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378,06:51:20,cupcakes
sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo
Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes
consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,cupcakes
sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo
consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378,06:51:20,cupcakes
sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo
"#;
assert_eq!(expected.to_string(), String::from_utf8(buffer).unwrap());
}
Expand Down Expand Up @@ -512,7 +536,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo
let mut file = tempfile::tempfile().unwrap();

let builder = WriterBuilder::new()
.has_headers(false)
.with_header(false)
.with_delimiter(b'|')
.with_null("NULL".to_string())
.with_time_format("%r".to_string());
Expand Down Expand Up @@ -560,7 +584,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo
)
.unwrap();

let builder = WriterBuilder::new().has_headers(false);
let builder = WriterBuilder::new().with_header(false);

let mut buf: Cursor<Vec<u8>> = Default::default();
// drop the writer early to release the borrow.
Expand Down Expand Up @@ -652,7 +676,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo

let mut file = tempfile::tempfile().unwrap();

let builder = WriterBuilder::new().with_rfc3339();
let builder = WriterBuilder::new();
let mut writer = builder.build(&mut file);
let batches = vec![&batch];
for batch in batches {
Expand Down
42 changes: 0 additions & 42 deletions arrow/tests/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,48 +53,6 @@ fn test_export_csv_timestamps() {
}
drop(writer);

let left = "c1,c2
2019-04-18T20:54:47.378000000+10:00,2019-04-18T10:54:47.378000000
2021-10-30T17:59:07.000000000+11:00,2021-10-30T06:59:07.000000000\n";
let right = String::from_utf8(sw).unwrap();
assert_eq!(left, right);
}

#[test]
fn test_export_csv_timestamps_using_rfc3339() {
let schema = Schema::new(vec![
Field::new(
"c1",
DataType::Timestamp(TimeUnit::Millisecond, Some("Australia/Sydney".into())),
true,
),
Field::new("c2", DataType::Timestamp(TimeUnit::Millisecond, None), true),
]);

let c1 = TimestampMillisecondArray::from(
// 1555584887 converts to 2019-04-18, 20:54:47 in time zone Australia/Sydney (AEST).
// The offset (difference to UTC) is +10:00.
// 1635577147 converts to 2021-10-30 17:59:07 in time zone Australia/Sydney (AEDT)
// The offset (difference to UTC) is +11:00. Note that daylight savings is in effect on 2021-10-30.
//
vec![Some(1555584887378), Some(1635577147000)],
)
.with_timezone("Australia/Sydney");
let c2 =
TimestampMillisecondArray::from(vec![Some(1555584887378), Some(1635577147000)]);
let batch =
RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap();

let mut sw = Vec::new();
let mut writer = arrow_csv::WriterBuilder::new()
.with_rfc3339()
.build(&mut sw);
let batches = vec![&batch];
for batch in batches {
writer.write(batch).unwrap();
}
drop(writer);

let left = "c1,c2
2019-04-18T20:54:47.378+10:00,2019-04-18T10:54:47.378
2021-10-30T17:59:07+11:00,2021-10-30T06:59:07\n";
Expand Down

0 comments on commit 556c5ff

Please sign in to comment.