Skip to content

Commit

Permalink
Add String datatype to stage2 (#68)
Browse files Browse the repository at this point in the history
* Add String datatype to stage2

* Fix default method for the Dictionary trait

* Update simple unit test to use String

* Extend Documentation

* implements #83
  • Loading branch information
ellmau committed Nov 21, 2022
1 parent ac05973 commit 910d36c
Show file tree
Hide file tree
Showing 16 changed files with 149 additions and 28 deletions.
52 changes: 46 additions & 6 deletions src/io/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,20 @@

use crate::error::Error;
use crate::physical::datatypes::{data_value::VecT, DataTypeName};
use crate::physical::dictionary::Dictionary;
use csv::Reader;

/// Imports a csv file
/// Needs a list of Options of [DataTypeName] and a [csv::Reader] reference
/// Needs a list of Options of [DataTypeName] and a [csv::Reader] reference, as well as a [Dictionary][crate::physical::dictionary::Dictionary]
/// # Parameters
/// * `datatypes` this is a list of [`DataTypeName`] options, which needs to match the number of fields in the csv-file.
/// If the Option is [`None`] the field will be ignored. [`Some(DataTypeName)`] describes the datatype of the field in the csv-file.
/// # Behaviour
/// If a given datatype from `datatypes` is not matching the value in the field (i.e. it cannot be parsed into such a value), the whole line will be ignored and an error message is emitted to the log.
pub fn read<T>(
datatypes: &[Option<DataTypeName>],
csv_reader: &mut Reader<T>,
dict: &mut dyn Dictionary,
) -> Result<Vec<VecT>, Error>
where
T: std::io::Read,
Expand All @@ -21,6 +28,7 @@ where
DataTypeName::U64 => VecT::U64(Vec::new()),
DataTypeName::Float => VecT::Float(Vec::new()),
DataTypeName::Double => VecT::Double(Vec::new()),
DataTypeName::String => VecT::String(Vec::new()),
})
}));
});
Expand All @@ -30,7 +38,7 @@ where
if let Err(Error::RollBack(rollback)) =
row.iter().enumerate().try_for_each(|(idx, item)| {
if let Some(datatype) = datatypes[idx] {
match datatype.parse(item) {
match datatype.parse(item, dict) {
Ok(val) => {
result[idx].as_mut().map(|vect| {
vect.push(&val);
Expand Down Expand Up @@ -62,6 +70,7 @@ where
#[cfg(test)]
mod test {
use super::*;
use crate::physical::dictionary::PrefixedStringDictionary;
use csv::ReaderBuilder;
use quickcheck_macros::quickcheck;
use test_log::test;
Expand All @@ -76,7 +85,11 @@ Boston;United States;4628910
.delimiter(b';')
.from_reader(data.as_bytes());

let x = read(&[None, None, None], &mut rdr);
let x = read(
&[None, None, None],
&mut rdr,
&mut PrefixedStringDictionary::new(),
);
assert!(x.is_ok());
assert_eq!(x.unwrap().len(), 0);
}
Expand All @@ -87,7 +100,7 @@ Boston;United States;4628910
let data = "\
10;20;30;40;20;valid
asdf;12.2;413;22.3;23;invalid
node01;22;33.33;12.333332;10;valid
node01;22;33.33;12.333332;10;valid again
node02;1312;12.33;313;1431;valid
node03;123;123;13;55;123;invalid
";
Expand All @@ -103,14 +116,40 @@ node03;123;123;13;55;123;invalid
Some(DataTypeName::Double),
Some(DataTypeName::Float),
Some(DataTypeName::U64),
None,
Some(DataTypeName::String),
],
&mut rdr,
&mut PrefixedStringDictionary::new(),
);

assert!(imported.is_ok());
assert_eq!(imported.as_ref().unwrap().len(), 4);
assert_eq!(imported.as_ref().unwrap().len(), 5);
assert_eq!(imported.as_ref().unwrap()[0].len(), 3);
log::debug!("imported: {:?}", imported);
assert_eq!(
imported.as_ref().unwrap()[4]
.get(0)
.map(|v| v.as_string().unwrap()),
Some(0usize)
);
assert_eq!(
imported.as_ref().unwrap()[4]
.get(1)
.map(|v| v.as_string().unwrap()),
Some(1usize)
);
assert_eq!(
imported.as_ref().unwrap()[4]
.get(2)
.map(|v| v.as_string().unwrap()),
Some(0usize)
);
assert_eq!(
imported.as_ref().unwrap()[4]
.get(3)
.map(|v| v.as_string().unwrap()),
None
);
}

#[quickcheck]
Expand Down Expand Up @@ -151,6 +190,7 @@ node03;123;123;13;55;123;invalid
Some(DataTypeName::Float),
],
&mut rdr,
&mut PrefixedStringDictionary::new(),
);

assert!(imported.is_ok());
Expand Down
11 changes: 11 additions & 0 deletions src/physical/columns/adaptive_column_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ pub enum AdaptiveColumnBuilderT {
Float(AdaptiveColumnBuilder<Float>),
/// Case Double
Double(AdaptiveColumnBuilder<Double>),
/// Case String
String(AdaptiveColumnBuilder<usize>),
}

impl AdaptiveColumnBuilderT {
Expand All @@ -141,6 +143,7 @@ impl AdaptiveColumnBuilderT {
DataTypeName::U64 => Self::U64(AdaptiveColumnBuilder::new()),
DataTypeName::Float => Self::Float(AdaptiveColumnBuilder::new()),
DataTypeName::Double => Self::Double(AdaptiveColumnBuilder::new()),
DataTypeName::String => Self::String(AdaptiveColumnBuilder::new()),
}
}

Expand Down Expand Up @@ -168,6 +171,13 @@ impl AdaptiveColumnBuilderT {
panic!("value does not match AdaptiveColumn type");
}
}
Self::String(cb) => {
cb.add(
value
.as_string()
.expect("Value does not match AdaptiveColumn type"),
);
}
}
}

Expand All @@ -177,6 +187,7 @@ impl AdaptiveColumnBuilderT {
Self::U64(cb) => cb.count(),
Self::Float(cb) => cb.count(),
Self::Double(cb) => cb.count(),
Self::String(cb) => cb.count(),
}
}
}
Expand Down
2 changes: 2 additions & 0 deletions src/physical/columns/column.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ pub enum ColumnT {
Float(ColumnEnum<Float>),
/// Case ColumnEnum<Double>
Double(ColumnEnum<Double>),
/// Case ColumnEnum<String>
String(ColumnEnum<usize>),
}

generate_datatype_forwarder!(forward_to_column_enum);
Expand Down
2 changes: 2 additions & 0 deletions src/physical/columns/interval_column.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ pub enum IntervalColumnT {
Float(IntervalColumnEnum<Float>),
/// Case Double
Double(IntervalColumnEnum<Double>),
/// Case String
String(IntervalColumnEnum<usize>),
}

generate_datatype_forwarder!(forward_to_interval_column_enum);
Expand Down
21 changes: 15 additions & 6 deletions src/physical/columns/ranged_column_scan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,8 @@ pub enum RangedColumnScanT<'a> {
Float(RangedColumnScanCell<'a, Float>),
/// Case Double
Double(RangedColumnScanCell<'a, Double>),
/// Case String
String(RangedColumnScanCell<'a, usize>),
}

generate_datatype_forwarder!(forward_to_ranged_column_scan_cell);
Expand Down Expand Up @@ -399,18 +401,25 @@ impl<'a> ColumnScan for RangedColumnScanT<'a> {
match self {
Self::U64(cs) => match value {
Self::Item::U64(val) => cs.seek(val).map(DataValueT::U64),
Self::Item::Float(_val) => None,
Self::Item::Double(_val) => None,
Self::Item::Float(_) => None,
Self::Item::Double(_) => None,
Self::Item::String(_) => None,
},
Self::Float(cs) => match value {
Self::Item::U64(_val) => None,
Self::Item::U64(_) => None,
Self::Item::Float(val) => cs.seek(val).map(DataValueT::Float),
Self::Item::Double(_val) => None,
Self::Item::Double(_) => None,
Self::Item::String(_) => None,
},
Self::Double(cs) => match value {
Self::Item::U64(_val) => None,
Self::Item::Float(_val) => None,
Self::Item::U64(_) => None,
Self::Item::Float(_) => None,
Self::Item::Double(val) => cs.seek(val).map(DataValueT::Double),
Self::Item::String(_) => None,
},
Self::String(cs) => match value {
Self::Item::String(val) => cs.seek(val).map(DataValueT::String),
_ => None, // no type mixing allowed, so in any other case it should be [None]
},
}
}
Expand Down
6 changes: 5 additions & 1 deletion src/physical/datatypes/data_type_name.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use crate::error::Error;

use super::DataValueT;
use crate::physical::dictionary::Dictionary;

/// Descriptors to refer to the possible data types at runtime.
#[derive(Clone, Copy, Debug, Ord, PartialOrd, Eq, PartialEq)]
Expand All @@ -11,15 +12,18 @@ pub enum DataTypeName {
Float,
/// Data type [`super::double::Double`]
Double,
/// Data type `String`, uses [`usize`] and a [dictionary][crate::physical::dictionary::Dictionary]
String,
}

impl DataTypeName {
/// Parses a string, based on the name of the Datatype
pub fn parse(&self, string: &str) -> Result<DataValueT, Error> {
pub fn parse(&self, string: &str, dict: &mut dyn Dictionary) -> Result<DataValueT, Error> {
Ok(match self {
DataTypeName::U64 => DataValueT::U64(string.parse::<u64>()?),
DataTypeName::Float => DataValueT::Float(super::Float::new(string.parse::<f32>()?)?),
DataTypeName::Double => DataValueT::Double(super::Double::new(string.parse::<f64>()?)?),
DataTypeName::String => DataValueT::String(dict.add(string.to_string())),
})
}
}
44 changes: 38 additions & 6 deletions src/physical/datatypes/data_value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,39 +18,50 @@ pub enum DataValueT {
Float(Float),
/// Case Double
Double(Double),
/// Case String
String(usize),
}

impl DataValueT {
/// Returns either `Option<u64>` or `None`
/// Returns either [`Option<u64>`], answering whether the [`DataValueT`] is of this datatype
pub fn as_u64(&self) -> Option<u64> {
match *self {
DataValueT::U64(val) => Some(val),
_ => None,
}
}

/// Returns either `Option<Float>` or `None`
/// Returns either [`Option<Float>`], answering whether [`DataValueT`] is of this datatype
pub fn as_float(&self) -> Option<Float> {
match *self {
DataValueT::Float(val) => Some(val),
_ => None,
}
}

/// Returns either `Option<Double>` or `None`
/// Returns either [`Option<Double>`], answering whether the [`DataValueT`] is of this datatype
pub fn as_double(&self) -> Option<Double> {
match *self {
DataValueT::Double(val) => Some(val),
_ => None,
}
}

/// Returns an [`Option<usize>`] , answering whether the [`DataValueT`] is of this datatype
pub fn as_string(&self) -> Option<usize> {
match *self {
DataValueT::String(val) => Some(val),
_ => None,
}
}

/// Compares its value with another given [`DataValueT`]
pub fn compare(&self, other: &Self) -> Option<Ordering> {
match self {
DataValueT::U64(val) => other.as_u64().map(|otherval| val.cmp(&otherval)),
DataValueT::Float(val) => other.as_float().map(|otherval| val.cmp(&otherval)),
DataValueT::Double(val) => other.as_double().map(|otherval| val.cmp(&otherval)),
DataValueT::String(val) => other.as_string().map(|otherval| val.cmp(&otherval)),
}
}

Expand All @@ -60,6 +71,7 @@ impl DataValueT {
Self::U64(_) => DataTypeName::U64,
Self::Float(_) => DataTypeName::Float,
Self::Double(_) => DataTypeName::Double,
Self::String(_) => DataTypeName::String,
}
}
}
Expand All @@ -70,6 +82,7 @@ impl std::fmt::Display for DataValueT {
Self::U64(val) => write!(f, "{}", val),
Self::Float(val) => write!(f, "{}", val),
Self::Double(val) => write!(f, "{}", val),
Self::String(val) => write!(f, "str{}", val),
}
}
}
Expand All @@ -83,6 +96,8 @@ pub enum VecT {
Float(Vec<Float>),
/// Case Vec<Double>
Double(Vec<Double>),
/// Case Vec<String>
String(Vec<usize>),
}

generate_datatype_forwarder!(forward_to_vec);
Expand All @@ -94,6 +109,7 @@ impl VecT {
DataTypeName::U64 => Self::U64(Vec::new()),
DataTypeName::Float => Self::Float(Vec::new()),
DataTypeName::Double => Self::Double(Vec::new()),
DataTypeName::String => Self::String(Vec::new()),
}
}

Expand All @@ -103,6 +119,7 @@ impl VecT {
Self::U64(_) => DataTypeName::U64,
Self::Float(_) => DataTypeName::Float,
Self::Double(_) => DataTypeName::Double,
Self::String(_) => DataTypeName::String,
}
}

Expand All @@ -117,16 +134,28 @@ impl VecT {
VecT::U64(vec) => vec.get(index).copied().map(DataValueT::U64),
VecT::Float(vec) => vec.get(index).copied().map(DataValueT::Float),
VecT::Double(vec) => vec.get(index).copied().map(DataValueT::Double),
VecT::String(vec) => vec.get(index).copied().map(DataValueT::String),
}
}

/// Inserts the Value to the corresponding Vector if the datatypes are compatible
/// Note that it is not checked if the [DataValueT] has the right enum-variant
pub(crate) fn push(&mut self, value: &DataValueT) {
match self {
VecT::U64(vec) => vec.push(value.as_u64().unwrap()),
VecT::Float(vec) => vec.push(value.as_float().unwrap()),
VecT::Double(vec) => vec.push(value.as_double().unwrap()),
VecT::U64(vec) => {
vec.push(value.as_u64().expect(
"expecting VecT::U64 and DataValueT::U64, but DataValueT does not match",
))
}
VecT::Float(vec) => vec.push(value.as_float().expect(
"expecting VecT::Float and DataValueT::Float, but DataValueT does not match",
)),
VecT::Double(vec) => vec.push(value.as_double().expect(
"expecting VecT::Double and DataValueT::Double, but DataValueT does not match",
)),
VecT::String(vec) => vec.push(value.as_string().expect(
"expecting VecT::String and DataValueT::String, but DataValueT does not match",
)),
};
}

Expand All @@ -153,6 +182,9 @@ impl VecT {
VecT::Double(vec) => vec
.get(idx_a)
.and_then(|&val_a| vec.get(idx_b).map(|val_b| val_a.cmp(val_b))),
VecT::String(vec) => vec
.get(idx_a)
.and_then(|&val_a| vec.get(idx_b).map(|val_b| val_a.cmp(val_b))),
}
}
}
9 changes: 6 additions & 3 deletions src/physical/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,12 @@ pub mod string_dictionary;
pub use string_dictionary::StringDictionary;

/// This Dictionary Trait defines dictionaries, which keep ownership of the inserted elements.
pub trait Dictionary: Default {
/// Initialize a new Dictionary
fn init() -> Self {
pub trait Dictionary {
/// Construct a new and empty [`Dictionary`]
fn new() -> Self
where
Self: Sized + Default,
{
Self::default()
}
/// Add a new string to the dictionary
Expand Down

0 comments on commit 910d36c

Please sign in to comment.