Skip to content

Commit

Permalink
support Cast String to FixedList
Browse files Browse the repository at this point in the history
  • Loading branch information
AEsir777 committed Nov 8, 2023
1 parent 9ab52f6 commit aad6939
Show file tree
Hide file tree
Showing 11 changed files with 328 additions and 128 deletions.
2 changes: 1 addition & 1 deletion dataset/tinysnb/schema.cypher
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
create node table person (ID INt64, fName StRING, gender INT64, isStudent BoOLEAN, isWorker BOOLEAN, age INT64, eyeSight DOUBLE, birthdate DATE, registerTime TIMESTAMP, lastJobDuration interval, workedHours INT64[], usedNames STRING[], courseScoresPerTerm INT64[][], grades INT64[4], height float, PRIMARY KEY (ID));
create node table organisation (ID INT64, name STRING, orgCode INT64, mark DOUBLE, score INT64, history STRING, licenseValidInterval INTERVAL, rating DOUBLE, state STRUCT(revenue INT16, location STRING[], stock STRUCT(price INT64[], volume INT64)), info UNION(price FLOAT, movein DATE, note STRING),PRIMARY KEY (ID));
create node table organisation (ID INT64, name STRING, orgCode INT64, mark DOUBLE, score INT64, history STRING, licenseValidInterval INTERVAL, rating DOUBLE, state STRUCT(revenue INT16, location STRING[], stock STRUCT(price INT64[], volume INT64)), info UNION(price FLOAT, movein DATE, note STRING), stockPrice STRING, PRIMARY KEY (ID));
create node table movies (name STRING, length INT32, note STRING, description STRUCT(rating DOUBLE, stars INT8, views INT64, release TIMESTAMP, film DATE, u8 UINT8, u16 UINT16, u32 UINT32, u64 UINT64, hugedata INT128), content BYTEA, audience MAP(STRING, INT64), grade union(credit boolean, grade1 double, grade2 int64), PRIMARY KEY (name));
create rel table knows (FROM person TO person, date DATE, meetTime TIMESTAMP, validInterval INTERVAL, comments STRING[], MANY_MANY);
create rel table studyAt (FROM person TO organisation, year INT64, places STRING[], length INT16, level INT8, code UINT64, temprature UINT32, ulength UINT16, ulevel UINT8, hugedata INT128, MANY_ONE);
Expand Down
6 changes: 3 additions & 3 deletions dataset/tinysnb/vOrganisation.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
1,ABFsUni,325,3.7,-2,10 years 5 months 13 hours 24 us,3 years 5 days,1,"{revenue: 138, location: ['toronto', 'montr,eal'], stock: {price: [96, 56], volume: 1000}}",3.12
4,CsWork,934,4.1,-100,2 years 4 days 10 hours,26 years 52 days 48 hours,0.78,"{revenue: 152, location: [\"vanco,uver north area\"], stock: {price: [15, 78, 671], volume: 432}}",abcd
6,DEsWork,824,4.1,7,2 years 4 hours 22 us 34 minutes,82 hours 100 milliseconds,0.52,"{revenue: 558, location: ['very long city name', 'new york'], stock: {price: [22], volume: 99}}",2023-12-15
1,ABFsUni,325,3.7,-2,10 years 5 months 13 hours 24 us,3 years 5 days,1,"{revenue: 138, location: ['toronto', 'montr,eal'], stock: {price: [96, 56], volume: 1000}}",3.12,"[3324.123, 342423.4321, 432.1231]"
4,CsWork,934,4.1,-100,2 years 4 days 10 hours,26 years 52 days 48 hours,0.78,"{revenue: 152, location: [\"vanco,uver north area\"], stock: {price: [15, 78, 671], volume: 432}}",abcd,NULL
6,DEsWork,824,4.1,7,2 years 4 hours 22 us 34 minutes,82 hours 100 milliseconds,0.52,"{revenue: 558, location: ['very long city name', 'new york'], stock: {price: [22], volume: 99}}",2023-12-15,"[1,4231, 432.123]"
80 changes: 80 additions & 0 deletions src/common/vector/value_vector.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "common/vector/value_vector.h"

#include "common/exception/not_implemented.h"
#include "common/null_buffer.h"
#include "common/types/value/nested.h"
#include "common/types/value/value.h"
Expand Down Expand Up @@ -294,6 +295,34 @@ std::unique_ptr<Value> ValueVector::getAsValue(uint64_t pos) {
value->childrenSize = children.size();
value->children = std::move(children);
} break;
case PhysicalTypeID::FIXED_LIST: {
auto childDataType = FixedListType::getChildType(&dataType);
auto numElements = FixedListType::getNumElementsInList(&dataType);
std::vector<std::unique_ptr<Value>> children;
children.reserve(numElements);
switch (childDataType->getPhysicalType()) {
case PhysicalTypeID::INT64: {
FixedListVector::getAsValue<int64_t>(this, children, pos, numElements);
} break;
case PhysicalTypeID::INT32: {
FixedListVector::getAsValue<int32_t>(this, children, pos, numElements);
} break;
case PhysicalTypeID::INT16: {
FixedListVector::getAsValue<int16_t>(this, children, pos, numElements);
} break;
case PhysicalTypeID::DOUBLE: {
FixedListVector::getAsValue<double>(this, children, pos, numElements);
} break;
case PhysicalTypeID::FLOAT: {
FixedListVector::getAsValue<float>(this, children, pos, numElements);
} break;
default: // LCOV_EXCL_START
KU_UNREACHABLE;
// LCOV_EXCL_STOP
}
value->childrenSize = numElements;
value->children = std::move(children);
} break;
case PhysicalTypeID::STRUCT: {
auto& fieldVectors = StructVector::getFieldVectors(this);
std::vector<std::unique_ptr<Value>> children;
Expand Down Expand Up @@ -536,6 +565,57 @@ void ListVector::sliceDataVector(
}
}

template<>
void FixedListVector::getAsValue<int64_t>(ValueVector* vector,
std::vector<std::unique_ptr<Value>>& children, uint64_t pos, uint64_t numElements) {
for (auto i = 0u; i < numElements; ++i) {
children.push_back(Value::createDefaultValue(LogicalType{LogicalTypeID::INT64}).copy());
children[i]->val.int64Val =
reinterpret_cast<int64_t*>(vector->getData() + vector->getNumBytesPerValue() * pos)[i];
}
}

template<>
void FixedListVector::getAsValue<int32_t>(ValueVector* vector,
std::vector<std::unique_ptr<Value>>& children, uint64_t pos, uint64_t numElements) {
for (auto i = 0u; i < numElements; ++i) {
children.push_back(Value::createDefaultValue(LogicalType{LogicalTypeID::INT32}).copy());
children[i]->val.int32Val =
reinterpret_cast<int32_t*>(vector->getData() + vector->getNumBytesPerValue() * pos)[i];
}
}

template<>
void FixedListVector::getAsValue<int16_t>(ValueVector* vector,
std::vector<std::unique_ptr<Value>>& children, uint64_t pos, uint64_t numElements) {
for (auto i = 0u; i < numElements; ++i) {
children.push_back(Value::createDefaultValue(LogicalType{LogicalTypeID::INT16}).copy());
children[i]->val.int16Val =
reinterpret_cast<int16_t*>(vector->getData() + vector->getNumBytesPerValue() * pos)[i];
}
}

template<>
void FixedListVector::getAsValue<float>(ValueVector* vector,
std::vector<std::unique_ptr<Value>>& children, uint64_t pos, uint64_t numElements) {
for (auto i = 0u; i < numElements; ++i) {
children.push_back(Value::createDefaultValue(LogicalType{LogicalTypeID::FLOAT}).copy());
children[i]->val.floatVal =
reinterpret_cast<float*>(vector->getData() + vector->getNumBytesPerValue() * pos)[i];
}
}

template<>
void FixedListVector::getAsValue<double>(ValueVector* vector,
std::vector<std::unique_ptr<Value>>& children, uint64_t pos, uint64_t numElements) {
// default: int64
for (auto i = 0u; i < numElements; ++i) {
children.push_back(Value::createDefaultValue(LogicalType{LogicalTypeID::DOUBLE}).copy());
children[i]->val.doubleVal =
reinterpret_cast<double*>(vector->getData() + vector->getNumBytesPerValue() * pos)[i];
}
}

void StructVector::copyFromRowData(ValueVector* vector, uint32_t pos, const uint8_t* rowData) {
KU_ASSERT(vector->dataType.getPhysicalType() == PhysicalTypeID::STRUCT);
auto& structFields = getFieldVectors(vector);
Expand Down
32 changes: 20 additions & 12 deletions src/function/cast_string_to_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ struct CastStringHelper {
uint64_t /*rowToAdd*/ = 0, const CSVReaderConfig* /*csvReaderConfig*/ = nullptr) {
simpleIntegerCast<int64_t>(input, len, result, LogicalType{LogicalTypeID::INT64});
}

static void castToFixedList(const char* input, uint64_t len, ValueVector* vector,
uint64_t rowToAdd, const CSVReaderConfig* csvReaderConfig);
};

template<>
Expand Down Expand Up @@ -167,7 +170,7 @@ static bool skipToClose(const char*& input, const char* end, uint64_t& lvl, char
if (!skipToCloseQuotes(input, end)) {
return false;
}
} else if (*input == '{') { // must have closing brackets fro {, ] if they are not quoted
} else if (*input == '{') { // must have closing brackets {, ] if they are not quoted
if (!skipToClose(input, end, lvl, '}', csvReaderConfig)) {
return false;
}
Expand Down Expand Up @@ -331,7 +334,6 @@ struct SplitStringFixedListOperation {
if (str.empty() || isNull(str)) {
throw ConversionException("Cast failed. NULL is not allowed for FIXEDLIST.");
}
auto type = FixedListType::getChildType(&resultVector->dataType);
CastStringHelper::cast(start, str.length(), value);
resultVector->setValue(offset, value);
offset++;
Expand All @@ -347,8 +349,8 @@ static void validateNumElementsInList(uint64_t numElementsRead, const LogicalTyp
}
}

void castStringToFixedList(const char* input, uint64_t len, ValueVector* vector, uint64_t rowToAdd,
const CSVReaderConfig* csvReaderConfig) {
void CastStringHelper::castToFixedList(const char* input, uint64_t len, ValueVector* vector,
uint64_t rowToAdd, const CSVReaderConfig* csvReaderConfig) {
KU_ASSERT(vector->dataType.getLogicalTypeID() == LogicalTypeID::FIXED_LIST);
auto childDataType = FixedListType::getChildType(&vector->dataType);

Expand All @@ -359,7 +361,7 @@ void castStringToFixedList(const char* input, uint64_t len, ValueVector* vector,

auto startOffset = state.count * rowToAdd;
switch (childDataType->getLogicalTypeID()) {
// TODO: currently only allow these type
// TODO(Kebing): currently only allow these type
case LogicalTypeID::INT64: {
SplitStringFixedListOperation<int64_t> split{startOffset, vector};
startListCast(input, len, split, csvReaderConfig, vector);
Expand All @@ -381,11 +383,17 @@ void castStringToFixedList(const char* input, uint64_t len, ValueVector* vector,
startListCast(input, len, split, csvReaderConfig, vector);
} break;
default: {
throw NotImplementedException("Unsupported data type: Driver::castStringToFixedList");
throw NotImplementedException("Unsupported data type: Function::castStringToFixedList");
}
}
}

void CastString::castToFixedList(const ku_string_t& input, ValueVector* resultVector,
uint64_t rowToAdd, const CSVReaderConfig* csvReaderConfig) {
CastStringHelper::castToFixedList(reinterpret_cast<const char*>(input.getData()), input.len,
resultVector, rowToAdd, csvReaderConfig);
}

// ---------------------- cast String to Map ------------------------------ //
struct SplitStringMapOperation {
SplitStringMapOperation(uint64_t& offset, ValueVector* resultVector)
Expand Down Expand Up @@ -420,7 +428,7 @@ static bool parseKeyOrValue(const char*& input, const char* end, T& state, bool
if (*input == '"' || *input == '\'') {
if (!skipToCloseQuotes(input, end)) {
return false;
};
}
} else if (*input == '{') {
if (!skipToClose(input, end, lvl, '}', csvReaderConfig)) {
return false;
Expand All @@ -429,7 +437,7 @@ static bool parseKeyOrValue(const char*& input, const char* end, T& state, bool
if (!skipToClose(
input, end, lvl, CopyConstants::DEFAULT_CSV_LIST_END_CHAR, csvReaderConfig)) {
return false;
};
}
} else if (isKey && *input == '=') {
return state.handleKey(start, input, csvReaderConfig);
} else if (!isKey && (*input == csvReaderConfig->delimiter || *input == '}')) {
Expand Down Expand Up @@ -500,9 +508,9 @@ void CastStringHelper::cast(const char* input, uint64_t len, map_entry_t& /*resu

template<>
void CastString::operation(const ku_string_t& input, map_entry_t& result, ValueVector* resultVector,
uint64_t rowToAdd, const CSVReaderConfig* CSVReaderConfig) {
uint64_t rowToAdd, const CSVReaderConfig* csvReaderConfig) {
CastStringHelper::cast(reinterpret_cast<const char*>(input.getData()), input.len, result,
resultVector, rowToAdd, CSVReaderConfig);
resultVector, rowToAdd, csvReaderConfig);
}

// ---------------------- cast String to Struct ------------------------------ //
Expand Down Expand Up @@ -854,8 +862,8 @@ void CastString::copyStringToVector(ValueVector* vector, uint64_t rowToAdd, std:
strVal.data(), strVal.length(), val, vector, rowToAdd, csvReaderConfig);
} break;
case LogicalTypeID::FIXED_LIST: {
// TODO: add fix list function wrapper
castStringToFixedList(strVal.data(), strVal.length(), vector, rowToAdd, csvReaderConfig);
CastStringHelper::castToFixedList(
strVal.data(), strVal.length(), vector, rowToAdd, csvReaderConfig);
} break;
case LogicalTypeID::STRUCT: {
struct_entry_t val;
Expand Down
Loading

0 comments on commit aad6939

Please sign in to comment.