Skip to content

Commit

Permalink
Merge pull request #2886 from kuzudb/fix-copy-string
Browse files Browse the repository at this point in the history
Fix parquet reader list slice
  • Loading branch information
acquamarin committed Feb 14, 2024
2 parents 022eadb + 78ca55e commit eb855a3
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 18 deletions.
10 changes: 6 additions & 4 deletions src/common/vector/value_vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -611,10 +611,12 @@ void ListVector::appendDataVector(kuzu::common::ValueVector* dstVector,
}
}

void ListVector::sliceDataVector(
ValueVector* vectorToSlice, uint64_t childIdx, uint64_t numValues) {
for (auto i = 0u; i < numValues - childIdx; i++) {
vectorToSlice->copyFromVectorData(i, vectorToSlice, i + childIdx);
void ListVector::sliceDataVector(ValueVector* vectorToSlice, uint64_t offset, uint64_t numValues) {
if (offset == 0) {
return;
}
for (auto i = 0u; i < numValues - offset; i++) {
vectorToSlice->copyFromVectorData(i, vectorToSlice, i + offset);
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/include/common/vector/value_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ class ListVector {
const ValueVector* srcVector, const uint8_t* srcData);
static void appendDataVector(
ValueVector* dstVector, ValueVector* srcDataVector, uint64_t numValuesToAppend);
static void sliceDataVector(ValueVector* vectorToSlice, uint64_t childIdx, uint64_t numValues);
static void sliceDataVector(ValueVector* vectorToSlice, uint64_t offset, uint64_t numValues);
};

class FixedListVector {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ uint64_t ListColumnReader::read(uint64_t numValues, parquet_filter_t& /*filter*/
childActualNumValues = childColumnReader->read(childReqNumValues, childFilter,
childDefinesPtr, childRepeatsPtr, vectorToRead.get());
} else {
// we do: use the overflow values
childActualNumValues = overflowChildCount;
overflowChildCount = 0;
}
Expand Down Expand Up @@ -100,11 +99,9 @@ uint64_t ListColumnReader::read(uint64_t numValues, parquet_filter_t& /*filter*/
resultPtr[resultOffset].offset = childIdx + currentChunkOffset;
resultPtr[resultOffset].size = 1;
} else if (childDefinesPtr[childIdx] == maxDefine - 1) {
// empty list
resultPtr[resultOffset].offset = childIdx + currentChunkOffset;
resultPtr[resultOffset].size = 0;
} else {
// value is NULL somewhere up the stack
resultOut->setNull(resultOffset, true);
resultPtr[resultOffset].offset = 0;
resultPtr[resultOffset].size = 0;
Expand All @@ -119,7 +116,6 @@ uint64_t ListColumnReader::read(uint64_t numValues, parquet_filter_t& /*filter*/
if (childIdx < childActualNumValues && resultOffset == numValues) {
common::ListVector::sliceDataVector(vectorToRead.get(), childIdx, childActualNumValues);
overflowChildCount = childActualNumValues - childIdx;
// move values in the child repeats and defines *backward* by child_idx
for (auto repdefIdx = 0u; repdefIdx < overflowChildCount; repdefIdx++) {
childDefinesPtr[repdefIdx] = childDefinesPtr[childIdx + repdefIdx];
childRepeatsPtr[repdefIdx] = childRepeatsPtr[childIdx + repdefIdx];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,32 +39,31 @@ void StructColumnReader::registerPrefetch(ThriftFileTransport& transport, bool a
}
}

uint64_t StructColumnReader::read(uint64_t num_values, parquet_filter_t& filter,
uint64_t StructColumnReader::read(uint64_t numValuesToRead, parquet_filter_t& filter,
uint8_t* define_out, uint8_t* repeat_out, common::ValueVector* result) {
auto& fieldVectors = common::StructVector::getFieldVectors(result);
KU_ASSERT(common::StructType::getNumFields(type.get()) == fieldVectors.size());
if (pendingSkips > 0) {
applyPendingSkips(pendingSkips);
}

uint64_t read_count = num_values;
uint64_t numValuesRead = numValuesToRead;
for (auto i = 0u; i < fieldVectors.size(); i++) {
auto child_num_values = childReaders[i]->read(
num_values, filter, define_out, repeat_out, fieldVectors[i].get());
auto numValuesChildrenRead = childReaders[i]->read(
numValuesToRead, filter, define_out, repeat_out, fieldVectors[i].get());
if (i == 0) {
read_count = child_num_values;
} else if (read_count != child_num_values) {
numValuesRead = numValuesChildrenRead;
} else if (numValuesRead != numValuesChildrenRead) {
throw std::runtime_error("Struct child row count mismatch");
}
}
// set the validity mask for this level
for (auto i = 0u; i < read_count; i++) {
for (auto i = 0u; i < numValuesRead; i++) {
if (define_out[i] < maxDefine) {
result->setNull(i, true);
}
}

return read_count;
return numValuesRead;
}

void StructColumnReader::skip(uint64_t num_values) {
Expand Down

0 comments on commit eb855a3

Please sign in to comment.