Skip to content

Commit

Permalink
Truncate string with size > 4096 for CSV copier
Browse files Browse the repository at this point in the history
  • Loading branch information
mewim committed Oct 10, 2022
1 parent afe7466 commit 9af4e9f
Show file tree
Hide file tree
Showing 7 changed files with 142 additions and 12 deletions.
100 changes: 100 additions & 0 deletions history.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
MATCH (e:edge) RETURN e.ID, e.value;
MATCH (e:edge) RETURN e.value;
CREATE node table edgecase(ID INT64, value STRING);
COPY edgecase FROM "/Users/lc/Desktop/edge_cases_csvs/e2.csv";
CREATE node table edgecase(ID INT64, value STRING);
COPY edgecase FROM "/Users/lc/Desktop/edge_cases_csvs/e1.csv";
COPY edgecase FROM "/Users/lc/Desktop/edge_cases_csvs/e2.csv";
COPY edgecase FROM "/Users/lc/Desktop/edge_cases_csvs/e3.csv";
CREATE node table edgecase(ID STRING, value STRING);
CREATE node table edgecase(ID STRING, value STRING, PRIMARY KEY (ID));
COPY edgecase FROM "/Users/lc/Desktop/edge_cases_csvs/normal.csv";
CREATE node table edgecase(ID STRING, value STRING, PRIMARY KEY (ID));
CREATE node table edgecase(ID STRING, value STRING);
COPY edgecase FROM "/Users/lc/Desktop/LongString.csv";
COPY edgecase FROM "/Users/lc/Desktop/LongString.csv" (HEADER=true);
COPY edgecase FROM "/Users/lc/Desktop/LongString.csv";
CREATE node table edgecase(ID STRING, value STRING);
COPY edgecase FROM "/Users/lc/Desktop/LongString.csv";
MATCH (e:edgecase) RETURN value;
MATCH (e:edgecase) RETURN e.value;
MATCH (e:edgecase) RETURN e.ID
MATCH (e:edgecase) RETURN e.ID;
COPY edgecase FROM "/Users/lc/Desktop/LongString.csv";
COPY edgecase FROM "/Users/lc/Desktop/LongString.csv" (HEADER=true);
COPY edgecase FROM "/Users/lc/Desktop/LongString.csv";
CREATE node table edgecase(ID STRING, value STRING);
COPY edgecase FROM "/Users/lc/Desktop/LongString.csv" (HEADER=true);
COPY edgecase FROM "/Users/lc/Desktop/LongString.csv";
COPY edgecase FROM "/Users/lc/Desktop/LongString.csv" (HEADER=true);
MATCH (e:edgecase) RETURN e.ID;
MATCH (e:edgecase) RETURN e.value;
MATCH e RETURN e.value;
MATCH e RETURN e;
MATCH (e:edgecase) WHERE e.ID="A" RETURN e.value;
MATCH (e:edgecase) WHERE e.ID="B" RETURN e.value;
CREATE node table edgecase(ID STRING, value STRING);
COPY edgecase FROM "/Users/lc/Desktop/LongString.csv" (HEADER=true);
MATCH (e:edgecase) WHERE e.ID="B" RETURN e.value;
MATCH (e:edgecase) WHERE e.ID="A" RETURN e.value;
CREATE node table edgecase(ID STRING, value STRING);
COPY edgecase FROM "/Users/lc/Desktop/LongString.csv" (HEADER=true);
MATCH (e:edgecase) WHERE e.ID="A" RETURN e.value;
MATCH (e:edgecase) WHERE e.ID="B" RETURN e.value;
COPY edgecase FROM "/Users/lc/Desktop/LongString.csv" (HEADER=true);
CREATE node table edgecase(ID STRING, value STRING);
COPY edgecase FROM "/Users/lc/Desktop/LongString.csv" (HEADER=true);
MATCH (e:edgecase) WHERE e.ID="B" RETURN e.value;
MATCH (e:edgecase) WHERE e.ID="A" RETURN e.value;
CREATE node table edgecase(ID STRING, value STRING);
MATCH (e:edgecase) WHERE e.ID="A" RETURN e.value;
CREATE node table edgecase(ID STRING, value STRING);
MATCH (e:edgecase) WHERE e.ID="A" RETURN e.value;
CREATE node table edgecase(ID STRING, value STRING);
MATCH (e:edgecase) WHERE e.ID="A" RETURN e.value;
CREATE node table edgecase(ID STRING, value STRING);
COPY edgecase FROM "/Users/lc/Desktop/LongString.csv" (HEADER=true);
CREATE node table edgecase(ID STRING, value STRING);
COPY edgecase FROM "/Users/lc/Desktop/LongString.csv" (HEADER=true);
CREATE node table edgecase(ID STRING, value STRING);
COPY edgecase FROM "/Users/lc/Desktop/LongString.csv" (HEADER=true);
MATCH (e:edgecase) WHERE e.ID="A" RETURN e.value;
MATCH (e:edgecase) WHERE e.ID="B" RETURN e.value;
CREATE NODE TABLE edgecase(ID STRING, value STRING);
COPY edgecase FROM "/Users/lc/Desktop/LongString.csv" (HEADER=true);
CREATE node table edgecase(ID STRING, value STRING);
COPY edgecase FROM "/Users/lc/Desktop/LongString.csv" (HEADER=true);
CREATE (c:City {name: "Zagreb", population_size: 1000000});
CREATE (c:edgecase {ID: "c", value: "cccccc"});
MATCH (e:edgecase) WHERE e.ID="c" RETURN e.value;
MATCH (e:edgecase) WHERE e.ID="C" RETURN e.value;
MATCH (e:edgecase) RETURN e.value;
MATCH (e:edgecase) RETURN e.value SKIP1;
MATCH (e:edgecase) RETURN e.value SKIP 1;
MATCH (e:edgecase) RETURN e.value SKIP 2;
MATCH (e:edgecase) RETURN e.ID SKIP 2;
MATCH (e:edgecase) RETURN e.ID SKIP 1;
MATCH (e:edgecase) RETURN e.ID SKIP 2;
MATCH (e:edgecase) RETURN e.ID SKIP 3;
MATCH (e:edgecase) RETURN e.ID SKIP 1;
MATCH (e:edgecase) RETURN e.ID SKIP 0;
CREATE (c:edgecase {ID: "C", value: "cccccc"});
MATCH (e:edgecase) RETURN e.ID SKIP 3;
MATCH (e:edgecase) RETURN e.ID SKIP 4;
MATCH (e:edgecase) RETURN e.ID LIMIT 5;
MATCH (e:edgecase) RETURN e.ID LIMIT 100;
CREATE (c:edgecase {ID: "C", value: "cccccc"});
MATCH (e:edgecase) RETURN e.ID LIMIT 10;
MATCH (c:nd) RETURN e.value LIMIT 10;
CREATE (c:edgecase {ID: "C", value: "TestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATe
tATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestATestA");
CREATE NODE TABLE n(ID STRING, value STRING);
CREATE NODE TABLE nd(ID STRING, value STRING);
CREATE (c:nd {ID: "C", value: "cccccc"});
MATCH (c:nd) RETURN c.value;
MATCH (c:nd) RETURN c.ID;
CREATE NODE TABLE nd(ID STRING, value STRING);
CREATE (c:nd {ID: "C", value: "cccccc"});
MATCH (n:nd) WHERE n.ID="C" RETURN n.ID;
MATCH (n:nd) RETURN n.ID;
MATCH (n:nd) RETURN COUNT(*)
6 changes: 5 additions & 1 deletion src/common/csv_reader/csv_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,11 @@ char* CSVReader::getString() {
setNextTokenIsProcessed();
auto strVal = line + linePtrStart;
if (strlen(strVal) > DEFAULT_PAGE_SIZE) {
throw CSVReaderException(StringUtils::getLongStringErrorMessage(strVal, DEFAULT_PAGE_SIZE));
if (this->logger != nullptr) {
logger->warn(StringUtils::getLongStringErrorMessage(strVal, DEFAULT_PAGE_SIZE));
}
// If the string is too long, truncate it.
strVal[DEFAULT_PAGE_SIZE] = '\0';
}
return strVal;
}
Expand Down
5 changes: 5 additions & 0 deletions src/common/include/csv_reader/csv_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <fstream>

#include "spdlog/spdlog.h"
#include "src/common/include/configs.h"
#include "src/common/types/include/literal.h"
#include "src/common/types/include/types_include.h"
Expand Down Expand Up @@ -54,6 +55,9 @@ class CSVReader {

~CSVReader();

// Pass the optional logger for getting warning messages.
inline void setLogger(shared_ptr<spdlog::logger> logger) { this->logger = logger; }

// returns true if there are more lines to be parsed in a block of a CSV file, else false.
bool hasNextLine();
// returns true if the currently-pointed to line has more data to be parsed, else false.
Expand Down Expand Up @@ -85,6 +89,7 @@ class CSVReader {
private:
FILE* fd;
const CSVReaderConfig& config;
shared_ptr<spdlog::logger> logger;
bool nextLineIsNotProcessed, isEndOfBlock, nextTokenIsNotProcessed;
char* line;
size_t lineCapacity, lineLen;
Expand Down
2 changes: 2 additions & 0 deletions src/storage/in_mem_csv_copier/in_mem_node_csv_copier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ void InMemNodeCSVCopier::populateColumnsAndCountUnstrPropertyListSizesTask(uint6
vector<PageByteCursor> overflowCursors(copier->nodeTableSchema->getNumStructuredProperties());
CSVReader reader(
copier->csvDescription.filePath, copier->csvDescription.csvReaderConfig, blockId);
reader.setLogger(copier->logger);
skipFirstRowIfNecessary(blockId, copier->csvDescription, reader);
auto bufferOffset = 0u;
while (reader.hasNextLine()) {
Expand Down Expand Up @@ -226,6 +227,7 @@ void InMemNodeCSVCopier::populateUnstrPropertyListsTask(
copier->logger->trace("Start: path={0} blkIdx={1}", copier->csvDescription.filePath, blockId);
CSVReader reader(
copier->csvDescription.filePath, copier->csvDescription.csvReaderConfig, blockId);
reader.setLogger(copier->logger);
skipFirstRowIfNecessary(blockId, copier->csvDescription, reader);
auto bufferOffset = 0u;
PageByteCursor overflowPagesCursor;
Expand Down
2 changes: 2 additions & 0 deletions src/storage/in_mem_csv_copier/in_mem_rel_csv_copier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ void InMemRelCSVCopier::populateAdjColumnsAndCountRelsInAdjListsTask(
copier->logger->debug("Start: path=`{0}` blkIdx={1}", copier->csvDescription.filePath, blockId);
CSVReader reader(
copier->csvDescription.filePath, copier->csvDescription.csvReaderConfig, blockId);
reader.setLogger(copier->logger);
skipFirstRowIfNecessary(blockId, copier->csvDescription, reader);
vector<bool> requireToReadTableLabels{true, true};
vector<nodeID_t> nodeIDs{2};
Expand Down Expand Up @@ -522,6 +523,7 @@ void InMemRelCSVCopier::populateAdjAndPropertyListsTask(
copier->logger->trace("Start: path=`{0}` blkIdx={1}", copier->csvDescription.filePath, blockId);
CSVReader reader(
copier->csvDescription.filePath, copier->csvDescription.csvReaderConfig, blockId);
reader.setLogger(copier->logger);
skipFirstRowIfNecessary(blockId, copier->csvDescription, reader);
vector<bool> requireToReadTableLabels{true, true};
vector<nodeID_t> nodeIDs{2};
Expand Down
11 changes: 0 additions & 11 deletions test/copy_csv/copy_csv_fault_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@ class CopyCSVFaultTest : public EmptyDBTest {
}
};

class CopyCSVLongStringTest : public CopyCSVFaultTest {
string getInputCSVDir() override { return "dataset/copy-csv-fault-tests/long-string/"; }
};

class CopyCSVDuplicateIDTest : public CopyCSVFaultTest {
string getInputCSVDir() override { return "dataset/copy-csv-fault-tests/duplicate-ids/"; }
};
Expand All @@ -32,13 +28,6 @@ class CopyNodeCSVUnmatchedColumnTypeTest : public CopyCSVFaultTest {
string getInputCSVDir() override { return "dataset/copy-csv-fault-tests/long-string/"; }
};

TEST_F(CopyCSVLongStringTest, LongStringError) {
ASSERT_EQ(getCopyCSVException(),
"Failed to execute statement: COPY person FROM "
"\"dataset/copy-csv-fault-tests/long-string/vPerson.csv\".\nError: CSVReader "
"exception: Maximum length of strings is 4096. Input string's length is 5625.");
}

TEST_F(CopyCSVDuplicateIDTest, DuplicateIDsError) {
ASSERT_EQ(getCopyCSVException(),
"Failed to execute statement: COPY person FROM "
Expand Down
28 changes: 28 additions & 0 deletions test/copy_csv/copy_csv_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ class CopyCSVEmptyListsTest : public InMemoryDBTest {
}
};

class CopyCSVLongStringTest : public InMemoryDBTest {
string getInputCSVDir() override { return "dataset/copy-csv-fault-tests/long-string/"; }
};

struct KnowsTablePTablePKnowsLists {
table_id_t knowsRelTableID;
table_id_t pNodeTableID;
Expand Down Expand Up @@ -322,3 +326,27 @@ TEST_F(CopyCSVSpecialCharTest, CopySpecialCharsCsv) {
TEST_F(CopyCSVEmptyListsTest, CopyCSVEmptyLists) {
testCopyCSVEmptyListsTest();
}

TEST_F(CopyCSVLongStringTest, LongStringError) {
auto storageManager = database->getStorageManager();
auto& catalog = *database->getCatalog();
auto tableID = catalog.getReadOnlyVersion()->getNodeTableIDFromName("person");
auto propertyIdx = catalog.getReadOnlyVersion()->getNodeProperty(tableID, "fName");
auto col =
storageManager->getNodesStore().getNodePropertyColumn(tableID, propertyIdx.propertyID);

EXPECT_EQ(4096, col->readValue(0).strVal.length());
string expectedResultName = "Alice";
auto repeatedTimes = 4096 / expectedResultName.length() + 1;
ostringstream os;
for (auto i = 0; i < repeatedTimes; i++) {
os << expectedResultName;
}
EXPECT_EQ(os.str().substr(0, 4096), col->readValue(0).strVal);
EXPECT_EQ("Bob", col->readValue(1).strVal);

propertyIdx = catalog.getReadOnlyVersion()->getNodeProperty(tableID, "gender");
col = storageManager->getNodesStore().getNodePropertyColumn(tableID, propertyIdx.propertyID);
EXPECT_EQ(1, col->readValue(0).val.int64Val);
EXPECT_EQ(2, col->readValue(1).val.int64Val);
}

0 comments on commit 9af4e9f

Please sign in to comment.