diff --git a/src/binder/bind/bind_export_database.cpp b/src/binder/bind/bind_export_database.cpp index 73a069b25c3..14a5ec8b879 100644 --- a/src/binder/bind/bind_export_database.cpp +++ b/src/binder/bind/bind_export_database.cpp @@ -1,6 +1,7 @@ #include "binder/copy/bound_export_database.h" #include "binder/query/bound_regular_query.h" #include "common/exception/binder.h" +#include "common/string_utils.h" #include "main/client_context.h" #include "parser/parser.h" #include "parser/port_db.h" @@ -44,6 +45,21 @@ static std::vector getExportInfo( return exportData; } +FileType getFileType(std::unordered_map& options) { + auto fileType = FileType::CSV; + if (options.find("FORMAT") != options.end()) { + auto value = options.at("FORMAT"); + if (value.getDataType()->getLogicalTypeID() != LogicalTypeID::STRING) { + throw BinderException("The type of format option must be a string."); + } + auto valueStr = value.getValue(); + StringUtils::toUpper(valueStr); + fileType = FileTypeUtils::fromString(valueStr); + options.erase("FORMAT"); + } + return fileType; +} + ExportedTableData Binder::extractExportData(std::string selQuery, std::string tableName) { auto parsedStatement = Parser::parseQuery(selQuery); ExportedTableData exportedTableData; @@ -65,10 +81,14 @@ std::unique_ptr Binder::bindExportDatabaseClause(const Statement auto& exportDatabaseStatement = ku_dynamic_cast(statement); auto boundFilePath = exportDatabaseStatement.getFilePath(); auto exportData = getExportInfo(catalog, clientContext->getTx(), this); - // TODO(Jiamin): add more supported file types - auto fileType = FileType::CSV; - auto csvConfig = CSVReaderConfig::construct( - bindParsingOptions(exportDatabaseStatement.getParsingOptionsRef())); + auto parsedOptions = bindParsingOptions(exportDatabaseStatement.getParsingOptionsRef()); + auto fileType = getFileType(parsedOptions); + if (fileType != FileType::CSV && fileType != FileType::PARQUET) { + throw BinderException("Export database currently only supports csv and parquet files."); + } + if (fileType != FileType::CSV && parsedOptions.size() != 0) { + throw BinderException{"Only export to csv can have options."}; + } // try to create the directory, if it doesn't exist yet if (!vfs->fileOrPathExists(boundFilePath)) { vfs->createDir(boundFilePath); @@ -76,7 +96,7 @@ std::unique_ptr Binder::bindExportDatabaseClause(const Statement throw BinderException(stringFormat("Directory {} already exists.", boundFilePath)); } return std::make_unique( - boundFilePath, fileType, std::move(exportData), csvConfig.option.copy()); + boundFilePath, fileType, std::move(exportData), std::move(parsedOptions)); } } // namespace binder } // namespace kuzu diff --git a/src/binder/bind/bind_file_scan.cpp b/src/binder/bind/bind_file_scan.cpp index 96e9f40f70d..230b88e2711 100644 --- a/src/binder/bind/bind_file_scan.cpp +++ b/src/binder/bind/bind_file_scan.cpp @@ -4,6 +4,7 @@ #include "common/exception/copy.h" #include "common/file_system/virtual_file_system.h" #include "common/string_format.h" +#include "common/string_utils.h" using namespace kuzu::parser; using namespace kuzu::binder; @@ -53,6 +54,7 @@ std::unordered_map Binder::bindParsingOptions( std::unordered_map options; for (auto& option : parsingOptions) { auto name = option.first; + common::StringUtils::toUpper(name); auto expr = expressionBinder.bindExpression(*option.second); KU_ASSERT(expr->expressionType == ExpressionType::LITERAL); auto literalExpr = ku_dynamic_cast(expr.get()); diff --git a/src/common/copier_config/csv_reader_config.cpp b/src/common/copier_config/csv_reader_config.cpp index 832cb81ab9d..4fffee643c7 100644 --- a/src/common/copier_config/csv_reader_config.cpp +++ b/src/common/copier_config/csv_reader_config.cpp @@ -1,7 +1,6 @@ #include "common/copier_config/csv_reader_config.h" #include "common/exception/binder.h" -#include "common/string_utils.h" namespace kuzu { namespace common { @@ -56,7 +55,6 @@ CSVReaderConfig CSVReaderConfig::construct( auto config = CSVReaderConfig(); for (auto& op : options) { auto name = op.first; - StringUtils::toUpper(name); auto isValidStringParsingOption = validateStringParsingOptionName(name); auto isValidBoolParsingOption = validateBoolParsingOptionName(name); if (isValidBoolParsingOption) { diff --git a/src/common/copier_config/reader_config.cpp b/src/common/copier_config/reader_config.cpp index 767d34747d1..c44c17aa39c 100644 --- a/src/common/copier_config/reader_config.cpp +++ b/src/common/copier_config/reader_config.cpp @@ -1,6 +1,7 @@ #include "common/copier_config/reader_config.h" #include "common/assert.h" +#include "common/exception/binder.h" #include "common/exception/copy.h" namespace kuzu { @@ -57,5 +58,17 @@ std::string FileTypeUtils::toString(FileType fileType) { } } +FileType FileTypeUtils::fromString(std::string fileType) { + if (fileType == "CSV") { + return FileType::CSV; + } else if (fileType == "PARQUET") { + return FileType::PARQUET; + } else if (fileType == "NPY") { + return FileType::NPY; + } else { + throw BinderException(stringFormat("Unsupported file type: {}.", fileType)); + } +} + } // namespace common } // namespace kuzu diff --git a/src/include/binder/copy/bound_export_database.h b/src/include/binder/copy/bound_export_database.h index 3afb8014c0b..06dfa4e4f9a 100644 --- a/src/include/binder/copy/bound_export_database.h +++ b/src/include/binder/copy/bound_export_database.h @@ -2,7 +2,6 @@ #include "binder/binder.h" #include "binder/bound_statement.h" #include "binder/query/bound_regular_query.h" -#include "common/copier_config/csv_reader_config.h" #include "common/copier_config/reader_config.h" namespace kuzu { @@ -21,22 +20,27 @@ struct ExportedTableData { class BoundExportDatabase : public BoundStatement { public: BoundExportDatabase(std::string filePath, common::FileType fileType, - std::vector exportData, common::CSVOption csvOption) + std::vector exportData, + std::unordered_map csvOption) : BoundStatement{common::StatementType::EXPORT_DATABASE, BoundStatementResult::createEmptyResult()}, - filePath{std::move(filePath)}, fileType{fileType}, - exportData(std::move(exportData)), csvOption{std::move(csvOption)} {} + exportData(std::move(exportData)), + boundFileInfo(fileType, std::vector{std::move(filePath)}) { + boundFileInfo.options = std::move(csvOption); + } - inline std::string getFilePath() const { return filePath; } - inline common::FileType getFileType() const { return fileType; } - inline const common::CSVOption* getCopyOption() const { return &csvOption; } + inline std::string getFilePath() const { return boundFileInfo.filePaths[0]; } + inline common::FileType getFileType() const { return boundFileInfo.fileType; } + inline common::CSVOption getCopyOption() const { + auto csvConfig = common::CSVReaderConfig::construct(boundFileInfo.options); + return csvConfig.option.copy(); + } + inline const common::ReaderConfig* getBoundFileInfo() const { return &boundFileInfo; } inline const std::vector* getExportData() const { return &exportData; } private: - std::string filePath; - common::FileType fileType; std::vector exportData; - common::CSVOption csvOption; + common::ReaderConfig boundFileInfo; }; } // namespace binder diff --git a/src/include/common/copier_config/reader_config.h b/src/include/common/copier_config/reader_config.h index 60cef54c2eb..1b2d187e99b 100644 --- a/src/include/common/copier_config/reader_config.h +++ b/src/include/common/copier_config/reader_config.h @@ -22,6 +22,7 @@ enum class FileType : uint8_t { struct FileTypeUtils { static FileType getFileTypeFromExtension(std::string_view extension); static std::string toString(FileType fileType); + static FileType fromString(std::string fileType); }; struct ReaderConfig { diff --git a/src/include/planner/operator/persistent/logical_export_db.h b/src/include/planner/operator/persistent/logical_export_db.h index 53c1a3ff8ec..f2f24189cb3 100644 --- a/src/include/planner/operator/persistent/logical_export_db.h +++ b/src/include/planner/operator/persistent/logical_export_db.h @@ -1,6 +1,7 @@ #pragma once #include "common/copier_config/csv_reader_config.h" +#include "common/copier_config/reader_config.h" #include "planner/operator/logical_operator.h" namespace kuzu { @@ -8,27 +9,30 @@ namespace planner { class LogicalExportDatabase : public LogicalOperator { public: - explicit LogicalExportDatabase(std::string filePath, common::CSVOption copyToOption, - std::vector> plans) + explicit LogicalExportDatabase( + common::ReaderConfig boundFileInfo, std::vector> plans) : LogicalOperator{LogicalOperatorType::EXPORT_DATABASE, std::move(plans)}, - filePath{std::move(filePath)}, copyToOption{std::move(copyToOption)} {} + boundFileInfo{std::move(boundFileInfo)} {} inline std::string getExpressionsForPrinting() const override { return std::string{}; } void computeFactorizedSchema() override; void computeFlatSchema() override; - inline std::string getFilePath() const { return filePath; } - inline const common::CSVOption* getCopyOption() const { return ©ToOption; } + inline std::string getFilePath() const { return boundFileInfo.filePaths[0]; } + inline common::FileType getFileType() const { return boundFileInfo.fileType; } + inline common::CSVOption getCopyOption() const { + auto csvConfig = common::CSVReaderConfig::construct(boundFileInfo.options); + return csvConfig.option.copy(); + } + inline const common::ReaderConfig* getBoundFileInfo() const { return &boundFileInfo; } inline std::unique_ptr copy() override { - return make_unique( - filePath, std::move(copyToOption), std::move(children)); + return make_unique(std::move(boundFileInfo), std::move(children)); } private: - std::string filePath; - common::CSVOption copyToOption; + common::ReaderConfig boundFileInfo; }; } // namespace planner diff --git a/src/include/processor/operator/persistent/export_db.h b/src/include/processor/operator/persistent/export_db.h index 9644bac864a..fb94ba52095 100644 --- a/src/include/processor/operator/persistent/export_db.h +++ b/src/include/processor/operator/persistent/export_db.h @@ -1,6 +1,6 @@ #pragma once -#include "common/copier_config/csv_reader_config.h" +#include "common/copier_config/reader_config.h" #include "processor/operator/physical_operator.h" namespace kuzu { @@ -8,11 +8,11 @@ namespace processor { class ExportDB : public PhysicalOperator { public: - ExportDB(std::string filePath, common::CSVOption copyToOption, uint32_t id, - const std::string& paramsString, std::vector> children) + ExportDB(common::ReaderConfig boundFileInfo, uint32_t id, const std::string& paramsString, + std::vector> children) : PhysicalOperator{PhysicalOperatorType::EXPORT_DATABASE, std::move(children), id, paramsString}, - filePath(filePath), copyToOption{std::move(copyToOption)} {} + boundFileInfo{std::move(boundFileInfo)} {} bool canParallel() const override { return false; } @@ -22,12 +22,11 @@ class ExportDB : public PhysicalOperator { inline std::unique_ptr clone() override { return std::make_unique( - filePath, std::move(copyToOption), id, paramsString, std::move(children)); + std::move(boundFileInfo), id, paramsString, std::move(children)); } private: - std::string filePath; - common::CSVOption copyToOption; + common::ReaderConfig boundFileInfo; }; } // namespace processor } // namespace kuzu diff --git a/src/planner/plan/plan_port_db.cpp b/src/planner/plan/plan_port_db.cpp index a2353dbc62e..d9953b3c6b6 100644 --- a/src/planner/plan/plan_port_db.cpp +++ b/src/planner/plan/plan_port_db.cpp @@ -1,8 +1,8 @@ #include "binder/copy/bound_export_database.h" +#include "common/string_utils.h" #include "planner/operator/persistent/logical_copy_to.h" #include "planner/operator/persistent/logical_export_db.h" #include "planner/planner.h" - using namespace kuzu::binder; using namespace kuzu::storage; using namespace kuzu::catalog; @@ -20,18 +20,21 @@ std::unique_ptr Planner::planExportDatabase(const BoundStatement& s auto exportData = boundExportDatabase.getExportData(); auto logicalOperators = std::vector>(); auto plan = std::make_unique(); + auto fileTypeStr = FileTypeUtils::toString(fileType); + StringUtils::toLower(fileTypeStr); + auto copyToSuffix = "." + fileTypeStr; for (auto& exportTableData : *exportData) { auto regularQuery = exportTableData.getRegularQuery(); KU_ASSERT(regularQuery->getStatementType() == StatementType::QUERY); auto tablePlan = getBestPlan(*regularQuery); - auto copyTo = - std::make_shared(filePath + "/" + exportTableData.tableName + ".csv", - fileType, exportTableData.columnNames, exportTableData.getColumnTypesRef(), - boundExportDatabase.getCopyOption()->copy(), tablePlan->getLastOperator()); + auto copyTo = std::make_shared( + filePath + "/" + exportTableData.tableName + copyToSuffix, fileType, + exportTableData.columnNames, exportTableData.getColumnTypesRef(), + boundExportDatabase.getCopyOption(), tablePlan->getLastOperator()); logicalOperators.push_back(std::move(copyTo)); } auto exportDatabase = make_shared( - filePath, boundExportDatabase.getCopyOption()->copy(), std::move(logicalOperators)); + boundExportDatabase.getBoundFileInfo()->copy(), std::move(logicalOperators)); plan->setLastOperator(std::move(exportDatabase)); return plan; } diff --git a/src/processor/map/map_port_db.cpp b/src/processor/map/map_port_db.cpp index 2287ed9b267..f7a5fd5ced2 100644 --- a/src/processor/map/map_port_db.cpp +++ b/src/processor/map/map_port_db.cpp @@ -13,15 +13,14 @@ std::unique_ptr PlanMapper::mapExportDatabase( planner::LogicalOperator* logicalOperator) { auto exportDatabase = ku_dynamic_cast(logicalOperator); - auto filePath = exportDatabase->getFilePath(); std::vector> children; for (auto childCopyTo : exportDatabase->getChildren()) { auto childPhysicalOperator = mapOperator(childCopyTo.get()); children.push_back(std::move(childPhysicalOperator)); } std::unique_ptr resultSetDescriptor; - return std::make_unique(filePath, exportDatabase->getCopyOption()->copy(), - getOperatorID(), exportDatabase->getExpressionsForPrinting(), std::move(children)); + return std::make_unique(exportDatabase->getBoundFileInfo()->copy(), getOperatorID(), + exportDatabase->getExpressionsForPrinting(), std::move(children)); } } // namespace processor diff --git a/src/processor/operator/persistent/export_db.cpp b/src/processor/operator/persistent/export_db.cpp index 61884cf012d..7f80a5c2496 100644 --- a/src/processor/operator/persistent/export_db.cpp +++ b/src/processor/operator/persistent/export_db.cpp @@ -8,6 +8,7 @@ #include "catalog/catalog_entry/node_table_catalog_entry.h" #include "catalog/catalog_entry/rel_table_catalog_entry.h" #include "common/file_system/virtual_file_system.h" +#include "common/string_utils.h" using namespace kuzu::common; using namespace kuzu::transaction; @@ -26,10 +27,14 @@ static void writeStringStreamToFile( } static void writeCopyStatement( - stringstream& ss, std::string tableName, std::string filePath, std::string copyOption) { + stringstream& ss, std::string tableName, ReaderConfig* boundFileInfo) { ss << "COPY "; - ss << tableName << " FROM \"" << filePath << "/" << tableName << ".csv"; - ss << "\"" << copyOption << std::endl; + ss << tableName << " FROM \"" << boundFileInfo->filePaths[0] << "/" << tableName; + auto fileTypeStr = FileTypeUtils::toString(boundFileInfo->fileType); + StringUtils::toLower(fileTypeStr); + ss << "." << fileTypeStr; + auto csvConfig = common::CSVReaderConfig::construct(boundFileInfo->options); + ss << "\"" << csvConfig.option.toCypher() << std::endl; } std::string getSchemaCypher(main::ClientContext* clientContext, transaction::Transaction* tx) { @@ -52,16 +57,16 @@ std::string getMacroCypher(catalog::Catalog* catalog, transaction::Transaction* return ss.str(); } -std::string getCopyCypher(catalog::Catalog* catalog, transaction::Transaction* tx, - std::string filePath, std::string copyOption) { +std::string getCopyCypher( + catalog::Catalog* catalog, transaction::Transaction* tx, ReaderConfig* boundFileInfo) { stringstream ss; for (auto& nodeTableEntry : catalog->getNodeTableEntries(tx)) { auto tableName = nodeTableEntry->getName(); - writeCopyStatement(ss, tableName, filePath, copyOption); + writeCopyStatement(ss, tableName, boundFileInfo); } for (auto& relTableEntry : catalog->getRelTableEntries(tx)) { auto tableName = relTableEntry->getName(); - writeCopyStatement(ss, tableName, filePath, copyOption); + writeCopyStatement(ss, tableName, boundFileInfo); } return ss.str(); } @@ -70,17 +75,17 @@ bool ExportDB::getNextTuplesInternal(ExecutionContext* context) { // write the schema.cypher file writeStringStreamToFile(context->clientContext->getVFSUnsafe(), getSchemaCypher(context->clientContext, context->clientContext->getTx()), - filePath + "/schema.cypher"); + boundFileInfo.filePaths[0] + "/schema.cypher"); // write macro.cypher file writeStringStreamToFile(context->clientContext->getVFSUnsafe(), getMacroCypher(context->clientContext->getCatalog(), context->clientContext->getTx()), - filePath + "/macro.cypher"); + boundFileInfo.filePaths[0] + "/macro.cypher"); // write the copy.cypher file // for every table, we write COPY FROM statement writeStringStreamToFile(context->clientContext->getVFSUnsafe(), - getCopyCypher(context->clientContext->getCatalog(), context->clientContext->getTx(), - filePath, copyToOption.toCypher()), - filePath + "/copy.cypher"); + getCopyCypher( + context->clientContext->getCatalog(), context->clientContext->getTx(), &boundFileInfo), + boundFileInfo.filePaths[0] + "/copy.cypher"); return false; } } // namespace processor diff --git a/test/test_files/copy/export_db.test b/test/test_files/copy/export_db.test index 17ad25bdafb..8d3007554c4 100644 --- a/test/test_files/copy/export_db.test +++ b/test/test_files/copy/export_db.test @@ -26,7 +26,7 @@ 5 -CASE ExportDatabaseWithCSVOption --STATEMENT Export Database "${KUZU_EXPORT_DB_DIRECTORY}/demo-db2" (header=true) +-STATEMENT Export Database "${KUZU_EXPORT_DB_DIRECTORY}/demo-db2" (format="csv", header=true) ---- ok -IMPORT_DATABASE "${KUZU_EXPORT_DB_DIRECTORY}/demo-db2" -STATEMENT MATCH (u:User) WHERE u.name = 'Adam' SET u.age = 50 @@ -35,6 +35,44 @@ -STATEMENT MATCH (u:User) WHERE u.name='Adam' RETURN u.age ---- 1 50 --STATEMENT Export Database "${KUZU_EXPORT_DB_DIRECTORY}/demo-db2" (header=true) +-STATEMENT Export Database "${KUZU_EXPORT_DB_DIRECTORY}/demo-db2" (format='csv', header=true) ---- error Binder exception: Directory ${KUZU_EXPORT_DB_DIRECTORY}/demo-db2 already exists. + +-CASE ExportDatabaseWithPARQUET +-STATEMENT Export Database "${KUZU_EXPORT_DB_DIRECTORY}/demo-db3" (format='parquet') +---- ok +-IMPORT_DATABASE "${KUZU_EXPORT_DB_DIRECTORY}/demo-db3" +-STATEMENT MATCH (u:User) WHERE u.name = 'Adam' SET u.age = 50 +---- ok +-LOG ReturnAge +-STATEMENT MATCH (u:User) WHERE u.name='Adam' RETURN u.age +---- 1 +50 + +-CASE ExportDatabaseError +-STATEMENT Export Database "${KUZU_EXPORT_DB_DIRECTORY}/demo-db4" (format='TURTLE') +---- error +Binder exception: Unsupported file type: TURTLE. + +-STATEMENT Export Database "${KUZU_EXPORT_DB_DIRECTORY}/demo-db4" (format='npy') +---- error +Binder exception: Export database currently only supports csv and parquet files. + +-STATEMENT Export Database "${KUZU_EXPORT_DB_DIRECTORY}/demo-db4" (format=false) +---- error +Binder exception: The type of format option must be a string. + +-STATEMENT Export Database "${KUZU_EXPORT_DB_DIRECTORY}/demo-db4" (format='PARQUET', header=true) +---- error +Binder exception: Only export to csv can have options. + +-STATEMENT Export Database "${KUZU_EXPORT_DB_DIRECTORY}/demo-db4" (header=true) +---- ok +-IMPORT_DATABASE "${KUZU_EXPORT_DB_DIRECTORY}/demo-db4" +-STATEMENT MATCH (u:User) WHERE u.name = 'Adam' SET u.age = 50 +---- ok +-LOG ReturnAge +-STATEMENT MATCH (u:User) WHERE u.name='Adam' RETURN u.age +---- 1 +50