diff --git a/test/cpp/api/CMakeLists.txt b/test/cpp/api/CMakeLists.txt index aa9f01a86217..b3cb6d56e3a6 100644 --- a/test/cpp/api/CMakeLists.txt +++ b/test/cpp/api/CMakeLists.txt @@ -3,6 +3,16 @@ set(TORCH_API_TEST_SOURCES ${TORCH_ROOT}/test/cpp/common/main.cpp ${TORCH_API_TEST_DIR}/any.cpp ${TORCH_API_TEST_DIR}/dataloader.cpp + ${TORCH_API_TEST_DIR}/data/ctf/ctf_sample_classification.cpp + ${TORCH_API_TEST_DIR}/data/ctf/ctf_sample_comments.cpp + ${TORCH_API_TEST_DIR}/data/ctf/ctf_sample_dssm.cpp + ${TORCH_API_TEST_DIR}/data/ctf/ctf_sample_empty_values.cpp + ${TORCH_API_TEST_DIR}/data/ctf/ctf_sample_exponent_values.cpp + ${TORCH_API_TEST_DIR}/data/ctf/ctf_sample_generic.cpp + ${TORCH_API_TEST_DIR}/data/ctf/ctf_sample_learning_to_rank.cpp + ${TORCH_API_TEST_DIR}/data/ctf/ctf_sample_part_of_speech_tagging.cpp + ${TORCH_API_TEST_DIR}/data/ctf/ctf_sample_sequence_classification.cpp + ${TORCH_API_TEST_DIR}/data/ctf/ctf_sample_sequence_to_sequence.cpp ${TORCH_API_TEST_DIR}/expanding-array.cpp ${TORCH_API_TEST_DIR}/integration.cpp ${TORCH_API_TEST_DIR}/jit.cpp diff --git a/test/cpp/api/data/ctf/ctf_sample_classification.cpp b/test/cpp/api/data/ctf/ctf_sample_classification.cpp new file mode 100644 index 000000000000..9126bd24bf97 --- /dev/null +++ b/test/cpp/api/data/ctf/ctf_sample_classification.cpp @@ -0,0 +1,126 @@ +#include + +#include +#include + +/// Tests must be executed from root directory of the repo +/// Order of CTFValues inside CTFSample are important + +namespace torch { +namespace data { +namespace ctf { + +TEST(DataTest, CTF_SAMPLE_CLASSIFICATION_SUCCESS) { + /// Actual data + std::vector input_streams; + input_streams.emplace_back( + 0, + "features", + "features", + 5, + CTFInputStreamType::Feature, + CTFDataStorage::Dense); + input_streams.emplace_back( + 1, + "class", + "class", + 0, + CTFInputStreamType::Label, + CTFDataStorage::Sparse); + CTFConfiguration config( + std::string(CTF_SAMPLE_DIR + "/ctf_sample_classification.ctf"), + input_streams, + CTFDataType(CTFDataType::Int16)); + + CTFParser ctf_parser(config); + ctf_parser.read_from_file(); + + /// Expected data + CTFDataset dataset(CTFDataType::Int16, input_streams); +#ifdef CTF_DEBUG + size_t sequence_id = 0; +#endif + size_t input_stream_id = 0; + + { + // 0 (implicit) +#ifdef CTF_DEBUG + sequence_id = 0; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + input_stream_id = 1; + auto sparse_stream_ptr = + static_cast*>( + sequence[input_stream_id].get()); + // |class 23:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(23); + } + { + input_stream_id = 0; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |features 2 3 4 5 6 + dense_stream_ptr->data.push_back(2); + dense_stream_ptr->data.push_back(3); + dense_stream_ptr->data.push_back(4); + dense_stream_ptr->data.push_back(5); + dense_stream_ptr->data.push_back(6); + } + dataset.sequences.push_back(sequence); +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + } + + { + // 1 (implicit) +#ifdef CTF_DEBUG + sequence_id = 1; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + + { + input_stream_id = 1; + auto sparse_stream_ptr = + static_cast*>( + sequence[input_stream_id].get()); + // |class 13:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(13); + } + { + input_stream_id = 0; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |features 1 2 0 2 3 + dense_stream_ptr->data.push_back(1); + dense_stream_ptr->data.push_back(2); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(2); + dense_stream_ptr->data.push_back(3); + } + dataset.sequences.push_back(sequence); +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + } + + EXPECT_TRUE(*ctf_parser.get_dataset() == dataset); +} +} // namespace ctf +} // namespace data +} // namespace torch \ No newline at end of file diff --git a/test/cpp/api/data/ctf/ctf_sample_comments.cpp b/test/cpp/api/data/ctf/ctf_sample_comments.cpp new file mode 100644 index 000000000000..726e533a01a6 --- /dev/null +++ b/test/cpp/api/data/ctf/ctf_sample_comments.cpp @@ -0,0 +1,189 @@ +#include + +#include +#include + +/// Tests must be executed from root directory of the repo +/// Order of CTFValues inside CTFSample are important + +namespace torch { +namespace data { +namespace ctf { + +TEST(DataTest, CTF_SAMPLE_COMMENTS_SUCCESS) { + /// Actual data + std::vector input_streams; + input_streams.emplace_back( + 0, "A", "A", 5, CTFInputStreamType::Feature, CTFDataStorage::Dense); + input_streams.emplace_back( + 1, "B", "B", 0, CTFInputStreamType::Feature, CTFDataStorage::Sparse); + input_streams.emplace_back( + 2, "C", "C", 1, CTFInputStreamType::Label, CTFDataStorage::Dense); + CTFConfiguration config( + std::string(CTF_SAMPLE_DIR + "/ctf_sample_comments.ctf"), + input_streams, + CTFDataType(CTFDataType::Float)); + + CTFParser ctf_parser(config); + ctf_parser.read_from_file(); + + /// Expected data + CTFDataset dataset(CTFDataType::Float, input_streams); +#ifdef CTF_DEBUG + size_t sequence_id = 0; +#endif + size_t input_stream_id = 0; + + { + // 0 (implicit) +#ifdef CTF_DEBUG + sequence_id = 0; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 2; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + input_stream_id = 1; + auto sparse_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |B 100:3 123:4 + sparse_stream_ptr->data.push_back(3); + sparse_stream_ptr->indices.push_back(100); + sparse_stream_ptr->data.push_back(4); + sparse_stream_ptr->indices.push_back(123); + } + { + input_stream_id = 2; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |C 8 + dense_stream_ptr->data.push_back(8); + } + { + input_stream_id = 0; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |A 0 1 2 3 4 + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(1); + dense_stream_ptr->data.push_back(2); + dense_stream_ptr->data.push_back(3); + dense_stream_ptr->data.push_back(4); + } + dataset.sequences.push_back(sequence); +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + } + + { + // 1 (implicit) +#ifdef CTF_DEBUG + sequence_id = 1; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 2; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + input_stream_id = 0; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |A 0 1.1 22 0.3 54 + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(1.1); + dense_stream_ptr->data.push_back(22); + dense_stream_ptr->data.push_back(0.3); + dense_stream_ptr->data.push_back(54); + } + { + input_stream_id = 2; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |C 123917 + dense_stream_ptr->data.push_back(123917); + } + { + input_stream_id = 1; + auto sparse_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |B 1134:1.911 13331:0.014 + sparse_stream_ptr->data.push_back(1.911); + sparse_stream_ptr->indices.push_back(1134); + sparse_stream_ptr->data.push_back(0.014); + sparse_stream_ptr->indices.push_back(13331); + } + dataset.sequences.push_back(sequence); +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + } + + { + // 2 (implicit) +#ifdef CTF_DEBUG + sequence_id = 2; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 2; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + input_stream_id = 2; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |C -0.001 + dense_stream_ptr->data.push_back(-0.001); + } + { + input_stream_id = 0; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |A 3.9 1.11 121.2 99.13 0.04 + dense_stream_ptr->data.push_back(3.9); + dense_stream_ptr->data.push_back(1.11); + dense_stream_ptr->data.push_back(121.2); + dense_stream_ptr->data.push_back(99.13); + dense_stream_ptr->data.push_back(0.04); + } + { + input_stream_id = 1; + auto sparse_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |B 999:0.001 918918:-9.19 + sparse_stream_ptr->data.push_back(0.001); + sparse_stream_ptr->indices.push_back(999); + sparse_stream_ptr->data.push_back(-9.19); + sparse_stream_ptr->indices.push_back(918918); + } + dataset.sequences.push_back(sequence); +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + } + + EXPECT_TRUE(*ctf_parser.get_dataset() == dataset); +} + +} // namespace ctf +} // namespace data +} // namespace torch \ No newline at end of file diff --git a/test/cpp/api/data/ctf/ctf_sample_dssm.cpp b/test/cpp/api/data/ctf/ctf_sample_dssm.cpp new file mode 100644 index 000000000000..77abdb3a2687 --- /dev/null +++ b/test/cpp/api/data/ctf/ctf_sample_dssm.cpp @@ -0,0 +1,131 @@ +#include + +#include +#include + +/// Tests must be executed from root directory of the repo +/// Order of CTFValues inside CTFSample are important + +namespace torch { +namespace data { +namespace ctf { + +TEST(DataTest, CTF_SAMPLE_DSSN_SUCCESS) { + /// Actual data + std::vector input_streams; + input_streams.emplace_back( + 0, "src", "src", 0, CTFInputStreamType::Feature, CTFDataStorage::Sparse); + input_streams.emplace_back( + 1, "tgt", "tgt", 0, CTFInputStreamType::Label, CTFDataStorage::Sparse); + CTFConfiguration config( + std::string(CTF_SAMPLE_DIR + "/ctf_sample_dssm.ctf"), + input_streams, + CTFDataType(CTFDataType::Double)); + + CTFParser ctf_parser(config); + ctf_parser.read_from_file(); + + /// Expected data + CTFDataset dataset(CTFDataType::Double, input_streams); +#ifdef CTF_DEBUG + size_t sequence_id = 0; +#endif + size_t input_stream_id = 0; + { + // 0 +#ifdef CTF_DEBUG + sequence_id = 0; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + input_stream_id = 0; + auto sparse_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |src 12:1 23:1 345:2 45001:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(12); + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(23); + sparse_stream_ptr->data.push_back(2); + sparse_stream_ptr->indices.push_back(345); + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(45001); + } + { + input_stream_id = 1; + auto sparse_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |tgt 233:1 766:2 234:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(233); + sparse_stream_ptr->data.push_back(2); + sparse_stream_ptr->indices.push_back(766); + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(234); + } + + dataset.sequences.push_back(sequence); +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + } + + { + // 1 +#ifdef CTF_DEBUG + sequence_id = 1; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + input_stream_id = 0; + auto sparse_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |src 123:1 56:1 10324:1 18001:3 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(123); + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(56); + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(10324); + sparse_stream_ptr->data.push_back(3); + sparse_stream_ptr->indices.push_back(18001); + } + { + input_stream_id = 1; + auto sparse_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |tgt 233:1 2344:2 8889:1 2234:1 253434:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(233); + sparse_stream_ptr->data.push_back(2); + sparse_stream_ptr->indices.push_back(2344); + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(8889); + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(2234); + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(253434); + } + dataset.sequences.push_back(sequence); +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + } + + EXPECT_TRUE(*ctf_parser.get_dataset() == dataset); +} +} // namespace ctf +} // namespace data +} // namespace torch \ No newline at end of file diff --git a/test/cpp/api/data/ctf/ctf_sample_empty_values.cpp b/test/cpp/api/data/ctf/ctf_sample_empty_values.cpp new file mode 100644 index 000000000000..15acf25c703e --- /dev/null +++ b/test/cpp/api/data/ctf/ctf_sample_empty_values.cpp @@ -0,0 +1,89 @@ +#include + +#include +#include +#include + +/// Tests must be executed from root directory of the repo +/// Order of CTFValues inside CTFSample are important + +namespace torch { +namespace data { +namespace ctf { + +TEST(DataTest, CTF_SAMPLE_EMPTY_VALUES_SUCCESS) { + /// Actual data + std::vector input_streams; + input_streams.emplace_back( + 0, "F0", "F0", 0, CTFInputStreamType::Feature, CTFDataStorage::Sparse); + input_streams.emplace_back( + 1, "F1", "F1", 1, CTFInputStreamType::Label, CTFDataStorage::Dense); + input_streams.emplace_back( + 2, "F2", "F2", 1, CTFInputStreamType::Label, CTFDataStorage::Dense); + CTFConfiguration config( + std::string(CTF_SAMPLE_DIR + "/ctf_sample_empty_values.ctf"), + input_streams, + CTFDataType(CTFDataType::Int16)); + + CTFParser ctf_parser(config); + ctf_parser.read_from_file(); + + /// Expected data + CTFDataset dataset(CTFDataType::Int16, input_streams); +#ifdef CTF_DEBUG + size_t sequence_id = 0; +#endif + size_t input_stream_id = 0; + { + // 1 +#ifdef CTF_DEBUG + sequence_id = 1; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 2; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + // |F0 + } +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + dataset.sequences.push_back(sequence); + } + { + // 2 +#ifdef CTF_DEBUG + sequence_id = 2; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 2; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + // |F0 |F1 |F2 + } +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + dataset.sequences.push_back(sequence); + } + + EXPECT_TRUE(*ctf_parser.get_dataset() == dataset); +} + +} // namespace ctf +} // namespace data +} // namespace torch \ No newline at end of file diff --git a/test/cpp/api/data/ctf/ctf_sample_exponent_values.cpp b/test/cpp/api/data/ctf/ctf_sample_exponent_values.cpp new file mode 100644 index 000000000000..6ddb1acbaa75 --- /dev/null +++ b/test/cpp/api/data/ctf/ctf_sample_exponent_values.cpp @@ -0,0 +1,93 @@ +#include + +#include +#include + +/// Tests must be executed from root directory of the repo +/// Order of CTFValues inside CTFSample are important + +namespace torch { +namespace data { +namespace ctf { + +TEST(DataTest, CTF_SAMPLE_EXPONENT_VALUE_SUCCESS) { + /// Actual data + std::vector input_streams; + input_streams.emplace_back( + 0, "F0", "F0", 0, CTFInputStreamType::Feature, CTFDataStorage::Sparse); + input_streams.emplace_back( + 1, "T0", "T0", 1, CTFInputStreamType::Label, CTFDataStorage::Dense); + CTFConfiguration config( + std::string(CTF_SAMPLE_DIR + "/ctf_sample_exponent_values.ctf"), + input_streams, + CTFDataType(CTFDataType::Double)); + + CTFParser ctf_parser(config); + ctf_parser.read_from_file(); + + /// Expected data + CTFDataset dataset(CTFDataType::Double, input_streams); +#ifdef CTF_DEBUG + size_t sequence_id = 0; +#endif + size_t input_stream_id = 0; + { + // 0 +#ifdef CTF_DEBUG + sequence_id = 0; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + input_stream_id = 0; + auto sparse_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |F0 0:0.421826 1:1.42167 2:-4.13626e-000123 5:-1.83832 7:-0.000114865 + // 9:-36288.6 11:113.553 13:4.25123e+009 16:-1.78095e-005 18:-0.00162638 + // 19:-1.07109 + sparse_stream_ptr->indices.push_back(0); + sparse_stream_ptr->data.push_back(0.421826); + sparse_stream_ptr->indices.push_back(1); + sparse_stream_ptr->data.push_back(1.42167); + sparse_stream_ptr->indices.push_back(2); + sparse_stream_ptr->data.push_back(-4.13626e-000123); + sparse_stream_ptr->indices.push_back(5); + sparse_stream_ptr->data.push_back(-1.83832); + sparse_stream_ptr->indices.push_back(7); + sparse_stream_ptr->data.push_back(-0.000114865); + sparse_stream_ptr->indices.push_back(9); + sparse_stream_ptr->data.push_back(-36288.6); + sparse_stream_ptr->indices.push_back(11); + sparse_stream_ptr->data.push_back(113.553); + sparse_stream_ptr->indices.push_back(13); + sparse_stream_ptr->data.push_back(4.25123e+009); + sparse_stream_ptr->indices.push_back(16); + sparse_stream_ptr->data.push_back(-1.78095e-005); + sparse_stream_ptr->indices.push_back(18); + sparse_stream_ptr->data.push_back(-0.00162638); + sparse_stream_ptr->indices.push_back(19); + sparse_stream_ptr->data.push_back(-1.07109); + } + { + input_stream_id = 1; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |T0 1 + dense_stream_ptr->data.push_back(1); + } + dataset.sequences.push_back(sequence); +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + } + + EXPECT_TRUE(*ctf_parser.get_dataset() == dataset); +} +} // namespace ctf +} // namespace data +} // namespace torch \ No newline at end of file diff --git a/test/cpp/api/data/ctf/ctf_sample_generic.cpp b/test/cpp/api/data/ctf/ctf_sample_generic.cpp new file mode 100644 index 000000000000..d5e0371314f0 --- /dev/null +++ b/test/cpp/api/data/ctf/ctf_sample_generic.cpp @@ -0,0 +1,246 @@ +#include + +#include +#include + +/// Tests must be executed from root directory of the repo +/// Order of CTFValues inside CTFSample are important + +namespace torch { +namespace data { +namespace ctf { + +TEST(DataTest, CTF_SAMPLE_GENERIC_SUCCESS) { + /// Actual data + std::vector input_streams; + input_streams.emplace_back( + 0, "a", "a", 3, CTFInputStreamType::Feature, CTFDataStorage::Dense); + input_streams.emplace_back( + 1, "b", "b", 2, CTFInputStreamType::Label, CTFDataStorage::Dense); + CTFConfiguration config( + std::string(CTF_SAMPLE_DIR + "/ctf_sample_generic.ctf"), + input_streams, + CTFDataType(CTFDataType::Int32)); + + CTFParser ctf_parser(config); + ctf_parser.read_from_file(); + + /// Expected data + CTFDataset dataset(CTFDataType::Int32, input_streams); +#ifdef CTF_DEBUG + size_t sequence_id = 0; +#endif + size_t input_stream_id = 0; + { + // 100 +#ifdef CTF_DEBUG + sequence_id = 100; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + input_stream_id = 0; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |a 1 2 3 + dense_stream_ptr->data.push_back(1); + dense_stream_ptr->data.push_back(2); + dense_stream_ptr->data.push_back(3); + // a 4 5 6 + dense_stream_ptr->data.push_back(4); + dense_stream_ptr->data.push_back(5); + dense_stream_ptr->data.push_back(6); + // |a 7 8 9 + dense_stream_ptr->data.push_back(7); + dense_stream_ptr->data.push_back(8); + dense_stream_ptr->data.push_back(9); + // |a 7 8 9 + dense_stream_ptr->data.push_back(7); + dense_stream_ptr->data.push_back(8); + dense_stream_ptr->data.push_back(9); + } + + { + input_stream_id = 1; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |b 100 200 + dense_stream_ptr->data.push_back(100); + dense_stream_ptr->data.push_back(200); + // |b 101 201 + dense_stream_ptr->data.push_back(101); + dense_stream_ptr->data.push_back(201); + // |b 102983 14532 + dense_stream_ptr->data.push_back(102983); + dense_stream_ptr->data.push_back(14532); + } + + dataset.sequences.push_back(sequence); +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + } + + { + // 200 +#ifdef CTF_DEBUG + sequence_id = 200; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + input_stream_id = 1; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |b 300 400 + dense_stream_ptr->data.push_back(300); + dense_stream_ptr->data.push_back(400); + } + { + input_stream_id = 0; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |a 10 20 30 + dense_stream_ptr->data.push_back(10); + dense_stream_ptr->data.push_back(20); + dense_stream_ptr->data.push_back(30); + } + dataset.sequences.push_back(sequence); +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + } + + { + // 333 +#ifdef CTF_DEBUG + sequence_id = 333; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + input_stream_id = 1; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + + // |b 500 100 + dense_stream_ptr->data.push_back(500); + dense_stream_ptr->data.push_back(100); + // |b 600 -900 + dense_stream_ptr->data.push_back(600); + dense_stream_ptr->data.push_back(-900); + + dataset.sequences.push_back(sequence); +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + } + } + + { + // 400 +#ifdef CTF_DEBUG + sequence_id = 400; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + input_stream_id = 0; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |a 1 2 3 + dense_stream_ptr->data.push_back(1); + dense_stream_ptr->data.push_back(2); + dense_stream_ptr->data.push_back(3); + // |a 4 5 6 + dense_stream_ptr->data.push_back(4); + dense_stream_ptr->data.push_back(5); + dense_stream_ptr->data.push_back(6); + // |a 4 5 6 TODO: repeated lines should be considered invalid + dense_stream_ptr->data.push_back(4); + dense_stream_ptr->data.push_back(5); + dense_stream_ptr->data.push_back(6); + } + + { + input_stream_id = 1; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |b 100 200 + dense_stream_ptr->data.push_back(100); + dense_stream_ptr->data.push_back(200); + // |b 101 201 + dense_stream_ptr->data.push_back(101); + dense_stream_ptr->data.push_back(201); + // |b 101 201 TODO: repeated lines should be considered invalid + dense_stream_ptr->data.push_back(101); + dense_stream_ptr->data.push_back(201); + } + dataset.sequences.push_back(sequence); +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + } + + { + // 500 +#ifdef CTF_DEBUG + sequence_id = 500; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + input_stream_id = 0; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |a 1 2 3 + dense_stream_ptr->data.push_back(1); + dense_stream_ptr->data.push_back(2); + dense_stream_ptr->data.push_back(3); + } + + { + input_stream_id = 1; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |b 100 200 + dense_stream_ptr->data.push_back(100); + dense_stream_ptr->data.push_back(200); + } + dataset.sequences.push_back(sequence); +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + } + + EXPECT_TRUE(*ctf_parser.get_dataset() == dataset); +} + +} // namespace ctf +} // namespace data +} // namespace torch \ No newline at end of file diff --git a/test/cpp/api/data/ctf/ctf_sample_learning_to_rank.cpp b/test/cpp/api/data/ctf/ctf_sample_learning_to_rank.cpp new file mode 100644 index 000000000000..d2b288fdcbd1 --- /dev/null +++ b/test/cpp/api/data/ctf/ctf_sample_learning_to_rank.cpp @@ -0,0 +1,226 @@ +#include + +#include +#include + +/// Tests must be executed from root directory of the repo +/// Order of CTFValues inside CTFSample are important + +namespace torch { +namespace data { +namespace ctf { + +TEST(DataTest, CTF_SAMPLE_LEARNING_TO_RANK_SUCCESS) { + /// Actual data + std::vector input_streams; + input_streams.emplace_back( + 0, + "features", + "features", + 12, + CTFInputStreamType::Feature, + CTFDataStorage::Dense); + input_streams.emplace_back( + 1, + "rating", + "rating", + 1, + CTFInputStreamType::Label, + CTFDataStorage::Dense); + CTFConfiguration config( + std::string(CTF_SAMPLE_DIR + "/ctf_sample_learning_to_rank.ctf"), + input_streams, + CTFDataType(CTFDataType::Int16)); + + CTFParser ctf_parser(config); + ctf_parser.read_from_file(); + + /// Expected data + CTFDataset dataset(CTFDataType::Int16, input_streams); +#ifdef CTF_DEBUG + size_t sequence_id = 0; +#endif + size_t input_stream_id = 0; + + { + // 0 +#ifdef CTF_DEBUG + sequence_id = 0; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + input_stream_id = 0; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |features 23 35 0 0 0 21 2345 0 0 0 0 0 + dense_stream_ptr->data.push_back(23); + dense_stream_ptr->data.push_back(35); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(21); + dense_stream_ptr->data.push_back(2345); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + // |features 0 123 0 22 44 44 290 22 22 22 33 0 + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(123); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(22); + dense_stream_ptr->data.push_back(44); + dense_stream_ptr->data.push_back(44); + dense_stream_ptr->data.push_back(290); + dense_stream_ptr->data.push_back(22); + dense_stream_ptr->data.push_back(22); + dense_stream_ptr->data.push_back(22); + dense_stream_ptr->data.push_back(33); + dense_stream_ptr->data.push_back(0); + // |features 0 0 0 0 0 0 1 0 0 0 0 0 + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(1); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + } + + { + input_stream_id = 1; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |rating 4 + dense_stream_ptr->data.push_back(4); + // |rating 2 + dense_stream_ptr->data.push_back(2); + // |rating 1 + dense_stream_ptr->data.push_back(1); + } + + dataset.sequences.push_back(sequence); +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + } + + { + // 1 +#ifdef CTF_DEBUG + sequence_id = 1; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + input_stream_id = 0; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |features 34 56 0 0 0 45 1312 0 0 0 0 0 + dense_stream_ptr->data.push_back(34); + dense_stream_ptr->data.push_back(56); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(45); + dense_stream_ptr->data.push_back(1312); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + // |features 45 45 0 0 0 12 335 0 0 0 0 0 + dense_stream_ptr->data.push_back(45); + dense_stream_ptr->data.push_back(45); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(12); + dense_stream_ptr->data.push_back(335); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + } + { + input_stream_id = 1; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |rating 1 + dense_stream_ptr->data.push_back(1); + // |rating 0 + dense_stream_ptr->data.push_back(0); + } + dataset.sequences.push_back(sequence); +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + } + + { + // 2 +#ifdef CTF_DEBUG + sequence_id = 2; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + input_stream_id = 0; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |features 0 0 0 0 0 0 22 0 0 0 0 0 + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(22); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + dense_stream_ptr->data.push_back(0); + } + { + input_stream_id = 1; + auto dense_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |rating 0 + dense_stream_ptr->data.push_back(0); + } + dataset.sequences.push_back(sequence); +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + } + + EXPECT_TRUE(*ctf_parser.get_dataset() == dataset); +} + +} // namespace ctf +} // namespace data +} // namespace torch \ No newline at end of file diff --git a/test/cpp/api/data/ctf/ctf_sample_part_of_speech_tagging.cpp b/test/cpp/api/data/ctf/ctf_sample_part_of_speech_tagging.cpp new file mode 100644 index 000000000000..a1aef3d709eb --- /dev/null +++ b/test/cpp/api/data/ctf/ctf_sample_part_of_speech_tagging.cpp @@ -0,0 +1,129 @@ +#include + +#include +#include + +/// Tests must be executed from root directory of the repo +/// Order of CTFValues inside CTFSample are important + +namespace torch { +namespace data { +namespace ctf { + +TEST(DataTest, CTF_SAMPLE_PART_OF_SPEECH_TAGGING_SUCCESS) { + /// Actual data + std::vector input_streams; + input_streams.emplace_back( + 0, + "word", + "word", + 0, + CTFInputStreamType::Feature, + CTFDataStorage::Sparse); + input_streams.emplace_back( + 1, "tag", "tag", 0, CTFInputStreamType::Label, CTFDataStorage::Sparse); + CTFConfiguration config( + std::string(CTF_SAMPLE_DIR + "/ctf_sample_part_of_speech_tagging.ctf"), + input_streams, + CTFDataType(CTFDataType::Double)); + + CTFParser ctf_parser(config); + ctf_parser.read_from_file(); + + /// Expected data + CTFDataset dataset(CTFDataType::Double, input_streams); +#ifdef CTF_DEBUG + size_t sequence_id = 0; +#endif + size_t input_stream_id = 0; + { + // 0 +#ifdef CTF_DEBUG + sequence_id = 0; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + input_stream_id = 0; + auto sparse_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |word 234:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(234); + // |word 123:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(123); + // |word 123:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(123); + } + { + input_stream_id = 1; + auto sparse_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |tag 12:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(12); + // |tag 10:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(10); + // |tag 13:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(13); + } + dataset.sequences.push_back(sequence); +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + } + + { + // 1 +#ifdef CTF_DEBUG + sequence_id = 1; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + input_stream_id = 0; + auto sparse_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |word 234:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(234); + // |word 123:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(123); + } + { + input_stream_id = 1; + auto sparse_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |tag 12:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(12); + // |tag 10:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(10); + } + dataset.sequences.push_back(sequence); +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + } + EXPECT_TRUE(*ctf_parser.get_dataset() == dataset); +} + +} // namespace ctf +} // namespace data +} // namespace torch \ No newline at end of file diff --git a/test/cpp/api/data/ctf/ctf_sample_sequence_classification.cpp b/test/cpp/api/data/ctf/ctf_sample_sequence_classification.cpp new file mode 100644 index 000000000000..a4990dea097e --- /dev/null +++ b/test/cpp/api/data/ctf/ctf_sample_sequence_classification.cpp @@ -0,0 +1,125 @@ +#include + +#include +#include + +/// Tests must be executed from root directory of the repo +/// Order of CTFValues inside CTFSample are important + +namespace torch { +namespace data { +namespace ctf { + +TEST(DataTest, CTF_SAMPLE_SEQUENCE_CLASSIFICATION_SUCCESS) { + /// Actual data + std::vector input_streams; + input_streams.emplace_back( + 0, + "word", + "word", + 0, + CTFInputStreamType::Feature, + CTFDataStorage::Sparse); + input_streams.emplace_back( + 1, + "class", + "class", + 0, + CTFInputStreamType::Label, + CTFDataStorage::Sparse); + CTFConfiguration config( + std::string(CTF_SAMPLE_DIR + "/ctf_sample_sequence_classification.ctf"), + input_streams, + CTFDataType(CTFDataType::Double)); + + CTFParser ctf_parser(config); + ctf_parser.read_from_file(); + + /// Expected data + CTFDataset dataset(CTFDataType::Double, input_streams); +#ifdef CTF_DEBUG + size_t sequence_id = 0; +#endif + size_t input_stream_id = 0; + { + // 0 +#ifdef CTF_DEBUG + sequence_id = 0; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + input_stream_id = 0; + auto sparse_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |word 234:1 + sparse_stream_ptr->indices.push_back(234); + sparse_stream_ptr->data.push_back(1); + // |word 123:1 + sparse_stream_ptr->indices.push_back(123); + sparse_stream_ptr->data.push_back(1); + // |word 890:1 + sparse_stream_ptr->indices.push_back(890); + sparse_stream_ptr->data.push_back(1); + } + { + input_stream_id = 1; + auto sparse_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |class 3:1 + sparse_stream_ptr->indices.push_back(3); + sparse_stream_ptr->data.push_back(1); + } + dataset.sequences.push_back(sequence); +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + } + + { + // 1 +#ifdef CTF_DEBUG + sequence_id = 1; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + input_stream_id = 0; + auto sparse_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |word 11:1 + sparse_stream_ptr->indices.push_back(11); + sparse_stream_ptr->data.push_back(1); + // |word 344:1 + sparse_stream_ptr->indices.push_back(344); + sparse_stream_ptr->data.push_back(1); + } + { + input_stream_id = 1; + auto sparse_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |class 2:1 + sparse_stream_ptr->indices.push_back(2); + sparse_stream_ptr->data.push_back(1); + } + dataset.sequences.push_back(sequence); +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + } + + EXPECT_TRUE(*ctf_parser.get_dataset() == dataset); +} +} // namespace ctf +} // namespace data +} // namespace torch \ No newline at end of file diff --git a/test/cpp/api/data/ctf/ctf_sample_sequence_to_sequence.cpp b/test/cpp/api/data/ctf/ctf_sample_sequence_to_sequence.cpp new file mode 100644 index 000000000000..49eb060001bd --- /dev/null +++ b/test/cpp/api/data/ctf/ctf_sample_sequence_to_sequence.cpp @@ -0,0 +1,124 @@ +#include + +#include +#include + +/// Tests must be executed from root directory of the repo +/// Order of CTFValues inside CTFSample are important + +namespace torch { +namespace data { +namespace ctf { + +TEST(DataTest, CTF_SAMPLE_SEQUENCE_TO_SEQUENCE_SUCCESS) { + /// Actual data + std::vector input_streams; + input_streams.emplace_back( + 0, + "sourceWord", + "sourceWord", + 0, + CTFInputStreamType::Feature, + CTFDataStorage::Sparse); + input_streams.emplace_back( + 1, + "targetWord", + "targetWord", + 0, + CTFInputStreamType::Label, + CTFDataStorage::Sparse); + CTFConfiguration config( + std::string(CTF_SAMPLE_DIR + "/ctf_sample_sequence_to_sequence.ctf"), + input_streams, + CTFDataType(CTFDataType::Double)); + + CTFParser ctf_parser(config); + ctf_parser.read_from_file(); + + /// Expected data + CTFDataset dataset(CTFDataType::Double, input_streams); +#ifdef CTF_DEBUG + size_t sequence_id = 0; +#endif + size_t input_stream_id = 0; + { + // 0 +#ifdef CTF_DEBUG + sequence_id = 0; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + input_stream_id = 0; + auto sparse_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |sourceWord 234:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(234); + // |sourceWord 123:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(123); + // |sourceWord 123:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(123); + // |sourceWord 11:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(11); + } + { + input_stream_id = 1; + auto sparse_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |targetWord 344:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(344); + // |targetWord 456:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(456); + // |targetWord 2222:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(2222); + } + + dataset.sequences.push_back(sequence); +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + } + + { + // 1 +#ifdef CTF_DEBUG + sequence_id = 1; +#endif + CTFSequenceData sequence; + input_stream_id = 0; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + input_stream_id = 1; + sequence.emplace_back(std::make_shared>( + input_stream_id, input_streams[input_stream_id].dimension)); + { + input_stream_id = 0; + auto sparse_stream_ptr = static_cast*>( + sequence[input_stream_id].get()); + // |sourceWord 123:1 + sparse_stream_ptr->data.push_back(1); + sparse_stream_ptr->indices.push_back(123); + } + dataset.sequences.push_back(sequence); +#ifdef CTF_DEBUG + dataset.sequences_id.push_back(sequence_id); +#endif + } + + EXPECT_TRUE(*ctf_parser.get_dataset() == dataset); +} +} // namespace ctf +} // namespace data +} // namespace torch \ No newline at end of file diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_classification.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_classification.ctf new file mode 100644 index 000000000000..55cb3984d67c --- /dev/null +++ b/test/cpp/api/data/ctf/samples/ctf_sample_classification.ctf @@ -0,0 +1,2 @@ +|class 23:1 |features 2 3 4 5 6 +|class 13:1 |features 1 2 0 2 3 diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_comments.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_comments.ctf new file mode 100644 index 000000000000..ffdfdc06ea5c --- /dev/null +++ b/test/cpp/api/data/ctf/samples/ctf_sample_comments.ctf @@ -0,0 +1,3 @@ +|B 100:3 123:4 |C 8 |A 0 1 2 3 4 |# a CTF comment +|# another comment |A 0 1.1 22 0.3 54 |C 123917 |B 1134:1.911 13331:0.014 +|C -0.001 |# a comment with an escaped pipe: '|#' |A 3.9 1.11 121.2 99.13 0.04 |B 999:0.001 918918:-9.19 diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_dssm.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_dssm.ctf new file mode 100644 index 000000000000..c9e188a4d4fa --- /dev/null +++ b/test/cpp/api/data/ctf/samples/ctf_sample_dssm.ctf @@ -0,0 +1,2 @@ +|src 12:1 23:1 345:2 45001:1 |tgt 233:1 766:2 234:1 +|src 123:1 56:1 10324:1 18001:3 |tgt 233:1 2344:2 8889:1 2234:1 253434:1 diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_empty_values.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_empty_values.ctf new file mode 100644 index 000000000000..0fc1eb65c623 --- /dev/null +++ b/test/cpp/api/data/ctf/samples/ctf_sample_empty_values.ctf @@ -0,0 +1,2 @@ +1|F0 +2|F0 |F1 |F2 diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_exponent_values.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_exponent_values.ctf new file mode 100644 index 000000000000..8fead46727a9 --- /dev/null +++ b/test/cpp/api/data/ctf/samples/ctf_sample_exponent_values.ctf @@ -0,0 +1 @@ +0 |F0 0:0.421826 1:1.42167 2:-4.13626e-000123 5:-1.83832 7:-0.000114865 9:-36288.6 11:113.553 13:4.25123e+009 16:-1.78095e-005 18:-0.00162638 19:-1.07109 |T0 1 diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_generic.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_generic.ctf new file mode 100644 index 000000000000..66f0dcf1c40d --- /dev/null +++ b/test/cpp/api/data/ctf/samples/ctf_sample_generic.ctf @@ -0,0 +1,11 @@ +100 |a 1 2 3 |b 100 200 |# comment at the end of line. +100 |a 4 5 6 |b 101 201 +100 |b 102983 14532 |a 7 8 9 +100 |a 7 8 9 +200 |b 300 400 |a 10 20 30 +333 |b 500 100 +333 |b 600 -900 +400 |a 1 2 3 |b 100 200 +|a 4 5 6 |b 101 201 +|a 4 5 6 |b 101 201 +500 |a 1 2 3 |b 100 200 diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_learning_to_rank.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_learning_to_rank.ctf new file mode 100644 index 000000000000..9c8eb9c8cbf8 --- /dev/null +++ b/test/cpp/api/data/ctf/samples/ctf_sample_learning_to_rank.ctf @@ -0,0 +1,6 @@ +0 |rating 4 |features 23 35 0 0 0 21 2345 0 0 0 0 0 +0 |rating 2 |features 0 123 0 22 44 44 290 22 22 22 33 0 +0 |rating 1 |features 0 0 0 0 0 0 1 0 0 0 0 0 +1 |rating 1 |features 34 56 0 0 0 45 1312 0 0 0 0 0 +1 |rating 0 |features 45 45 0 0 0 12 335 0 0 0 0 0 +2 |rating 0 |features 0 0 0 0 0 0 22 0 0 0 0 0 diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0000.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0000.ctf new file mode 100644 index 000000000000..74758fa30dfa --- /dev/null +++ b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0000.ctf @@ -0,0 +1,3 @@ +1 |word 134:1 |tag 12:1 +1 |word 123:1 |tag 10:1 +1 |word 123:1 |tag 13:1 diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0001.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0001.ctf new file mode 100644 index 000000000000..6d4681fb0b56 --- /dev/null +++ b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0001.ctf @@ -0,0 +1,3 @@ +2 |word 234:1 |tag 22:1 +2 |word 223:1 |tag 20:1 +2 |word 223:1 |tag 23:1 diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0002.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0002.ctf new file mode 100644 index 000000000000..550d09c030b1 --- /dev/null +++ b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0002.ctf @@ -0,0 +1,3 @@ +3 |word 334:1 |tag 32:1 +3 |word 323:1 |tag 30:1 +3 |word 323:1 |tag 33:1 diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0003.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0003.ctf new file mode 100644 index 000000000000..8410928bde0b --- /dev/null +++ b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0003.ctf @@ -0,0 +1,3 @@ +4 |word 434:1 |tag 42:1 +4 |word 423:1 |tag 40:1 +4 |word 423:1 |tag 43:1 diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0004.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0004.ctf new file mode 100644 index 000000000000..ea7cb6344531 --- /dev/null +++ b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0004.ctf @@ -0,0 +1,3 @@ +5 |word 534:1 |tag 52:1 +5 |word 523:1 |tag 50:1 +5 |word 523:1 |tag 53:1 diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0005.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0005.ctf new file mode 100644 index 000000000000..023b6c69d1de --- /dev/null +++ b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0005.ctf @@ -0,0 +1,2 @@ +6 |word 634:1 |tag 62:1 +6 |word 623:1 |tag 60:1 diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0006.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0006.ctf new file mode 100644 index 000000000000..263aba8a7d17 --- /dev/null +++ b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0006.ctf @@ -0,0 +1 @@ +7 |word 734:1 |tag 72:1 diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_part_of_speech_tagging.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_part_of_speech_tagging.ctf new file mode 100644 index 000000000000..4f325f78aba6 --- /dev/null +++ b/test/cpp/api/data/ctf/samples/ctf_sample_part_of_speech_tagging.ctf @@ -0,0 +1,5 @@ +0 |word 234:1 |tag 12:1 +0 |word 123:1 |tag 10:1 +0 |word 123:1 |tag 13:1 +1 |word 234:1 |tag 12:1 +1 |word 123:1 |tag 10:1 diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_sequence_classification.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_sequence_classification.ctf new file mode 100644 index 000000000000..61ea12fa60b1 --- /dev/null +++ b/test/cpp/api/data/ctf/samples/ctf_sample_sequence_classification.ctf @@ -0,0 +1,5 @@ +0 |word 234:1 |class 3:1 +0 |word 123:1 +0 |word 890:1 +1 |word 11:1 |class 2:1 +1 |word 344:1 diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_sequence_to_sequence.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_sequence_to_sequence.ctf new file mode 100644 index 000000000000..342045e9ea73 --- /dev/null +++ b/test/cpp/api/data/ctf/samples/ctf_sample_sequence_to_sequence.ctf @@ -0,0 +1,5 @@ +0 |sourceWord 234:1 |targetWord 344:1 +0 |sourceWord 123:1 |targetWord 456:1 +0 |sourceWord 123:1 |targetWord 2222:1 +0 |sourceWord 11:1 +1 |sourceWord 123:1 diff --git a/test/cpp/api/data/ctf/samples/ctf_samples.h b/test/cpp/api/data/ctf/samples/ctf_samples.h new file mode 100644 index 000000000000..35721b94ca43 --- /dev/null +++ b/test/cpp/api/data/ctf/samples/ctf_samples.h @@ -0,0 +1,69 @@ +#pragma once + +#include +#include + +namespace torch { +namespace data { +namespace ctf { + +static const std::string CTF_SAMPLE_DIR("./test/cpp/api/data/ctf/samples"); + +#ifdef CTF_DEBUG +template +void print_data(CTFDataset dataset) { + + size_t index = 0; + for (const auto& sequence_data : dataset.sequences) { + std::cerr << dataset.sequences_id[index] << " "; + for (const auto input_stream : sequence_data) { + auto input_stream_id = input_stream.get()->input_stream_id; + const auto& input_stream_info = dataset.input_streams[input_stream_id]; + + std::string input_stream_type; + if (input_stream_info.type == CTFInputStreamType::Feature) { + input_stream_type = "F"; + } else { + input_stream_type = "L"; + } + std::cerr << " |" << input_stream_info.name << "(" << input_stream_type + << ")"; + + if (input_stream_info.storage == CTFDataStorage::Dense) { + CTFDenseInputStreamData* dense_data = + reinterpret_cast*>( + input_stream.get()); + + if (dense_data->data.empty()) { + std::cerr << " "; + } else { + for (const auto& value : dense_data->data) { + std::cerr << " " << value; + } + } + } else { + // TODO: print row start somewhere + CTFSparseInputStreamData* sparse_data = + reinterpret_cast*>( + input_stream.get()); + + if (sparse_data->data.empty()) { + std::cerr << " "; + } else { + size_t col_index = 0; + for (const auto& value : sparse_data->data) { + std::cerr << " " << sparse_data->indices[col_index++] << ":" + << value; + } + } + } + } + std::cerr << std::endl; + ++index; + } +} +#endif + +} // namespace ctf +} // namespace data +} // namespace torch diff --git a/test/cpp/api/dataloader.cpp b/test/cpp/api/dataloader.cpp index 461dfe56338b..1eb2b09108a5 100644 --- a/test/cpp/api/dataloader.cpp +++ b/test/cpp/api/dataloader.cpp @@ -1,10 +1,12 @@ #include #include +#include #include #include #include +#include #include #include @@ -16,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -1274,3 +1277,310 @@ TEST(DataLoaderTest, StatefulDatasetWithCollate) { ASSERT_TRUE(batch->data[0].allclose(torch::ones(kBatchSize + 1))); ASSERT_TRUE(batch->target[0].allclose(torch::zeros(kBatchSize - 1))); } + +class DummyChunkDataSet : public datasets::ChunkDataSet< + DummyChunkDataSet, + std::vector, + samplers::SequentialSampler, + samplers::SequentialSampler> { + public: + using BatchType = torch::optional>; + using BatchRequestType = size_t; + DummyChunkDataSet(size_t num_chunks, size_t batch_size) + : datasets::ChunkDataSet< + DummyChunkDataSet, + std::vector, + samplers::SequentialSampler, + samplers::SequentialSampler>(), + num_chunks_(num_chunks), + batch_size_(batch_size), + chunk_sampler_(std::move(samplers::SequentialSampler(num_chunks))), + example_sampler_(std::move(samplers::SequentialSampler(batch_size))) {} + + std::vector read_chunk(size_t chunk_index) override { + std::vector batch(batch_size_); + size_t counter = chunk_index * batch_size_; + for (auto& i : batch) { + i = counter++; + } + return batch; + } + + /// Simply returns an entire chunk to test the API for now. + torch::optional> get_batch(size_t batch_size) override { + int index = chunk_index_.fetch_add(1); + if (index < num_chunks_) { + return read_chunk(index); + } + return torch::nullopt; + } + + samplers::SequentialSampler get_chunk_sampler() override { + return chunk_sampler_; + } + + samplers::SequentialSampler get_example_sampler() override { + return example_sampler_; + } + + size_t get_chunk_count() override { + return num_chunks_; + } + + private: + std::atomic chunk_index_{0}; + size_t num_chunks_; + size_t batch_size_; + samplers::SequentialSampler chunk_sampler_; + samplers::SequentialSampler example_sampler_; +}; + +TEST(DataTest, DataLoaderWithChunkSupportSingleWorker) { + const size_t kBatchSize = 13; + const size_t kNumChunks = 10; + + auto dataset = torch::data::datasets::make_shared_dataset( + kNumChunks, kBatchSize) + .map(transforms::BatchLambda, int>( + [](const std::vector& x) { + return std::accumulate(x.begin(), x.end(), 0); + })); + auto data_loader = + torch::data::make_data_loader(dataset, DataLoaderOptions(kBatchSize)); + + int count = 0; + for (int sum : *data_loader) { + int res = 0; + for (int i = 0; i < kBatchSize; ++i) { + res += count * kBatchSize + i; + } + ASSERT_EQ(sum, res); + count++; + } + ASSERT_EQ(count, 10); +} + +TEST(DataTest, DataLoaderWithChunkSupportMultiWorker) { + const size_t kBatchSize = 13; + const size_t kNumChunks = 10; + + auto dataset = torch::data::datasets::make_shared_dataset( + kNumChunks, kBatchSize) + .map(transforms::BatchLambda, int>( + [](const std::vector& x) { + return std::accumulate(x.begin(), x.end(), 0); + })); + auto data_loader = + torch::data::make_data_loader(dataset, DataLoaderOptions(kBatchSize)); + + int count = 0; + int result_sum = 0; + int expected_sum = 0; + for (int sum : *data_loader) { + result_sum += sum; + for (int i = 0; i < kBatchSize; ++i) { + expected_sum += count * kBatchSize + i; + } + count++; + } + ASSERT_EQ(result_sum, expected_sum); +} + +/// ctf_sample_part_of_speech_tagging.ctf has 2 batches with 1 example each +TEST(DataTest, CTFDataLoaderWithChunkSupportSingleWorkerSingleChunk) { + const size_t batch_size = 1; + const size_t total_workers = 1; + const size_t total_example = 2; + const size_t max_jobs = 2 * total_workers; + std::vector input_streams; + input_streams.emplace_back( + "word", + "word", + 0, + torch::data::ctf::CTFInputStreamType::Feature, + torch::data::ctf::CTFDataStorage::Sparse); + input_streams.emplace_back( + "tag", + "tag", + 0, + torch::data::ctf::CTFInputStreamType::Label, + torch::data::ctf::CTFDataStorage::Sparse); + torch::data::ctf::CTFConfiguration config( + std::string( + torch::data::ctf::CTF_SAMPLE_DIR + + "/ctf_sample_part_of_speech_tagging.ctf"), + input_streams, + torch::data::ctf::CTFDataType(torch::data::ctf::CTFDataType::Double)); + + datasets::SharedBatchDataset> + shared_dataset = datasets::make_shared_dataset>(config); + auto data_loader = torch::data::make_chunk_data_loader( + shared_dataset, + DataLoaderOptions() + .batch_size(batch_size) + .chunk_loading(true) + .workers(total_workers) + .max_jobs(max_jobs)); + + shared_dataset->reset(); + auto iterator = data_loader->begin(); + size_t count_example = 0; + // TODO: Because current DataLoader can return empty batches, + // batch max_jobs to ensure chunk is fully read. + // Empty batches are ignored by the tests + for (size_t i = 0; i < max_jobs; ++i, ++iterator) { + std::vector batch = *iterator; + if (batch.size() != 0) { + count_example += batch.size(); + ASSERT_EQ(batch.size(), batch_size); + torch::data::ctf::CTFSparseInputStreamData* sparse_data = + reinterpret_cast*>( + batch[0][0].get()); + ASSERT_EQ(sparse_data->data[0], 1); + // TODO: Add more checks after using new ChunkDataSet... + } + } + ASSERT_EQ(total_example, count_example); +} + +// ctf_sample_part_of_speech_tagging.ctf has a single batch with 2 examples +TEST( + DataTest, + CTFDataLoaderWithChunkSupportSingleWorkerSingleChunkTwoExamplePerBatch) { + const size_t batch_size = 2; + const size_t total_example = 2; + const size_t total_worker = 1; + const size_t max_jobs = 2 * total_worker; + std::vector input_streams; + input_streams.emplace_back( + "word", + "word", + 0, + torch::data::ctf::CTFInputStreamType::Feature, + torch::data::ctf::CTFDataStorage::Sparse); + input_streams.emplace_back( + "tag", + "tag", + 0, + torch::data::ctf::CTFInputStreamType::Label, + torch::data::ctf::CTFDataStorage::Sparse); + torch::data::ctf::CTFConfiguration config( + std::string( + torch::data::ctf::CTF_SAMPLE_DIR + + "/ctf_sample_part_of_speech_tagging.ctf"), + input_streams, + torch::data::ctf::CTFDataType(torch::data::ctf::CTFDataType::Double)); + + datasets::SharedBatchDataset> + shared_dataset = datasets::make_shared_dataset>(config); + auto data_loader = torch::data::make_chunk_data_loader( + shared_dataset, + DataLoaderOptions() + .workers(total_worker) + .max_jobs(max_jobs) + .batch_size(batch_size) + .chunk_loading(true)); + + shared_dataset->reset(); + auto iterator = data_loader->begin(); + size_t count_example = 0; + // TODO: Because current DataLoader can return empty batches, + // batch max_jobs to ensure chunk is fully read. + // Empty batches are ignored by the tests + for (size_t i = 0; i < max_jobs; ++i, ++iterator) { + std::vector batch = *iterator; + if (batch.size() != 0) { + ASSERT_EQ(batch.size(), batch_size); + torch::data::ctf::CTFSparseInputStreamData* sparse_data = + reinterpret_cast*>( + batch[0][0].get()); + ASSERT_EQ(sparse_data->data[0], 1); + // TODO: Add more checks after using new ChunkDataSet... + count_example += batch.size(); + } + } + ASSERT_EQ(total_example, count_example); +} + +// ctf_sample_multiple_chunks_0000[0...6].ctf has 7 chunks with 3 batches each +// (last chunk has 1 batch) and one example per batch +TEST(DataTest, CTFDataLoaderWithChunkSupportMultipleWorkersMultipleChunks) { + const size_t batch_size = 3; + const size_t total_example = 7; + const size_t total_prefetch = 2; + const size_t total_worker = 10; + const size_t max_jobs = 2 * total_worker; + + std::vector configs; + std::vector input_streams; + input_streams.emplace_back( + "word", + "word", + 0, + torch::data::ctf::CTFInputStreamType::Feature, + torch::data::ctf::CTFDataStorage::Sparse); + input_streams.emplace_back( + "tag", + "tag", + 0, + torch::data::ctf::CTFInputStreamType::Label, + torch::data::ctf::CTFDataStorage::Sparse); + + for (size_t i = 0; i < total_example; ++i) { + torch::data::ctf::CTFConfiguration config( + std::string( + torch::data::ctf::CTF_SAMPLE_DIR + + "/ctf_sample_multiple_chunks_000" + std::to_string(i) + ".ctf"), + input_streams, + torch::data::ctf::CTFDataType(torch::data::ctf::CTFDataType::Double)); + + configs.push_back(config); + } + + datasets::SharedBatchDataset> + shared_dataset = datasets::make_shared_dataset>(configs, total_prefetch); + auto data_loader = torch::data::make_chunk_data_loader( + shared_dataset, + DataLoaderOptions() + .workers(total_worker) + .max_jobs(max_jobs) + .batch_size(batch_size) + .chunk_loading(true)); + + shared_dataset->reset(); + auto iterator = data_loader->begin(); + size_t count_example = 0; + // TODO: Because current DataLoader can return empty batches, + // batch max_jobs to ensure chunk is fully read. + // Empty batches are ignored by the tests + for (size_t i = 0; i < max_jobs; ++i, ++iterator) { + std::vector batch = *iterator; + count_example += batch.size(); + for (size_t b = 0; b < batch.size(); ++b) { + torch::data::ctf::CTFSparseInputStreamData* sparse_data = + reinterpret_cast*>( + batch[0][0].get()); + ASSERT_EQ(sparse_data->data[0], 1); + // TODO: Add more checks after using new ChunkDataSet... + } + } + ASSERT_EQ(total_example, count_example); +} diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 7de5815c5c55..342c5391ea68 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -239,6 +239,7 @@ if (NOT NO_API) ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/random.cpp ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/sequential.cpp ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/stream.cpp + ${TORCH_SRC_DIR}/csrc/api/src/data/ctf/reader.cpp ${TORCH_SRC_DIR}/csrc/api/src/jit.cpp ${TORCH_SRC_DIR}/csrc/api/src/nn/init.cpp ${TORCH_SRC_DIR}/csrc/api/src/nn/module.cpp diff --git a/torch/csrc/api/include/torch/data/ctf/ctf_chunk_dataset.h b/torch/csrc/api/include/torch/data/ctf/ctf_chunk_dataset.h new file mode 100644 index 000000000000..c1eaa0e2cc42 --- /dev/null +++ b/torch/csrc/api/include/torch/data/ctf/ctf_chunk_dataset.h @@ -0,0 +1,93 @@ +#pragma once + +#include +#include +#include +#include + +#include + +namespace torch { +namespace data { +namespace ctf { + + +template < + typename DataType = double, + typename ChunkSampler = samplers::RandomSampler, + typename ExampleSampler = samplers::RandomSampler> +class CTFChunkDataset + : public datasets::ChunkDataSet< + CTFChunkDataset, + std::vector, + ChunkSampler, + ExampleSampler> { + public: + using BatchType = std::vector; + using ChunkSamplerType = ChunkSampler; + using ExampleSamplerType = ExampleSampler; + + /// Loads multiple CTF files on multiple chunks with parallelization + /// TODO: CTF files are not splitted, so they must fit in memory + explicit CTFChunkDataset( + std::vector configs, + size_t prefetch_count) + : datasets::ChunkDataSet< + CTFChunkDataset, + std::vector, + ChunkSampler, + ExampleSampler>(prefetch_count, false), + config_(configs), + chunk_sampler_(std::move(ChunkSampler(0))), + example_sampler_(std::move(ExampleSampler(0))) { + num_chunks_ = configs.size(); + } + + /// Loads a single CTF file on a single chunk without parallelization + /// TODO: CTF files are not splitted, so they must fit in memory + explicit CTFChunkDataset(ctf::CTFConfiguration config) + : datasets::ChunkDataSet< + CTFChunkDataset, + std::vector, + ChunkSampler, + ExampleSampler>(1, false), + chunk_sampler_(std::move(ChunkSampler(0))), + example_sampler_(std::move(ExampleSampler(0))) { + num_chunks_ = 1; + config_.push_back(config); + } + + std::vector read_chunk(size_t chunk_index) override { + // read file (which is a full chunk) + ctf::CTFParser ctf_parser(config_[chunk_index]); + ctf_parser.read_from_file(); + std::shared_ptr> ctf_dataset = + ctf_parser.get_dataset(); + + return std::move(ctf_dataset->sequences); + } + + ChunkSampler get_chunk_sampler() override { + return chunk_sampler_; + } + + ExampleSampler get_example_sampler() override { + return example_sampler_; + } + + size_t get_chunk_count() override { + return num_chunks_; + } + + + private: + std::vector config_; + size_t num_chunks_; + ChunkSampler chunk_sampler_; + ExampleSampler example_sampler_; +}; + + +} // namespace ctf +} // namespace data +} // namespace torch diff --git a/torch/csrc/api/include/torch/data/ctf/ctf_parser.h b/torch/csrc/api/include/torch/data/ctf/ctf_parser.h new file mode 100644 index 000000000000..ed0ff9311929 --- /dev/null +++ b/torch/csrc/api/include/torch/data/ctf/ctf_parser.h @@ -0,0 +1,931 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#ifdef CTF_DEBUG +#include +#endif +#include +#include +#include +#include + +namespace torch { +namespace data { +namespace ctf { + +/* + * CTF general format + * [Sequence_Id](Sample or Comment)+ + * where + * sequence_Id=(empty|[0-9]+) + * Sample=|Input_Name (Value )* + * Comment=|# some content + * Example: + * 100 |a 1 2 3 |b 100 200 + * 100 |a 4 5 6 |b 101 201 + * 100 |b 102983 14532 |a 7 8 9 + * 100 |a 7 8 9 + * 200 |b 300 400 |a 10 20 30 + * 333 |b 500 100 + * 333 |b 600 -900 + * 400 |a 1 2 3 |b 100 200 + * |a 4 5 6 |b 101 201 + * |a 4 5 6 |b 101 201 + * 500 |a 1 2 3 |b 100 200 + */ + +/// +/// Beginning of type definitions +/// + +/// +/// Enumeration type denoting data type of symbolic data entities or actual +/// data. +/// +enum class CTFDataType : unsigned int { + Unknown = 0, + Float = 1, + Double = 2, + UChar = 3, // So far only used internally in deserializers. + Float16 = 4, + Int8 = 5, + Int16 = 6, + Int32 = 7, +}; + +/// +/// Enumeration type denoting the format of storage +/// +enum class CTFDataStorage { Dense, Sparse }; +enum class CTFInputStreamType { Feature, Label }; + +/// +/// Input Stream information +/// +struct CTFInputStreamInformation { + // Self-assigned Unique ID of the input stream (do not assign it!) + // TODO: ugly, fix this! + size_t __id__; + // Unique name of the input stream + std::string name; + // Unique alias of the input + // Useful when the name is long + std::string alias; + // expected number of elements in a sample + // TODO: Only useful if number of samples is known + size_t dimension; + // Input streams belong to either Feature or Label + CTFInputStreamType type; + // Data storage of the stream + CTFDataStorage storage; + + CTFInputStreamInformation( + std::string name, + std::string alias, + size_t dimension, + CTFInputStreamType type, + CTFDataStorage storage) + : name(std::move(name)), + alias(std::move(alias)), + dimension(dimension), + type(type), + storage(storage){}; + + // Used for unit tests + CTFInputStreamInformation( + size_t id, + std::string name, + std::string alias, + size_t dimension, + CTFInputStreamType type, + CTFDataStorage storage) + : __id__(id), + name(std::move(name)), + alias(std::move(alias)), + dimension(dimension), + type(type), + storage(storage){}; +}; +inline bool operator==( + const CTFInputStreamInformation& lhs, + const CTFInputStreamInformation& rhs) { + return ( + lhs.__id__ == rhs.__id__ && lhs.name == rhs.name && + lhs.alias == rhs.alias && lhs.dimension == rhs.dimension && + lhs.type == rhs.type && lhs.storage == rhs.storage); +} + +inline bool operator!=( + const CTFInputStreamInformation& lhs, + const CTFInputStreamInformation& rhs) { + return !(lhs == rhs); +} + +/// +/// Helper to centralize all input information in a single object +/// +class CTFConfiguration { + public: + explicit CTFConfiguration( + const std::string& filepath, + const std::vector& input_streams_info, + CTFDataType data_type) + : filepath_(std::move(filepath)), + input_streams_info_(std::move(input_streams_info)), + data_type_(data_type){}; + + const std::vector& get_input_streams_info() const { + return input_streams_info_; + } + + const std::string& get_file_path() const { + return filepath_; + } + CTFDataType get_ctf_data_type() const { + return data_type_; + } + + private: + std::string filepath_; + std::vector input_streams_info_; + CTFDataType data_type_; +}; + +/// +/// Sequence ID type +/// -1 is used to flag an uninitialized Sequence ID +/// +typedef long int CTFSequenceID; + +#ifdef CTF_DEBUG +/// +/// Maps Sequenced ID to index at vector +/// +typedef std::map CTFSequenceMap; +#endif + +/// +/// Input Stream ID type +/// All Input Streamsare stored on a vector +/// and CTFInputStreamID is the index of a particular stream +/// +typedef size_t CTFInputStreamID; + +/// +/// Maps Input Stream names to a unique index +/// +typedef std::unordered_map + CTFInputStreamMapByName; + +/// +/// Used during sparse data parsing +/// +const size_t CTFValueIndexUninitialized = SIZE_MAX; +typedef size_t CTFValueIndex; + +/// +/// Sequence data type +/// The global vector of sequences and the vector of samples will use it +/// +struct CTFInpuStreamDataBase { + explicit CTFInpuStreamDataBase(size_t input_stream_id) + : input_stream_id(input_stream_id) {} + size_t input_stream_id; +}; +typedef std::shared_ptr CTFInpuStreamDataBasePtr; +typedef std::vector CTFSequenceData; + +/// +/// Dense data +/// +template +struct CTFDenseInputStreamData : CTFInpuStreamDataBase { + explicit CTFDenseInputStreamData(size_t input_stream_id, size_t capacity = 0) + : CTFInpuStreamDataBase(input_stream_id) { + if (capacity > 0) { + // TODO: On a per input stream storage, sample dimension is not useful + // data.reserve(capacity); + } + } + + std::vector data; +}; + +/// +/// Sparse data +/// +template +struct CTFSparseInputStreamData : CTFInpuStreamDataBase { + explicit CTFSparseInputStreamData( + size_t input_stream_id, + size_t dimension = 0) + : CTFInpuStreamDataBase(input_stream_id) { + if (dimension > 0) { + /// TODO: Reserve something for sparse input? 1%? 0.05% of dimension? + // data.reserve(dimension); + } + } + + std::vector indices; + std::vector data; +}; + +/// +/// CTFDataset centralizes all parsed data +/// +template +struct CTFDataset { + explicit CTFDataset( + CTFDataType data_type, + const std::vector& input_streams_info) + : data_type(data_type), input_streams_info(input_streams_info) {} + + bool operator==(const CTFDataset& rhs) const { + // Datasets must have the same type and number of sequences + if (this->data_type != rhs.data_type || + this->sequences.size() != rhs.sequences.size()) { + return false; + } + + for (size_t sequence_index = 0; sequence_index < this->sequences.size(); + ++sequence_index) { + // Each sequence buffer must have the same number of input streams + if (this->sequences[sequence_index].size() != + rhs.sequences[sequence_index].size()) { + return false; + } + // Each input stream must have the same number of values + for (size_t sequence_data_index = 0; + sequence_data_index < this->sequences[sequence_index].size(); + ++sequence_data_index) { + auto this_stream_ptr = + this->sequences[sequence_index][sequence_data_index].get(); + auto this_stream_id = this_stream_ptr->input_stream_id; + auto rhs_stream_ptr = + rhs.sequences[sequence_index][sequence_data_index].get(); + auto rhs_stream_id = rhs_stream_ptr->input_stream_id; + // Input streams IDs must match + if (this_stream_id != rhs_stream_id) { + return false; + } + + // Input stream metadata must match + const auto& this_input_stream_info = + this->input_streams_info[this_stream_id]; + const auto& rhs_input_stream_info = + rhs.input_streams_info[rhs_stream_id]; + if (this_input_stream_info != rhs_input_stream_info) { + return false; + } + + // Values inside each input stream must match + if (rhs_input_stream_info.storage == CTFDataStorage::Dense) { + auto this_dense_stream_ptr = + static_cast*>(this_stream_ptr); + auto rhs_dense_stream_ptr = + static_cast*>(rhs_stream_ptr); + if (this_dense_stream_ptr->data != rhs_dense_stream_ptr->data) { + return false; + } + } else { + auto this_sparse_stream_ptr = + static_cast*>(this_stream_ptr); + auto rhs_sparse_stream_ptr = + static_cast*>(rhs_stream_ptr); + if ((this_sparse_stream_ptr->indices != + rhs_sparse_stream_ptr->indices) || + (this_sparse_stream_ptr->data != rhs_sparse_stream_ptr->data)) { + return false; + } + } + } + } + + return true; + } + + bool operator!=(const CTFDataset& rhs) const { + return !(this == rhs); + } + + // TODO: Do we need this? Maybe for logging, only + CTFDataType data_type; + // Contains all sequences + // TODO: Performance consideration: CNTK knows the number of sequences in the + // chunk, allowing accurate memory reservation. Pytorch approach doesn't + std::vector sequences; + + // CTF Input Stream definitions for features and labels + std::vector input_streams_info; + + // Input stream map (maps input stream name to a unique ID) + CTFInputStreamMapByName input_streams_map; +#ifdef CTF_DEBUG + std::vector sequences_id; +#endif +}; + +/// +/// Beginning of implementation +/// + +template +class CTFParser { + public: + explicit CTFParser(const CTFConfiguration& config) + : data_type_(config.get_ctf_data_type()), + dataset_(std::make_shared>( + config.get_ctf_data_type(), + config.get_input_streams_info())), + scratch_(CTF_SCRATCH_LENGTH, '\0'), + reader_(std::make_shared(config.get_file_path())), + has_initial_sequence_id_(false), + previous_sequence_id_(-1) { + // TODO: Improve validation by iterating all streams checking + // CTFInputStreamType? + if (dataset_->input_streams_info.size() < 2) { + std::string error_msg( + "Missing 'features' or 'labels' CTF stream definitions!"); +#ifdef CTF_DEBUG + std::cerr << error_msg << std::endl; +#endif + throw std::runtime_error(error_msg); + } + + // TODO: Improve it for unit testing too + // Creating unique IDs for all input streams + for (size_t i = 0; i < dataset_->input_streams_info.size(); ++i) { + CTFInputStreamInformation& stream = dataset_->input_streams_info[i]; + const std::string& name = stream.name; + dataset_->input_streams_map[name] = i; + dataset_->input_streams_info[i].__id__ = i; + } + } + +#ifdef CTF_DEBUG + void print_data(void) const { + size_t index = 0; + for (const auto& sequence_data : dataset_->sequences) { + std::cerr << dataset_->sequences_id[index] << " "; + for (const auto input_stream : sequence_data) { + auto input_stream_id = input_stream.get()->input_stream_id; + const auto& input_stream_info = + dataset_->input_streams_info[input_stream_id]; + + std::string input_stream_type; + if (input_stream_info.type == CTFInputStreamType::Feature) { + input_stream_type = "F"; + } else { + input_stream_type = "L"; + } + std::cerr << " |" << input_stream_info.name << "(" << input_stream_type + << ")"; + + if (input_stream_info.storage == CTFDataStorage::Dense) { + CTFDenseInputStreamData* dense_data = + reinterpret_cast*>( + input_stream.get()); + + if (dense_data->data.empty()) { + std::cerr << " "; + } else { + for (const auto& value : dense_data->data) { + std::cerr << " " << value; + } + } + } else { + CTFSparseInputStreamData* sparse_data = + reinterpret_cast*>( + input_stream.get()); + + if (sparse_data->data.empty()) { + std::cerr << " "; + } else { + size_t col_index = 0; + for (const auto& value : sparse_data->data) { + std::cerr << " " << sparse_data->indices[col_index++] << ":" + << value; + } + } + } + } + std::cerr << std::endl; + ++index; + } + } +#endif + + std::shared_ptr> get_dataset() { + return dataset_; + } + + void read_from_file() { +#ifdef CTF_DEBUG + size_t read_count = 0; +#endif + + do { +#ifdef CTF_DEBUG + std::cout << "Read count: " << ++read_count << " starting at " + << reader_->get_position() << std::endl; +#endif + // CTF files start with valid alpha-numeric characters + if (is_non_printable(reader_->peek_char())) { + std::string error_msg( + "Non printable character anon print CTF file at position " + + std::to_string(reader_->get_position()) + "(" + + std::to_string(static_cast(reader_->peek_char())) + ")"); +#ifdef CTF_DEBUG + std::cout << error_msg << std::endl; +#endif + throw std::runtime_error(error_msg); + } + + // There can be an explicit sequence ID at the beginning of the line or + // the last known is used implicitly + CTFSequenceID sequence_id; + bool is_new_sequence = get_sequence_id(sequence_id); + + while (!is_eol(reader_->peek_char())) { + // After the sequence ID, there can be many input streams/comments + if (!get_input_stream(sequence_id, is_new_sequence)) { + if (!discard_comment()) { + std::string error_msg( + "Invalid CTF File. Neither a CTF Value nor a " + "CTF Comment was found at position " + + std::to_string(reader_->get_position())); +#ifdef CTF_DEBUG + std::cout << error_msg << std::endl; +#endif + throw std::runtime_error(error_msg); + } + } + } + // Discard EOL + reader_->get_char(); + } while (reader_->can_read()); + } + + private: + CTFParser() = delete; + DISALLOW_COPY_AND_ASSIGN(CTFParser); + + bool get_sequence_id(CTFSequenceID& sequence_id) { +#ifdef CTF_DEBUG + // For logging purposes + size_t initial_pos = reader_->get_position(); +#endif + + // Flag to identify when a new Sequence ID is found + bool is_new = false; + + // idx will be used to iterate through scratch_ for local string parsing + size_t idx = 0; + + // Sequence ID must start with a digit + char c = reader_->peek_char(); + if (!is_digit(c)) { +#ifdef CTF_DEBUG + std::cout << "Not a Sequence ID at position " << initial_pos << std::endl; +#endif + if (has_initial_sequence_id_) { + sequence_id = previous_sequence_id_; +#ifdef CTF_DEBUG + std::cout << "Using previous Sequence ID (" << previous_sequence_id_ + << ")" << std::endl; +#endif + } else { + is_new = true; + sequence_id = previous_sequence_id_ + 1; + +#ifdef CTF_DEBUG + std::cout << "Incremented previous Sequence ID (" << sequence_id << ")" + << std::endl; +#endif + } + previous_sequence_id_ = sequence_id; + return is_new; + } + + // Get all consecutive digits + while (is_digit(reader_->peek_char())) { + c = reader_->get_char(); + scratch_[idx++] = c; + } + scratch_[idx] = '\0'; + + // Discard delimiters after the ID + while (is_value_delimiter(reader_->peek_char())) { + reader_->get_char(); + } + + // After Sequence ID, there must be a '|' + if (!is_name_prefix(reader_->peek_char())) { + std::string error_msg( + "Missing name delimiter for one of the sequences at position " + + std::to_string(reader_->get_position())); +#ifdef CTF_DEBUG + std::cerr << error_msg << std::endl; +#endif + throw std::runtime_error(error_msg); + } + + // Convert string and return integral value + sequence_id = static_cast(std::stoull(scratch_.data())); +#ifdef CTF_DEBUG + std::cout << "Found Sequence ID '" << std::to_string(sequence_id) + << "' at position " << std::to_string(initial_pos) << std::endl; +#endif + + // Decides whether this is a new example or an existing one + if (previous_sequence_id_ != sequence_id && sequence_id != LONG_MAX) { + is_new = true; + } + + previous_sequence_id_ = sequence_id; + has_initial_sequence_id_ = true; + return is_new; + } + + bool get_input_stream( + const CTFSequenceID& sequence_id, + bool& is_new_sequence) { + // Create a new sequence with input_streams_info pre-allocated + if (is_new_sequence) { + is_new_sequence = false; + +#ifdef CTF_DEBUG + dataset_->sequences_id.emplace_back(sequence_id); +#endif + + // New sequence to be appended to dataset_->sequences + CTFSequenceData sequence; + + for (auto const& stream : dataset_->input_streams_info) { + CTFInputStreamID input_stream_id = + dataset_->input_streams_map[stream.name]; + if (stream.storage == CTFDataStorage::Dense) { + // TODO: Performance consideration: CNTK knows the number of samples + // in the sequence, allowing accurate memory reservation (index built + // during init) + sequence.emplace_back( + std::make_shared>( + input_stream_id, stream.dimension)); + } else { + sequence.emplace_back( + std::make_shared>( + input_stream_id, stream.dimension)); + } + } + + dataset_->sequences.emplace_back(sequence); + } + + // Reads the Input Stream name and lookup its input stream reference + CTFInputStreamID input_stream_id; + if (!get_input_stream_name(input_stream_id)) { + return false; + } + const CTFInputStreamInformation& input_stream = + dataset_->input_streams_info[input_stream_id]; + + // Appends all values to the input stream + if (!get_input_stream(input_stream)) { + return false; + } + + // TODO: Check actual number of values records of the stream + return true; + } + + // Parses input name from buffer and returns both CTFInputStreamInformation + // reference and true if the input name belongs to an existing Input Stream + bool get_input_stream_name(CTFInputStreamID& input_stream_id) { +#ifdef CTF_DEBUG + // For logging purposes + size_t initial_pos = reader_->get_position(); +#endif + // idx will be used to iterate through scratch_ for local string parsing + size_t idx = 0; + + // CTF Name must start with a '|' + if (!is_name_prefix(reader_->peek_char())) { +#ifdef CTF_DEBUG + std::cout << "Not a CTF Name at position " << initial_pos << std::endl; +#endif + return false; + } + + // Discard | and get all consecutive digits and alpha characters + char c = reader_->get_char(); + while (is_digit(reader_->peek_char()) || is_alpha(reader_->peek_char())) { + c = reader_->get_char(); + scratch_[idx++] = c; + } + scratch_[idx] = '\0'; + + // Discard delimiters after the CTF Name + while (is_value_delimiter(reader_->peek_char())) { + c = reader_->get_char(); + } + + // After CTF Name, there must be a CTF value or another CTF Name + c = reader_->peek_char(); + if (!is_number(c) && !is_name_prefix(c) && !is_eol(c)) { +#ifdef CTF_DEBUG + std::cerr << "Unexpected symbol '" << c << "' after CTF Name at position " + << reader_->get_position() << std::endl; +#endif + reader_->rewind_char(); + return false; + } + + // Return the CTF Name + // TODO: Can be done better? + std::string name = std::string(scratch_.begin(), scratch_.begin() + idx); +#ifdef CTF_DEBUG + std::cout << "Found CTF Name '" << name << "' at position " << initial_pos + << std::endl; +#endif + + /// Match input name with the ones at 'features' and 'labels' + bool found = false; + auto it = dataset_->input_streams_map.find(name); + if (it != dataset_->input_streams_map.end()) { + input_stream_id = it->second; + found = true; + } + + if (!found) { + std::string error_msg( + "CTF Stream not found for input name '" + name + "'."); +#ifdef CTF_DEBUG + std::cerr << error_msg << std::endl; +#endif + throw std::runtime_error(error_msg); + } + + return true; + } + + bool get_input_stream_value(const CTFInputStreamInformation& input_stream) { +#ifdef CTF_DEBUG + // For logging purposes + size_t initial_pos = reader_->get_position(); +#endif + // idx will be used to iterate through scratch_ for local string parsing + size_t idx = 0; + + // Temporary data/index holders + CTFValueIndex ctf_index = CTFValueIndexUninitialized; + DataType ctf_value; + + // CTF Value must start with a digit, dot, signal or exponent symbol + char c = reader_->peek_char(); + if (!is_number(c)) { +#ifdef CTF_DEBUG + std::cerr << "Unexpected symbol '" << c << "' at position " << initial_pos + << std::endl; +#endif + return false; + } + + // Get all consecutive digits and decimal point, if any + bool is_float = false; + size_t sign_count = 0; + bool has_exponent = false; + while (is_number(reader_->peek_char()) || + is_sparse_value_delimiter(reader_->peek_char())) { + c = reader_->get_char(); + if (is_exponent(c)) { + has_exponent = true; + } + if (is_sign(c)) { + if ((sign_count > 1 && !has_exponent) || (sign_count > 2)) { + std::string error_msg( + "Invalid CTF Value. CTF value with more than one " + "positive or negative sign at position " + + std::to_string(reader_->get_position())); +#ifdef CTF_DEBUG + std::cerr << error_msg << std::endl; +#endif + throw std::runtime_error(error_msg); + } + ++sign_count; + } + if (is_decimal_point(c)) { + if (is_float) { + std::string error_msg( + "Invalid CTF Value. CTF value with more than one " + "decimal point at position " + + std::to_string(reader_->get_position())); +#ifdef CTF_DEBUG + std::cerr << error_msg << std::endl; +#endif + throw std::runtime_error(error_msg); + } + is_float = true; + } + if (is_sparse_value_delimiter(c)) { + if (input_stream.storage == CTFDataStorage::Dense) { + std::string error_msg( + "Unexpected sparse index delimiter ':' at position " + + std::to_string(reader_->get_position())); +#ifdef CTF_DEBUG + std::cerr << error_msg << std::endl; +#endif + throw std::runtime_error(error_msg); + } + // Validate found ctf value index + if (is_float) { + std::string error_msg( + "Unexpected symbol '.' at index of CTF Value at position " + + std::to_string(reader_->get_position())); +#ifdef CTF_DEBUG + std::cerr << error_msg << std::endl; +#endif + throw std::runtime_error(error_msg); + } else { + // Discard colon, grab cft index value and reset ctf value string + c = reader_->get_char(); + ctf_index = static_cast(std::stoull( + std::string(scratch_.begin(), scratch_.begin() + idx))); + idx = 0; +#ifdef CTF_DEBUG + std::cout << "Found CTF Value Index '" << ctf_index + << "' at position " << reader_->get_position() << std::endl; +#endif + } + } + scratch_[idx++] = c; + } + scratch_[idx] = '\0'; + + // Discard delimiters after the CTF Value + while (is_value_delimiter(reader_->peek_char())) { + c = reader_->get_char(); + } + + // After CTF Value, there must be another CTF Value or CTF Comment + c = reader_->peek_char(); + if (!is_number(c) && !is_comment_prefix(c) && !is_eol(c)) { + std::string error_msg( + "Unexpected symbol '" + std::to_string(c) + + "' after CTF Value at position " + + std::to_string(reader_->get_position())); +#ifdef CTF_DEBUG + std::cerr << error_msg << std::endl; +#endif + throw std::runtime_error(error_msg); + } + + // Grab CTF value + if (ctf_index != CTFValueIndexUninitialized) { + if (input_stream.storage == CTFDataStorage::Dense) { + std::string error_msg( + "Unexpected CTF Value format. Dense format was expected but " + "a sparse one was found at position " + + std::to_string(reader_->get_position())); +#ifdef CTF_DEBUG + std::cerr << error_msg << std::endl; +#endif + throw std::runtime_error(error_msg); + } + } else { + if (input_stream.storage != CTFDataStorage::Dense) { + std::string error_msg( + "Unexpected CTF Value format. Sparse format was expected but " + "a dense one was found at position " + + std::to_string(reader_->get_position())); +#ifdef CTF_DEBUG + std::cerr << error_msg << std::endl; +#endif + throw std::runtime_error(error_msg); + } + } + ctf_value = static_cast( + std::stod(std::string(scratch_.begin(), scratch_.begin() + idx))); +#ifdef CTF_DEBUG + std::cout << "Found CTF Value '" << ctf_value << "' at position " + << reader_->get_position() << std::endl; +#endif + + if (input_stream.storage == CTFDataStorage::Dense) { + CTFDenseInputStreamData* dense_data = + static_cast*>( + (dataset_->sequences.back())[input_stream.__id__].get()); + dense_data->data.emplace_back(ctf_value); + } else { + CTFSparseInputStreamData* sparse_data = + static_cast*>( + (dataset_->sequences.back())[input_stream.__id__].get()); + // std::cerr << "data.emplace_back(" << ctf_value << ") for input stream + // id " << input_stream.__id__ << std::endl; + sparse_data->data.emplace_back(ctf_value); + sparse_data->indices.emplace_back(ctf_index); + } + return true; + } + + bool discard_comment(void) { +#ifdef CTF_DEBUG + // For logging purposes + size_t initial_pos = reader_->get_position(); +#endif + + // Used for matching quotes inside a comment + // Helps detecting end of comment + size_t quote_count = 0; + + // CTF Comment must start with |# + char c = reader_->get_char(); + if (!is_comment_prefix(c)) { +#ifdef CTF_DEBUG + std::cout << "Not a CTF Comment at position " << initial_pos << std::endl; +#endif + reader_->rewind_char(); + return false; + } + + c = reader_->get_char(); + if (!is_comment_suffix(c)) { + std::string error_msg( + "Not a CTF Comment at position " + + std::to_string(reader_->get_position())); +#ifdef CTF_DEBUG + std::cout << error_msg << std::endl; +#endif + throw std::runtime_error(error_msg); + } + + // Get all consecutive digits and alpha characters + while (!is_eol(reader_->peek_char())) { + c = reader_->peek_char(); + // Comment symbol can show up when properly escaped + if (is_escape_delimiter(c)) { + ++quote_count; + } + + // If new ctf sample is found, end current comment + if (is_name_prefix(c) && (quote_count % 2 == 0)) { + break; + } + + c = reader_->get_char(); + } + +#ifdef CTF_DEBUG + std::cout << "Skipping CTF Comment at position " << reader_->get_position() + << std::endl; +#endif + return true; + } + + bool get_input_stream(const CTFInputStreamInformation& input_stream) { + // Adds a new row start for the input stream + if (input_stream.storage == CTFDataStorage::Sparse) { + CTFSparseInputStreamData* sparse_data = + static_cast*>( + (dataset_->sequences.back())[input_stream.__id__].get()); + } + + // Get them all and push to th right stream + while (!is_name_prefix(reader_->peek_char()) && + !is_comment_prefix(reader_->peek_char()) && + !is_eol(reader_->peek_char())) { + if (!get_input_stream_value(input_stream)) { +#ifdef CTF_DEBUG + std::cout << "CTF Value not found. An empty one will be used." + << std::endl; +#endif + } + } + + return true; + } + + // type for CTF values + CTFDataType data_type_; + // dataset holding all parsed entries + std::shared_ptr> dataset_; + // resposible for reading the CTF file + std::shared_ptr reader_; + // Local buffer for string parsing + const size_t CTF_SCRATCH_LENGTH = 128; + std::vector scratch_; + // Used to decide whether first row of CTF file has a Sequence ID + bool has_initial_sequence_id_; + // Used to detect when a sequence is over + CTFSequenceID previous_sequence_id_; +}; + +} // namespace ctf +} // namespace data +} // namespace torch diff --git a/torch/csrc/api/include/torch/data/ctf/reader.h b/torch/csrc/api/include/torch/data/ctf/reader.h new file mode 100644 index 000000000000..2974a6e885e8 --- /dev/null +++ b/torch/csrc/api/include/torch/data/ctf/reader.h @@ -0,0 +1,160 @@ +#pragma once + +#include + +#include +#include +#include +#include + +namespace torch { +namespace data { +namespace ctf { + +// TODO: Should we use Memory mapped files to speed buffering? + +/// A sequential text reader to feed CTF parser +/// +/// C File API was used due to performance constraints +/// Current implementation caches chunks of data from file in memory +/// and parses CTF from it. When it gets empty, buffer is refilled and the cycle +/// is repeated until EOF is reached +/// +class Reader { + public: + virtual ~Reader(); + explicit Reader(const std::string& filename); + + inline bool can_read(void) const { + return (!is_buffer_empty() || can_buffer()); + } + inline const char& peek_char(void) { + if (is_buffer_empty()) { + refill(); + } + if (rewinded_char_) { + return previous_char_; + } else { + return buffer_[buffer_pos_]; + } + } + inline const char& get_char(void) { + if (buffer_pos_ > 0) { + previous_char_ = buffer_[buffer_pos_ - 1]; + } + if (is_buffer_empty()) { + refill(); + } + if (rewinded_char_) { + rewinded_char_ = false; + return previous_char_; + } else { + return buffer_[buffer_pos_++]; + } + } + inline const size_t& get_position(void) const { + return buffer_pos_; + } + inline void rewind_char(void) { + rewinded_char_ = true; + } + + private: + /// File handling + bool refill(void); + inline bool can_buffer(void) const { + return (!is_eof_); + } + inline bool is_buffer_empty(void) const { + return ((buffer_size_ == 0) || (buffer_size_ == buffer_pos_)); + } + std::string filename_; + std::shared_ptr file_; + bool is_eof_; + + /// Buffer handling buffer_size must be big enough + /// to fit a really long line on the CTF file + const size_t CTF_MAX_BUFFER_SIZE = 2 * 1024 * 1024; + std::vector buffer_; + size_t buffer_pos_; + size_t buffer_size_; + bool rewinded_char_; + char previous_char_; + + Reader() = delete; + DISALLOW_COPY_AND_ASSIGN(Reader); +}; + +static const char SPACE_CHAR = ' '; +static const char TAB_CHAR = '\t'; +static const char NAME_PREFIX = '|'; +static const char INDEX_DELIMITER = ':'; +static const char ESCAPE_SYMBOL = '#'; + +inline bool is_name_prefix(const char& c) { + return (c == NAME_PREFIX); +} + +inline bool is_comment_prefix(const char& c) { + return (is_name_prefix(c)); +} + +inline bool is_comment_suffix(const char& c) { + return (c == '#'); +} + +inline bool is_decimal_point(const char& c) { + return (c == '.'); +} + +inline bool is_sparse_value_delimiter(const char& c) { + return (c == ':'); +} + +inline bool is_digit(const char& c) { + return (c >= '0' && c <= '9'); +} + +inline bool is_alpha(const char& c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); +} + +inline bool is_sign(const char& c) { + return c == '+' || c == '-'; +} + +inline bool is_exponent(const char& c) { + return c == 'e' || c == 'E'; +} + +inline bool is_number(const char& c) { + return (is_digit(c) || is_decimal_point(c) || is_sign(c) || is_exponent(c)); +} + +inline bool is_printable(const char& c) { + return c >= SPACE_CHAR; +} + +inline bool is_non_printable(const char& c) { + return !is_printable(c); +} + +inline bool is_value_delimiter(const char& c) { + return c == SPACE_CHAR || c == TAB_CHAR; +} + +inline bool is_eol(const char& c) { + return (c == '\r' || c == '\n'); +} + +inline bool is_escape_delimiter(const char& c) { + return (c == '\'' || c == '"'); +} + +inline bool is_column_delimiter(const char& c) { + return is_value_delimiter(c) || (is_non_printable(c) && !is_eol(c)); +} + +} // namespace ctf +} // namespace data +} // namespace torch \ No newline at end of file diff --git a/torch/csrc/api/include/torch/data/ctf/utils.h b/torch/csrc/api/include/torch/data/ctf/utils.h new file mode 100644 index 000000000000..e57ba0109f6c --- /dev/null +++ b/torch/csrc/api/include/torch/data/ctf/utils.h @@ -0,0 +1,19 @@ +#pragma once + +namespace torch { +namespace data { +namespace ctf { + +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&) = delete; \ + void operator=(const TypeName&) = delete + +#define DISABLE_COPY_AND_MOVE(TypeName) \ + TypeName(const TypeName&) = delete; \ + TypeName& operator=(const TypeName&) = delete; \ + TypeName(TypeName&&) = delete; \ + TypeName& operator=(TypeName&&) = delete + +} // namespace ctf +} // namespace data +} // namespace torch \ No newline at end of file diff --git a/torch/csrc/api/include/torch/data/datasets.h b/torch/csrc/api/include/torch/data/datasets.h index 82c31fe96a58..df565e972358 100644 --- a/torch/csrc/api/include/torch/data/datasets.h +++ b/torch/csrc/api/include/torch/data/datasets.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include diff --git a/torch/csrc/api/include/torch/data/datasets/chunk.h b/torch/csrc/api/include/torch/data/datasets/chunk.h new file mode 100644 index 000000000000..04f4bb967ff2 --- /dev/null +++ b/torch/csrc/api/include/torch/data/datasets/chunk.h @@ -0,0 +1,67 @@ +#pragma once + +#include +#include +#include +#include + +namespace torch { +namespace data { +namespace datasets { + +/// A stateful dataset that support hierarchical sampling and prefetching of +/// entre chunks. +/// +/// A chunk could be an entire file, such as an audio data file or an image, +/// or part of a file in the case of a large text-file split based on seek +/// positions. +/// +/// Unlike regular dataset, chunk dataset require two samplers to operate and +/// keeps an internal state. `ChunkSampler` selects, which chunk to load next, +/// while the `ExampleSampler` determins the order of Examples that are returned +/// in each `get_batch` call. The hierarchical sampling approach used here is +/// inspired by this paper http://martin.zinkevich.org/publications/nips2010.pdf +template < + typename Self, + typename Batch = std::vector>, + typename ChunkSampler = samplers::RandomSampler, + typename ExampleSampler = samplers::RandomSampler> +class ChunkDataSet : public StatefulDataset { + public: + using SelfType = Self; + using BatchType = Batch; + using ChunkSamplerType = ChunkSampler; + using ExampleSamplerType = ExampleSampler; + + /// Read an entire chunk. A derived class needs to override this method. + virtual Batch read_chunk(size_t chunk_index) = 0; + + /// Returns the chunk sampler for this dataset. + virtual ChunkSampler get_chunk_sampler() = 0; + + /// Returns the example sampler for this dataset. + virtual ExampleSampler get_example_sampler() = 0; + + /// returns the number of chunks available in this dataset. + virtual size_t get_chunk_count() = 0; + + /// Default get_batch method of BatchDataSet. This method returns Example + /// batches created from the preloaded chunks. The implemenation is dataset + /// agnostic and does not need overriding in different chunk data sets. + optional get_batch(size_t batch_size) override { + // Temporary: tests will have a simple implemenation. + return torch::nullopt; + } + + /// This will clear any internal state and starts the internal prefetching + /// mechanism for the chunk dataset. + virtual void reset() {} + + /// size is not used for chunk dataset. + optional size() const override { + return torch::nullopt; + } +}; +} // namespace datasets +} // namespace data +} // namespace torch diff --git a/torch/csrc/api/src/data/ctf/reader.cpp b/torch/csrc/api/src/data/ctf/reader.cpp new file mode 100644 index 000000000000..1240478d05b5 --- /dev/null +++ b/torch/csrc/api/src/data/ctf/reader.cpp @@ -0,0 +1,85 @@ +#include + +#include +#include +#include +#include +#include + +namespace torch { +namespace data { +namespace ctf { + +/* + * Reader class for CTF + * + * RAII pattern was used for file descriptor + */ + +Reader::~Reader() {} + +Reader::Reader(const std::string& filename) + : filename_(filename), + is_eof_(false), + buffer_pos_(0), + buffer_size_(0), + rewinded_char_(false), + previous_char_(0) { + std::FILE* const tmp = fopen(filename_.c_str(), "rbS"); + if (!tmp) { + std::string error_msg( + "Reader could not open the specified file (" + filename + ")"); +#ifdef CTF_DEBUG + std::cerr << error_msg << std::endl; +#endif + throw std::runtime_error(error_msg); + } + file_ = std::shared_ptr(tmp, std::fclose); + + buffer_.resize(Reader::CTF_MAX_BUFFER_SIZE); + refill(); +} + +bool Reader::refill(void) { + if (!is_buffer_empty()) { +#ifdef CTF_DEBUG + std::cout << "Buffer is not empty yet. Not refilling it" << std::endl; +#endif + return false; + } + if (!can_buffer()) { +#ifdef CTF_DEBUG + std::cout << "Nothing to read from file " << filename_ << ". (" + << strerror(errno) << ")"; +#endif + return false; + } + + buffer_pos_ = 0; + + size_t bytes_read = + std::fread(&buffer_[0], 1, Reader::CTF_MAX_BUFFER_SIZE, file_.get()); + + if (feof(file_.get()) != 0) { + is_eof_ = true; + } + + if (bytes_read != Reader::CTF_MAX_BUFFER_SIZE && !is_eof_) { + std::string error_msg( + "Error reading file " + filename_ + ". " + strerror(errno)); +#ifdef CTF_DEBUG + std::cerr << error_msg << buffer_pos_ << std::endl; +#endif + throw std::runtime_error(error_msg); + } + buffer_size_ = bytes_read; +#ifdef CTF_DEBUG + std::cout << "Buffer refilled. Read " << std::to_string(bytes_read) + << " from file " << filename_ << std::endl; +#endif + return true; +} + +} // namespace ctf +} // namespace data +} // namespace torch \ No newline at end of file