diff --git a/test/cpp/api/CMakeLists.txt b/test/cpp/api/CMakeLists.txt
index aa9f01a86217..b3cb6d56e3a6 100644
--- a/test/cpp/api/CMakeLists.txt
+++ b/test/cpp/api/CMakeLists.txt
@@ -3,6 +3,16 @@ set(TORCH_API_TEST_SOURCES
   ${TORCH_ROOT}/test/cpp/common/main.cpp
   ${TORCH_API_TEST_DIR}/any.cpp
   ${TORCH_API_TEST_DIR}/dataloader.cpp
+  ${TORCH_API_TEST_DIR}/data/ctf/ctf_sample_classification.cpp
+  ${TORCH_API_TEST_DIR}/data/ctf/ctf_sample_comments.cpp
+  ${TORCH_API_TEST_DIR}/data/ctf/ctf_sample_dssm.cpp
+  ${TORCH_API_TEST_DIR}/data/ctf/ctf_sample_empty_values.cpp
+  ${TORCH_API_TEST_DIR}/data/ctf/ctf_sample_exponent_values.cpp
+  ${TORCH_API_TEST_DIR}/data/ctf/ctf_sample_generic.cpp
+  ${TORCH_API_TEST_DIR}/data/ctf/ctf_sample_learning_to_rank.cpp
+  ${TORCH_API_TEST_DIR}/data/ctf/ctf_sample_part_of_speech_tagging.cpp
+  ${TORCH_API_TEST_DIR}/data/ctf/ctf_sample_sequence_classification.cpp
+  ${TORCH_API_TEST_DIR}/data/ctf/ctf_sample_sequence_to_sequence.cpp
   ${TORCH_API_TEST_DIR}/expanding-array.cpp
   ${TORCH_API_TEST_DIR}/integration.cpp
   ${TORCH_API_TEST_DIR}/jit.cpp
diff --git a/test/cpp/api/data/ctf/ctf_sample_classification.cpp b/test/cpp/api/data/ctf/ctf_sample_classification.cpp
new file mode 100644
index 000000000000..9126bd24bf97
--- /dev/null
+++ b/test/cpp/api/data/ctf/ctf_sample_classification.cpp
@@ -0,0 +1,126 @@
+#include <gtest/gtest.h>
+
+#include <test/cpp/api/data/ctf/samples/ctf_samples.h>
+#include <torch/data/ctf/ctf_parser.h>
+
+/// Tests must be executed from root directory of the repo
+/// Order of CTFValue<int16_t>s inside CTFSample are important
+
+namespace torch {
+namespace data {
+namespace ctf {
+
+TEST(DataTest, CTF_SAMPLE_CLASSIFICATION_SUCCESS) {
+  /// Actual data
+  std::vector<CTFInputStreamInformation> input_streams;
+  input_streams.emplace_back(
+      0,
+      "features",
+      "features",
+      5,
+      CTFInputStreamType::Feature,
+      CTFDataStorage::Dense);
+  input_streams.emplace_back(
+      1,
+      "class",
+      "class",
+      0,
+      CTFInputStreamType::Label,
+      CTFDataStorage::Sparse);
+  CTFConfiguration config(
+      std::string(CTF_SAMPLE_DIR + "/ctf_sample_classification.ctf"),
+      input_streams,
+      CTFDataType(CTFDataType::Int16));
+
+  CTFParser<int16_t> ctf_parser(config);
+  ctf_parser.read_from_file();
+
+  /// Expected data
+  CTFDataset<int16_t> dataset(CTFDataType::Int16, input_streams);
+#ifdef CTF_DEBUG
+  size_t sequence_id = 0;
+#endif
+  size_t input_stream_id = 0;
+
+  {
+    // 0 (implicit)
+#ifdef CTF_DEBUG
+    sequence_id = 0;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<int16_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<int16_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      input_stream_id = 1;
+      auto sparse_stream_ptr =
+          static_cast<CTFSparseInputStreamData<int16_t>*>(
+              sequence[input_stream_id].get());
+      // |class 23:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(23);
+    }
+    {
+      input_stream_id = 0;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<int16_t>*>(
+          sequence[input_stream_id].get());
+      // |features 2 3 4 5 6
+      dense_stream_ptr->data.push_back(2);
+      dense_stream_ptr->data.push_back(3);
+      dense_stream_ptr->data.push_back(4);
+      dense_stream_ptr->data.push_back(5);
+      dense_stream_ptr->data.push_back(6);
+    }
+    dataset.sequences.push_back(sequence);
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+  }
+
+  {
+    // 1 (implicit)
+#ifdef CTF_DEBUG
+    sequence_id = 1;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<int16_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<int16_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+
+    {
+      input_stream_id = 1;
+      auto sparse_stream_ptr =
+          static_cast<CTFSparseInputStreamData<int16_t>*>(
+              sequence[input_stream_id].get());
+      // |class 13:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(13);
+    }
+    {
+      input_stream_id = 0;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<int16_t>*>(
+          sequence[input_stream_id].get());
+      // |features 1 2 0 2 3
+      dense_stream_ptr->data.push_back(1);
+      dense_stream_ptr->data.push_back(2);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(2);
+      dense_stream_ptr->data.push_back(3);
+    }
+    dataset.sequences.push_back(sequence);
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+  }
+
+  EXPECT_TRUE(*ctf_parser.get_dataset() == dataset);
+}
+} // namespace ctf
+} // namespace data
+} // namespace torch
\ No newline at end of file
diff --git a/test/cpp/api/data/ctf/ctf_sample_comments.cpp b/test/cpp/api/data/ctf/ctf_sample_comments.cpp
new file mode 100644
index 000000000000..726e533a01a6
--- /dev/null
+++ b/test/cpp/api/data/ctf/ctf_sample_comments.cpp
@@ -0,0 +1,189 @@
+#include <gtest/gtest.h>
+
+#include <test/cpp/api/data/ctf/samples/ctf_samples.h>
+#include <torch/data/ctf/ctf_parser.h>
+
+/// Tests must be executed from root directory of the repo
+/// Order of CTFValue<float>s inside CTFSample are important
+
+namespace torch {
+namespace data {
+namespace ctf {
+
+TEST(DataTest, CTF_SAMPLE_COMMENTS_SUCCESS) {
+  /// Actual data
+  std::vector<CTFInputStreamInformation> input_streams;
+  input_streams.emplace_back(
+      0, "A", "A", 5, CTFInputStreamType::Feature, CTFDataStorage::Dense);
+  input_streams.emplace_back(
+      1, "B", "B", 0, CTFInputStreamType::Feature, CTFDataStorage::Sparse);
+  input_streams.emplace_back(
+      2, "C", "C", 1, CTFInputStreamType::Label, CTFDataStorage::Dense);
+  CTFConfiguration config(
+      std::string(CTF_SAMPLE_DIR + "/ctf_sample_comments.ctf"),
+      input_streams,
+      CTFDataType(CTFDataType::Float));
+
+  CTFParser<float> ctf_parser(config);
+  ctf_parser.read_from_file();
+
+  /// Expected data
+  CTFDataset<float> dataset(CTFDataType::Float, input_streams);
+#ifdef CTF_DEBUG
+  size_t sequence_id = 0;
+#endif
+  size_t input_stream_id = 0;
+
+  {
+    // 0 (implicit)
+#ifdef CTF_DEBUG
+    sequence_id = 0;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<float>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<float>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 2;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<float>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      input_stream_id = 1;
+      auto sparse_stream_ptr = static_cast<CTFSparseInputStreamData<float>*>(
+          sequence[input_stream_id].get());
+      // |B 100:3 123:4
+      sparse_stream_ptr->data.push_back(3);
+      sparse_stream_ptr->indices.push_back(100);
+      sparse_stream_ptr->data.push_back(4);
+      sparse_stream_ptr->indices.push_back(123);
+    }
+    {
+      input_stream_id = 2;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<float>*>(
+          sequence[input_stream_id].get());
+      // |C 8
+      dense_stream_ptr->data.push_back(8);
+    }
+    {
+      input_stream_id = 0;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<float>*>(
+          sequence[input_stream_id].get());
+      // |A 0 1 2 3 4
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(1);
+      dense_stream_ptr->data.push_back(2);
+      dense_stream_ptr->data.push_back(3);
+      dense_stream_ptr->data.push_back(4);
+    }
+    dataset.sequences.push_back(sequence);
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+  }
+
+  {
+    // 1 (implicit)
+#ifdef CTF_DEBUG
+    sequence_id = 1;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<float>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<float>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 2;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<float>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      input_stream_id = 0;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<float>*>(
+          sequence[input_stream_id].get());
+      // |A 0 1.1 22 0.3 54
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(1.1);
+      dense_stream_ptr->data.push_back(22);
+      dense_stream_ptr->data.push_back(0.3);
+      dense_stream_ptr->data.push_back(54);
+    }
+    {
+      input_stream_id = 2;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<float>*>(
+          sequence[input_stream_id].get());
+      // |C 123917
+      dense_stream_ptr->data.push_back(123917);
+    }
+    {
+      input_stream_id = 1;
+      auto sparse_stream_ptr = static_cast<CTFSparseInputStreamData<float>*>(
+          sequence[input_stream_id].get());
+      // |B 1134:1.911 13331:0.014
+      sparse_stream_ptr->data.push_back(1.911);
+      sparse_stream_ptr->indices.push_back(1134);
+      sparse_stream_ptr->data.push_back(0.014);
+      sparse_stream_ptr->indices.push_back(13331);
+    }
+    dataset.sequences.push_back(sequence);
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+  }
+
+  {
+    // 2 (implicit)
+#ifdef CTF_DEBUG
+    sequence_id = 2;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<float>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<float>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 2;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<float>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      input_stream_id = 2;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<float>*>(
+          sequence[input_stream_id].get());
+      // |C -0.001
+      dense_stream_ptr->data.push_back(-0.001);
+    }
+    {
+      input_stream_id = 0;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<float>*>(
+          sequence[input_stream_id].get());
+      // |A 3.9 1.11 121.2 99.13 0.04
+      dense_stream_ptr->data.push_back(3.9);
+      dense_stream_ptr->data.push_back(1.11);
+      dense_stream_ptr->data.push_back(121.2);
+      dense_stream_ptr->data.push_back(99.13);
+      dense_stream_ptr->data.push_back(0.04);
+    }
+    {
+      input_stream_id = 1;
+      auto sparse_stream_ptr = static_cast<CTFSparseInputStreamData<float>*>(
+          sequence[input_stream_id].get());
+      // |B 999:0.001 918918:-9.19
+      sparse_stream_ptr->data.push_back(0.001);
+      sparse_stream_ptr->indices.push_back(999);
+      sparse_stream_ptr->data.push_back(-9.19);
+      sparse_stream_ptr->indices.push_back(918918);
+    }
+    dataset.sequences.push_back(sequence);
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+  }
+
+  EXPECT_TRUE(*ctf_parser.get_dataset() == dataset);
+}
+
+} // namespace ctf
+} // namespace data
+} // namespace torch
\ No newline at end of file
diff --git a/test/cpp/api/data/ctf/ctf_sample_dssm.cpp b/test/cpp/api/data/ctf/ctf_sample_dssm.cpp
new file mode 100644
index 000000000000..77abdb3a2687
--- /dev/null
+++ b/test/cpp/api/data/ctf/ctf_sample_dssm.cpp
@@ -0,0 +1,131 @@
+#include <gtest/gtest.h>
+
+#include <test/cpp/api/data/ctf/samples/ctf_samples.h>
+#include <torch/data/ctf/ctf_parser.h>
+
+/// Tests must be executed from root directory of the repo
+/// Order of CTFValue<double>s inside CTFSample are important
+
+namespace torch {
+namespace data {
+namespace ctf {
+
+TEST(DataTest, CTF_SAMPLE_DSSN_SUCCESS) {
+  /// Actual data
+  std::vector<CTFInputStreamInformation> input_streams;
+  input_streams.emplace_back(
+      0, "src", "src", 0, CTFInputStreamType::Feature, CTFDataStorage::Sparse);
+  input_streams.emplace_back(
+      1, "tgt", "tgt", 0, CTFInputStreamType::Label, CTFDataStorage::Sparse);
+  CTFConfiguration config(
+      std::string(CTF_SAMPLE_DIR + "/ctf_sample_dssm.ctf"),
+      input_streams,
+      CTFDataType(CTFDataType::Double));
+
+  CTFParser<double> ctf_parser(config);
+  ctf_parser.read_from_file();
+
+  /// Expected data
+  CTFDataset<double> dataset(CTFDataType::Double, input_streams);
+#ifdef CTF_DEBUG
+  size_t sequence_id = 0;
+#endif
+  size_t input_stream_id = 0;
+  {
+    // 0
+#ifdef CTF_DEBUG
+    sequence_id = 0;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<double>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<double>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      input_stream_id = 0;
+      auto sparse_stream_ptr = static_cast<CTFSparseInputStreamData<double>*>(
+          sequence[input_stream_id].get());
+      // |src 12:1 23:1 345:2 45001:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(12);
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(23);
+      sparse_stream_ptr->data.push_back(2);
+      sparse_stream_ptr->indices.push_back(345);
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(45001);
+    }
+    {
+      input_stream_id = 1;
+      auto sparse_stream_ptr = static_cast<CTFSparseInputStreamData<double>*>(
+          sequence[input_stream_id].get());
+      // |tgt 233:1 766:2 234:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(233);
+      sparse_stream_ptr->data.push_back(2);
+      sparse_stream_ptr->indices.push_back(766);
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(234);
+    }
+
+    dataset.sequences.push_back(sequence);
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+  }
+
+  {
+    // 1
+#ifdef CTF_DEBUG
+    sequence_id = 1;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<double>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<double>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      input_stream_id = 0;
+      auto sparse_stream_ptr = static_cast<CTFSparseInputStreamData<double>*>(
+          sequence[input_stream_id].get());
+      // |src 123:1 56:1 10324:1 18001:3
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(123);
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(56);
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(10324);
+      sparse_stream_ptr->data.push_back(3);
+      sparse_stream_ptr->indices.push_back(18001);
+    }
+    {
+      input_stream_id = 1;
+      auto sparse_stream_ptr = static_cast<CTFSparseInputStreamData<double>*>(
+          sequence[input_stream_id].get());
+      // |tgt 233:1 2344:2 8889:1 2234:1 253434:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(233);
+      sparse_stream_ptr->data.push_back(2);
+      sparse_stream_ptr->indices.push_back(2344);
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(8889);
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(2234);
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(253434);
+    }
+    dataset.sequences.push_back(sequence);
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+  }
+
+  EXPECT_TRUE(*ctf_parser.get_dataset() == dataset);
+}
+} // namespace ctf
+} // namespace data
+} // namespace torch
\ No newline at end of file
diff --git a/test/cpp/api/data/ctf/ctf_sample_empty_values.cpp b/test/cpp/api/data/ctf/ctf_sample_empty_values.cpp
new file mode 100644
index 000000000000..15acf25c703e
--- /dev/null
+++ b/test/cpp/api/data/ctf/ctf_sample_empty_values.cpp
@@ -0,0 +1,89 @@
+#include <gtest/gtest.h>
+
+#include <test/cpp/api/data/ctf/samples/ctf_samples.h>
+#include <torch/data/ctf/ctf_parser.h>
+#include <torch/data/ctf/utils.h>
+
+/// Tests must be executed from root directory of the repo
+/// Order of CTFValue<double>s inside CTFSample are important
+
+namespace torch {
+namespace data {
+namespace ctf {
+
+TEST(DataTest, CTF_SAMPLE_EMPTY_VALUES_SUCCESS) {
+  /// Actual data
+  std::vector<CTFInputStreamInformation> input_streams;
+  input_streams.emplace_back(
+      0, "F0", "F0", 0, CTFInputStreamType::Feature, CTFDataStorage::Sparse);
+  input_streams.emplace_back(
+      1, "F1", "F1", 1, CTFInputStreamType::Label, CTFDataStorage::Dense);
+  input_streams.emplace_back(
+      2, "F2", "F2", 1, CTFInputStreamType::Label, CTFDataStorage::Dense);
+  CTFConfiguration config(
+      std::string(CTF_SAMPLE_DIR + "/ctf_sample_empty_values.ctf"),
+      input_streams,
+      CTFDataType(CTFDataType::Int16));
+
+  CTFParser<int16_t> ctf_parser(config);
+  ctf_parser.read_from_file();
+
+  /// Expected data
+  CTFDataset<int16_t> dataset(CTFDataType::Int16, input_streams);
+#ifdef CTF_DEBUG
+  size_t sequence_id = 0;
+#endif
+  size_t input_stream_id = 0;
+  {
+    // 1
+#ifdef CTF_DEBUG
+    sequence_id = 1;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<int16_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<int16_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 2;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<int16_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      // |F0
+    }
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+    dataset.sequences.push_back(sequence);
+  }
+  {
+    // 2
+#ifdef CTF_DEBUG
+    sequence_id = 2;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<int16_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<int16_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 2;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<int16_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      // |F0 |F1 |F2
+    }
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+    dataset.sequences.push_back(sequence);
+  }
+
+  EXPECT_TRUE(*ctf_parser.get_dataset() == dataset);
+}
+
+} // namespace ctf
+} // namespace data
+} // namespace torch
\ No newline at end of file
diff --git a/test/cpp/api/data/ctf/ctf_sample_exponent_values.cpp b/test/cpp/api/data/ctf/ctf_sample_exponent_values.cpp
new file mode 100644
index 000000000000..6ddb1acbaa75
--- /dev/null
+++ b/test/cpp/api/data/ctf/ctf_sample_exponent_values.cpp
@@ -0,0 +1,93 @@
+#include <gtest/gtest.h>
+
+#include <test/cpp/api/data/ctf/samples/ctf_samples.h>
+#include <torch/data/ctf/ctf_parser.h>
+
+/// Tests must be executed from root directory of the repo
+/// Order of CTFValue<double>s inside CTFSample are important
+
+namespace torch {
+namespace data {
+namespace ctf {
+
+TEST(DataTest, CTF_SAMPLE_EXPONENT_VALUE_SUCCESS) {
+  /// Actual data
+  std::vector<CTFInputStreamInformation> input_streams;
+  input_streams.emplace_back(
+      0, "F0", "F0", 0, CTFInputStreamType::Feature, CTFDataStorage::Sparse);
+  input_streams.emplace_back(
+      1, "T0", "T0", 1, CTFInputStreamType::Label, CTFDataStorage::Dense);
+  CTFConfiguration config(
+      std::string(CTF_SAMPLE_DIR + "/ctf_sample_exponent_values.ctf"),
+      input_streams,
+      CTFDataType(CTFDataType::Double));
+
+  CTFParser<double> ctf_parser(config);
+  ctf_parser.read_from_file();
+
+  /// Expected data
+  CTFDataset<double> dataset(CTFDataType::Double, input_streams);
+#ifdef CTF_DEBUG
+  size_t sequence_id = 0;
+#endif
+  size_t input_stream_id = 0;
+  {
+    // 0
+#ifdef CTF_DEBUG
+    sequence_id = 0;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<double>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<double>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      input_stream_id = 0;
+      auto sparse_stream_ptr = static_cast<CTFSparseInputStreamData<double>*>(
+          sequence[input_stream_id].get());
+      // |F0 0:0.421826 1:1.42167 2:-4.13626e-000123 5:-1.83832 7:-0.000114865
+      // 9:-36288.6 11:113.553 13:4.25123e+009 16:-1.78095e-005 18:-0.00162638
+      // 19:-1.07109
+      sparse_stream_ptr->indices.push_back(0);
+      sparse_stream_ptr->data.push_back(0.421826);
+      sparse_stream_ptr->indices.push_back(1);
+      sparse_stream_ptr->data.push_back(1.42167);
+      sparse_stream_ptr->indices.push_back(2);
+      sparse_stream_ptr->data.push_back(-4.13626e-000123);
+      sparse_stream_ptr->indices.push_back(5);
+      sparse_stream_ptr->data.push_back(-1.83832);
+      sparse_stream_ptr->indices.push_back(7);
+      sparse_stream_ptr->data.push_back(-0.000114865);
+      sparse_stream_ptr->indices.push_back(9);
+      sparse_stream_ptr->data.push_back(-36288.6);
+      sparse_stream_ptr->indices.push_back(11);
+      sparse_stream_ptr->data.push_back(113.553);
+      sparse_stream_ptr->indices.push_back(13);
+      sparse_stream_ptr->data.push_back(4.25123e+009);
+      sparse_stream_ptr->indices.push_back(16);
+      sparse_stream_ptr->data.push_back(-1.78095e-005);
+      sparse_stream_ptr->indices.push_back(18);
+      sparse_stream_ptr->data.push_back(-0.00162638);
+      sparse_stream_ptr->indices.push_back(19);
+      sparse_stream_ptr->data.push_back(-1.07109);
+    }
+    {
+      input_stream_id = 1;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<double>*>(
+          sequence[input_stream_id].get());
+      // |T0 1
+      dense_stream_ptr->data.push_back(1);
+    }
+    dataset.sequences.push_back(sequence);
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+  }
+
+  EXPECT_TRUE(*ctf_parser.get_dataset() == dataset);
+}
+} // namespace ctf
+} // namespace data
+} // namespace torch
\ No newline at end of file
diff --git a/test/cpp/api/data/ctf/ctf_sample_generic.cpp b/test/cpp/api/data/ctf/ctf_sample_generic.cpp
new file mode 100644
index 000000000000..d5e0371314f0
--- /dev/null
+++ b/test/cpp/api/data/ctf/ctf_sample_generic.cpp
@@ -0,0 +1,246 @@
+#include <gtest/gtest.h>
+
+#include <test/cpp/api/data/ctf/samples/ctf_samples.h>
+#include <torch/data/ctf/ctf_parser.h>
+
+/// Tests must be executed from root directory of the repo
+/// Order of CTFValue<int32_t>s inside CTFSample are important
+
+namespace torch {
+namespace data {
+namespace ctf {
+
+TEST(DataTest, CTF_SAMPLE_GENERIC_SUCCESS) {
+  /// Actual data
+  std::vector<CTFInputStreamInformation> input_streams;
+  input_streams.emplace_back(
+      0, "a", "a", 3, CTFInputStreamType::Feature, CTFDataStorage::Dense);
+  input_streams.emplace_back(
+      1, "b", "b", 2, CTFInputStreamType::Label, CTFDataStorage::Dense);
+  CTFConfiguration config(
+      std::string(CTF_SAMPLE_DIR + "/ctf_sample_generic.ctf"),
+      input_streams,
+      CTFDataType(CTFDataType::Int32));
+
+  CTFParser<int32_t> ctf_parser(config);
+  ctf_parser.read_from_file();
+
+  /// Expected data
+  CTFDataset<int32_t> dataset(CTFDataType::Int32, input_streams);
+#ifdef CTF_DEBUG
+  size_t sequence_id = 0;
+#endif
+  size_t input_stream_id = 0;
+  {
+    // 100
+#ifdef CTF_DEBUG
+    sequence_id = 100;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<int32_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<int32_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      input_stream_id = 0;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<int32_t>*>(
+          sequence[input_stream_id].get());
+      // |a 1 2 3
+      dense_stream_ptr->data.push_back(1);
+      dense_stream_ptr->data.push_back(2);
+      dense_stream_ptr->data.push_back(3);
+      // a 4 5 6
+      dense_stream_ptr->data.push_back(4);
+      dense_stream_ptr->data.push_back(5);
+      dense_stream_ptr->data.push_back(6);
+      // |a 7 8 9
+      dense_stream_ptr->data.push_back(7);
+      dense_stream_ptr->data.push_back(8);
+      dense_stream_ptr->data.push_back(9);
+      // |a 7 8 9
+      dense_stream_ptr->data.push_back(7);
+      dense_stream_ptr->data.push_back(8);
+      dense_stream_ptr->data.push_back(9);
+    }
+
+    {
+      input_stream_id = 1;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<int32_t>*>(
+          sequence[input_stream_id].get());
+      // |b 100 200
+      dense_stream_ptr->data.push_back(100);
+      dense_stream_ptr->data.push_back(200);
+      // |b 101 201
+      dense_stream_ptr->data.push_back(101);
+      dense_stream_ptr->data.push_back(201);
+      // |b 102983 14532
+      dense_stream_ptr->data.push_back(102983);
+      dense_stream_ptr->data.push_back(14532);
+    }
+
+    dataset.sequences.push_back(sequence);
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+  }
+
+  {
+    // 200
+#ifdef CTF_DEBUG
+    sequence_id = 200;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<int32_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<int32_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      input_stream_id = 1;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<int32_t>*>(
+          sequence[input_stream_id].get());
+      // |b 300 400
+      dense_stream_ptr->data.push_back(300);
+      dense_stream_ptr->data.push_back(400);
+    }
+    {
+      input_stream_id = 0;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<int32_t>*>(
+          sequence[input_stream_id].get());
+      // |a 10 20 30
+      dense_stream_ptr->data.push_back(10);
+      dense_stream_ptr->data.push_back(20);
+      dense_stream_ptr->data.push_back(30);
+    }
+    dataset.sequences.push_back(sequence);
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+  }
+
+  {
+    // 333
+#ifdef CTF_DEBUG
+    sequence_id = 333;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<int32_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<int32_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      input_stream_id = 1;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<int32_t>*>(
+          sequence[input_stream_id].get());
+
+      // |b 500 100
+      dense_stream_ptr->data.push_back(500);
+      dense_stream_ptr->data.push_back(100);
+      // |b 600 -900
+      dense_stream_ptr->data.push_back(600);
+      dense_stream_ptr->data.push_back(-900);
+
+      dataset.sequences.push_back(sequence);
+#ifdef CTF_DEBUG
+      dataset.sequences_id.push_back(sequence_id);
+#endif
+    }
+  }
+
+  {
+    // 400
+#ifdef CTF_DEBUG
+    sequence_id = 400;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<int32_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<int32_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      input_stream_id = 0;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<int32_t>*>(
+          sequence[input_stream_id].get());
+      // |a 1 2 3
+      dense_stream_ptr->data.push_back(1);
+      dense_stream_ptr->data.push_back(2);
+      dense_stream_ptr->data.push_back(3);
+      // |a 4 5 6
+      dense_stream_ptr->data.push_back(4);
+      dense_stream_ptr->data.push_back(5);
+      dense_stream_ptr->data.push_back(6);
+      // |a 4 5 6 TODO: repeated lines should be considered invalid
+      dense_stream_ptr->data.push_back(4);
+      dense_stream_ptr->data.push_back(5);
+      dense_stream_ptr->data.push_back(6);
+    }
+
+    {
+      input_stream_id = 1;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<int32_t>*>(
+          sequence[input_stream_id].get());
+      // |b 100 200
+      dense_stream_ptr->data.push_back(100);
+      dense_stream_ptr->data.push_back(200);
+      // |b 101 201
+      dense_stream_ptr->data.push_back(101);
+      dense_stream_ptr->data.push_back(201);
+      // |b 101 201 TODO: repeated lines should be considered invalid
+      dense_stream_ptr->data.push_back(101);
+      dense_stream_ptr->data.push_back(201);
+    }
+    dataset.sequences.push_back(sequence);
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+  }
+
+  {
+    // 500
+#ifdef CTF_DEBUG
+    sequence_id = 500;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<int32_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<int32_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      input_stream_id = 0;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<int32_t>*>(
+          sequence[input_stream_id].get());
+      // |a 1 2 3
+      dense_stream_ptr->data.push_back(1);
+      dense_stream_ptr->data.push_back(2);
+      dense_stream_ptr->data.push_back(3);
+    }
+
+    {
+      input_stream_id = 1;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<int32_t>*>(
+          sequence[input_stream_id].get());
+      // |b 100 200
+      dense_stream_ptr->data.push_back(100);
+      dense_stream_ptr->data.push_back(200);
+    }
+    dataset.sequences.push_back(sequence);
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+  }
+
+  EXPECT_TRUE(*ctf_parser.get_dataset() == dataset);
+}
+
+} // namespace ctf
+} // namespace data
+} // namespace torch
\ No newline at end of file
diff --git a/test/cpp/api/data/ctf/ctf_sample_learning_to_rank.cpp b/test/cpp/api/data/ctf/ctf_sample_learning_to_rank.cpp
new file mode 100644
index 000000000000..d2b288fdcbd1
--- /dev/null
+++ b/test/cpp/api/data/ctf/ctf_sample_learning_to_rank.cpp
@@ -0,0 +1,226 @@
+#include <gtest/gtest.h>
+
+#include <test/cpp/api/data/ctf/samples/ctf_samples.h>
+#include <torch/data/ctf/ctf_parser.h>
+
+/// Tests must be executed from root directory of the repo
+/// Order of CTFValue<int16_t>s inside CTFSample are important
+
+namespace torch {
+namespace data {
+namespace ctf {
+
+TEST(DataTest, CTF_SAMPLE_LEARNING_TO_RANK_SUCCESS) {
+  /// Actual data
+  std::vector<CTFInputStreamInformation> input_streams;
+  input_streams.emplace_back(
+      0,
+      "features",
+      "features",
+      12,
+      CTFInputStreamType::Feature,
+      CTFDataStorage::Dense);
+  input_streams.emplace_back(
+      1,
+      "rating",
+      "rating",
+      1,
+      CTFInputStreamType::Label,
+      CTFDataStorage::Dense);
+  CTFConfiguration config(
+      std::string(CTF_SAMPLE_DIR + "/ctf_sample_learning_to_rank.ctf"),
+      input_streams,
+      CTFDataType(CTFDataType::Int16));
+
+  CTFParser<int16_t> ctf_parser(config);
+  ctf_parser.read_from_file();
+
+  /// Expected data
+  CTFDataset<int16_t> dataset(CTFDataType::Int16, input_streams);
+#ifdef CTF_DEBUG
+  size_t sequence_id = 0;
+#endif
+  size_t input_stream_id = 0;
+
+  {
+    // 0
+#ifdef CTF_DEBUG
+    sequence_id = 0;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<int16_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<int16_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      input_stream_id = 0;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<int16_t>*>(
+          sequence[input_stream_id].get());
+      // |features 23 35 0 0 0 21 2345 0 0 0 0 0
+      dense_stream_ptr->data.push_back(23);
+      dense_stream_ptr->data.push_back(35);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(21);
+      dense_stream_ptr->data.push_back(2345);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      // |features 0 123 0 22 44 44 290 22 22 22 33 0
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(123);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(22);
+      dense_stream_ptr->data.push_back(44);
+      dense_stream_ptr->data.push_back(44);
+      dense_stream_ptr->data.push_back(290);
+      dense_stream_ptr->data.push_back(22);
+      dense_stream_ptr->data.push_back(22);
+      dense_stream_ptr->data.push_back(22);
+      dense_stream_ptr->data.push_back(33);
+      dense_stream_ptr->data.push_back(0);
+      // |features 0 0 0 0 0 0 1 0 0 0 0 0
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(1);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+    }
+
+    {
+      input_stream_id = 1;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<int16_t>*>(
+          sequence[input_stream_id].get());
+      // |rating 4
+      dense_stream_ptr->data.push_back(4);
+      // |rating 2
+      dense_stream_ptr->data.push_back(2);
+      // |rating 1
+      dense_stream_ptr->data.push_back(1);
+    }
+
+    dataset.sequences.push_back(sequence);
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+  }
+
+  {
+    // 1
+#ifdef CTF_DEBUG
+    sequence_id = 1;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<int16_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<int16_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      input_stream_id = 0;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<int16_t>*>(
+          sequence[input_stream_id].get());
+      // |features 34 56 0 0 0 45 1312 0 0 0 0 0
+      dense_stream_ptr->data.push_back(34);
+      dense_stream_ptr->data.push_back(56);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(45);
+      dense_stream_ptr->data.push_back(1312);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      // |features 45 45 0 0 0 12 335 0 0 0 0 0
+      dense_stream_ptr->data.push_back(45);
+      dense_stream_ptr->data.push_back(45);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(12);
+      dense_stream_ptr->data.push_back(335);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+    }
+    {
+      input_stream_id = 1;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<int16_t>*>(
+          sequence[input_stream_id].get());
+      // |rating 1
+      dense_stream_ptr->data.push_back(1);
+      // |rating 0
+      dense_stream_ptr->data.push_back(0);
+    }
+    dataset.sequences.push_back(sequence);
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+  }
+
+  {
+    // 2
+#ifdef CTF_DEBUG
+    sequence_id = 2;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<int16_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFDenseInputStreamData<int16_t>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      input_stream_id = 0;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<int16_t>*>(
+          sequence[input_stream_id].get());
+      // |features 0 0 0 0 0 0 22 0 0 0 0 0
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(22);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+      dense_stream_ptr->data.push_back(0);
+    }
+    {
+      input_stream_id = 1;
+      auto dense_stream_ptr = static_cast<CTFDenseInputStreamData<int16_t>*>(
+          sequence[input_stream_id].get());
+      // |rating 0
+      dense_stream_ptr->data.push_back(0);
+    }
+    dataset.sequences.push_back(sequence);
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+  }
+
+  EXPECT_TRUE(*ctf_parser.get_dataset() == dataset);
+}
+
+} // namespace ctf
+} // namespace data
+} // namespace torch
\ No newline at end of file
diff --git a/test/cpp/api/data/ctf/ctf_sample_part_of_speech_tagging.cpp b/test/cpp/api/data/ctf/ctf_sample_part_of_speech_tagging.cpp
new file mode 100644
index 000000000000..a1aef3d709eb
--- /dev/null
+++ b/test/cpp/api/data/ctf/ctf_sample_part_of_speech_tagging.cpp
@@ -0,0 +1,129 @@
+#include <gtest/gtest.h>
+
+#include <test/cpp/api/data/ctf/samples/ctf_samples.h>
+#include <torch/data/ctf/ctf_parser.h>
+
+/// Tests must be executed from root directory of the repo
+/// Order of CTFValue<double>s inside CTFSample are important
+
+namespace torch {
+namespace data {
+namespace ctf {
+
+TEST(DataTest, CTF_SAMPLE_PART_OF_SPEECH_TAGGING_SUCCESS) {
+  /// Actual data
+  std::vector<CTFInputStreamInformation> input_streams;
+  input_streams.emplace_back(
+      0,
+      "word",
+      "word",
+      0,
+      CTFInputStreamType::Feature,
+      CTFDataStorage::Sparse);
+  input_streams.emplace_back(
+      1, "tag", "tag", 0, CTFInputStreamType::Label, CTFDataStorage::Sparse);
+  CTFConfiguration config(
+      std::string(CTF_SAMPLE_DIR + "/ctf_sample_part_of_speech_tagging.ctf"),
+      input_streams,
+      CTFDataType(CTFDataType::Double));
+
+  CTFParser<double> ctf_parser(config);
+  ctf_parser.read_from_file();
+
+  /// Expected data
+  CTFDataset<double> dataset(CTFDataType::Double, input_streams);
+#ifdef CTF_DEBUG
+  size_t sequence_id = 0;
+#endif
+  size_t input_stream_id = 0;
+  {
+    // 0
+#ifdef CTF_DEBUG
+    sequence_id = 0;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<double>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<double>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      input_stream_id = 0;
+      auto sparse_stream_ptr = static_cast<CTFSparseInputStreamData<double>*>(
+          sequence[input_stream_id].get());
+      // |word 234:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(234);
+      // |word 123:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(123);
+      // |word 123:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(123);
+    }
+    {
+      input_stream_id = 1;
+      auto sparse_stream_ptr = static_cast<CTFSparseInputStreamData<double>*>(
+          sequence[input_stream_id].get());
+      // |tag 12:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(12);
+      // |tag 10:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(10);
+      // |tag 13:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(13);
+    }
+    dataset.sequences.push_back(sequence);
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+  }
+
+  {
+    // 1
+#ifdef CTF_DEBUG
+    sequence_id = 1;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<double>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<double>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      input_stream_id = 0;
+      auto sparse_stream_ptr = static_cast<CTFSparseInputStreamData<double>*>(
+          sequence[input_stream_id].get());
+      // |word 234:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(234);
+      // |word 123:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(123);
+    }
+    {
+      input_stream_id = 1;
+      auto sparse_stream_ptr = static_cast<CTFSparseInputStreamData<double>*>(
+          sequence[input_stream_id].get());
+      // |tag 12:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(12);
+      // |tag 10:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(10);
+    }
+    dataset.sequences.push_back(sequence);
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+  }
+  EXPECT_TRUE(*ctf_parser.get_dataset() == dataset);
+}
+
+} // namespace ctf
+} // namespace data
+} // namespace torch
\ No newline at end of file
diff --git a/test/cpp/api/data/ctf/ctf_sample_sequence_classification.cpp b/test/cpp/api/data/ctf/ctf_sample_sequence_classification.cpp
new file mode 100644
index 000000000000..a4990dea097e
--- /dev/null
+++ b/test/cpp/api/data/ctf/ctf_sample_sequence_classification.cpp
@@ -0,0 +1,125 @@
+#include <gtest/gtest.h>
+
+#include <test/cpp/api/data/ctf/samples/ctf_samples.h>
+#include <torch/data/ctf/ctf_parser.h>
+
+/// Tests must be executed from root directory of the repo
+/// Order of CTFValue<double>s inside CTFSample are important
+
+namespace torch {
+namespace data {
+namespace ctf {
+
+TEST(DataTest, CTF_SAMPLE_SEQUENCE_CLASSIFICATION_SUCCESS) {
+  /// Actual data
+  std::vector<CTFInputStreamInformation> input_streams;
+  input_streams.emplace_back(
+      0,
+      "word",
+      "word",
+      0,
+      CTFInputStreamType::Feature,
+      CTFDataStorage::Sparse);
+  input_streams.emplace_back(
+      1,
+      "class",
+      "class",
+      0,
+      CTFInputStreamType::Label,
+      CTFDataStorage::Sparse);
+  CTFConfiguration config(
+      std::string(CTF_SAMPLE_DIR + "/ctf_sample_sequence_classification.ctf"),
+      input_streams,
+      CTFDataType(CTFDataType::Double));
+
+  CTFParser<double> ctf_parser(config);
+  ctf_parser.read_from_file();
+
+  /// Expected data
+  CTFDataset<double> dataset(CTFDataType::Double, input_streams);
+#ifdef CTF_DEBUG
+  size_t sequence_id = 0;
+#endif
+  size_t input_stream_id = 0;
+  {
+    // 0
+#ifdef CTF_DEBUG
+    sequence_id = 0;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<double>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<double>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      input_stream_id = 0;
+      auto sparse_stream_ptr = static_cast<CTFSparseInputStreamData<double>*>(
+          sequence[input_stream_id].get());
+      // |word 234:1
+      sparse_stream_ptr->indices.push_back(234);
+      sparse_stream_ptr->data.push_back(1);
+      // |word 123:1
+      sparse_stream_ptr->indices.push_back(123);
+      sparse_stream_ptr->data.push_back(1);
+      // |word 890:1
+      sparse_stream_ptr->indices.push_back(890);
+      sparse_stream_ptr->data.push_back(1);
+    }
+    {
+      input_stream_id = 1;
+      auto sparse_stream_ptr = static_cast<CTFSparseInputStreamData<double>*>(
+          sequence[input_stream_id].get());
+      // |class 3:1
+      sparse_stream_ptr->indices.push_back(3);
+      sparse_stream_ptr->data.push_back(1);
+    }
+    dataset.sequences.push_back(sequence);
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+  }
+
+  {
+    // 1
+#ifdef CTF_DEBUG
+    sequence_id = 1;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<double>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<double>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      input_stream_id = 0;
+      auto sparse_stream_ptr = static_cast<CTFSparseInputStreamData<double>*>(
+          sequence[input_stream_id].get());
+      // |word 11:1
+      sparse_stream_ptr->indices.push_back(11);
+      sparse_stream_ptr->data.push_back(1);
+      // |word 344:1
+      sparse_stream_ptr->indices.push_back(344);
+      sparse_stream_ptr->data.push_back(1);
+    }
+    {
+      input_stream_id = 1;
+      auto sparse_stream_ptr = static_cast<CTFSparseInputStreamData<double>*>(
+          sequence[input_stream_id].get());
+      // |class 2:1
+      sparse_stream_ptr->indices.push_back(2);
+      sparse_stream_ptr->data.push_back(1);
+    }
+    dataset.sequences.push_back(sequence);
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+  }
+
+  EXPECT_TRUE(*ctf_parser.get_dataset() == dataset);
+}
+} // namespace ctf
+} // namespace data
+} // namespace torch
\ No newline at end of file
diff --git a/test/cpp/api/data/ctf/ctf_sample_sequence_to_sequence.cpp b/test/cpp/api/data/ctf/ctf_sample_sequence_to_sequence.cpp
new file mode 100644
index 000000000000..49eb060001bd
--- /dev/null
+++ b/test/cpp/api/data/ctf/ctf_sample_sequence_to_sequence.cpp
@@ -0,0 +1,124 @@
+#include <gtest/gtest.h>
+
+#include <test/cpp/api/data/ctf/samples/ctf_samples.h>
+#include <torch/data/ctf/ctf_parser.h>
+
+/// Tests must be executed from root directory of the repo
+/// Order of CTFValue<double>s inside CTFSample are important
+
+namespace torch {
+namespace data {
+namespace ctf {
+
+TEST(DataTest, CTF_SAMPLE_SEQUENCE_TO_SEQUENCE_SUCCESS) {
+  /// Actual data
+  std::vector<CTFInputStreamInformation> input_streams;
+  input_streams.emplace_back(
+      0,
+      "sourceWord",
+      "sourceWord",
+      0,
+      CTFInputStreamType::Feature,
+      CTFDataStorage::Sparse);
+  input_streams.emplace_back(
+      1,
+      "targetWord",
+      "targetWord",
+      0,
+      CTFInputStreamType::Label,
+      CTFDataStorage::Sparse);
+  CTFConfiguration config(
+      std::string(CTF_SAMPLE_DIR + "/ctf_sample_sequence_to_sequence.ctf"),
+      input_streams,
+      CTFDataType(CTFDataType::Double));
+
+  CTFParser<double> ctf_parser(config);
+  ctf_parser.read_from_file();
+
+  /// Expected data
+  CTFDataset<double> dataset(CTFDataType::Double, input_streams);
+#ifdef CTF_DEBUG
+  size_t sequence_id = 0;
+#endif
+  size_t input_stream_id = 0;
+  {
+    // 0
+#ifdef CTF_DEBUG
+    sequence_id = 0;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<double>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<double>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      input_stream_id = 0;
+      auto sparse_stream_ptr = static_cast<CTFSparseInputStreamData<double>*>(
+          sequence[input_stream_id].get());
+      // |sourceWord 234:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(234);
+      // |sourceWord 123:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(123);
+      // |sourceWord 123:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(123);
+      // |sourceWord 11:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(11);
+    }
+    {
+      input_stream_id = 1;
+      auto sparse_stream_ptr = static_cast<CTFSparseInputStreamData<double>*>(
+          sequence[input_stream_id].get());
+      // |targetWord 344:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(344);
+      // |targetWord 456:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(456);
+      // |targetWord 2222:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(2222);
+    }
+
+    dataset.sequences.push_back(sequence);
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+  }
+
+  {
+    // 1
+#ifdef CTF_DEBUG
+    sequence_id = 1;
+#endif
+    CTFSequenceData sequence;
+    input_stream_id = 0;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<double>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    input_stream_id = 1;
+    sequence.emplace_back(std::make_shared<CTFSparseInputStreamData<double>>(
+        input_stream_id, input_streams[input_stream_id].dimension));
+    {
+      input_stream_id = 0;
+      auto sparse_stream_ptr = static_cast<CTFSparseInputStreamData<double>*>(
+          sequence[input_stream_id].get());
+      // |sourceWord 123:1
+      sparse_stream_ptr->data.push_back(1);
+      sparse_stream_ptr->indices.push_back(123);
+    }
+    dataset.sequences.push_back(sequence);
+#ifdef CTF_DEBUG
+    dataset.sequences_id.push_back(sequence_id);
+#endif
+  }
+
+  EXPECT_TRUE(*ctf_parser.get_dataset() == dataset);
+}
+} // namespace ctf
+} // namespace data
+} // namespace torch
\ No newline at end of file
diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_classification.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_classification.ctf
new file mode 100644
index 000000000000..55cb3984d67c
--- /dev/null
+++ b/test/cpp/api/data/ctf/samples/ctf_sample_classification.ctf
@@ -0,0 +1,2 @@
+|class 23:1 |features 2 3 4 5 6
+|class 13:1 |features 1 2 0 2 3
diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_comments.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_comments.ctf
new file mode 100644
index 000000000000..ffdfdc06ea5c
--- /dev/null
+++ b/test/cpp/api/data/ctf/samples/ctf_sample_comments.ctf
@@ -0,0 +1,3 @@
+|B 100:3 123:4 |C 8 |A 0 1 2 3 4 |# a CTF comment
+|# another comment |A 0 1.1 22 0.3 54 |C 123917 |B 1134:1.911 13331:0.014
+|C -0.001 |# a comment with an escaped pipe: '|#' |A 3.9 1.11 121.2 99.13 0.04 |B 999:0.001 918918:-9.19
diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_dssm.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_dssm.ctf
new file mode 100644
index 000000000000..c9e188a4d4fa
--- /dev/null
+++ b/test/cpp/api/data/ctf/samples/ctf_sample_dssm.ctf
@@ -0,0 +1,2 @@
+|src 12:1 23:1 345:2 45001:1    |tgt 233:1 766:2 234:1
+|src 123:1 56:1 10324:1 18001:3 |tgt 233:1 2344:2 8889:1 2234:1 253434:1
diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_empty_values.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_empty_values.ctf
new file mode 100644
index 000000000000..0fc1eb65c623
--- /dev/null
+++ b/test/cpp/api/data/ctf/samples/ctf_sample_empty_values.ctf
@@ -0,0 +1,2 @@
+1|F0
+2|F0 |F1 |F2
diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_exponent_values.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_exponent_values.ctf
new file mode 100644
index 000000000000..8fead46727a9
--- /dev/null
+++ b/test/cpp/api/data/ctf/samples/ctf_sample_exponent_values.ctf
@@ -0,0 +1 @@
+0 |F0 0:0.421826 1:1.42167 2:-4.13626e-000123 5:-1.83832 7:-0.000114865 9:-36288.6 11:113.553 13:4.25123e+009 16:-1.78095e-005 18:-0.00162638 19:-1.07109 |T0 1
diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_generic.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_generic.ctf
new file mode 100644
index 000000000000..66f0dcf1c40d
--- /dev/null
+++ b/test/cpp/api/data/ctf/samples/ctf_sample_generic.ctf
@@ -0,0 +1,11 @@
+100 |a 1 2 3 |b 100 200 |# comment at the end of line.
+100 |a 4 5 6 |b 101 201
+100 |b 102983 14532 |a 7 8 9
+100 |a 7 8 9
+200 |b 300 400 |a 10 20 30
+333 |b 500 100
+333 |b 600 -900
+400 |a 1 2 3 |b 100 200
+|a 4 5 6 |b 101 201
+|a 4 5 6 |b 101 201
+500 |a 1 2 3 |b 100 200
diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_learning_to_rank.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_learning_to_rank.ctf
new file mode 100644
index 000000000000..9c8eb9c8cbf8
--- /dev/null
+++ b/test/cpp/api/data/ctf/samples/ctf_sample_learning_to_rank.ctf
@@ -0,0 +1,6 @@
+0 |rating 4 |features 23 35 0 0 0 21 2345 0 0 0 0 0
+0 |rating 2 |features 0 123 0 22 44 44 290 22 22 22 33 0
+0 |rating 1 |features 0 0 0 0 0 0 1 0 0 0 0 0
+1 |rating 1 |features 34 56 0 0 0 45 1312 0 0 0 0 0
+1 |rating 0 |features 45 45 0 0 0 12 335 0 0 0 0 0
+2 |rating 0 |features 0 0 0 0 0 0 22 0 0 0 0 0
diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0000.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0000.ctf
new file mode 100644
index 000000000000..74758fa30dfa
--- /dev/null
+++ b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0000.ctf
@@ -0,0 +1,3 @@
+1 |word 134:1 |tag 12:1
+1 |word 123:1 |tag 10:1
+1 |word 123:1 |tag 13:1
diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0001.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0001.ctf
new file mode 100644
index 000000000000..6d4681fb0b56
--- /dev/null
+++ b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0001.ctf
@@ -0,0 +1,3 @@
+2 |word 234:1 |tag 22:1
+2 |word 223:1 |tag 20:1
+2 |word 223:1 |tag 23:1
diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0002.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0002.ctf
new file mode 100644
index 000000000000..550d09c030b1
--- /dev/null
+++ b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0002.ctf
@@ -0,0 +1,3 @@
+3 |word 334:1 |tag 32:1
+3 |word 323:1 |tag 30:1
+3 |word 323:1 |tag 33:1
diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0003.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0003.ctf
new file mode 100644
index 000000000000..8410928bde0b
--- /dev/null
+++ b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0003.ctf
@@ -0,0 +1,3 @@
+4 |word 434:1 |tag 42:1
+4 |word 423:1 |tag 40:1
+4 |word 423:1 |tag 43:1
diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0004.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0004.ctf
new file mode 100644
index 000000000000..ea7cb6344531
--- /dev/null
+++ b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0004.ctf
@@ -0,0 +1,3 @@
+5 |word 534:1 |tag 52:1
+5 |word 523:1 |tag 50:1
+5 |word 523:1 |tag 53:1
diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0005.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0005.ctf
new file mode 100644
index 000000000000..023b6c69d1de
--- /dev/null
+++ b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0005.ctf
@@ -0,0 +1,2 @@
+6 |word 634:1 |tag 62:1
+6 |word 623:1 |tag 60:1
diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0006.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0006.ctf
new file mode 100644
index 000000000000..263aba8a7d17
--- /dev/null
+++ b/test/cpp/api/data/ctf/samples/ctf_sample_multiple_chunks_0006.ctf
@@ -0,0 +1 @@
+7 |word 734:1 |tag 72:1
diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_part_of_speech_tagging.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_part_of_speech_tagging.ctf
new file mode 100644
index 000000000000..4f325f78aba6
--- /dev/null
+++ b/test/cpp/api/data/ctf/samples/ctf_sample_part_of_speech_tagging.ctf
@@ -0,0 +1,5 @@
+0 |word 234:1 |tag 12:1
+0 |word 123:1 |tag 10:1
+0 |word 123:1 |tag 13:1
+1 |word 234:1 |tag 12:1
+1 |word 123:1 |tag 10:1
diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_sequence_classification.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_sequence_classification.ctf
new file mode 100644
index 000000000000..61ea12fa60b1
--- /dev/null
+++ b/test/cpp/api/data/ctf/samples/ctf_sample_sequence_classification.ctf
@@ -0,0 +1,5 @@
+0 |word 234:1 |class 3:1
+0 |word 123:1
+0 |word 890:1
+1 |word 11:1 |class 2:1
+1 |word 344:1
diff --git a/test/cpp/api/data/ctf/samples/ctf_sample_sequence_to_sequence.ctf b/test/cpp/api/data/ctf/samples/ctf_sample_sequence_to_sequence.ctf
new file mode 100644
index 000000000000..342045e9ea73
--- /dev/null
+++ b/test/cpp/api/data/ctf/samples/ctf_sample_sequence_to_sequence.ctf
@@ -0,0 +1,5 @@
+0 |sourceWord 234:1  |targetWord 344:1
+0 |sourceWord 123:1  |targetWord 456:1
+0 |sourceWord 123:1  |targetWord 2222:1
+0 |sourceWord 11:1
+1 |sourceWord 123:1
diff --git a/test/cpp/api/data/ctf/samples/ctf_samples.h b/test/cpp/api/data/ctf/samples/ctf_samples.h
new file mode 100644
index 000000000000..35721b94ca43
--- /dev/null
+++ b/test/cpp/api/data/ctf/samples/ctf_samples.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include <torch/data/ctf/ctf_parser.h>
+#include <string>
+
+namespace torch {
+namespace data {
+namespace ctf {
+
+static const std::string CTF_SAMPLE_DIR("./test/cpp/api/data/ctf/samples");
+
+#ifdef CTF_DEBUG
+template <typename DataType>
+void print_data(CTFDataset<DataType> dataset) {
+
+  size_t index = 0;
+  for (const auto& sequence_data : dataset.sequences) {
+    std::cerr << dataset.sequences_id[index] << " ";
+    for (const auto input_stream : sequence_data) {
+      auto input_stream_id = input_stream.get()->input_stream_id;
+      const auto& input_stream_info = dataset.input_streams[input_stream_id];
+
+      std::string input_stream_type;
+      if (input_stream_info.type == CTFInputStreamType::Feature) {
+        input_stream_type = "F";
+      } else {
+        input_stream_type = "L";
+      }
+      std::cerr << " |" << input_stream_info.name << "(" << input_stream_type
+                << ")";
+
+      if (input_stream_info.storage == CTFDataStorage::Dense) {
+        CTFDenseInputStreamData<DataType>* dense_data =
+            reinterpret_cast<CTFDenseInputStreamData<DataType>*>(
+                input_stream.get());
+
+        if (dense_data->data.empty()) {
+          std::cerr << " <empty>";
+        } else {
+          for (const auto& value : dense_data->data) {
+            std::cerr << " " << value;
+          }
+        }
+      } else {
+        // TODO: print row start somewhere
+        CTFSparseInputStreamData<DataType>* sparse_data =
+            reinterpret_cast<CTFSparseInputStreamData<DataType>*>(
+                input_stream.get());
+
+        if (sparse_data->data.empty()) {
+          std::cerr << " <empty>";
+        } else {
+          size_t col_index = 0;
+          for (const auto& value : sparse_data->data) {
+            std::cerr << " " << sparse_data->indices[col_index++] << ":"
+                      << value;
+          }
+        }
+      }
+    }
+    std::cerr << std::endl;
+    ++index;
+  }
+}
+#endif
+
+} // namespace ctf
+} // namespace data
+} // namespace torch
diff --git a/test/cpp/api/dataloader.cpp b/test/cpp/api/dataloader.cpp
index 461dfe56338b..1eb2b09108a5 100644
--- a/test/cpp/api/dataloader.cpp
+++ b/test/cpp/api/dataloader.cpp
@@ -1,10 +1,12 @@
 #include <gtest/gtest.h>
 
 #include <torch/data.h>
+#include <torch/data/ctf/ctf_chunk_dataset.h>
 #include <torch/data/detail/sequencers.h>
 #include <torch/serialize.h>
 #include <torch/types.h>
 
+#include <test/cpp/api/data/ctf/samples/ctf_samples.h>
 #include <test/cpp/api/support.h>
 
 #include <c10/util/ArrayRef.h>
@@ -16,6 +18,7 @@
 #include <iterator>
 #include <limits>
 #include <mutex>
+#include <numeric>
 #include <stdexcept>
 #include <string>
 #include <thread>
@@ -1274,3 +1277,310 @@ TEST(DataLoaderTest, StatefulDatasetWithCollate) {
   ASSERT_TRUE(batch->data[0].allclose(torch::ones(kBatchSize + 1)));
   ASSERT_TRUE(batch->target[0].allclose(torch::zeros(kBatchSize - 1)));
 }
+
+class DummyChunkDataSet : public datasets::ChunkDataSet<
+                         DummyChunkDataSet,
+                         std::vector<int>,
+                         samplers::SequentialSampler,
+                         samplers::SequentialSampler> {
+ public:
+  using BatchType = torch::optional<std::vector<int>>;
+  using BatchRequestType = size_t;
+  DummyChunkDataSet(size_t num_chunks, size_t batch_size)
+      : datasets::ChunkDataSet<
+            DummyChunkDataSet,
+            std::vector<int>,
+            samplers::SequentialSampler,
+            samplers::SequentialSampler>(),
+        num_chunks_(num_chunks),
+        batch_size_(batch_size),
+        chunk_sampler_(std::move(samplers::SequentialSampler(num_chunks))),
+        example_sampler_(std::move(samplers::SequentialSampler(batch_size))) {}
+
+  std::vector<int> read_chunk(size_t chunk_index) override {
+    std::vector<int> batch(batch_size_);
+    size_t counter = chunk_index * batch_size_;
+    for (auto& i : batch) {
+      i = counter++;
+    }
+    return batch;
+  }
+
+  /// Simply returns an entire chunk to test the API for now.
+  torch::optional<std::vector<int>> get_batch(size_t batch_size) override {
+    int index = chunk_index_.fetch_add(1);
+    if (index < num_chunks_) {
+      return read_chunk(index);
+    }
+    return torch::nullopt;
+  }
+
+  samplers::SequentialSampler get_chunk_sampler() override {
+    return chunk_sampler_;
+  }
+
+  samplers::SequentialSampler get_example_sampler() override {
+    return example_sampler_;
+  }
+
+  size_t get_chunk_count() override {
+    return num_chunks_;
+  }
+
+ private:
+  std::atomic<int> chunk_index_{0};
+  size_t num_chunks_;
+  size_t batch_size_;
+  samplers::SequentialSampler chunk_sampler_;
+  samplers::SequentialSampler example_sampler_;
+};
+
+TEST(DataTest, DataLoaderWithChunkSupportSingleWorker) {
+  const size_t kBatchSize = 13;
+  const size_t kNumChunks = 10;
+
+  auto dataset = torch::data::datasets::make_shared_dataset<DummyChunkDataSet>(
+                     kNumChunks, kBatchSize)
+                     .map(transforms::BatchLambda<std::vector<int>, int>(
+                         [](const std::vector<int>& x) {
+                           return std::accumulate(x.begin(), x.end(), 0);
+                         }));
+  auto data_loader =
+      torch::data::make_data_loader(dataset, DataLoaderOptions(kBatchSize));
+
+  int count = 0;
+  for (int sum : *data_loader) {
+    int res = 0;
+    for (int i = 0; i < kBatchSize; ++i) {
+      res += count * kBatchSize + i;
+    }
+    ASSERT_EQ(sum, res);
+    count++;
+  }
+  ASSERT_EQ(count, 10);
+}
+
+TEST(DataTest, DataLoaderWithChunkSupportMultiWorker) {
+  const size_t kBatchSize = 13;
+  const size_t kNumChunks = 10;
+
+  auto dataset = torch::data::datasets::make_shared_dataset<DummyChunkDataSet>(
+                     kNumChunks, kBatchSize)
+                     .map(transforms::BatchLambda<std::vector<int>, int>(
+                         [](const std::vector<int>& x) {
+                           return std::accumulate(x.begin(), x.end(), 0);
+                         }));
+  auto data_loader =
+      torch::data::make_data_loader(dataset, DataLoaderOptions(kBatchSize));
+
+  int count = 0;
+  int result_sum = 0;
+  int expected_sum = 0;
+  for (int sum : *data_loader) {
+    result_sum += sum;
+    for (int i = 0; i < kBatchSize; ++i) {
+      expected_sum += count * kBatchSize + i;
+    }
+    count++;
+  }
+  ASSERT_EQ(result_sum, expected_sum);
+}
+
+/// ctf_sample_part_of_speech_tagging.ctf has 2 batches with 1 example each
+TEST(DataTest, CTFDataLoaderWithChunkSupportSingleWorkerSingleChunk) {
+  const size_t batch_size = 1;
+  const size_t total_workers = 1;
+  const size_t total_example = 2;
+  const size_t max_jobs = 2 * total_workers;
+  std::vector<torch::data::ctf::CTFInputStreamInformation> input_streams;
+  input_streams.emplace_back(
+      "word",
+      "word",
+      0,
+      torch::data::ctf::CTFInputStreamType::Feature,
+      torch::data::ctf::CTFDataStorage::Sparse);
+  input_streams.emplace_back(
+      "tag",
+      "tag",
+      0,
+      torch::data::ctf::CTFInputStreamType::Label,
+      torch::data::ctf::CTFDataStorage::Sparse);
+  torch::data::ctf::CTFConfiguration config(
+      std::string(
+          torch::data::ctf::CTF_SAMPLE_DIR +
+          "/ctf_sample_part_of_speech_tagging.ctf"),
+      input_streams,
+      torch::data::ctf::CTFDataType(torch::data::ctf::CTFDataType::Double));
+
+  datasets::SharedBatchDataset<ctf::CTFChunkDataset<
+      double,
+      samplers::RandomSampler,
+      samplers::RandomSampler>>
+      shared_dataset = datasets::make_shared_dataset<ctf::CTFChunkDataset<
+          double,
+          samplers::RandomSampler,
+          samplers::RandomSampler>>(config);
+  auto data_loader = torch::data::make_chunk_data_loader(
+      shared_dataset,
+      DataLoaderOptions()
+          .batch_size(batch_size)
+          .chunk_loading(true)
+          .workers(total_workers)
+          .max_jobs(max_jobs));
+
+  shared_dataset->reset();
+  auto iterator = data_loader->begin();
+  size_t count_example = 0;
+  // TODO: Because current DataLoader can return empty batches,
+  // batch max_jobs to ensure chunk is fully read.
+  // Empty batches are ignored by the tests
+  for (size_t i = 0; i < max_jobs; ++i, ++iterator) {
+    std::vector<torch::data::ctf::CTFSequenceData> batch = *iterator;
+    if (batch.size() != 0) {
+      count_example += batch.size();
+      ASSERT_EQ(batch.size(), batch_size);
+      torch::data::ctf::CTFSparseInputStreamData<double>* sparse_data =
+          reinterpret_cast<torch::data::ctf::CTFSparseInputStreamData<double>*>(
+              batch[0][0].get());
+      ASSERT_EQ(sparse_data->data[0], 1);
+      // TODO: Add more checks after using new ChunkDataSet...
+    }
+  }
+  ASSERT_EQ(total_example, count_example);
+}
+
+// ctf_sample_part_of_speech_tagging.ctf has a single batch with 2 examples
+TEST(
+    DataTest,
+    CTFDataLoaderWithChunkSupportSingleWorkerSingleChunkTwoExamplePerBatch) {
+  const size_t batch_size = 2;
+  const size_t total_example = 2;
+  const size_t total_worker = 1;
+  const size_t max_jobs = 2 * total_worker;
+  std::vector<torch::data::ctf::CTFInputStreamInformation> input_streams;
+  input_streams.emplace_back(
+      "word",
+      "word",
+      0,
+      torch::data::ctf::CTFInputStreamType::Feature,
+      torch::data::ctf::CTFDataStorage::Sparse);
+  input_streams.emplace_back(
+      "tag",
+      "tag",
+      0,
+      torch::data::ctf::CTFInputStreamType::Label,
+      torch::data::ctf::CTFDataStorage::Sparse);
+  torch::data::ctf::CTFConfiguration config(
+      std::string(
+          torch::data::ctf::CTF_SAMPLE_DIR +
+          "/ctf_sample_part_of_speech_tagging.ctf"),
+      input_streams,
+      torch::data::ctf::CTFDataType(torch::data::ctf::CTFDataType::Double));
+
+  datasets::SharedBatchDataset<ctf::CTFChunkDataset<
+      double,
+      samplers::RandomSampler,
+      samplers::RandomSampler>>
+      shared_dataset = datasets::make_shared_dataset<ctf::CTFChunkDataset<
+          double,
+          samplers::RandomSampler,
+          samplers::RandomSampler>>(config);
+  auto data_loader = torch::data::make_chunk_data_loader(
+      shared_dataset,
+      DataLoaderOptions()
+          .workers(total_worker)
+          .max_jobs(max_jobs)
+          .batch_size(batch_size)
+          .chunk_loading(true));
+
+  shared_dataset->reset();
+  auto iterator = data_loader->begin();
+  size_t count_example = 0;
+  // TODO: Because current DataLoader can return empty batches,
+  // batch max_jobs to ensure chunk is fully read.
+  // Empty batches are ignored by the tests
+  for (size_t i = 0; i < max_jobs; ++i, ++iterator) {
+    std::vector<torch::data::ctf::CTFSequenceData> batch = *iterator;
+    if (batch.size() != 0) {
+      ASSERT_EQ(batch.size(), batch_size);
+      torch::data::ctf::CTFSparseInputStreamData<double>* sparse_data =
+          reinterpret_cast<torch::data::ctf::CTFSparseInputStreamData<double>*>(
+              batch[0][0].get());
+      ASSERT_EQ(sparse_data->data[0], 1);
+      // TODO: Add more checks after using new ChunkDataSet...
+      count_example += batch.size();
+    }
+  }
+  ASSERT_EQ(total_example, count_example);
+}
+
+// ctf_sample_multiple_chunks_0000[0...6].ctf has 7 chunks with 3 batches each
+// (last chunk has 1 batch) and one example per batch
+TEST(DataTest, CTFDataLoaderWithChunkSupportMultipleWorkersMultipleChunks) {
+  const size_t batch_size = 3;
+  const size_t total_example = 7;
+  const size_t total_prefetch = 2;
+  const size_t total_worker = 10;
+  const size_t max_jobs = 2 * total_worker;
+
+  std::vector<torch::data::ctf::CTFConfiguration> configs;
+  std::vector<torch::data::ctf::CTFInputStreamInformation> input_streams;
+  input_streams.emplace_back(
+      "word",
+      "word",
+      0,
+      torch::data::ctf::CTFInputStreamType::Feature,
+      torch::data::ctf::CTFDataStorage::Sparse);
+  input_streams.emplace_back(
+      "tag",
+      "tag",
+      0,
+      torch::data::ctf::CTFInputStreamType::Label,
+      torch::data::ctf::CTFDataStorage::Sparse);
+
+  for (size_t i = 0; i < total_example; ++i) {
+    torch::data::ctf::CTFConfiguration config(
+        std::string(
+            torch::data::ctf::CTF_SAMPLE_DIR +
+            "/ctf_sample_multiple_chunks_000" + std::to_string(i) + ".ctf"),
+        input_streams,
+        torch::data::ctf::CTFDataType(torch::data::ctf::CTFDataType::Double));
+
+    configs.push_back(config);
+  }
+
+  datasets::SharedBatchDataset<ctf::CTFChunkDataset<
+      double,
+      samplers::RandomSampler,
+      samplers::RandomSampler>>
+      shared_dataset = datasets::make_shared_dataset<ctf::CTFChunkDataset<
+          double,
+          samplers::RandomSampler,
+          samplers::RandomSampler>>(configs, total_prefetch);
+  auto data_loader = torch::data::make_chunk_data_loader(
+      shared_dataset,
+      DataLoaderOptions()
+          .workers(total_worker)
+          .max_jobs(max_jobs)
+          .batch_size(batch_size)
+          .chunk_loading(true));
+
+  shared_dataset->reset();
+  auto iterator = data_loader->begin();
+  size_t count_example = 0;
+  // TODO: Because current DataLoader can return empty batches,
+  // batch max_jobs to ensure chunk is fully read.
+  // Empty batches are ignored by the tests
+  for (size_t i = 0; i < max_jobs; ++i, ++iterator) {
+    std::vector<torch::data::ctf::CTFSequenceData> batch = *iterator;
+    count_example += batch.size();
+    for (size_t b = 0; b < batch.size(); ++b) {
+      torch::data::ctf::CTFSparseInputStreamData<double>* sparse_data =
+          reinterpret_cast<torch::data::ctf::CTFSparseInputStreamData<double>*>(
+              batch[0][0].get());
+      ASSERT_EQ(sparse_data->data[0], 1);
+      // TODO: Add more checks after using new ChunkDataSet...
+    }
+  }
+  ASSERT_EQ(total_example, count_example);
+}
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 7de5815c5c55..342c5391ea68 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -239,6 +239,7 @@ if (NOT NO_API)
     ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/random.cpp
     ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/sequential.cpp
     ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/stream.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/data/ctf/reader.cpp
     ${TORCH_SRC_DIR}/csrc/api/src/jit.cpp
     ${TORCH_SRC_DIR}/csrc/api/src/nn/init.cpp
     ${TORCH_SRC_DIR}/csrc/api/src/nn/module.cpp
diff --git a/torch/csrc/api/include/torch/data/ctf/ctf_chunk_dataset.h b/torch/csrc/api/include/torch/data/ctf/ctf_chunk_dataset.h
new file mode 100644
index 000000000000..c1eaa0e2cc42
--- /dev/null
+++ b/torch/csrc/api/include/torch/data/ctf/ctf_chunk_dataset.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#include <torch/data.h>
+#include <torch/data/datasets/base.h>
+#include <torch/data/example.h>
+#include <torch/types.h>
+
+#include <torch/data/ctf/ctf_parser.h>
+
+namespace torch {
+namespace data {
+namespace ctf {
+
+
+template <
+    typename DataType = double,
+    typename ChunkSampler = samplers::RandomSampler,
+    typename ExampleSampler = samplers::RandomSampler>
+class CTFChunkDataset
+    : public datasets::ChunkDataSet<
+          CTFChunkDataset<DataType, ChunkSampler, ExampleSampler>,
+          std::vector<CTFSequenceData>,
+          ChunkSampler,
+          ExampleSampler> {
+ public:
+  using BatchType = std::vector<CTFSequenceData>;
+  using ChunkSamplerType = ChunkSampler;
+  using ExampleSamplerType = ExampleSampler;
+
+  /// Loads multiple CTF files on multiple chunks with parallelization
+  /// TODO: CTF files are not splitted, so they must fit in memory
+  explicit CTFChunkDataset(
+      std::vector<ctf::CTFConfiguration> configs,
+      size_t prefetch_count)
+      : datasets::ChunkDataSet<
+            CTFChunkDataset<DataType, ChunkSampler, ExampleSampler>,
+            std::vector<CTFSequenceData>,
+            ChunkSampler,
+            ExampleSampler>(prefetch_count, false),
+        config_(configs),
+        chunk_sampler_(std::move(ChunkSampler(0))),
+        example_sampler_(std::move(ExampleSampler(0))) {
+    num_chunks_ = configs.size();
+  }
+
+  /// Loads a single CTF file on a single chunk without parallelization
+  /// TODO: CTF files are not splitted, so they must fit in memory
+  explicit CTFChunkDataset(ctf::CTFConfiguration config)
+      : datasets::ChunkDataSet<
+            CTFChunkDataset<DataType, ChunkSampler, ExampleSampler>,
+            std::vector<CTFSequenceData>,
+            ChunkSampler,
+            ExampleSampler>(1, false),
+        chunk_sampler_(std::move(ChunkSampler(0))),
+        example_sampler_(std::move(ExampleSampler(0))) {
+    num_chunks_ = 1;
+    config_.push_back(config);
+  }
+
+  std::vector<CTFSequenceData> read_chunk(size_t chunk_index) override {
+    // read file (which is a full chunk)
+    ctf::CTFParser<DataType> ctf_parser(config_[chunk_index]);
+    ctf_parser.read_from_file();
+    std::shared_ptr<CTFDataset<DataType>> ctf_dataset =
+        ctf_parser.get_dataset();
+
+    return std::move(ctf_dataset->sequences);
+  }
+
+  ChunkSampler get_chunk_sampler() override {
+    return chunk_sampler_;
+  }
+
+  ExampleSampler get_example_sampler() override {
+    return example_sampler_;
+  }
+
+  size_t get_chunk_count() override {
+    return num_chunks_;
+  }
+
+
+ private:
+  std::vector<ctf::CTFConfiguration> config_;
+  size_t num_chunks_;
+  ChunkSampler chunk_sampler_;
+  ExampleSampler example_sampler_;
+};
+
+
+} // namespace ctf
+} // namespace data
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/data/ctf/ctf_parser.h b/torch/csrc/api/include/torch/data/ctf/ctf_parser.h
new file mode 100644
index 000000000000..ed0ff9311929
--- /dev/null
+++ b/torch/csrc/api/include/torch/data/ctf/ctf_parser.h
@@ -0,0 +1,931 @@
+#pragma once
+
+#include <torch/data/ctf/reader.h>
+#include <torch/data/ctf/utils.h>
+
+#include <stdint.h>
+#include <algorithm>
+#include <cassert>
+#include <climits>
+#include <iostream>
+#ifdef CTF_DEBUG
+#include <map>
+#endif
+#include <memory>
+#include <ostream>
+#include <unordered_map>
+#include <vector>
+
+namespace torch {
+namespace data {
+namespace ctf {
+
+/*
+ * CTF general format
+ * [Sequence_Id](Sample or Comment)+
+ *   where
+ *          sequence_Id=(empty|[0-9]+)
+ *          Sample=|Input_Name (Value )*
+ *          Comment=|# some content
+ * Example:
+ * 100 |a 1 2 3 |b 100 200
+ * 100 |a 4 5 6 |b 101 201
+ * 100 |b 102983 14532 |a 7 8 9
+ * 100 |a 7 8 9
+ * 200 |b 300 400 |a 10 20 30
+ * 333 |b 500 100
+ * 333 |b 600 -900
+ * 400 |a 1 2 3 |b 100 200
+ * |a 4 5 6 |b 101 201
+ * |a 4 5 6 |b 101 201
+ * 500 |a 1 2 3 |b 100 200
+ */
+
+///
+/// Beginning of type definitions
+///
+
+///
+/// Enumeration type denoting data type of symbolic data entities or actual
+/// data.
+///
+enum class CTFDataType : unsigned int {
+  Unknown = 0,
+  Float = 1,
+  Double = 2,
+  UChar = 3, // So far only used internally in deserializers.
+  Float16 = 4,
+  Int8 = 5,
+  Int16 = 6,
+  Int32 = 7,
+};
+
+///
+/// Enumeration type denoting the format of storage
+///
+enum class CTFDataStorage { Dense, Sparse };
+enum class CTFInputStreamType { Feature, Label };
+
+///
+/// Input Stream information
+///
+struct CTFInputStreamInformation {
+  // Self-assigned Unique ID of the input stream (do not assign it!)
+  // TODO: ugly, fix this!
+  size_t __id__;
+  // Unique name of the input stream
+  std::string name;
+  // Unique alias of the input
+  // Useful when the name is long
+  std::string alias;
+  // expected number of elements in a sample
+  // TODO: Only useful if number of samples is known
+  size_t dimension;
+  // Input streams belong to either Feature or Label
+  CTFInputStreamType type;
+  // Data storage of the stream
+  CTFDataStorage storage;
+
+  CTFInputStreamInformation(
+      std::string name,
+      std::string alias,
+      size_t dimension,
+      CTFInputStreamType type,
+      CTFDataStorage storage)
+      : name(std::move(name)),
+        alias(std::move(alias)),
+        dimension(dimension),
+        type(type),
+        storage(storage){};
+
+  // Used for unit tests
+  CTFInputStreamInformation(
+      size_t id,
+      std::string name,
+      std::string alias,
+      size_t dimension,
+      CTFInputStreamType type,
+      CTFDataStorage storage)
+      : __id__(id),
+        name(std::move(name)),
+        alias(std::move(alias)),
+        dimension(dimension),
+        type(type),
+        storage(storage){};
+};
+inline bool operator==(
+    const CTFInputStreamInformation& lhs,
+    const CTFInputStreamInformation& rhs) {
+  return (
+      lhs.__id__ == rhs.__id__ && lhs.name == rhs.name &&
+      lhs.alias == rhs.alias && lhs.dimension == rhs.dimension &&
+      lhs.type == rhs.type && lhs.storage == rhs.storage);
+}
+
+inline bool operator!=(
+    const CTFInputStreamInformation& lhs,
+    const CTFInputStreamInformation& rhs) {
+  return !(lhs == rhs);
+}
+
+///
+/// Helper to centralize all input information in a single object
+///
+class CTFConfiguration {
+ public:
+  explicit CTFConfiguration(
+      const std::string& filepath,
+      const std::vector<CTFInputStreamInformation>& input_streams_info,
+      CTFDataType data_type)
+      : filepath_(std::move(filepath)),
+        input_streams_info_(std::move(input_streams_info)),
+        data_type_(data_type){};
+
+  const std::vector<CTFInputStreamInformation>& get_input_streams_info() const {
+    return input_streams_info_;
+  }
+
+  const std::string& get_file_path() const {
+    return filepath_;
+  }
+  CTFDataType get_ctf_data_type() const {
+    return data_type_;
+  }
+
+ private:
+  std::string filepath_;
+  std::vector<CTFInputStreamInformation> input_streams_info_;
+  CTFDataType data_type_;
+};
+
+///
+/// Sequence ID type
+/// -1 is used to flag an uninitialized Sequence ID
+///
+typedef long int CTFSequenceID;
+
+#ifdef CTF_DEBUG
+///
+/// Maps Sequenced ID to index at vector<CTFSequenceData>
+///
+typedef std::map<size_t, size_t> CTFSequenceMap;
+#endif
+
+///
+/// Input Stream ID type
+/// All Input Streamsare stored on a vector<CTFINputStreamInformation>
+/// and CTFInputStreamID is the index of a particular stream
+///
+typedef size_t CTFInputStreamID;
+
+///
+/// Maps Input Stream names to a unique index
+///
+typedef std::unordered_map<std::string, CTFInputStreamID>
+    CTFInputStreamMapByName;
+
+///
+/// Used during sparse data parsing
+///
+const size_t CTFValueIndexUninitialized = SIZE_MAX;
+typedef size_t CTFValueIndex;
+
+///
+/// Sequence data type
+/// The global vector of sequences and the vector of samples will use it
+///
+struct CTFInpuStreamDataBase {
+  explicit CTFInpuStreamDataBase(size_t input_stream_id)
+      : input_stream_id(input_stream_id) {}
+  size_t input_stream_id;
+};
+typedef std::shared_ptr<CTFInpuStreamDataBase> CTFInpuStreamDataBasePtr;
+typedef std::vector<CTFInpuStreamDataBasePtr> CTFSequenceData;
+
+///
+/// Dense data
+///
+template <typename DataType>
+struct CTFDenseInputStreamData : CTFInpuStreamDataBase {
+  explicit CTFDenseInputStreamData(size_t input_stream_id, size_t capacity = 0)
+      : CTFInpuStreamDataBase(input_stream_id) {
+    if (capacity > 0) {
+      // TODO: On a per input stream storage, sample dimension is not useful
+      // data.reserve(capacity);
+    }
+  }
+
+  std::vector<DataType> data;
+};
+
+///
+/// Sparse data
+///
+template <typename DataType>
+struct CTFSparseInputStreamData : CTFInpuStreamDataBase {
+  explicit CTFSparseInputStreamData(
+      size_t input_stream_id,
+      size_t dimension = 0)
+      : CTFInpuStreamDataBase(input_stream_id) {
+    if (dimension > 0) {
+      /// TODO: Reserve something for sparse input? 1%? 0.05% of dimension?
+      // data.reserve(dimension);
+    }
+  }
+
+  std::vector<size_t> indices;
+  std::vector<DataType> data;
+};
+
+///
+/// CTFDataset centralizes all parsed data
+///
+template <typename DataType>
+struct CTFDataset {
+  explicit CTFDataset(
+      CTFDataType data_type,
+      const std::vector<CTFInputStreamInformation>& input_streams_info)
+      : data_type(data_type), input_streams_info(input_streams_info) {}
+
+  bool operator==(const CTFDataset<DataType>& rhs) const {
+    // Datasets must have the same type and number of sequences
+    if (this->data_type != rhs.data_type ||
+        this->sequences.size() != rhs.sequences.size()) {
+      return false;
+    }
+
+    for (size_t sequence_index = 0; sequence_index < this->sequences.size();
+         ++sequence_index) {
+      // Each sequence buffer must have the same number of input streams
+      if (this->sequences[sequence_index].size() !=
+          rhs.sequences[sequence_index].size()) {
+        return false;
+      }
+      // Each input stream must have the same number of values
+      for (size_t sequence_data_index = 0;
+           sequence_data_index < this->sequences[sequence_index].size();
+           ++sequence_data_index) {
+        auto this_stream_ptr =
+            this->sequences[sequence_index][sequence_data_index].get();
+        auto this_stream_id = this_stream_ptr->input_stream_id;
+        auto rhs_stream_ptr =
+            rhs.sequences[sequence_index][sequence_data_index].get();
+        auto rhs_stream_id = rhs_stream_ptr->input_stream_id;
+        // Input streams IDs must match
+        if (this_stream_id != rhs_stream_id) {
+          return false;
+        }
+
+        // Input stream metadata must match
+        const auto& this_input_stream_info =
+            this->input_streams_info[this_stream_id];
+        const auto& rhs_input_stream_info =
+            rhs.input_streams_info[rhs_stream_id];
+        if (this_input_stream_info != rhs_input_stream_info) {
+          return false;
+        }
+
+        // Values inside each input stream must match
+        if (rhs_input_stream_info.storage == CTFDataStorage::Dense) {
+          auto this_dense_stream_ptr =
+              static_cast<CTFDenseInputStreamData<DataType>*>(this_stream_ptr);
+          auto rhs_dense_stream_ptr =
+              static_cast<CTFDenseInputStreamData<DataType>*>(rhs_stream_ptr);
+          if (this_dense_stream_ptr->data != rhs_dense_stream_ptr->data) {
+            return false;
+          }
+        } else {
+          auto this_sparse_stream_ptr =
+              static_cast<CTFSparseInputStreamData<DataType>*>(this_stream_ptr);
+          auto rhs_sparse_stream_ptr =
+              static_cast<CTFSparseInputStreamData<DataType>*>(rhs_stream_ptr);
+          if ((this_sparse_stream_ptr->indices !=
+               rhs_sparse_stream_ptr->indices) ||
+              (this_sparse_stream_ptr->data != rhs_sparse_stream_ptr->data)) {
+            return false;
+          }
+        }
+      }
+    }
+
+    return true;
+  }
+
+  bool operator!=(const CTFDataset<DataType>& rhs) const {
+    return !(this == rhs);
+  }
+
+  // TODO: Do we need this? Maybe for logging, only
+  CTFDataType data_type;
+  // Contains all sequences
+  // TODO: Performance consideration: CNTK knows the number of sequences in the
+  // chunk, allowing accurate memory reservation. Pytorch approach doesn't
+  std::vector<CTFSequenceData> sequences;
+
+  // CTF Input Stream definitions for features and labels
+  std::vector<CTFInputStreamInformation> input_streams_info;
+
+  // Input stream map (maps input stream name to a unique ID)
+  CTFInputStreamMapByName input_streams_map;
+#ifdef CTF_DEBUG
+  std::vector<size_t> sequences_id;
+#endif
+};
+
+///
+/// Beginning of implementation
+///
+
+template <typename DataType>
+class CTFParser {
+ public:
+  explicit CTFParser(const CTFConfiguration& config)
+      : data_type_(config.get_ctf_data_type()),
+        dataset_(std::make_shared<CTFDataset<DataType>>(
+            config.get_ctf_data_type(),
+            config.get_input_streams_info())),
+        scratch_(CTF_SCRATCH_LENGTH, '\0'),
+        reader_(std::make_shared<Reader>(config.get_file_path())),
+        has_initial_sequence_id_(false),
+        previous_sequence_id_(-1) {
+    // TODO: Improve validation by iterating all streams checking
+    // CTFInputStreamType?
+    if (dataset_->input_streams_info.size() < 2) {
+      std::string error_msg(
+          "Missing 'features' or 'labels' CTF stream definitions!");
+#ifdef CTF_DEBUG
+      std::cerr << error_msg << std::endl;
+#endif
+      throw std::runtime_error(error_msg);
+    }
+
+    // TODO: Improve it for unit testing too
+    // Creating unique IDs for all input streams
+    for (size_t i = 0; i < dataset_->input_streams_info.size(); ++i) {
+      CTFInputStreamInformation& stream = dataset_->input_streams_info[i];
+      const std::string& name = stream.name;
+      dataset_->input_streams_map[name] = i;
+      dataset_->input_streams_info[i].__id__ = i;
+    }
+  }
+
+#ifdef CTF_DEBUG
+  void print_data(void) const {
+    size_t index = 0;
+    for (const auto& sequence_data : dataset_->sequences) {
+      std::cerr << dataset_->sequences_id[index] << " ";
+      for (const auto input_stream : sequence_data) {
+        auto input_stream_id = input_stream.get()->input_stream_id;
+        const auto& input_stream_info =
+            dataset_->input_streams_info[input_stream_id];
+
+        std::string input_stream_type;
+        if (input_stream_info.type == CTFInputStreamType::Feature) {
+          input_stream_type = "F";
+        } else {
+          input_stream_type = "L";
+        }
+        std::cerr << " |" << input_stream_info.name << "(" << input_stream_type
+                  << ")";
+
+        if (input_stream_info.storage == CTFDataStorage::Dense) {
+          CTFDenseInputStreamData<DataType>* dense_data =
+              reinterpret_cast<CTFDenseInputStreamData<DataType>*>(
+                  input_stream.get());
+
+          if (dense_data->data.empty()) {
+            std::cerr << " <empty>";
+          } else {
+            for (const auto& value : dense_data->data) {
+              std::cerr << " " << value;
+            }
+          }
+        } else {
+          CTFSparseInputStreamData<DataType>* sparse_data =
+              reinterpret_cast<CTFSparseInputStreamData<DataType>*>(
+                  input_stream.get());
+
+          if (sparse_data->data.empty()) {
+            std::cerr << " <empty>";
+          } else {
+            size_t col_index = 0;
+            for (const auto& value : sparse_data->data) {
+              std::cerr << " " << sparse_data->indices[col_index++] << ":"
+                        << value;
+            }
+          }
+        }
+      }
+      std::cerr << std::endl;
+      ++index;
+    }
+  }
+#endif
+
+  std::shared_ptr<CTFDataset<DataType>> get_dataset() {
+    return dataset_;
+  }
+
+  void read_from_file() {
+#ifdef CTF_DEBUG
+    size_t read_count = 0;
+#endif
+
+    do {
+#ifdef CTF_DEBUG
+      std::cout << "Read count: " << ++read_count << " starting at "
+                << reader_->get_position() << std::endl;
+#endif
+      // CTF files start with valid alpha-numeric characters
+      if (is_non_printable(reader_->peek_char())) {
+        std::string error_msg(
+            "Non printable character anon print CTF file at position " +
+            std::to_string(reader_->get_position()) + "(" +
+            std::to_string(static_cast<int>(reader_->peek_char())) + ")");
+#ifdef CTF_DEBUG
+        std::cout << error_msg << std::endl;
+#endif
+        throw std::runtime_error(error_msg);
+      }
+
+      // There can be an explicit sequence ID at the beginning of the line or
+      // the last known is used implicitly
+      CTFSequenceID sequence_id;
+      bool is_new_sequence = get_sequence_id(sequence_id);
+
+      while (!is_eol(reader_->peek_char())) {
+        // After the sequence ID, there can be many input streams/comments
+        if (!get_input_stream(sequence_id, is_new_sequence)) {
+          if (!discard_comment()) {
+            std::string error_msg(
+                "Invalid CTF File. Neither a CTF Value nor a "
+                "CTF Comment was found at position " +
+                std::to_string(reader_->get_position()));
+#ifdef CTF_DEBUG
+            std::cout << error_msg << std::endl;
+#endif
+            throw std::runtime_error(error_msg);
+          }
+        }
+      }
+      // Discard EOL
+      reader_->get_char();
+    } while (reader_->can_read());
+  }
+
+ private:
+  CTFParser() = delete;
+  DISALLOW_COPY_AND_ASSIGN(CTFParser);
+
+  bool get_sequence_id(CTFSequenceID& sequence_id) {
+#ifdef CTF_DEBUG
+    // For logging purposes
+    size_t initial_pos = reader_->get_position();
+#endif
+
+    // Flag to identify when a new Sequence ID is found
+    bool is_new = false;
+
+    // idx will be used to iterate through scratch_ for local string parsing
+    size_t idx = 0;
+
+    // Sequence ID must start with a digit
+    char c = reader_->peek_char();
+    if (!is_digit(c)) {
+#ifdef CTF_DEBUG
+      std::cout << "Not a Sequence ID at position " << initial_pos << std::endl;
+#endif
+      if (has_initial_sequence_id_) {
+        sequence_id = previous_sequence_id_;
+#ifdef CTF_DEBUG
+        std::cout << "Using previous Sequence ID (" << previous_sequence_id_
+                  << ")" << std::endl;
+#endif
+      } else {
+        is_new = true;
+        sequence_id = previous_sequence_id_ + 1;
+
+#ifdef CTF_DEBUG
+        std::cout << "Incremented previous Sequence ID (" << sequence_id << ")"
+                  << std::endl;
+#endif
+      }
+      previous_sequence_id_ = sequence_id;
+      return is_new;
+    }
+
+    // Get all consecutive digits
+    while (is_digit(reader_->peek_char())) {
+      c = reader_->get_char();
+      scratch_[idx++] = c;
+    }
+    scratch_[idx] = '\0';
+
+    // Discard delimiters after the ID
+    while (is_value_delimiter(reader_->peek_char())) {
+      reader_->get_char();
+    }
+
+    // After Sequence ID, there must be a '|'
+    if (!is_name_prefix(reader_->peek_char())) {
+      std::string error_msg(
+          "Missing name delimiter for one of the sequences at position " +
+          std::to_string(reader_->get_position()));
+#ifdef CTF_DEBUG
+      std::cerr << error_msg << std::endl;
+#endif
+      throw std::runtime_error(error_msg);
+    }
+
+    // Convert string and return integral value
+    sequence_id = static_cast<CTFSequenceID>(std::stoull(scratch_.data()));
+#ifdef CTF_DEBUG
+    std::cout << "Found Sequence ID '" << std::to_string(sequence_id)
+              << "' at position " << std::to_string(initial_pos) << std::endl;
+#endif
+
+    // Decides whether this is a new example or an existing one
+    if (previous_sequence_id_ != sequence_id && sequence_id != LONG_MAX) {
+      is_new = true;
+    }
+
+    previous_sequence_id_ = sequence_id;
+    has_initial_sequence_id_ = true;
+    return is_new;
+  }
+
+  bool get_input_stream(
+      const CTFSequenceID& sequence_id,
+      bool& is_new_sequence) {
+    // Create a new sequence with input_streams_info pre-allocated
+    if (is_new_sequence) {
+      is_new_sequence = false;
+
+#ifdef CTF_DEBUG
+      dataset_->sequences_id.emplace_back(sequence_id);
+#endif
+
+      // New sequence to be appended to dataset_->sequences
+      CTFSequenceData sequence;
+
+      for (auto const& stream : dataset_->input_streams_info) {
+        CTFInputStreamID input_stream_id =
+            dataset_->input_streams_map[stream.name];
+        if (stream.storage == CTFDataStorage::Dense) {
+          // TODO: Performance consideration: CNTK knows the number of samples
+          // in the sequence, allowing accurate memory reservation (index built
+          // during init)
+          sequence.emplace_back(
+              std::make_shared<CTFDenseInputStreamData<DataType>>(
+                  input_stream_id, stream.dimension));
+        } else {
+          sequence.emplace_back(
+              std::make_shared<CTFSparseInputStreamData<DataType>>(
+                  input_stream_id, stream.dimension));
+        }
+      }
+
+      dataset_->sequences.emplace_back(sequence);
+    }
+
+    // Reads the Input Stream name and lookup its input stream reference
+    CTFInputStreamID input_stream_id;
+    if (!get_input_stream_name(input_stream_id)) {
+      return false;
+    }
+    const CTFInputStreamInformation& input_stream =
+        dataset_->input_streams_info[input_stream_id];
+
+    // Appends all values to the input stream
+    if (!get_input_stream(input_stream)) {
+      return false;
+    }
+
+    // TODO: Check actual number of values records of the stream
+    return true;
+  }
+
+  // Parses input name from buffer and returns both CTFInputStreamInformation
+  // reference and true if the input name belongs to an existing Input Stream
+  bool get_input_stream_name(CTFInputStreamID& input_stream_id) {
+#ifdef CTF_DEBUG
+    // For logging purposes
+    size_t initial_pos = reader_->get_position();
+#endif
+    // idx will be used to iterate through scratch_ for local string parsing
+    size_t idx = 0;
+
+    // CTF Name must start with a '|'
+    if (!is_name_prefix(reader_->peek_char())) {
+#ifdef CTF_DEBUG
+      std::cout << "Not a CTF Name at position " << initial_pos << std::endl;
+#endif
+      return false;
+    }
+
+    // Discard | and get all consecutive digits and alpha characters
+    char c = reader_->get_char();
+    while (is_digit(reader_->peek_char()) || is_alpha(reader_->peek_char())) {
+      c = reader_->get_char();
+      scratch_[idx++] = c;
+    }
+    scratch_[idx] = '\0';
+
+    // Discard delimiters after the CTF Name
+    while (is_value_delimiter(reader_->peek_char())) {
+      c = reader_->get_char();
+    }
+
+    // After CTF Name, there must be a CTF value or another CTF Name
+    c = reader_->peek_char();
+    if (!is_number(c) && !is_name_prefix(c) && !is_eol(c)) {
+#ifdef CTF_DEBUG
+      std::cerr << "Unexpected symbol '" << c << "' after CTF Name at position "
+                << reader_->get_position() << std::endl;
+#endif
+      reader_->rewind_char();
+      return false;
+    }
+
+    // Return the CTF Name
+    // TODO: Can be done better?
+    std::string name = std::string(scratch_.begin(), scratch_.begin() + idx);
+#ifdef CTF_DEBUG
+    std::cout << "Found CTF Name '" << name << "' at position " << initial_pos
+              << std::endl;
+#endif
+
+    /// Match input name with the ones at 'features' and 'labels'
+    bool found = false;
+    auto it = dataset_->input_streams_map.find(name);
+    if (it != dataset_->input_streams_map.end()) {
+      input_stream_id = it->second;
+      found = true;
+    }
+
+    if (!found) {
+      std::string error_msg(
+          "CTF Stream not found for input name '" + name + "'.");
+#ifdef CTF_DEBUG
+      std::cerr << error_msg << std::endl;
+#endif
+      throw std::runtime_error(error_msg);
+    }
+
+    return true;
+  }
+
+  bool get_input_stream_value(const CTFInputStreamInformation& input_stream) {
+#ifdef CTF_DEBUG
+    // For logging purposes
+    size_t initial_pos = reader_->get_position();
+#endif
+    // idx will be used to iterate through scratch_ for local string parsing
+    size_t idx = 0;
+
+    // Temporary data/index holders
+    CTFValueIndex ctf_index = CTFValueIndexUninitialized;
+    DataType ctf_value;
+
+    // CTF Value must start with a digit, dot, signal or exponent symbol
+    char c = reader_->peek_char();
+    if (!is_number(c)) {
+#ifdef CTF_DEBUG
+      std::cerr << "Unexpected symbol '" << c << "' at position " << initial_pos
+                << std::endl;
+#endif
+      return false;
+    }
+
+    // Get all consecutive digits and decimal point, if any
+    bool is_float = false;
+    size_t sign_count = 0;
+    bool has_exponent = false;
+    while (is_number(reader_->peek_char()) ||
+           is_sparse_value_delimiter(reader_->peek_char())) {
+      c = reader_->get_char();
+      if (is_exponent(c)) {
+        has_exponent = true;
+      }
+      if (is_sign(c)) {
+        if ((sign_count > 1 && !has_exponent) || (sign_count > 2)) {
+          std::string error_msg(
+              "Invalid CTF Value. CTF value with more than one "
+              "positive or negative sign at position " +
+              std::to_string(reader_->get_position()));
+#ifdef CTF_DEBUG
+          std::cerr << error_msg << std::endl;
+#endif
+          throw std::runtime_error(error_msg);
+        }
+        ++sign_count;
+      }
+      if (is_decimal_point(c)) {
+        if (is_float) {
+          std::string error_msg(
+              "Invalid CTF Value. CTF value with more than one "
+              "decimal point at position " +
+              std::to_string(reader_->get_position()));
+#ifdef CTF_DEBUG
+          std::cerr << error_msg << std::endl;
+#endif
+          throw std::runtime_error(error_msg);
+        }
+        is_float = true;
+      }
+      if (is_sparse_value_delimiter(c)) {
+        if (input_stream.storage == CTFDataStorage::Dense) {
+          std::string error_msg(
+              "Unexpected sparse index delimiter ':' at position " +
+              std::to_string(reader_->get_position()));
+#ifdef CTF_DEBUG
+          std::cerr << error_msg << std::endl;
+#endif
+          throw std::runtime_error(error_msg);
+        }
+        // Validate found ctf value index
+        if (is_float) {
+          std::string error_msg(
+              "Unexpected symbol '.' at index of CTF Value at position " +
+              std::to_string(reader_->get_position()));
+#ifdef CTF_DEBUG
+          std::cerr << error_msg << std::endl;
+#endif
+          throw std::runtime_error(error_msg);
+        } else {
+          // Discard colon, grab cft index value and reset ctf value string
+          c = reader_->get_char();
+          ctf_index = static_cast<CTFValueIndex>(std::stoull(
+              std::string(scratch_.begin(), scratch_.begin() + idx)));
+          idx = 0;
+#ifdef CTF_DEBUG
+          std::cout << "Found CTF Value Index '" << ctf_index
+                    << "' at position " << reader_->get_position() << std::endl;
+#endif
+        }
+      }
+      scratch_[idx++] = c;
+    }
+    scratch_[idx] = '\0';
+
+    // Discard delimiters after the CTF Value
+    while (is_value_delimiter(reader_->peek_char())) {
+      c = reader_->get_char();
+    }
+
+    // After CTF Value, there must be another CTF Value or CTF Comment
+    c = reader_->peek_char();
+    if (!is_number(c) && !is_comment_prefix(c) && !is_eol(c)) {
+      std::string error_msg(
+          "Unexpected symbol '" + std::to_string(c) +
+          "' after CTF Value at position " +
+          std::to_string(reader_->get_position()));
+#ifdef CTF_DEBUG
+      std::cerr << error_msg << std::endl;
+#endif
+      throw std::runtime_error(error_msg);
+    }
+
+    // Grab CTF value
+    if (ctf_index != CTFValueIndexUninitialized) {
+      if (input_stream.storage == CTFDataStorage::Dense) {
+        std::string error_msg(
+            "Unexpected CTF Value format. Dense format was expected but "
+            "a sparse one was found at position " +
+            std::to_string(reader_->get_position()));
+#ifdef CTF_DEBUG
+        std::cerr << error_msg << std::endl;
+#endif
+        throw std::runtime_error(error_msg);
+      }
+    } else {
+      if (input_stream.storage != CTFDataStorage::Dense) {
+        std::string error_msg(
+            "Unexpected CTF Value format. Sparse format was expected but "
+            "a dense one was found at position " +
+            std::to_string(reader_->get_position()));
+#ifdef CTF_DEBUG
+        std::cerr << error_msg << std::endl;
+#endif
+        throw std::runtime_error(error_msg);
+      }
+    }
+    ctf_value = static_cast<DataType>(
+        std::stod(std::string(scratch_.begin(), scratch_.begin() + idx)));
+#ifdef CTF_DEBUG
+    std::cout << "Found CTF Value '" << ctf_value << "' at position "
+              << reader_->get_position() << std::endl;
+#endif
+
+    if (input_stream.storage == CTFDataStorage::Dense) {
+      CTFDenseInputStreamData<DataType>* dense_data =
+          static_cast<CTFDenseInputStreamData<DataType>*>(
+              (dataset_->sequences.back())[input_stream.__id__].get());
+      dense_data->data.emplace_back(ctf_value);
+    } else {
+      CTFSparseInputStreamData<DataType>* sparse_data =
+          static_cast<CTFSparseInputStreamData<DataType>*>(
+              (dataset_->sequences.back())[input_stream.__id__].get());
+      // std::cerr << "data.emplace_back(" << ctf_value << ") for input stream
+      // id " << input_stream.__id__ << std::endl;
+      sparse_data->data.emplace_back(ctf_value);
+      sparse_data->indices.emplace_back(ctf_index);
+    }
+    return true;
+  }
+
+  bool discard_comment(void) {
+#ifdef CTF_DEBUG
+    // For logging purposes
+    size_t initial_pos = reader_->get_position();
+#endif
+
+    // Used for matching quotes inside a comment
+    // Helps detecting end of comment
+    size_t quote_count = 0;
+
+    // CTF Comment must start with |#
+    char c = reader_->get_char();
+    if (!is_comment_prefix(c)) {
+#ifdef CTF_DEBUG
+      std::cout << "Not a CTF Comment at position " << initial_pos << std::endl;
+#endif
+      reader_->rewind_char();
+      return false;
+    }
+
+    c = reader_->get_char();
+    if (!is_comment_suffix(c)) {
+      std::string error_msg(
+          "Not a CTF Comment at position " +
+          std::to_string(reader_->get_position()));
+#ifdef CTF_DEBUG
+      std::cout << error_msg << std::endl;
+#endif
+      throw std::runtime_error(error_msg);
+    }
+
+    // Get all consecutive digits and alpha characters
+    while (!is_eol(reader_->peek_char())) {
+      c = reader_->peek_char();
+      // Comment symbol can show up when properly escaped
+      if (is_escape_delimiter(c)) {
+        ++quote_count;
+      }
+
+      // If new ctf sample is found, end current comment
+      if (is_name_prefix(c) && (quote_count % 2 == 0)) {
+        break;
+      }
+
+      c = reader_->get_char();
+    }
+
+#ifdef CTF_DEBUG
+    std::cout << "Skipping CTF Comment at position " << reader_->get_position()
+              << std::endl;
+#endif
+    return true;
+  }
+
+  bool get_input_stream(const CTFInputStreamInformation& input_stream) {
+    // Adds a new row start for the input stream
+    if (input_stream.storage == CTFDataStorage::Sparse) {
+      CTFSparseInputStreamData<DataType>* sparse_data =
+          static_cast<CTFSparseInputStreamData<DataType>*>(
+              (dataset_->sequences.back())[input_stream.__id__].get());
+    }
+
+    // Get them all and push to th right stream
+    while (!is_name_prefix(reader_->peek_char()) &&
+           !is_comment_prefix(reader_->peek_char()) &&
+           !is_eol(reader_->peek_char())) {
+      if (!get_input_stream_value(input_stream)) {
+#ifdef CTF_DEBUG
+        std::cout << "CTF Value not found. An empty one will be used."
+                  << std::endl;
+#endif
+      }
+    }
+
+    return true;
+  }
+
+  // type for CTF values
+  CTFDataType data_type_;
+  // dataset holding all parsed entries
+  std::shared_ptr<CTFDataset<DataType>> dataset_;
+  // resposible for reading the CTF file
+  std::shared_ptr<Reader> reader_;
+  // Local buffer for string parsing
+  const size_t CTF_SCRATCH_LENGTH = 128;
+  std::vector<char> scratch_;
+  // Used to decide whether first row of CTF file has a Sequence ID
+  bool has_initial_sequence_id_;
+  // Used to detect when a sequence is over
+  CTFSequenceID previous_sequence_id_;
+};
+
+} // namespace ctf
+} // namespace data
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/data/ctf/reader.h b/torch/csrc/api/include/torch/data/ctf/reader.h
new file mode 100644
index 000000000000..2974a6e885e8
--- /dev/null
+++ b/torch/csrc/api/include/torch/data/ctf/reader.h
@@ -0,0 +1,160 @@
+#pragma once
+
+#include <torch/data/ctf/utils.h>
+
+#include <cstdio>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace torch {
+namespace data {
+namespace ctf {
+
+// TODO: Should we use Memory mapped files to speed buffering?
+
+/// A sequential text reader to feed CTF parser
+///
+/// C File API was used due to performance constraints
+/// Current implementation caches chunks of data from file in memory
+/// and parses CTF from it. When it gets empty, buffer is refilled and the cycle
+/// is repeated until EOF is reached
+///
+class Reader {
+ public:
+  virtual ~Reader();
+  explicit Reader(const std::string& filename);
+
+  inline bool can_read(void) const {
+    return (!is_buffer_empty() || can_buffer());
+  }
+  inline const char& peek_char(void) {
+    if (is_buffer_empty()) {
+      refill();
+    }
+    if (rewinded_char_) {
+      return previous_char_;
+    } else {
+      return buffer_[buffer_pos_];
+    }
+  }
+  inline const char& get_char(void) {
+    if (buffer_pos_ > 0) {
+      previous_char_ = buffer_[buffer_pos_ - 1];
+    }
+    if (is_buffer_empty()) {
+      refill();
+    }
+    if (rewinded_char_) {
+      rewinded_char_ = false;
+      return previous_char_;
+    } else {
+      return buffer_[buffer_pos_++];
+    }
+  }
+  inline const size_t& get_position(void) const {
+    return buffer_pos_;
+  }
+  inline void rewind_char(void) {
+    rewinded_char_ = true;
+  }
+
+ private:
+  /// File handling
+  bool refill(void);
+  inline bool can_buffer(void) const {
+    return (!is_eof_);
+  }
+  inline bool is_buffer_empty(void) const {
+    return ((buffer_size_ == 0) || (buffer_size_ == buffer_pos_));
+  }
+  std::string filename_;
+  std::shared_ptr<FILE> file_;
+  bool is_eof_;
+
+  /// Buffer handling buffer_size must be big enough
+  /// to fit a really long line on the CTF file
+  const size_t CTF_MAX_BUFFER_SIZE = 2 * 1024 * 1024;
+  std::vector<char> buffer_;
+  size_t buffer_pos_;
+  size_t buffer_size_;
+  bool rewinded_char_;
+  char previous_char_;
+
+  Reader() = delete;
+  DISALLOW_COPY_AND_ASSIGN(Reader);
+};
+
+static const char SPACE_CHAR = ' ';
+static const char TAB_CHAR = '\t';
+static const char NAME_PREFIX = '|';
+static const char INDEX_DELIMITER = ':';
+static const char ESCAPE_SYMBOL = '#';
+
+inline bool is_name_prefix(const char& c) {
+  return (c == NAME_PREFIX);
+}
+
+inline bool is_comment_prefix(const char& c) {
+  return (is_name_prefix(c));
+}
+
+inline bool is_comment_suffix(const char& c) {
+  return (c == '#');
+}
+
+inline bool is_decimal_point(const char& c) {
+  return (c == '.');
+}
+
+inline bool is_sparse_value_delimiter(const char& c) {
+  return (c == ':');
+}
+
+inline bool is_digit(const char& c) {
+  return (c >= '0' && c <= '9');
+}
+
+inline bool is_alpha(const char& c) {
+  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
+}
+
+inline bool is_sign(const char& c) {
+  return c == '+' || c == '-';
+}
+
+inline bool is_exponent(const char& c) {
+  return c == 'e' || c == 'E';
+}
+
+inline bool is_number(const char& c) {
+  return (is_digit(c) || is_decimal_point(c) || is_sign(c) || is_exponent(c));
+}
+
+inline bool is_printable(const char& c) {
+  return c >= SPACE_CHAR;
+}
+
+inline bool is_non_printable(const char& c) {
+  return !is_printable(c);
+}
+
+inline bool is_value_delimiter(const char& c) {
+  return c == SPACE_CHAR || c == TAB_CHAR;
+}
+
+inline bool is_eol(const char& c) {
+  return (c == '\r' || c == '\n');
+}
+
+inline bool is_escape_delimiter(const char& c) {
+  return (c == '\'' || c == '"');
+}
+
+inline bool is_column_delimiter(const char& c) {
+  return is_value_delimiter(c) || (is_non_printable(c) && !is_eol(c));
+}
+
+} // namespace ctf
+} // namespace data
+} // namespace torch
\ No newline at end of file
diff --git a/torch/csrc/api/include/torch/data/ctf/utils.h b/torch/csrc/api/include/torch/data/ctf/utils.h
new file mode 100644
index 000000000000..e57ba0109f6c
--- /dev/null
+++ b/torch/csrc/api/include/torch/data/ctf/utils.h
@@ -0,0 +1,19 @@
+#pragma once
+
+namespace torch {
+namespace data {
+namespace ctf {
+
+#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&) = delete;      \
+  void operator=(const TypeName&) = delete
+
+#define DISABLE_COPY_AND_MOVE(TypeName)          \
+  TypeName(const TypeName&) = delete;            \
+  TypeName& operator=(const TypeName&) = delete; \
+  TypeName(TypeName&&) = delete;                 \
+  TypeName& operator=(TypeName&&) = delete
+
+} // namespace ctf
+} // namespace data
+} // namespace torch
\ No newline at end of file
diff --git a/torch/csrc/api/include/torch/data/datasets.h b/torch/csrc/api/include/torch/data/datasets.h
index 82c31fe96a58..df565e972358 100644
--- a/torch/csrc/api/include/torch/data/datasets.h
+++ b/torch/csrc/api/include/torch/data/datasets.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/data/datasets/base.h>
+#include <torch/data/datasets/chunk.h>
 #include <torch/data/datasets/map.h>
 #include <torch/data/datasets/mnist.h>
 #include <torch/data/datasets/shared.h>
diff --git a/torch/csrc/api/include/torch/data/datasets/chunk.h b/torch/csrc/api/include/torch/data/datasets/chunk.h
new file mode 100644
index 000000000000..04f4bb967ff2
--- /dev/null
+++ b/torch/csrc/api/include/torch/data/datasets/chunk.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <torch/csrc/utils/memory.h>
+#include <torch/data/datasets/stateful.h>
+#include <torch/data/example.h>
+#include <torch/data/samplers.h>
+
+namespace torch {
+namespace data {
+namespace datasets {
+
+/// A stateful dataset that support hierarchical sampling and prefetching of
+/// entre chunks.
+///
+/// A chunk could be an entire file, such as an audio data file or an image,
+/// or part of a file in the case of a large text-file split based on seek
+/// positions.
+///
+/// Unlike regular dataset, chunk dataset require two samplers to operate and
+/// keeps an internal state. `ChunkSampler` selects, which chunk to load next,
+/// while the `ExampleSampler` determins the order of Examples that are returned
+/// in each `get_batch` call. The hierarchical sampling approach used here is
+/// inspired by this paper http://martin.zinkevich.org/publications/nips2010.pdf
+template <
+    typename Self,
+    typename Batch = std::vector<Example<>>,
+    typename ChunkSampler = samplers::RandomSampler,
+    typename ExampleSampler = samplers::RandomSampler>
+class ChunkDataSet : public StatefulDataset<Self, Batch, size_t> {
+ public:
+  using SelfType = Self;
+  using BatchType = Batch;
+  using ChunkSamplerType = ChunkSampler;
+  using ExampleSamplerType = ExampleSampler;
+
+  /// Read an entire chunk. A derived class needs to override this method.
+  virtual Batch read_chunk(size_t chunk_index) = 0;
+
+  /// Returns the chunk sampler for this dataset.
+  virtual ChunkSampler get_chunk_sampler() = 0;
+
+  /// Returns the example sampler for this dataset.
+  virtual ExampleSampler get_example_sampler() = 0;
+
+  /// returns the number of chunks available in this dataset.
+  virtual size_t get_chunk_count() = 0;
+
+  /// Default get_batch method of BatchDataSet. This method returns Example
+  /// batches created from the preloaded chunks. The implemenation is dataset
+  /// agnostic and does not need overriding in different chunk data sets.
+  optional<Batch> get_batch(size_t batch_size) override {
+    // Temporary: tests will have a simple implemenation.
+    return torch::nullopt;
+  }
+
+  /// This will clear any internal state and starts the internal prefetching
+  /// mechanism for the chunk dataset.
+  virtual void reset() {}
+
+  /// size is not used for chunk dataset.
+  optional<size_t> size() const override {
+    return torch::nullopt;
+  }
+};
+} // namespace datasets
+} // namespace data
+} // namespace torch
diff --git a/torch/csrc/api/src/data/ctf/reader.cpp b/torch/csrc/api/src/data/ctf/reader.cpp
new file mode 100644
index 000000000000..1240478d05b5
--- /dev/null
+++ b/torch/csrc/api/src/data/ctf/reader.cpp
@@ -0,0 +1,85 @@
+#include <torch/data/ctf/reader.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <iostream>
+#include <string>
+
+namespace torch {
+namespace data {
+namespace ctf {
+
+/*
+ * Reader class for CTF
+ *
+ * RAII pattern was used for file descriptor
+ */
+
+Reader::~Reader() {}
+
+Reader::Reader(const std::string& filename)
+    : filename_(filename),
+      is_eof_(false),
+      buffer_pos_(0),
+      buffer_size_(0),
+      rewinded_char_(false),
+      previous_char_(0) {
+  std::FILE* const tmp = fopen(filename_.c_str(), "rbS");
+  if (!tmp) {
+    std::string error_msg(
+        "Reader could not open the specified file (" + filename + ")");
+#ifdef CTF_DEBUG
+    std::cerr << error_msg << std::endl;
+#endif
+    throw std::runtime_error(error_msg);
+  }
+  file_ = std::shared_ptr<std::FILE>(tmp, std::fclose);
+
+  buffer_.resize(Reader::CTF_MAX_BUFFER_SIZE);
+  refill();
+}
+
+bool Reader::refill(void) {
+  if (!is_buffer_empty()) {
+#ifdef CTF_DEBUG
+    std::cout << "Buffer is not empty yet. Not refilling it" << std::endl;
+#endif
+    return false;
+  }
+  if (!can_buffer()) {
+#ifdef CTF_DEBUG
+    std::cout << "Nothing to read from file " << filename_ << ". ("
+              << strerror(errno) << ")";
+#endif
+    return false;
+  }
+
+  buffer_pos_ = 0;
+
+  size_t bytes_read =
+      std::fread(&buffer_[0], 1, Reader::CTF_MAX_BUFFER_SIZE, file_.get());
+
+  if (feof(file_.get()) != 0) {
+    is_eof_ = true;
+  }
+
+  if (bytes_read != Reader::CTF_MAX_BUFFER_SIZE && !is_eof_) {
+    std::string error_msg(
+        "Error reading file " + filename_ + ". " + strerror(errno));
+#ifdef CTF_DEBUG
+    std::cerr << error_msg << buffer_pos_ << std::endl;
+#endif
+    throw std::runtime_error(error_msg);
+  }
+  buffer_size_ = bytes_read;
+#ifdef CTF_DEBUG
+  std::cout << "Buffer refilled. Read " << std::to_string(bytes_read)
+            << " from file " << filename_ << std::endl;
+#endif
+  return true;
+}
+
+} // namespace ctf
+} // namespace data
+} // namespace torch
\ No newline at end of file