diff --git a/Makefile b/Makefile index c77ad2a..6c586a9 100644 --- a/Makefile +++ b/Makefile @@ -3,18 +3,29 @@ CXXFLAGS = -std=c++17 -Wall -Wextra -O2 -I. BUILD_DIR = build -DBMS_OBJS = $(BUILD_DIR)/main.o $(BUILD_DIR)/src/parser.o +DBMS_OBJS = $(BUILD_DIR)/main.o \ + $(BUILD_DIR)/src/parser.o \ + $(BUILD_DIR)/src/storage/disk_manager.o \ + $(BUILD_DIR)/src/storage/buffer_pool.o \ + $(BUILD_DIR)/src/storage/slotted_page.o \ + $(BUILD_DIR)/src/storage/heap_file.o \ + $(BUILD_DIR)/src/sql/tuple.o \ + $(BUILD_DIR)/src/sql/catalog.o TEST_OBJS = $(BUILD_DIR)/tests/test_parser.o \ $(BUILD_DIR)/tests/storage/test_disk_manager.o \ $(BUILD_DIR)/tests/storage/test_buffer_pool.o \ $(BUILD_DIR)/tests/storage/test_slotted_page.o \ $(BUILD_DIR)/tests/storage/test_heap_file.o \ $(BUILD_DIR)/tests/storage/test_integration.o \ + $(BUILD_DIR)/tests/sql/test_tuple.o \ + $(BUILD_DIR)/tests/sql/test_catalog.o \ $(BUILD_DIR)/src/parser.o \ $(BUILD_DIR)/src/storage/disk_manager.o \ $(BUILD_DIR)/src/storage/buffer_pool.o \ $(BUILD_DIR)/src/storage/slotted_page.o \ - $(BUILD_DIR)/src/storage/heap_file.o + $(BUILD_DIR)/src/storage/heap_file.o \ + $(BUILD_DIR)/src/sql/tuple.o \ + $(BUILD_DIR)/src/sql/catalog.o dbms: $(DBMS_OBJS) $(CXX) $(CXXFLAGS) -o $@ $^ diff --git a/main.cpp b/main.cpp index 784c7d9..67be420 100644 --- a/main.cpp +++ b/main.cpp @@ -1,11 +1,22 @@ #include "src/parser.h" +#include "src/sql/catalog.h" +#include "src/sql/tuple.h" +#include "src/storage/buffer_pool.h" +#include "src/storage/disk_manager.h" +#include "src/storage/heap_file.h" +#include +#include #include #include #include +#include #include -// Pretty-print a parsed SelectQuery to stdout in a stable, debuggable format. +// ============================================================================= +// Parser demo — parse a handful of SQL strings and print the resulting AST. +// ============================================================================= + static void printQuery(const SelectQuery& q) { std::cout << " columns: "; if (q.select_all) { @@ -34,9 +45,8 @@ static void printQuery(const SelectQuery& q) { } } -// Driver: parse a handful of example queries and print the resulting AST. -// The last query is intentionally malformed to exercise the error path. -int main() { +static void runParserDemo() { + std::cout << "=== Parser demo ==========================================\n\n"; const std::vector queries = { "SELECT id, name FROM users WHERE age > 18", "SELECT * FROM products", @@ -46,7 +56,6 @@ int main() { "SELECT * FROM a JOIN b ON a.x = b.x JOIN c ON b.y = c.y WHERE c.z > 0", "SELECT FROM users", }; - for (const auto& sql : queries) { std::cout << "SQL: " << sql << "\n"; try { @@ -54,11 +63,112 @@ int main() { SelectQuery q = p.parse(); printQuery(q); } catch (const std::exception& e) { - // Lex or parse error — keep going so remaining examples still run. std::cout << " error: " << e.what() << "\n"; } std::cout << "\n"; } +} + +// ============================================================================= +// Storage demo — exercise the full storage + catalog stack end-to-end: +// +// 1. open a brand-new database file +// 2. create the catalog (allocates __tables and __columns at pages 0/1) +// 3. createTable("users", schema) +// 4. insert a few rows via TupleCodec → HeapFile +// 5. flush, drop the in-memory state, reopen +// 6. open the catalog with no extra information, look up "users", +// open its heap file, scan and print every row +// ============================================================================= + +namespace { + +void seedUsers(BufferPool& bp, const Catalog::TableInfo& info) { + const std::vector> rows = { + {1, "alice", 30}, + {2, "bob", 25}, + {3, "carol", 40}, + {4, "dave", 19}, + {5, "eve", 33}, + }; + HeapFile hf(&bp, info.root_page); + for (const auto& [id, name, age] : rows) { + const auto bytes = TupleCodec::encode(info.schema, { + Value::Int32(id), + Value::Text(name), + Value::Int32(age), + }); + hf.insert(bytes.data(), bytes.size()); + } + std::cout << " inserted " << rows.size() << " rows into 'users'\n"; +} + +void scanUsers(BufferPool& bp, const Catalog::TableInfo& info) { + HeapFile hf(&bp, info.root_page); + for (const auto& [rid, bytes] : hf) { + const auto vals = TupleCodec::decode(info.schema, bytes.data(), bytes.size()); + std::cout << " rid=(" << rid.page_id << "," << rid.slot_id << ")" + << " id=" << vals[0].i32 + << " name=" << vals[1].text + << " age=" << vals[2].i32 << "\n"; + } +} + +} // namespace +static void runStorageDemo() { + std::cout << "=== Storage demo =========================================\n\n"; + const std::string path = "/tmp/dbms_demo.db"; + + // Start from a clean slate so the demo is reproducible. + std::error_code ec; + std::filesystem::remove(path, ec); + + // ----- Phase 1: create + populate ------------------------------------ + { + DiskManager dm(path); + BufferPool bp(8, &dm); + Catalog cat = Catalog::create(&bp); + + const Schema users_schema{{ + {"id", Type::Int32, false}, + {"name", Type::Text, false}, + {"age", Type::Int32, true}, + }}; + cat.createTable("users", users_schema); + + std::cout << "[phase 1] created catalog + table 'users'\n"; + std::cout << " __tables is at page " << Catalog::TABLES_ROOT << "\n"; + std::cout << " __columns is at page " << Catalog::COLUMNS_ROOT << "\n"; + std::cout << " users heap is at page " + << cat.getTable("users")->root_page << "\n"; + + seedUsers(bp, *cat.getTable("users")); + bp.flushAll(); + std::cout << " flushed; file size = " + << std::filesystem::file_size(path) << " bytes\n\n"; + } + + // ----- Phase 2: cold reopen + scan ----------------------------------- + DiskManager dm(path); + BufferPool bp(8, &dm); + Catalog cat(&bp); // bootstrap from pages 0 + 1 + + std::cout << "[phase 2] reopened database; tables in catalog:"; + for (const auto& n : cat.tableNames()) std::cout << " " << n; + std::cout << "\n"; + + const auto* info = cat.getTable("users"); + std::cout << " scanning 'users':\n"; + scanUsers(bp, *info); + std::cout << "\n"; + + // Tidy up so successive runs always start clean. + std::filesystem::remove(path, ec); +} + +int main() { + // runParserDemo(); + runStorageDemo(); return 0; } diff --git a/src/sql/catalog.cpp b/src/sql/catalog.cpp new file mode 100644 index 0000000..753527e --- /dev/null +++ b/src/sql/catalog.cpp @@ -0,0 +1,161 @@ +#include "src/sql/catalog.h" + +#include +#include +#include +#include +#include + +Schema Catalog::tablesSchema() { + return Schema{{ + {"table_id", Type::Int32, false}, + {"name", Type::Text, false}, + {"first_page_id", Type::Int64, false}, + }}; +} + +Schema Catalog::columnsSchema() { + return Schema{{ + {"table_id", Type::Int32, false}, + {"position", Type::Int32, false}, + {"name", Type::Text, false}, + {"type", Type::Int32, false}, + {"nullable", Type::Bool, false}, + }}; +} + +Catalog::Catalog(BufferPool* bp) + : bp_(bp), + tables_hf_(bp, TABLES_ROOT), + columns_hf_(bp, COLUMNS_ROOT), + next_table_id_(0) { + loadFromDisk(); +} + +Catalog Catalog::create(BufferPool* bp) { + // Allocating __tables must yield page 0; __columns must yield page 1. + // Anything else means the database already had data, in which case the + // bootstrap convention is broken and we'd silently mis-read on reopen. + HeapFile tables_hf = HeapFile::create(bp); + if (tables_hf.firstPageId() != TABLES_ROOT) { + throw std::runtime_error( + "Catalog::create: __tables did not land at page 0; " + "is the database already initialized?"); + } + HeapFile columns_hf = HeapFile::create(bp); + if (columns_hf.firstPageId() != COLUMNS_ROOT) { + throw std::runtime_error( + "Catalog::create: __columns did not land at page 1; " + "is the database already initialized?"); + } + return Catalog(bp); +} + +void Catalog::loadFromDisk() { + const Schema tables_s = tablesSchema(); + const Schema columns_s = columnsSchema(); + + // Pass 1: pull every row out of __tables. Schemas are filled in in pass 2. + std::vector infos; + for (auto it = tables_hf_.begin(); it != tables_hf_.end(); ++it) { + const auto& bytes = it->second; + auto vals = TupleCodec::decode(tables_s, bytes.data(), bytes.size()); + TableInfo info; + info.table_id = vals[0].i32; + info.name = vals[1].text; + info.root_page = static_cast(vals[2].i64); + infos.push_back(std::move(info)); + } + + // Pass 2: collect column rows from __columns, group by table_id, sort by + // position. Sorting by position lets us reconstruct the user's column + // order even if the catalog was edited out of order across crashes. + struct ColRow { + int32_t position; + std::string name; + Type type; + bool nullable; + }; + std::map> by_table; + for (auto it = columns_hf_.begin(); it != columns_hf_.end(); ++it) { + const auto& bytes = it->second; + auto vals = TupleCodec::decode(columns_s, bytes.data(), bytes.size()); + const int32_t tid = vals[0].i32; + ColRow c{vals[1].i32, vals[2].text, typeFromCode(vals[3].i32), vals[4].b}; + by_table[tid].push_back(std::move(c)); + } + + // Pass 3: stitch each table's columns into its Schema, install in cache, + // and track the next free table_id. + int32_t max_id = -1; + for (auto& info : infos) { + auto cols_it = by_table.find(info.table_id); + if (cols_it != by_table.end()) { + auto& cols = cols_it->second; + std::sort(cols.begin(), cols.end(), + [](const ColRow& a, const ColRow& b) { + return a.position < b.position; + }); + info.schema.columns.reserve(cols.size()); + for (auto& c : cols) { + info.schema.columns.push_back({std::move(c.name), c.type, c.nullable}); + } + } + max_id = std::max(max_id, info.table_id); + tables_.emplace(info.name, std::move(info)); + } + next_table_id_ = max_id + 1; +} + +void Catalog::createTable(const std::string& name, Schema schema) { + if (hasTable(name)) { + throw std::runtime_error("Catalog: table '" + name + "' already exists"); + } + + // Allocate the user heap file first. If we crash before recording in + // the catalog, the worst case is one orphan page; the alternative + // (record-first, allocate-after) leaves dangling rows pointing at + // nothing. + HeapFile new_hf = HeapFile::create(bp_); + const PageId root = new_hf.firstPageId(); + const int32_t table_id = next_table_id_++; + + // Append to __tables. + { + const auto bytes = TupleCodec::encode(tablesSchema(), { + Value::Int32(table_id), + Value::Text(name), + Value::Int64(static_cast(root)), + }); + tables_hf_.insert(bytes.data(), bytes.size()); + } + + // Append one row per column to __columns. Position is the column's + // index in the user's schema and is what we sort by on reload. + const Schema cs = columnsSchema(); + for (size_t i = 0; i < schema.columns.size(); ++i) { + const auto& col = schema.columns[i]; + const auto bytes = TupleCodec::encode(cs, { + Value::Int32(table_id), + Value::Int32(static_cast(i)), + Value::Text(col.name), + Value::Int32(typeToCode(col.type)), + Value::Bool(col.nullable), + }); + columns_hf_.insert(bytes.data(), bytes.size()); + } + + tables_.emplace(name, TableInfo{table_id, name, std::move(schema), root}); +} + +const Catalog::TableInfo* Catalog::getTable(const std::string& name) const { + auto it = tables_.find(name); + return it == tables_.end() ? nullptr : &it->second; +} + +std::vector Catalog::tableNames() const { + std::vector out; + out.reserve(tables_.size()); + for (const auto& kv : tables_) out.push_back(kv.first); + return out; +} diff --git a/src/sql/catalog.h b/src/sql/catalog.h new file mode 100644 index 0000000..4f33fd5 --- /dev/null +++ b/src/sql/catalog.h @@ -0,0 +1,88 @@ +#pragma once + +#include "src/sql/tuple.h" +#include "src/storage/buffer_pool.h" +#include "src/storage/disk_manager.h" +#include "src/storage/heap_file.h" + +#include +#include +#include +#include +#include + +// Persistent table catalog, stored as two ordinary heap files: +// +// __tables (table_id, name, first_page_id) +// __columns(table_id, position, name, type, nullable) +// +// One row per user table in __tables, one row per column in __columns. +// These system tables live at hard-coded page ids — page 0 is __tables, +// page 1 is __columns. That hard-coding is the bootstrap: opening a +// database is "open those two heap files at known offsets and walk them." +// +// On startup the catalog is fully reconstructed in memory and serves as +// a read-through cache. Mutations (createTable) update both system +// tables on disk and the in-memory cache. +// +// __tables and __columns are not themselves listed in __tables — they +// live below the catalog. This avoids the chicken-and-egg of needing +// the catalog to read the catalog. +// +// Single-process. If you have two Catalog instances open on the same +// database file they will both load on construction but won't see each +// other's edits. +class Catalog { +public: + // Hard-coded bootstrap pages. They are the first two pages allocated + // on a brand-new database file by Catalog::create. + static constexpr PageId TABLES_ROOT = 0; + static constexpr PageId COLUMNS_ROOT = 1; + + struct TableInfo { + int32_t table_id; + std::string name; + Schema schema; + PageId root_page; + }; + + // Open an existing catalog. Reads __tables and __columns at the + // bootstrap page ids and rebuilds the in-memory cache. + explicit Catalog(BufferPool* bp); + + // Allocate the bootstrap pages on a brand-new database and return a + // ready-to-use Catalog. Throws if the disk already has pages + // allocated, since that would leave __tables and __columns at the + // wrong page ids. + static Catalog create(BufferPool* bp); + + // Allocate a heap file for `name`, append one row to __tables and + // schema.columns.size() rows to __columns, update the cache. + // Throws if `name` already exists. + void createTable(const std::string& name, Schema schema); + + bool hasTable(const std::string& name) const { + return tables_.count(name) != 0; + } + + // Returns nullptr when absent. The pointer is stable across later + // createTable calls (unordered_map insertions don't invalidate + // references) and remains valid for this Catalog's lifetime. + const TableInfo* getTable(const std::string& name) const; + + std::vector tableNames() const; + +private: + BufferPool* bp_; + HeapFile tables_hf_; + HeapFile columns_hf_; + std::unordered_map tables_; + int32_t next_table_id_; + + // Hard-coded schemas of the two system tables. + static Schema tablesSchema(); + static Schema columnsSchema(); + + // Walk __tables + __columns and populate `tables_` and `next_table_id_`. + void loadFromDisk(); +}; diff --git a/src/sql/tuple.cpp b/src/sql/tuple.cpp new file mode 100644 index 0000000..f963a45 --- /dev/null +++ b/src/sql/tuple.cpp @@ -0,0 +1,287 @@ +#include "src/sql/tuple.h" + +#include +#include +#include +#include + +namespace { + +// Byte width of a fixed-width column type. Text is variable-length and is +// rejected here; tupleSize() (the only caller) is documented as fixed-only. +size_t widthOf(Type t) { + switch (t) { + case Type::Int32: return 4; + case Type::Int64: return 8; + case Type::Bool: return 1; + case Type::Text: + throw std::runtime_error("tuple codec: tupleSize() is undefined for " + "schemas containing Text columns"); + } + throw std::runtime_error("tuple codec: unknown type"); +} + +} // namespace + +size_t Schema::indexOf(const std::string& name) const { + for (size_t i = 0; i < columns.size(); ++i) { + if (columns[i].name == name) return i; + } + return kNotFound; +} + +size_t Schema::tupleSize() const { + const size_t bitmap = (columns.size() + 7) / 8; + size_t data = 0; + for (const auto& c : columns) data += widthOf(c.type); + return bitmap + data; +} + +Value Value::Int32(int32_t v) { + Value r; + r.type = Type::Int32; + r.is_null = false; + r.i32 = v; + return r; +} + +Value Value::Int64(int64_t v) { + Value r; + r.type = Type::Int64; + r.is_null = false; + r.i64 = v; + return r; +} + +Value Value::Bool(bool v) { + Value r; + r.type = Type::Bool; + r.is_null = false; + r.b = v; + return r; +} + +Value Value::Text(std::string v) { + Value r; + r.type = Type::Text; + r.is_null = false; + r.text = std::move(v); + return r; +} + +Value Value::Null(Type t) { + Value r; + r.type = t; + r.is_null = true; + return r; +} + +int32_t typeToCode(Type t) { + switch (t) { + case Type::Int32: return 0; + case Type::Int64: return 1; + case Type::Bool: return 2; + case Type::Text: return 3; + } + throw std::runtime_error("typeToCode: unknown Type"); +} + +Type typeFromCode(int32_t c) { + switch (c) { + case 0: return Type::Int32; + case 1: return Type::Int64; + case 2: return Type::Bool; + case 3: return Type::Text; + default: + throw std::runtime_error("typeFromCode: invalid type code " + + std::to_string(c)); + } +} + +bool operator==(const Value& a, const Value& b) { + if (a.type != b.type) return false; + if (a.is_null != b.is_null) return false; + if (a.is_null) return true; + switch (a.type) { + case Type::Int32: return a.i32 == b.i32; + case Type::Int64: return a.i64 == b.i64; + case Type::Bool: return a.b == b.b; + case Type::Text: return a.text == b.text; + } + return false; +} + +std::vector TupleCodec::encode(const Schema& s, const std::vector& vals) { + if (vals.size() != s.columns.size()) { + throw std::runtime_error( + "TupleCodec::encode: column count mismatch (got " + + std::to_string(vals.size()) + ", expected " + + std::to_string(s.columns.size()) + ")"); + } + + const size_t n = s.columns.size(); + const size_t bitmap_bytes = (n + 7) / 8; + + // Validate everything before allocating so encode either fully succeeds + // or fully fails with a clear error. + for (size_t i = 0; i < n; ++i) { + const auto& col = s.columns[i]; + const auto& v = vals[i]; + if (v.is_null && !col.nullable) { + throw std::runtime_error( + "TupleCodec::encode: column '" + col.name + "' is not nullable"); + } + if (!v.is_null && v.type != col.type) { + throw std::runtime_error( + "TupleCodec::encode: column '" + col.name + "' type mismatch"); + } + if (col.type == Type::Text && !v.is_null && + v.text.size() > std::numeric_limits::max()) { + throw std::runtime_error( + "TupleCodec::encode: Text value exceeds 4 GB length limit"); + } + } + + // Compute total size once, so we allocate exactly. + size_t total = bitmap_bytes; + for (size_t i = 0; i < n; ++i) { + const auto& col = s.columns[i]; + const auto& v = vals[i]; + switch (col.type) { + case Type::Int32: total += 4; break; + case Type::Int64: total += 8; break; + case Type::Bool: total += 1; break; + case Type::Text: total += 4 + (v.is_null ? 0u : v.text.size()); break; + } + } + + // Zero-init so null slots have deterministic bytes. + std::vector out(total, 0); + + // Null bitmap. + for (size_t i = 0; i < n; ++i) { + if (vals[i].is_null) { + out[i / 8] |= static_cast(1u << (i % 8)); + } + } + + // Payload. + char* p = out.data() + bitmap_bytes; + for (size_t i = 0; i < n; ++i) { + const auto& col = s.columns[i]; + const auto& v = vals[i]; + switch (col.type) { + case Type::Int32: { + if (!v.is_null) std::memcpy(p, &v.i32, 4); + p += 4; + break; + } + case Type::Int64: { + if (!v.is_null) std::memcpy(p, &v.i64, 8); + p += 8; + break; + } + case Type::Bool: { + if (!v.is_null) { + uint8_t x = v.b ? 1 : 0; + std::memcpy(p, &x, 1); + } + p += 1; + break; + } + case Type::Text: { + const uint32_t text_len = v.is_null + ? 0u + : static_cast(v.text.size()); + std::memcpy(p, &text_len, 4); + p += 4; + if (!v.is_null && text_len > 0) { + std::memcpy(p, v.text.data(), text_len); + p += text_len; + } + break; + } + } + } + return out; +} + +std::vector TupleCodec::decode(const Schema& s, const char* bytes, size_t len) { + const size_t n = s.columns.size(); + const size_t bitmap_bytes = (n + 7) / 8; + + if (len < bitmap_bytes) { + throw std::runtime_error( + "TupleCodec::decode: input shorter than null bitmap"); + } + + const char* end = bytes + len; + const char* p = bytes + bitmap_bytes; + + std::vector out; + out.reserve(n); + + for (size_t i = 0; i < n; ++i) { + const auto& col = s.columns[i]; + const bool is_null = + (static_cast(bytes[i / 8]) >> (i % 8)) & 1u; + + Value v; + v.type = col.type; + v.is_null = is_null; + + switch (col.type) { + case Type::Int32: { + if (end - p < 4) { + throw std::runtime_error("TupleCodec::decode: truncated Int32"); + } + if (!is_null) std::memcpy(&v.i32, p, 4); + p += 4; + break; + } + case Type::Int64: { + if (end - p < 8) { + throw std::runtime_error("TupleCodec::decode: truncated Int64"); + } + if (!is_null) std::memcpy(&v.i64, p, 8); + p += 8; + break; + } + case Type::Bool: { + if (end - p < 1) { + throw std::runtime_error("TupleCodec::decode: truncated Bool"); + } + if (!is_null) { + uint8_t x; + std::memcpy(&x, p, 1); + v.b = (x != 0); + } + p += 1; + break; + } + case Type::Text: { + if (end - p < 4) { + throw std::runtime_error( + "TupleCodec::decode: truncated Text length prefix"); + } + uint32_t text_len; + std::memcpy(&text_len, p, 4); + p += 4; + if (static_cast(end - p) < text_len) { + throw std::runtime_error( + "TupleCodec::decode: truncated Text payload"); + } + if (!is_null) v.text.assign(p, text_len); + p += text_len; + break; + } + } + out.push_back(std::move(v)); + } + + if (p != end) { + throw std::runtime_error( + "TupleCodec::decode: trailing bytes after schema"); + } + return out; +} diff --git a/src/sql/tuple.h b/src/sql/tuple.h new file mode 100644 index 0000000..01c263d --- /dev/null +++ b/src/sql/tuple.h @@ -0,0 +1,88 @@ +#pragma once + +#include +#include +#include +#include + +// SQL column types we know how to serialize. +// Int32 / Int64 / Bool — fixed width. +// Text — variable width, length-prefixed (uint32 + bytes). +enum class Type { Int32, Int64, Bool, Text }; + +struct Column { + std::string name; + Type type; + bool nullable; +}; + +// A row's schema: an ordered list of columns. Pure value type; no I/O. +struct Schema { + std::vector columns; + + // Sentinel returned by indexOf when the name isn't present. Mirrors + // std::string::npos's "size_t with all bits set" convention. + static constexpr size_t kNotFound = static_cast(-1); + + // Find a column by name. Returns kNotFound if absent. + size_t indexOf(const std::string& name) const; + + // Bytes a tuple of this schema occupies on disk *if every column is + // fixed-width*. Throws when the schema contains a Text column, since + // the encoded size then depends on the actual values. Use the size of + // TupleCodec::encode(...) instead for variable-length schemas. + size_t tupleSize() const; +}; + +// A runtime value. The union is only safe to read for the discriminator's +// type when is_null is false; otherwise the payload is unset. +struct Value { + Type type; + bool is_null; + union { + int32_t i32; + int64_t i64; + bool b; + }; + std::string text; // out-of-union for simplicity (Text reserved) + + // Convenience builders. Make tests and call-sites read cleanly. + static Value Int32(int32_t v); + static Value Int64(int64_t v); + static Value Bool(bool v); + static Value Text(std::string v); + static Value Null(Type t); +}; + +bool operator==(const Value& a, const Value& b); +inline bool operator!=(const Value& a, const Value& b) { return !(a == b); } + +// Stable on-disk integer encoding of a Type, used anywhere the type +// itself needs to be persisted (the catalog's __columns table is the +// current consumer). The numeric values are part of the file format — +// never reorder or reuse them. +int32_t typeToCode(Type t); +Type typeFromCode(int32_t c); + +// Stateless codec converting a row of Values to/from the byte sequence +// stored in a SlottedPage's tuple area. Layout: +// +// [ null bitmap : ceil(N/8) bytes ][ col0 bytes ][ col1 bytes ] ... +// +// Each column's slot: +// Int32/Int64/Bool — full fixed width regardless of null status; the +// bitmap is authoritative for nullness. +// Text — uint32_t length prefix followed by `length` bytes. +// Null is encoded as length 0 with no payload (and +// the bitmap bit set); empty-but-non-null Text is +// length 0 with the bitmap bit clear. +class TupleCodec { +public: + // Throws std::runtime_error on column-count mismatch, type mismatch, + // or null in a non-nullable column. + static std::vector encode(const Schema& s, const std::vector& vals); + + // Throws if the bytes are truncated mid-column, contain trailing bytes + // after the schema is satisfied, or are too short for the null bitmap. + static std::vector decode(const Schema& s, const char* bytes, size_t len); +}; diff --git a/src/storage/README.md b/src/storage/README.md index 5a1bd08..6a3da1b 100644 --- a/src/storage/README.md +++ b/src/storage/README.md @@ -62,3 +62,7 @@ Each layer has its own test file under `tests/storage/`. The end-to-end persistence test lives in `tests/storage/test_integration.cpp` (loads 1000 rows, simulates a program restart by destroying every storage object, then scans the rows back). + + +## Data flow +![alt text](image.png) \ No newline at end of file diff --git a/src/storage/image.png b/src/storage/image.png new file mode 100644 index 0000000..722f94e Binary files /dev/null and b/src/storage/image.png differ diff --git a/tests/sql/test_catalog.cpp b/tests/sql/test_catalog.cpp new file mode 100644 index 0000000..bca5e70 --- /dev/null +++ b/tests/sql/test_catalog.cpp @@ -0,0 +1,334 @@ +#include "tests/vendor/doctest.h" + +#include "src/sql/catalog.h" +#include "src/sql/tuple.h" +#include "src/storage/buffer_pool.h" +#include "src/storage/disk_manager.h" +#include "src/storage/heap_file.h" +#include "tests/test_util.h" + +#include +#include +#include +#include + +namespace { + +Schema makePersonSchema() { + return Schema{{ + {"id", Type::Int32, false}, + {"name", Type::Text, false}, + {"age", Type::Int32, true}, + }}; +} + +Schema makeOrderSchema() { + return Schema{{ + {"id", Type::Int64, false}, + {"customer", Type::Text, false}, + {"total_cents", Type::Int32, false}, + {"shipped", Type::Bool, false}, + }}; +} + +} // namespace + +TEST_CASE("Catalog::create allocates pages 0 and 1 as the bootstrap heap files") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + + CHECK(Catalog::TABLES_ROOT == 0); + CHECK(Catalog::COLUMNS_ROOT == 1); + // Both system tables have been allocated. + CHECK(dm.numPages() == 2); +} + +TEST_CASE("Catalog::create on a non-empty disk throws") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + + // Pretend something else allocated page 0 already. + dm.allocatePage(); + CHECK_THROWS_AS(Catalog::create(&bp), std::runtime_error); +} + +TEST_CASE("a fresh catalog reports no user tables") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + + CHECK_FALSE(cat.hasTable("anything")); + CHECK(cat.getTable("anything") == nullptr); + CHECK(cat.tableNames().empty()); +} + +TEST_CASE("createTable registers a table that hasTable / getTable can find") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + + cat.createTable("people", makePersonSchema()); + + CHECK(cat.hasTable("people")); + const auto* info = cat.getTable("people"); + REQUIRE(info != nullptr); + CHECK(info->name == "people"); + CHECK(info->table_id == 0); + REQUIRE(info->schema.columns.size() == 3); + CHECK(info->schema.columns[0].name == "id"); + CHECK(info->schema.columns[0].type == Type::Int32); + CHECK_FALSE(info->schema.columns[0].nullable); + CHECK(info->schema.columns[1].type == Type::Text); + CHECK(info->schema.columns[2].nullable); + // Brand-new table sits past the bootstrap pages. + CHECK(info->root_page > Catalog::COLUMNS_ROOT); +} + +TEST_CASE("createTable with a duplicate name throws") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + + cat.createTable("people", makePersonSchema()); + CHECK_THROWS_AS(cat.createTable("people", makePersonSchema()), + std::runtime_error); +} + +TEST_CASE("table_ids are assigned monotonically in creation order") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + + cat.createTable("a", makePersonSchema()); + cat.createTable("b", makeOrderSchema()); + cat.createTable("c", makePersonSchema()); + + CHECK(cat.getTable("a")->table_id == 0); + CHECK(cat.getTable("b")->table_id == 1); + CHECK(cat.getTable("c")->table_id == 2); +} + +TEST_CASE("multiple tables coexist with distinct heap files") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + + cat.createTable("people", makePersonSchema()); + cat.createTable("orders", makeOrderSchema()); + + auto names = cat.tableNames(); + REQUIRE(names.size() == 2); + std::sort(names.begin(), names.end()); + CHECK(names[0] == "orders"); + CHECK(names[1] == "people"); + + const auto* p = cat.getTable("people"); + const auto* o = cat.getTable("orders"); + REQUIRE(p); + REQUIRE(o); + CHECK(p->root_page != o->root_page); + CHECK(p->root_page != Catalog::TABLES_ROOT); + CHECK(p->root_page != Catalog::COLUMNS_ROOT); + CHECK(o->root_page != Catalog::TABLES_ROOT); + CHECK(o->root_page != Catalog::COLUMNS_ROOT); +} + +TEST_CASE("schema with all four column types round-trips through the catalog") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + + Schema mixed{{ + {"a", Type::Int32, true}, + {"b", Type::Int64, false}, + {"c", Type::Bool, true}, + {"d", Type::Text, false}, + }}; + cat.createTable("mixed", mixed); + + const auto* info = cat.getTable("mixed"); + REQUIRE(info); + REQUIRE(info->schema.columns.size() == 4); + CHECK(info->schema.columns[0].type == Type::Int32); + CHECK(info->schema.columns[0].nullable); + CHECK(info->schema.columns[1].type == Type::Int64); + CHECK_FALSE(info->schema.columns[1].nullable); + CHECK(info->schema.columns[2].type == Type::Bool); + CHECK(info->schema.columns[2].nullable); + CHECK(info->schema.columns[3].type == Type::Text); + CHECK_FALSE(info->schema.columns[3].nullable); +} + +TEST_CASE("catalog persists across DiskManager / BufferPool restart") { + TempFile tf; + { + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + cat.createTable("people", makePersonSchema()); + cat.createTable("orders", makeOrderSchema()); + bp.flushAll(); + } + + // Cold reopen: only `tf.path()` survives across the boundary. + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat(&bp); + + CHECK(cat.hasTable("people")); + CHECK(cat.hasTable("orders")); + + const auto* p = cat.getTable("people"); + REQUIRE(p); + CHECK(p->table_id == 0); + REQUIRE(p->schema.columns.size() == 3); + CHECK(p->schema.columns[1].name == "name"); + CHECK(p->schema.columns[1].type == Type::Text); + CHECK(p->schema.columns[2].nullable); + + const auto* o = cat.getTable("orders"); + REQUIRE(o); + CHECK(o->table_id == 1); + REQUIRE(o->schema.columns.size() == 4); + CHECK(o->schema.columns[3].type == Type::Bool); +} + +TEST_CASE("table_ids do not collide with new tables created after restart") { + TempFile tf; + { + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + cat.createTable("a", makePersonSchema()); + cat.createTable("b", makePersonSchema()); + bp.flushAll(); + } + + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat(&bp); + cat.createTable("c", makePersonSchema()); + + CHECK(cat.getTable("a")->table_id == 0); + CHECK(cat.getTable("b")->table_id == 1); + CHECK(cat.getTable("c")->table_id == 2); +} + +TEST_CASE("a registered table's heap file is independently usable") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + cat.createTable("people", makePersonSchema()); + + const auto* info = cat.getTable("people"); + REQUIRE(info); + + HeapFile people(&bp, info->root_page); + + auto bytes = TupleCodec::encode(info->schema, { + Value::Int32(1), + Value::Text("alice"), + Value::Int32(30), + }); + RID r = people.insert(bytes.data(), bytes.size()); + + std::string out; + REQUIRE(people.get(r, &out)); + auto vals = TupleCodec::decode(info->schema, out.data(), out.size()); + CHECK(vals[0].i32 == 1); + CHECK(vals[1].text == "alice"); + CHECK(vals[2].i32 == 30); +} + +TEST_CASE("end-to-end: catalog + 100 rows, restart, full scan") { + TempFile tf; + { + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + cat.createTable("people", makePersonSchema()); + + const auto* info = cat.getTable("people"); + REQUIRE(info); + HeapFile people(&bp, info->root_page); + for (int i = 0; i < 100; ++i) { + auto bytes = TupleCodec::encode(info->schema, { + Value::Int32(i), + Value::Text("name_" + std::to_string(i)), + Value::Int32(20 + (i % 50)), + }); + people.insert(bytes.data(), bytes.size()); + } + bp.flushAll(); + } + + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat(&bp); + + const auto* info = cat.getTable("people"); + REQUIRE(info); + HeapFile people(&bp, info->root_page); + + int count = 0; + int sum_ids = 0; + for (const auto& [rid, bytes] : people) { + (void)rid; + auto vals = TupleCodec::decode(info->schema, bytes.data(), bytes.size()); + REQUIRE(vals.size() == 3); + sum_ids += vals[0].i32; + ++count; + } + CHECK(count == 100); + CHECK(sum_ids == 100 * 99 / 2); +} + +TEST_CASE("getTable pointer remains valid after subsequent createTable calls") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + + cat.createTable("first", makePersonSchema()); + const auto* first = cat.getTable("first"); + REQUIRE(first); + const PageId first_root = first->root_page; + + cat.createTable("second", makeOrderSchema()); + cat.createTable("third", makePersonSchema()); + + CHECK(first->name == "first"); + CHECK(first->root_page == first_root); +} + +TEST_CASE("__tables and __columns hold the expected number of rows") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + + cat.createTable("people", makePersonSchema()); // 3 columns + cat.createTable("orders", makeOrderSchema()); // 4 columns + + // Reach into the system tables directly to confirm row counts. + HeapFile tables_hf(&bp, Catalog::TABLES_ROOT); + HeapFile columns_hf(&bp, Catalog::COLUMNS_ROOT); + + int n_tables = 0; + for (auto it = tables_hf.begin(); it != tables_hf.end(); ++it) ++n_tables; + CHECK(n_tables == 2); + + int n_columns = 0; + for (auto it = columns_hf.begin(); it != columns_hf.end(); ++it) ++n_columns; + CHECK(n_columns == 3 + 4); +} diff --git a/tests/sql/test_tuple.cpp b/tests/sql/test_tuple.cpp new file mode 100644 index 0000000..0bf910e --- /dev/null +++ b/tests/sql/test_tuple.cpp @@ -0,0 +1,298 @@ +#include "tests/vendor/doctest.h" + +#include "src/sql/tuple.h" + +#include +#include +#include +#include +#include + +TEST_CASE("Schema::indexOf finds columns by name") { + Schema s{{ + {"id", Type::Int32, false}, + {"name", Type::Bool, true}, + {"age", Type::Int64, true}, + }}; + CHECK(s.indexOf("id") == 0); + CHECK(s.indexOf("name") == 1); + CHECK(s.indexOf("age") == 2); + CHECK(s.indexOf("missing") == Schema::kNotFound); +} + +TEST_CASE("Schema::tupleSize matches the layout: bitmap + sum of widths") { + Schema s{{ + {"a", Type::Int32, false}, // 4 bytes + {"b", Type::Int64, true}, // 8 bytes + {"c", Type::Bool, false}, // 1 byte + }}; + // bitmap: ceil(3/8) = 1 + // payload: 4 + 8 + 1 = 13 + CHECK(s.tupleSize() == 1 + 13); +} + +TEST_CASE("Schema::tupleSize counts a full bitmap byte every 8 columns") { + Schema s; + for (int i = 0; i < 9; ++i) { + s.columns.push_back({"c" + std::to_string(i), Type::Bool, true}); + } + // 9 columns → bitmap = 2 bytes; 9 bools → 9 bytes payload. + CHECK(s.tupleSize() == 2 + 9); +} + +TEST_CASE("Value factories build values of the expected kind") { + auto i = Value::Int32(42); + CHECK(i.type == Type::Int32); + CHECK_FALSE(i.is_null); + CHECK(i.i32 == 42); + + auto j = Value::Int64(0x0123456789ABCDEFLL); + CHECK(j.type == Type::Int64); + CHECK(j.i64 == 0x0123456789ABCDEFLL); + + auto t = Value::Bool(true); + CHECK(t.type == Type::Bool); + CHECK(t.b == true); + + auto n = Value::Null(Type::Int32); + CHECK(n.type == Type::Int32); + CHECK(n.is_null); +} + +TEST_CASE("Value equality compares type, null, and payload") { + CHECK(Value::Int32(1) == Value::Int32(1)); + CHECK(Value::Int32(1) != Value::Int32(2)); + CHECK(Value::Int32(1) != Value::Int64(1)); // different type + CHECK(Value::Bool(true) != Value::Bool(false)); + CHECK(Value::Null(Type::Int32) == Value::Null(Type::Int32)); + CHECK(Value::Null(Type::Int32) != Value::Null(Type::Bool)); + CHECK(Value::Null(Type::Int32) != Value::Int32(0)); // null vs zero +} + +TEST_CASE("encode/decode round trips a non-null tuple") { + Schema s{{ + {"id", Type::Int32, false}, + {"big", Type::Int64, false}, + {"flag", Type::Bool, false}, + }}; + std::vector vals = { + Value::Int32(0x12345678), + Value::Int64(static_cast(0xCAFEBABEDEADBEEFLL)), + Value::Bool(true), + }; + + auto bytes = TupleCodec::encode(s, vals); + REQUIRE(bytes.size() == s.tupleSize()); + + auto back = TupleCodec::decode(s, bytes.data(), bytes.size()); + REQUIRE(back.size() == vals.size()); + for (size_t i = 0; i < vals.size(); ++i) { + CHECK(back[i] == vals[i]); + } +} + +TEST_CASE("encode/decode round trip with mixed null and non-null columns") { + Schema s{{ + {"a", Type::Int32, true}, + {"b", Type::Int64, true}, + {"c", Type::Bool, false}, + {"d", Type::Int32, true}, + }}; + std::vector vals = { + Value::Null(Type::Int32), + Value::Int64(99), + Value::Bool(false), + Value::Null(Type::Int32), + }; + auto bytes = TupleCodec::encode(s, vals); + auto back = TupleCodec::decode(s, bytes.data(), bytes.size()); + CHECK(back == vals); +} + +TEST_CASE("empty schema round trips an empty tuple") { + Schema s{}; + auto bytes = TupleCodec::encode(s, {}); + CHECK(bytes.empty()); + // decode with empty bytes works because tupleSize() == 0. + auto back = TupleCodec::decode(s, bytes.data(), 0); + CHECK(back.empty()); +} + +TEST_CASE("encode rejects column count mismatch") { + Schema s{{{"a", Type::Int32, false}, {"b", Type::Bool, false}}}; + CHECK_THROWS_AS(TupleCodec::encode(s, {Value::Int32(1)}), std::runtime_error); +} + +TEST_CASE("encode rejects mismatched value type") { + Schema s{{{"a", Type::Int32, false}}}; + CHECK_THROWS_AS(TupleCodec::encode(s, {Value::Int64(1)}), std::runtime_error); + CHECK_THROWS_AS(TupleCodec::encode(s, {Value::Bool(false)}), std::runtime_error); +} + +TEST_CASE("encode rejects null in a non-nullable column") { + Schema s{{{"a", Type::Int32, false}}}; + CHECK_THROWS_AS(TupleCodec::encode(s, {Value::Null(Type::Int32)}), std::runtime_error); +} + +TEST_CASE("typeToCode and typeFromCode round trip every Type and reject garbage") { + for (Type t : {Type::Int32, Type::Int64, Type::Bool, Type::Text}) { + CHECK(typeFromCode(typeToCode(t)) == t); + } + CHECK_THROWS_AS(typeFromCode(99), std::runtime_error); + CHECK_THROWS_AS(typeFromCode(-1), std::runtime_error); +} + +TEST_CASE("tupleSize is undefined when the schema contains Text columns") { + // tupleSize is for fixed-only schemas; Text is variable-length, so its + // size depends on the actual values. Throw rather than guess. + Schema s{{{"a", Type::Text, true}}}; + CHECK_THROWS_AS(s.tupleSize(), std::runtime_error); +} + +TEST_CASE("Text round trips a non-null value of varying lengths") { + Schema s{{{"name", Type::Text, false}}}; + + for (const std::string& sample : {std::string(""), + std::string("hi"), + std::string("a longer text value"), + std::string(1000, 'x')}) { + std::vector vals = {Value::Text(sample)}; + auto bytes = TupleCodec::encode(s, vals); + // Bitmap (1 byte) + length prefix (4 bytes) + payload. + REQUIRE(bytes.size() == 1 + 4 + sample.size()); + auto back = TupleCodec::decode(s, bytes.data(), bytes.size()); + REQUIRE(back.size() == 1); + CHECK_FALSE(back[0].is_null); + CHECK(back[0].text == sample); + } +} + +TEST_CASE("Text distinguishes null from empty string via the bitmap") { + Schema s{{{"name", Type::Text, true}}}; + + // Empty string: not null, length 0. + auto e_bytes = TupleCodec::encode(s, {Value::Text("")}); + auto e_back = TupleCodec::decode(s, e_bytes.data(), e_bytes.size()); + CHECK_FALSE(e_back[0].is_null); + CHECK(e_back[0].text == ""); + + // Null: bitmap bit set, length still 0. + auto n_bytes = TupleCodec::encode(s, {Value::Null(Type::Text)}); + auto n_back = TupleCodec::decode(s, n_bytes.data(), n_bytes.size()); + CHECK(n_back[0].is_null); + + // Same byte length, different bitmap. + REQUIRE(e_bytes.size() == n_bytes.size()); + CHECK(e_bytes[0] != n_bytes[0]); +} + +TEST_CASE("mixed schema with fixed and Text columns round-trips") { + Schema s{{ + {"id", Type::Int32, false}, + {"name", Type::Text, false}, + {"score", Type::Int64, true}, + {"label", Type::Text, true}, + {"active", Type::Bool, false}, + }}; + std::vector vals = { + Value::Int32(7), + Value::Text("alice"), + Value::Null(Type::Int64), + Value::Text(""), // non-null empty text + Value::Bool(true), + }; + + auto bytes = TupleCodec::encode(s, vals); + auto back = TupleCodec::decode(s, bytes.data(), bytes.size()); + CHECK(back == vals); +} + +TEST_CASE("decode rejects truncated Text length prefix or payload") { + Schema s{{{"name", Type::Text, false}}}; + auto good = TupleCodec::encode(s, {Value::Text("hello")}); + + // Lop off the payload — length says 5 but only 3 bytes follow. + auto truncated_payload = good; + truncated_payload.resize(truncated_payload.size() - 2); + CHECK_THROWS_AS(TupleCodec::decode(s, truncated_payload.data(), + truncated_payload.size()), + std::runtime_error); + + // Also lop off most of the length prefix itself. + auto truncated_prefix = good; + truncated_prefix.resize(2); // bitmap byte + 1 byte of length + CHECK_THROWS_AS(TupleCodec::decode(s, truncated_prefix.data(), + truncated_prefix.size()), + std::runtime_error); +} + +TEST_CASE("decode rejects wrong byte length") { + Schema s{{{"a", Type::Int32, false}}}; + auto good = TupleCodec::encode(s, {Value::Int32(1)}); + CHECK_THROWS_AS(TupleCodec::decode(s, good.data(), good.size() - 1), std::runtime_error); + CHECK_THROWS_AS(TupleCodec::decode(s, good.data(), good.size() + 1), std::runtime_error); +} + +TEST_CASE("randomized round-trip stress: 200 random schemas/tuples") { + std::mt19937 rng(0xBEEFu); + std::uniform_int_distribution n_cols_dist(1, 8); + std::uniform_int_distribution type_dist(0, 3); // includes Text + std::uniform_int_distribution coin(0, 1); + std::uniform_int_distribution text_len_dist(0, 40); + std::uniform_int_distribution byte_dist(0, 255); + + auto type_for = [](int t) { + switch (t) { + case 0: return Type::Int32; + case 1: return Type::Int64; + case 2: return Type::Bool; + default: return Type::Text; + } + }; + + for (int trial = 0; trial < 200; ++trial) { + Schema s; + const int n_cols = n_cols_dist(rng); + for (int i = 0; i < n_cols; ++i) { + s.columns.push_back({"c" + std::to_string(i), + type_for(type_dist(rng)), + coin(rng) != 0}); + } + + std::vector vals; + for (const auto& col : s.columns) { + const bool make_null = col.nullable && coin(rng) == 0; + if (make_null) { + vals.push_back(Value::Null(col.type)); + continue; + } + switch (col.type) { + case Type::Int32: + vals.push_back(Value::Int32(static_cast(rng()))); + break; + case Type::Int64: { + const int64_t hi = static_cast(rng()) << 32; + const int64_t lo = static_cast(rng()); + vals.push_back(Value::Int64(hi | lo)); + break; + } + case Type::Bool: + vals.push_back(Value::Bool(coin(rng) != 0)); + break; + case Type::Text: { + std::string t(text_len_dist(rng), '\0'); + for (auto& c : t) c = static_cast(byte_dist(rng)); + vals.push_back(Value::Text(std::move(t))); + break; + } + } + } + + auto bytes = TupleCodec::encode(s, vals); + auto back = TupleCodec::decode(s, bytes.data(), bytes.size()); + REQUIRE(back.size() == vals.size()); + for (size_t i = 0; i < vals.size(); ++i) { + REQUIRE(back[i] == vals[i]); + } + } +}