Introduce preprocessing step

Decided to introduce a new class for better functional encapsulation and clear distribution of duties. First step is to handle comments in the preprocessor since that's where they are actually stripped out which results in some slightly different behaviour of tokenisation (added tests to capture this). On the road to #13
kymckay · May 13, 2021 · fba0d9a · fba0d9a
1 parent 59268e7
commit fba0d9a
Show file tree

Hide file tree

Showing 18 changed files with 282 additions and 93 deletions.
diff --git a/README.md b/README.md
@@ -16,10 +16,17 @@ This is the first C++ I have ever written. My intentions for this project are:
 
 If you'd like to contribute please read [CONTRIBUTING.md](CONTRIBUTING.md).
 
+## Preprocessing
+
+- A comment starts with a `//` or `/*` character pair that is not part of a string literal, and ends at the end of the physical line or with a `*/` character pair, respectively. These are stripped during preprocessing and thus do not act as tokens delimiters.
+- SQF scripts support C-like preprocessor directives. If the `#` character appears at the start of a line (ignoring leading whitespace) the line is preprocessed.
+- The directive must immediately follow the `#` character (no whitespace) and ends at the end of the logical line.
+- The logical line can be extended by a `\` character immediately preceding the end of the physical line.
+- Preprocessing will fail and the script cannot be tokenised if an unrecognised directive is present.
+
 ## Lexical Analysis
 
 - In SQF a program consists of a sequence of statements. The end of a statement (except from the very last) is marked by a `,` or `;` character except where such characters are allowed by syntax (e.g. commas within an array literal).
-- A comment starts with a `//` or `/*` character pair that is not part of a string literal, and ends at the end of the physical line or with a `*/` character pair, respectively.
 - Outside of string literals, whitespace characters are used to seperate tokens (only necessary if their concatenation could otherwise be interpreted as a different token).
 
 ### Identifiers and Keywords

diff --git a/src/lexer/BUILD b/src/lexer/BUILD
@@ -8,6 +8,6 @@ cc_library(
     name = "sqf-lexer",
     srcs = ["lexer.cpp"],
     hdrs = ["lexical_error.h", "lexer.h"],
-    deps = [":sqf-tokens", "//src/sqf:sqf-keywords"],
+    deps = [":sqf-tokens", "//src/sqf:sqf-keywords", "//src/preprocessor:sqf-preprocessor"],
     visibility = ["//src/parser:__pkg__"],
 )
diff --git a/src/lexer/lexer.cpp b/src/lexer/lexer.cpp
@@ -6,7 +6,7 @@
 #include <algorithm>
 #include <cctype>
 
-Lexer::Lexer(std::istream &to_read) : stream_(to_read)
+Lexer::Lexer(Preprocessor &pre) : preproc_(pre)
 {
     // Immediately read in the first character
     advance();
@@ -20,29 +20,12 @@ void Lexer::error(Token t, std::string msg)
 // Preview the next character in order to differentiate tokens that start the same
 char Lexer::peek()
 {
-    return stream_.peek();
+    return preproc_.peek();
 }
 
 void Lexer::advance()
 {
-    // Increment the line whenever a newline is passed
-    if (current_char_ == '\n')
-    {
-        lineno_++;
-        column_ = 0;
-    }
-
-    stream_.get(current_char_);
-
-    // When end of stream is reached return EOF character
-    if (stream_.eof())
-    {
-        current_char_ = '\0';
-    }
-    else
-    {
-        column_++;
-    }
+    current_char_ = preproc_.get();
 }
 
 void Lexer::skipWhitespace()
@@ -53,31 +36,6 @@ void Lexer::skipWhitespace()
     }
 }
 
-void Lexer::skipComment()
-{
-    if (peek() == '/')
-    {
-        while (current_char_ != '\0' && current_char_ != '\n')
-        {
-            advance();
-        }
-
-        // Skip past the EOL
-        advance();
-    }
-    else if (peek() == '*')
-    {
-        while (current_char_ != '\0' && !(current_char_ == '*' && peek() == '/'))
-        {
-            advance();
-        }
-
-        // Skip past the block end: */
-        advance();
-        advance();
-    }
-}
-
 Token Lexer::_id()
 {
     // Token position at first character
@@ -232,8 +190,8 @@ Token Lexer::makeToken(TokenType type, std::string raw)
     Token t;
     t.type = type;
     t.raw = raw;
-    t.line = lineno_;
-    t.column = column_;
+    t.line = current_char_.line;
+    t.column = current_char_.column;
     return t;
 }
 
@@ -248,13 +206,6 @@ Token Lexer::nextToken()
             continue;
         }
 
-        // Comments are irrelevant (block and line)
-        if (current_char_ == '/' && (peek() == '/' || peek() == '*'))
-        {
-            skipComment();
-            continue;
-        }
-
         if (std::isalpha(current_char_) || current_char_ == '_')
         {
             return _id();

diff --git a/src/lexer/lexer.h b/src/lexer/lexer.h
@@ -1,31 +1,27 @@
 #pragma once
+#include "src/preprocessor/preprocessor.h"
 #include "src/lexer/token.h"
 #include "src/lexer/lexical_error.h"
 #include <string>
 #include <istream>
 
 class Lexer
 {
-    // Reference member, stream has no copy constructor and lexer doesn't care what kind of stream it is (file or string)
-    std::istream &stream_;
+    // Reference member, no need to copy the supplied preprocessor
+    Preprocessor &preproc_;
 
-    char current_char_;
-    // Current position lexer has reached in the text
-    // Used to give a position to errors
-    int lineno_ = 1;
-    int column_ = 1;
+    PosChar current_char_;
 
     void error(Token, std::string);
     char peek();
     void advance();
     void skipWhitespace();
-    void skipComment();
     Token _id();
     Token number();
     Token string();
     Token makeToken(TokenType, std::string);
 
 public:
-    Lexer(std::istream&);
+    Lexer(Preprocessor&);
     Token nextToken();
 };
diff --git a/src/main/sqwhiff.cpp b/src/main/sqwhiff.cpp
@@ -1,3 +1,4 @@
+#include "src/preprocessor/preprocessor.h"
 #include "src/lexer/lexer.h"
 #include "src/parser/parser.h"
 #include "src/analyzer/analyzer.h"
@@ -10,7 +11,8 @@ int main()
 
     if (file_in.is_open())
     {
-        Lexer lex(file_in);
+        Preprocessor preproc(file_in);
+        Lexer lex(preproc);
         Parser p(lex);
         Analyzer a(p);
         a.analyze(std::cout);

diff --git a/src/preprocessor/BUILD b/src/preprocessor/BUILD
@@ -0,0 +1,6 @@
+cc_library(
+    name = "sqf-preprocessor",
+    srcs = ["preprocessor.cpp"],
+    hdrs = ["pos_char.h", "preprocessing_error.h", "preprocessor.h"],
+    visibility = ["//src/lexer:__pkg__"],
+)
diff --git a/src/preprocessor/pos_char.h b/src/preprocessor/pos_char.h
@@ -0,0 +1,15 @@
+#pragma once
+
+// Character with an associated file position (may not correspond to physical position because of macros)
+struct PosChar
+{
+    char c = '\0';
+    int line = 1;
+    int column = 1;
+
+    // Convenience conversion operator
+    operator char() const { return c; }
+};
+
+// Using a typedef prevents naming clashes in global name space
+typedef PosChar PosChar;
diff --git a/src/preprocessor/preprocessing_error.h b/src/preprocessor/preprocessing_error.h
@@ -0,0 +1,9 @@
+#pragma once
+#include <string>
+#include <stdexcept>
+
+class PreprocessingError : public std::runtime_error
+{
+public:
+    PreprocessingError(int line, int col, std::string msg) : std::runtime_error(std::to_string(line) + ":" + std::to_string(col) + " PreprocessingError: " + msg) {}
+};
diff --git a/src/preprocessor/preprocessor.cpp b/src/preprocessor/preprocessor.cpp
@@ -0,0 +1,91 @@
+#include "src/preprocessor/preprocessor.h"
+#include <string>
+#include <istream>
+#include <algorithm>
+#include <cctype>
+
+Preprocessor::Preprocessor(std::istream &to_read) : stream_(to_read)
+{
+    // Immediately read in the first character
+    advance();
+}
+
+void Preprocessor::error(PosChar p, std::string msg)
+{
+    throw PreprocessingError(p.line, p.column, msg);
+}
+
+void Preprocessor::advance()
+{
+    // Increment the line whenever a newline is passed
+    if (current_char_ == '\n')
+    {
+        lineno_++;
+        column_ = 0;
+        line_start_ = true;
+    }
+    else if (line_start_ && !std::isspace(current_char_))
+    {
+        line_start_ = false;
+    }
+
+    stream_.get(current_char_);
+
+    // When end of stream is reached return EOF character
+    if (stream_.eof())
+    {
+        current_char_ = '\0';
+    }
+    else
+    {
+        column_++;
+    }
+}
+
+void Preprocessor::skipComment()
+{
+    if (stream_.peek() == '/')
+    {
+        // Intentionally don't skip the newline at the end (acts as a delimiter)
+        while (current_char_ != '\0' && current_char_ != '\n')
+        {
+            advance();
+        }
+    }
+    else if (stream_.peek() == '*')
+    {
+        while (current_char_ != '\0' && !(current_char_ == '*' && peek() == '/'))
+        {
+            advance();
+        }
+
+        // Skip past the block end: */
+        advance();
+        advance();
+    }
+}
+
+PosChar Preprocessor::get()
+{
+    // Comments are irrelevant (block and line)
+    if (current_char_ == '/' && (stream_.peek() == '/' || stream_.peek() == '*'))
+    {
+        skipComment();
+    }
+
+    PosChar c;
+    c.line = lineno_;
+    c.column = column_;
+    c.c = current_char_;
+
+    // Remember to actually progress through the input
+    advance();
+
+    return c;
+}
+
+// Preview the next character in order to differentiate tokens that start the same
+char Preprocessor::peek()
+{
+    return stream_.peek();
+}
diff --git a/src/preprocessor/preprocessor.h b/src/preprocessor/preprocessor.h
@@ -0,0 +1,29 @@
+#pragma once
+#include "src/preprocessor/pos_char.h"
+#include "src/preprocessor/preprocessing_error.h"
+#include <string>
+#include <istream>
+
+class Preprocessor
+{
+    // Reference member for polymorphism
+    std::istream &stream_;
+
+    char current_char_ = '\0';
+    // Current physical position preprocessor has reached
+    // Used to give a position to errors and macros
+    int lineno_ = 1;
+    int column_ = 1;
+
+    // Preprocessor directives must appear at a line start (ignoring whitespace)
+    bool line_start_ = true;
+
+    void error(PosChar, std::string);
+    void advance();
+    void skipComment();
+
+public:
+    Preprocessor(std::istream&);
+    PosChar get();
+    char peek();
+};
diff --git a/test/BUILD b/test/BUILD
@@ -42,9 +42,17 @@ cc_test(
   srcs = ["parser_private_test.cpp"],
   deps = [":sqf-ast-tester", "@com_google_googletest//:gtest_main"],
 )
+
 cc_test(
   name = "parser-program",
   size = "small",
   srcs = ["parser_program_test.cpp"],
   deps = [":sqf-ast-tester", "@com_google_googletest//:gtest_main"],
+)
+
+cc_test(
+  name = "preprocessor-comments",
+  size = "small",
+  srcs = ["preprocessor_comments_test.cpp"],
+  deps = [":sqf-ast-tester", "@com_google_googletest//:gtest_main"],
 )