Permalink
Browse files

initial commit

  • Loading branch information...
Julien Desrosiers
Julien Desrosiers committed Aug 17, 2009
0 parents commit 55605294784e91c77e2f10e21d58254660f67164
Showing with 1,340 additions and 0 deletions.
  1. +75 −0 bracket_lexer.rb
  2. +18 −0 example.rb
  3. +145 −0 grammar.y
  4. +120 −0 lexer.rb
  5. +20 −0 lexer_test.rb
  6. +173 −0 nodes.rb
  7. +456 −0 parser.rb
  8. +35 −0 parser_output.txt
  9. +40 −0 parser_test.rb
  10. +146 −0 runtime.rb
  11. +112 −0 spartan.rb
@@ -0,0 +1,75 @@
+class BracketLexer
+ KEYWORDS = ["def", "class", "if", "else", "true", "false", "nil"]
+
+ def tokenize(code)
+ code.chomp!
+ i = 0
+ tokens = []
+ current_indent = 0
+ indent_stack = []
+
+ while i < code.size
+ chunk = code[i..-1]
+
+ if identifier = chunk[/\A([a-z]\w*)/, 1]
+ if KEYWORDS.include?(identifier)
+ tokens << [identifier.upcase.to_sym, identifier]
+ else
+ tokens << [:IDENTIFIER, identifier]
+ end
+ i += identifier.size
+
+ elsif constant = chunk[/\A([A-Z]\w*)/, 1]
+ tokens << [:CONSTANT, constant]
+ i += constant.size
+
+ elsif number = chunk[/\A([0-9]+)/, 1]
+ tokens << [:NUMBER, number.to_i]
+ i += number.size
+
+ elsif string = chunk[/\A"(.*?)"/, 1]
+ tokens << [:STRING, string]
+ i += string.size + 2
+
+ # All indentation magic code was removed and only this elsif
+ # was added.
+ elsif chunk.match(/\A\n+/)
+ tokens << [:NEWLINE, "\n"]
+ i += 1
+
+ elsif chunk.match(/\A /)
+ i += 1
+
+ else
+ value = chunk[0,1]
+ tokens << [value, value]
+ i += 1
+
+ end
+
+ end
+
+ tokens
+ end
+end
+
+code = <<-EOS
+if 1 {
+ print "..."
+ if false {
+ pass
+ }
+ print "done!"
+}
+print "The End"
+EOS
+
+p BracketLexer.new.tokenize(code)
+# [[:IF, "if"], [:NUMBER, 1], ["{", "{"], [:NEWLINE, "\n"],
+# [:IDENTIFIER, "print"], [:STRING, "..."], [:NEWLINE, "\n"],
+# [:IF, "if"], [:FALSE, "false"], ["{", "{"], [:NEWLINE, "\n"],
+# [:IDENTIFIER, "pass"], [:NEWLINE, "\n"],
+# ["}", "}"], [:NEWLINE, "\n"],
+# [:IDENTIFIER, "print"], [:STRING, "done!"], [:NEWLINE, "\n"],
+# ["}", "}"], [:NEWLINE, "\n"],
+# [:IDENTIFIER, "print"], [:STRING, "The End"]]
@@ -0,0 +1,18 @@
+require "parser"
+require "runtime"
+
+code = <<-EOS
+class Awesome:
+ def does_it_work:
+ "yeah!"
+
+awesome_object = Awesome.new
+if awesome_object:
+ print("awesome_object.does_it_work: ")
+ print(awesome_object.does_it_work)
+else:
+ print("Something is wrong...")
+EOS
+
+nodes = Parser.new.parse(code)
+nodes.eval(Runtime)
145 grammar.y
@@ -0,0 +1,145 @@
+class Parser
+
+# Declare tokens produced by the lexer
+token IF ELSE
+token DEF
+token CLASS
+token NEWLINE
+token NUMBER
+token STRING
+token TRUE FALSE NIL
+token IDENTIFIER
+token CONSTANT
+token INDENT DEDENT
+
+rule
+ # All rules are declared in this format:
+ #
+ # RuleName:
+ # OtherRule TOKEN AnotherRule { code to run when this matches }
+ # | OtherRule { ... }
+ # ;
+ #
+ # In the code section ({...} on the right):
+ # - Assign to "result" the value returned by the rule.
+ # - Use val[index of expression] to reference expressions on the left.
+
+
+ # All parsing will end in this rule, being the trunk of the AST.
+ Root:
+ /* nothing */ { result = Nodes.new([]) }
+ | Expressions { result = val[0] }
+ ;
+
+ # Any list of expressions, class or method body.
+ Expressions:
+ Expression { result = Nodes.new(val) }
+ | Expressions Terminator Expression { result = val[0] << val[2] }
+ # To ignore trailing line breaks
+ | Expressions Terminator { result = Nodes.new([val[0]]) }
+ ;
+
+ # All types of expressions in our language
+ Expression:
+ Literal
+ | Call
+ | Constant
+ | Assign
+ | Def
+ | Class
+ | If
+ ;
+
+ # All tokens that can terminate an expression
+ Terminator:
+ NEWLINE
+ | ";"
+ ;
+
+ Literal:
+ NUMBER { result = LiteralNode.new(val[0]) }
+ | STRING { result = LiteralNode.new(val[0]) }
+ | TRUE { result = LiteralNode.new(true) }
+ | FALSE { result = LiteralNode.new(false) }
+ | NIL { result = LiteralNode.new(nil) }
+ ;
+
+ # A method call
+ Call:
+ # method
+ IDENTIFIER { result = CallNode.new(nil, val[0]) }
+ # method(arguments)
+ | IDENTIFIER "(" ArgList ")" { result = CallNode.new(nil, val[0], val[2]) }
+ # receiver.method
+ | Expression "." IDENTIFIER { result = CallNode.new(val[0], val[2]) }
+ # receiver.method(arguments)
+ | Expression "."
+ IDENTIFIER "(" ArgList ")" { result = CallNode.new(val[0], val[2], val[4]) }
+ ;
+
+ ArgList:
+ /* nothing */ { result = [] }
+ | Expression { result = val }
+ | ArgList "," Expression { result = val[0] << val[2] }
+ ;
+
+ Constant:
+ CONSTANT { result = GetConstantNode.new(val[0]) }
+ ;
+
+ # Assignation to variables or contants
+ Assign:
+ IDENTIFIER "=" Expression { result = SetLocalNode.new(val[0], val[2]) }
+ | CONSTANT "=" Expression { result = SetConstantNode.new(val[0], val[2]) }
+ ;
+
+ # Method definition
+ Def:
+ DEF IDENTIFIER Block { result = DefNode.new(val[1], [], val[2]) }
+ | DEF IDENTIFIER
+ "(" ParamList ")" Block { result = DefNode.new(val[1], val[3], val[5]) }
+ ;
+
+ ParamList:
+ /* nothing */ { result = [] }
+ | IDENTIFIER { result = val }
+ | ParamList "," IDENTIFIER { result = val[0] << val[2] }
+ ;
+
+ # Class definition
+ Class:
+ CLASS CONSTANT Block { result = ClassNode.new(val[1], val[2]) }
+ ;
+
+ # if and if-else block
+ If:
+ IF Expression Block { result = IfNode.new(val[1], val[2]) }
+ | IF Expression Block NEWLINE
+ ELSE Block { result = IfNode.new(val[1], val[2], val[5]) }
+ ;
+
+ # A block of indented code. You see here that all the hard work was done
+ # by the lexer.
+ Block:
+ INDENT Expressions DEDENT { result = val[1] }
+ # If you don't like indentation you could replace the previous rule with
+ # the following one do seperate blocks w/ "{" ... "}".
+ # (You'll need remove the indentation magic section in the lexer too)
+ # "{" Expressions "}" { replace = val[1] }
+ ;
+end
+
+---- header
+ require "lexer"
+ require "nodes"
+
+---- inner
+ def parse(code, show_tokens=false)
+ @tokens = Lexer.new.tokenize(code)
+ puts @tokens.inspect if show_tokens
+ do_parse
+ end
+
+ def next_token
+ @tokens.shift
+ end
120 lexer.rb
@@ -0,0 +1,120 @@
+class Lexer
+ KEYWORDS = ["def", "class", "if", "else", "true", "false", "nil"]
+
+ def tokenize(code)
+ # Cleanup code by remove extra line breaks
+ code.chomp!
+
+ # Current character position we're parsing
+ i = 0
+
+ # Collection of all parsed tokens in the form [:TOKEN_TYPE, value]
+ tokens = []
+
+ # Current indent level is the number of spaces in the last indent.
+ current_indent = 0
+ # We keep track of the indentation levels we are in so
+ # that when we dedent, we can check if we're on the
+ # correct level.
+ indent_stack = []
+
+ # This is how to implement a very simple scanner.
+ # Scan one caracter at the time until you find something to parse.
+ while i < code.size
+ chunk = code[i..-1]
+
+ # Matching standard tokens.
+ #
+ # Matching if, print, method names, etc.
+ if identifier = chunk[/\A([a-z]\w*)/, 1]
+ # Keywords are special identifiers tagged with their own
+ # name, 'if' will result in an [:IF, "if"] token
+ if KEYWORDS.include?(identifier)
+ tokens << [identifier.upcase.to_sym, identifier]
+ # Non-keyword identifiers include method and variable
+ # names.
+ else
+ tokens << [:IDENTIFIER, identifier]
+ end
+ # skip what we just parsed
+ i += identifier.size
+
+ # Matching class names and constants.
+ elsif constant = chunk[/\A([A-Z]\w*)/, 1]
+ tokens << [:CONSTANT, constant]
+ i += constant.size
+
+ elsif number = chunk[/\A([0-9]+)/, 1]
+ tokens << [:NUMBER, number.to_i]
+ i += number.size
+
+ elsif string = chunk[/\A"(.*?)"/, 1]
+ tokens << [:STRING, string]
+ i += string.size + 2
+
+ # Here's the indentation magic!
+ #
+ # We have to take care of 3 cases:
+ #
+ # if true: # 1) the block is created
+ # line 1
+ # line 2 # 2) new line inside a block
+ # continue # 3) dedent
+ #
+ # This elsif takes care of the first case.
+ # The number of spaces will determine the indent level.
+ elsif indent = chunk[/\A\:\n( +)/m, 1]
+ # When we create a new block we expect the indent level
+ # to go up.
+ if indent.size <= current_indent
+ raise "Bad indent level, got #{indent.size} indents, " +
+ "expected > #{current_indent}"
+ end
+ # Adjust the current indentation level.
+ current_indent = indent.size
+ indent_stack.push(current_indent)
+ tokens << [:INDENT, indent.size]
+ i += indent.size + 2
+
+ # This one takes care of cases 2 and 3.
+ # We stay in the same block if the indent level is the
+ # same as current_indent, or close a block, if it is lower.
+ elsif indent = chunk[/\A\n( *)/m, 1]
+ if indent.size < current_indent
+ indent_stack.pop
+ current_indent = indent_stack.first || 0
+ tokens << [:DEDENT, indent.size]
+ tokens << [:NEWLINE, "\n"]
+ elsif indent.size == current_indent
+ # Nothing to do, we're still in the same block
+ tokens << [:NEWLINE, "\n"]
+ else # indent.size > current_indent
+ # Cannot increase indent level without using ":", so
+ # this is an error.
+ raise "Missing ':'"
+ end
+ i += indent.size + 1
+
+ # Ignore whitespace
+ elsif chunk.match(/\A /)
+ i += 1
+
+ # We treat all other single characters as a token.
+ # Eg.: ( ) , . !
+ else
+ value = chunk[0,1]
+ tokens << [value, value]
+ i += 1
+
+ end
+
+ end
+
+ # Close all open blocks
+ while indent = indent_stack.pop
+ tokens << [:DEDENT, indent_stack.first || 0]
+ end
+
+ tokens
+ end
+end
@@ -0,0 +1,20 @@
+require "lexer"
+
+code = <<-EOS
+if 1:
+ print "..."
+ if false:
+ pass
+ print "done!"
+print "The End"
+EOS
+
+p Lexer.new.tokenize(code)
+# [[:IF, "if"], [:NUMBER, 1],
+# [:INDENT, 2], [:IDENTIFIER, "print"], [:STRING, "..."], [:NEWLINE, "\n"],
+# [:IF, "if"], [:IDENTIFIER, "false"],
+# [:INDENT, 4], [:IDENTIFIER, "pass"],
+# [:DEDENT, 2], [:NEWLINE, "\n"],
+# [:IDENTIFIER, "print"], [:STRING, "done!"],
+# [:DEDENT, 0], [:NEWLINE, "\n"],
+# [:IDENTIFIER, "print"], [:STRING, "The End"]]
Oops, something went wrong.

0 comments on commit 5560529

Please sign in to comment.