From 154d1d5aed8e015499f0737cf00442068126d5e3 Mon Sep 17 00:00:00 2001 From: Marcel Laverdet Date: Mon, 22 Jun 2009 14:23:24 -0800 Subject: [PATCH] Quick checkin Summary: I just want a checkin of this somewhere that's not my dev server. Test Plan: None --- Makefile.xhp | 13 + code_rope.cpp | 80 ++++++ code_rope.hpp | 21 ++ config.m4 | 9 + ext.cpp | 208 ++++++++++++++ ext.hpp | 11 + xhp_parser.hpp | 46 +++ xhp_parser.y | 746 +++++++++++++++++++++++++++++++++++++++++++++++++ xhp_scanner.l | 377 +++++++++++++++++++++++++ 9 files changed, 1511 insertions(+) create mode 100644 Makefile.xhp create mode 100644 code_rope.cpp create mode 100644 code_rope.hpp create mode 100644 config.m4 create mode 100644 ext.cpp create mode 100644 ext.hpp create mode 100644 xhp_parser.hpp create mode 100644 xhp_parser.y create mode 100644 xhp_scanner.l diff --git a/Makefile.xhp b/Makefile.xhp new file mode 100644 index 00000000..bb83d4c0 --- /dev/null +++ b/Makefile.xhp @@ -0,0 +1,13 @@ +all: xhp_scanner.lex.cpp xhp_parser.yacc.cpp + +clean: + rm phpx xhp_scanner.lex.cpp xhp_scanner.lex.hpp xhp_parser.yacc.cpp xhp_parser.yacc.hpp xhp_parser.yacc.output + +xhp_scanner.lex.cpp: xhp_scanner.l + flex --header-file=xhp_scanner.lex.hpp --prefix=xhp -o $@ -d $< + +xhp_parser.yacc.cpp: xhp_parser.y + bison --debug --name-prefix=xhp --verbose -d -o $@ $< + +phpx: xhp_scanner.lex.cpp xhp_parser.yacc.cpp parser.cpp lineno_str.cpp + g++ -ggdb -Wall $^ -o $@ diff --git a/code_rope.cpp b/code_rope.cpp new file mode 100644 index 00000000..4f809011 --- /dev/null +++ b/code_rope.cpp @@ -0,0 +1,80 @@ +#include "code_rope.hpp" +using namespace std; + +code_rope::code_rope(const __gnu_cxx::rope str, const size_t no /* = 0 */, const size_t lf /* = 0 */) : str(str), lf(lf), no(no) {} + +code_rope::code_rope(const code_rope& str, const size_t no /* = 0 */, const size_t lf /* = 0 */) : str(str.str), lf(lf), no(no) { + if (str.lf || str.no) { + if (!no && !lf) { + this->lf = str.lf; + this->no = str.no; + } else { + throw new std::exception(); + } + } else { + this->no = no; + this->lf = lf; + } +} + +const char* code_rope::c_str() const { + if (this->no > 1) { + __gnu_cxx::rope whitespace(this->no - 1, '\n'); + whitespace += this->str; + return whitespace.c_str(); + } else { + return this->str.c_str(); + } +} + +void code_rope::prepend(const char* str) { + this->str = __gnu_cxx::rope(str) + this->str; +} + +const char code_rope::back() const { + return this->str.empty() ? 0 : this->str.back(); +} + +void code_rope::pop_back() { + this->str.pop_back(); +} + +code_rope code_rope::operator+(const code_rope& right) const { + size_t diff; + size_t no, lf; + __gnu_cxx::rope glue; + if (this->no && right.no) { + no = this->no; + if (right.no > this->no + this->lf) { + diff = right.no - this->no - this->lf; + lf = this->lf + right.lf + diff; + glue = __gnu_cxx::rope(diff, '\n'); + } else { + no = this->no; + lf = this->lf + right.lf; + } + } else if (right.no) { + no = right.no; + lf = this->lf + right.lf; + } else { + no = this->no; + lf = this->lf + right.lf; + } + return code_rope(this->str + glue + right.str, no, lf); +} + +code_rope code_rope::operator+(const char* right) const { + return code_rope(this->str + right, this->no, this->lf); +} + +code_rope& code_rope::operator=(const char* str) { + this->str = str; + this->no = this->lf = 0; + return *this; +} + +code_rope operator+(const char* left, const code_rope& right) { + code_rope ret(right); + ret.prepend(left); + return ret; +} diff --git a/code_rope.hpp b/code_rope.hpp new file mode 100644 index 00000000..3ddd8364 --- /dev/null +++ b/code_rope.hpp @@ -0,0 +1,21 @@ +#include +#include + +class code_rope { + protected: + __gnu_cxx::rope str; + size_t lf; /* how many line breaks this code contains */ + size_t no; /* line number this code starts on */ + + public: + code_rope(const __gnu_cxx::rope = "", const size_t = 0, const size_t = 0); + code_rope(const code_rope&, const size_t = 0, const size_t = 0); + const char* c_str() const; + void prepend(const char* str); + const char back() const; + void pop_back(); + code_rope operator+(const code_rope& right) const; + code_rope operator+(const char*) const; + code_rope& operator=(const char*); +}; +code_rope operator+(const char*, const code_rope&); diff --git a/config.m4 b/config.m4 new file mode 100644 index 00000000..66a752a5 --- /dev/null +++ b/config.m4 @@ -0,0 +1,9 @@ +PHP_ARG_ENABLE(xhp, xhp, +[ --enable-xhp Enable XHP]) + +PHP_REQUIRE_CXX() +if test "$PHP_XHP" = "yes"; then + PHP_ADD_LIBRARY(stdc++,, XHP_SHARED_LIBADD) + PHP_SUBST(XHP_SHARED_LIBADD) + PHP_NEW_EXTENSION(xhp, xhp_scanner.lex.cpp xhp_parser.yacc.cpp code_rope.cpp ext.cpp, $ext_shared) +fi diff --git a/ext.cpp b/ext.cpp new file mode 100644 index 00000000..9ce2554f --- /dev/null +++ b/ext.cpp @@ -0,0 +1,208 @@ +#include "ext.hpp" +#include "xhp_parser.hpp" +#include "zend.h" +#include "zend_API.h" +#include "zend_compile.h" +#include "zend_hash.h" +#include "zend_extensions.h" +#include + +typedef zend_op_array* (zend_compile_file_t)(zend_file_handle*, int TSRMLS_DC); +typedef zend_op_array* (zend_compile_string_t)(zval*, char* TSRMLS_DC); +static zend_compile_file_t* dist_compile_file; +static zend_compile_string_t* dist_compile_string; + +typedef struct { + const char* str; + size_t pos; + size_t len; +} xhp_stream_t; + +size_t xhp_stream_reader(xhp_stream_t* handle, char* buf, size_t len TSRMLS_DC) { + if (len > handle->len - handle->pos) { + len = handle->len - handle->pos; + } + if (len) { + memcpy(buf, handle->str + handle->pos, len); + buf[len] = 0; + handle->pos += len; + return len; + } else { + return 0; + } +} + +long xhp_stream_fteller(xhp_stream_t* handle TSRMLS_DC) { + return (long)handle->pos; +} + +char* xhp_parse_str(const char* code, const char* filename, int firsttok) { + + // Run it through the php superset parser which may generate valid php code... + void* scanner; + code_rope buf; + xhp_extra_type extra; + extra.firsttoken = firsttok; + +// xhpdebug = 1; + xhplex_init(&scanner); + xhpset_extra(&extra, scanner); + xhp_scan_string(code, scanner); + int ret = xhpparse(scanner, filename, &buf); + xhplex_destroy(scanner); + if (ret) { + zend_error(E_COMPILE_ERROR, buf.c_str()); + return NULL; + } else { + // Create a string stream with the rewritten code and give to compile_file + return estrdup(buf.c_str()); + } +} + +static zend_op_array* xhp_compile_file(zend_file_handle* f, int type TSRMLS_DC) { + + if (open_file_for_scanning(f TSRMLS_CC) == FAILURE) { + // If opening the file fails just send it to the original func + return dist_compile_file(f, type TSRMLS_CC); + } + + if (f->type == ZEND_HANDLE_STREAM && f->handle.stream.interactive) { + fprintf(stderr, "Warning: Using PHP + XHP in interactive mode will lead to undesirable behavior; execution will not commence until EOF (^D) is encountered.\n"); + } + + // Read full program from zend stream + std::string str; + char buf[4096]; + size_t len; + while (len = zend_stream_read(f, (char*)&buf, 4095 TSRMLS_CC)) { + buf[len] = 0; + str += buf; + } + + // Run this through xhp? Quick heuristic to determine if we can avoid a 2nd parse stage + bool maybe_xhp = false; + const char* cstr = str.c_str(); + const char* ii; + for (ii = cstr; *ii; ++ii) { + if (*ii == '<') { // + if (ii[1] == '/') { + maybe_xhp = 1; + break; + } + } else if (*ii == '/' && ii[1] == '>') { // + maybe_xhp = 1; + break; + } + } + + // If this file contains xhp run through the xhp parse stage + const char* rewrit; + if (maybe_xhp) { + bool old_in_comp = CG(in_compilation); + CG(in_compilation) = true; + rewrit = xhp_parse_str(cstr, f->filename ? f->filename : "", 0); + if (rewrit == NULL) { + zend_bailout(); + } + CG(in_compilation) = old_in_comp; + len = strlen(rewrit); + } else { + rewrit = cstr; + len = ii - cstr; + } + + // Create a fake stream + xhp_stream_t stream_data; + stream_data.str = rewrit; + stream_data.pos = 0; + stream_data.len = len; + + zend_file_handle fake_file; + fake_file.type = ZEND_HANDLE_STREAM; + fake_file.opened_path = f->opened_path ? estrdup(f->opened_path) : NULL; + fake_file.filename = f->filename; + fake_file.free_filename = false; + + fake_file.handle.stream.handle = &stream_data; + fake_file.handle.stream.reader = (zend_stream_reader_t)&xhp_stream_reader; + fake_file.handle.stream.closer = NULL; + fake_file.handle.stream.fteller = (zend_stream_fteller_t)&xhp_stream_fteller; + fake_file.handle.stream.interactive = 0; + + zend_op_array* ret = dist_compile_file(&fake_file, type TSRMLS_CC); + + if (maybe_xhp) { + efree(const_cast(rewrit)); + } + return ret; +} + +static zend_op_array* xhp_compile_string(zval* str, char *filename TSRMLS_DC) { + + // Cast to str + zval tmp; + if (str->type != IS_STRING) { + tmp = *str; + zval_copy_ctor(&tmp); + convert_to_string(&tmp); + str = &tmp; + } + + // Rewrite the string + char* rewrit = xhp_parse_str(str->value.str.val, "", t_PHP_FAKE_OPEN_TAG); + if (str == &tmp) { + zval_dtor(&tmp); + } + if (rewrit == NULL) { + return NULL; + } + + // Create another tmp zval with the rewritten PHP code and pass it to the original function + INIT_ZVAL(tmp); + tmp.type = IS_STRING; + tmp.value.str.val = rewrit; + tmp.value.str.len = strlen(rewrit); + zend_op_array* ret = dist_compile_string(&tmp, filename TSRMLS_CC); + zval_dtor(&tmp); + return ret; +} + +static PHP_MINIT_FUNCTION(xhp) { + + // APC has this crazy magic api you can use to avoid the race condition for when an extension overwrites + // the compile_file function. The desired order here is APC -> xhp -> PHP, that way APC can cache the + // file as usual. + zend_module_entry *apc_lookup; + zend_constant *apc_magic; + if (zend_hash_find(&module_registry, "apc", sizeof("apc"), (void**)&apc_lookup) != FAILURE && + zend_hash_find(EG(zend_constants), "\000apc_magic", 11, (void**)&apc_magic) != FAILURE) { + zend_compile_file_t* (*apc_set_compile_file)(zend_compile_file_t*) = (zend_compile_file_t* (*)(zend_compile_file_t*))apc_magic->value.value.lval; + dist_compile_file = apc_set_compile_file(NULL); + apc_set_compile_file(xhp_compile_file); + } else { + dist_compile_file = zend_compile_file; + zend_compile_file = xhp_compile_file; + } + + // For eval + dist_compile_string = zend_compile_string; + zend_compile_string = xhp_compile_string; + return SUCCESS; +} + +zend_module_entry xhp_module_entry = { + STANDARD_MODULE_HEADER, + PHP_XHP_EXTNAME, + NULL, + PHP_MINIT(xhp), + NULL, + NULL, + NULL, + NULL, + PHP_XHP_VERSION, + STANDARD_MODULE_PROPERTIES +}; + +#ifdef COMPILE_DL_XHP +ZEND_GET_MODULE(xhp) +#endif diff --git a/ext.hpp b/ext.hpp new file mode 100644 index 00000000..c95385cc --- /dev/null +++ b/ext.hpp @@ -0,0 +1,11 @@ +#pragma once +#ifdef HAVE_CONFIG_H +#include "../config.h" +#endif +#include "php.h" + +#define PHP_XHP_VERSION "1.0" +#define PHP_XHP_EXTNAME "xhp" + +extern zend_module_entry xhp_module_entry; +#define phpext_xhp &xhp_module_entry diff --git a/xhp_parser.hpp b/xhp_parser.hpp new file mode 100644 index 00000000..199d9f21 --- /dev/null +++ b/xhp_parser.hpp @@ -0,0 +1,46 @@ +#pragma once + +#include +#include "code_rope.hpp" + +#define YYSTYPE code_rope + +#define YY_HEADER_EXPORT_START_CONDITIONS + +typedef struct { + int firsttoken; + char* heredoc_eom; + size_t heredoc_eom_len; + char* heredoc_data_start; + char* heredoc_data_last; +} xhp_extra_type; +#define YY_EXTRA_TYPE xhp_extra_type* + +#define YYLTYPE_IS_DECLARED +typedef struct YYLTYPE { + int internal_line; + int actual_line_offset; + int first_line; + int first_column; + int last_line; + int last_column; +} YYLTYPE; + + +void flexBEGIN(int, void*); +void flex_push_state(int, void*); +void flex_pop_state(void*); + +#include "xhp_parser.yacc.hpp" +#ifndef FLEX_SCANNER +// You can't include flex's header from within flex or shit goes to hell +#include "xhp_scanner.lex.hpp" +#define yy_push_state(a) flex_push_state(a, yyscanner); +#define yy_pop_state() flex_pop_state(yyscanner); +#endif + +extern int xhpdebug; +int xhpparse(void*, const char*, code_rope*); +#ifndef FLEX_SCANNER +void* xhp_scan_string(const char *yy_str, void* yyscanner); +#endif diff --git a/xhp_parser.y b/xhp_parser.y new file mode 100644 index 00000000..3a1f1f25 --- /dev/null +++ b/xhp_parser.y @@ -0,0 +1,746 @@ +%{ + #include "xhp_parser.hpp" + #include "zend_compile.h" + #include +%} + +%{ + #ifdef yylineno + #undef yylineno + #endif + #define yylineno (unsigned int)(yylloc.internal_line) + #define cr(s) code_rope(s, yylineno) + extern int yydebug; + static void yyerror(YYLTYPE* xhplloc, void* yyscanner, const char* filename, code_rope* str, const char* a) { + CG(zend_lineno) = xhplloc->internal_line + xhplloc->actual_line_offset; + zend_set_compiled_filename(const_cast(filename) TSRMLS_CC); + *str = a; + return; + + std::stringstream c; + c << xhplloc->internal_line + xhplloc->actual_line_offset; + *str = (std::string(a) + " in " + filename + " on line " + c.str()).c_str(); + } +%} + +%locations +%pure-parser +%parse-param { void* yyscanner } +%parse-param { const char* filename } +%parse-param { code_rope* root } +%lex-param { void* yyscanner } +%error-verbose + +// Keywords +%token t_IF +%nonassoc p_IF +%left t_ELSEIF +%left t_ELSE +%token t_DO t_WHILE t_FOR t_FOREACH +%token t_SWITCH t_CASE t_DEFAULT t_BREAK t_CONTINUE +%token t_FUNCTION t_RETURN +%token t_NEW t_CLONE +%token t_TRY +%nonassoc t_CATCH +%nonassoc t_FINALLY +%nonassoc p_CATCH; +%token t_ABSTRACT t_FINAL t_PRIVATE t_PROTECTED t_PUBLIC +%token t_CLASS t_INTERFACE t_EXTENDS t_IMPLEMENTS + +// Literals +%token t_LITERAL_STRING t_EVALUATED_STRING t_SHELL_EXPRESSION +%token t_NUMBER +%token t_HEREDOC +%token t_XHP_TEXT +%token t_XHP_DIV t_XHP_LESS_THAN_DIV t_XHP_GREATER_THAN + +// Operators +%left t_AS +%left t_COMMA +%left t_LOGICAL_OR +%left t_LOGICAL_XOR +%left t_LOGICAL_AND +%right t_PRINT +%right t_ECHO +%right t_ASSIGN t_APPEND t_PLUS_ASSIGN t_MINUS_ASSIGN t_DIV_ASSIGN t_MULT_ASSIGN t_MOD_ASSIGN t_BIT_AND_ASSIGN t_BIT_OR_ASSIGN t_BIT_XOR_ASSIGN t_LSHIFT_ASSIGN t_RSHIFT_ASSIGN +%left t_PLING t_COLON +%left t_OR +%left t_AND +%left t_BIT_OR +%left t_BIT_XOR +%left t_BIT_AND +%nonassoc t_EQUAL t_NOT_EQUAL t_STRICT_EQUAL t_STRICT_NOT_EQUAL +%nonassoc t_LESS_THAN_EQUAL t_GREATER_THAN_EQUAL t_LESS_THAN t_GREATER_THAN +%left t_LSHIFT t_RSHIFT +%left t_PLUS t_MINUS t_CONCAT +%left t_MULT t_DIV t_MOD +%right t_NOT t_BIT_NOT t_INCR t_DECR // also casting operators and "@" +%right t_INSTANCEOF +%right t_LBRACKET +%token t_RBRACKET +%token t_RCURLY +%nonassoc t_NEW +%nonassoc t_CLONE +%left t_CONST +%left t_GLOBAL +%right t_THROW +%left t_STATIC +%left t_ARROW t_DOUBLE_ARROW +%left t_HEBREW_THING +%left t_LPAREN t_RPAREN +%right t_REQ_ONCE +%right t_INC_ONCE +%right t_REQ +%right t_INC +%right t_DOLLAR +%right t_AT +%right t_LCURLY + +// Misc +%token t_SEMICOLON +%token t_IDENTIFIER t_DOUBLE_QUOTE t_PHP_OPEN_TAG t_PHP_FAKE_OPEN_TAG t_PHP_CLOSE_TAG t_PHP_OPEN_TAG_WITH_ECHO +%token t_INLINE_HTML +%token t_IDK + +%start program +%initial-action { + yylloc.internal_line = 1; + yylloc.actual_line_offset = 0; +} +%% + +// Top level +program: + statement_list { + *root = $1; + } +; + +statement_list: + /* empty */ { + $$ = cr(""); + } +| statement_list statement { + $$ = $1 + $2; + } +; + +// Statements +statement: + t_LCURLY statement_list t_RCURLY { + $$ = "{" + $2 + "}"; + } +| t_PHP_OPEN_TAG { + yy_push_state(PHP); + $$ = cr(""; + } +| t_SEMICOLON { + $$ = ";"; + } +; + +expression_statement: + expression_with_comma semicolon { + $$ = $1 + $2; + } +; + +if_statement: + t_IF t_LPAREN expression t_RPAREN statement { + $$ = "if (" + $3 + ") " + $5; + } +| statement t_ELSE statement { + $$ = $1 + " else " + $3; + } +| statement t_ELSEIF t_LPAREN expression t_RPAREN statement { + $$ = $1 + "elseif (" + $4 + ") " + $6; + } +; + +for_statement: + t_FOR t_LPAREN statement statement expression_with_comma t_RPAREN statement { + $$ = "for (" + $3 + $4 + $5 + ") " + $7; + } +| t_FOR t_LPAREN statement statement t_RPAREN statement { + $$ = "for (" + $3 + $4 + ") " + $6; + } +; + +foreach_statement: + t_FOREACH t_LPAREN expression t_RPAREN statement { + $$ = "foreach (" + $3 + ") " + $5; + } +; + +do_statement: + t_DO statement t_WHILE t_LPAREN expression t_RPAREN semicolon { + $$ = "do " + $2 + " while (" + $5 + ")" + $7; + } +| t_WHILE t_LPAREN expression t_RPAREN statement { + $$ = "while (" + $3 + ") " + $5; + } +; + +switch_statement: + t_SWITCH t_LPAREN expression t_RPAREN statement { + $$ = "switch (" + $3 + ") " + $5; + } +| t_CASE expression t_COLON { + $$ = "case " + $2 + ":"; + } +| t_CASE expression semicolon { + $$ = "case " + $2 + $3; + } +| t_DEFAULT t_COLON { + $$ = "default:"; + } +| t_DEFAULT semicolon { + $$ = "default" + $2; + } +| t_BREAK expression semicolon { + $$ = "break " + $2 + $3; + } +| t_BREAK semicolon { + $$ = "break" + $2; + } +| t_CONTINUE expression semicolon { + $$ = "continue" + $2 + $3; + } +| t_CONTINUE semicolon { + $$ = "continue" + $2; + } +; + +return_statement: + t_RETURN expression semicolon { + $$ = "return " + $2 + $3; + } +| t_RETURN semicolon { + $$ = cr("return") + $2; + } +; + +try_statement: + t_TRY t_LCURLY statement_list t_RCURLY catch_blocks { + $$ = "try { " + $3 + "} " + $5; + } +; + +catch_blocks: + catch_blocks catch_blocks %prec p_CATCH { + $$ = $1 + $2; + } +| t_CATCH t_LPAREN identifier expression t_RPAREN t_LCURLY statement_list t_RCURLY { + $$ = "catch(" + $3 + " " + $4 + ") {" + $7 + "}"; + } +| t_FINALLY t_LCURLY statement_list t_RCURLY { + $$ = "finally {" + $3 + "}"; + } +; + +// Argument list +argument_list: + t_LPAREN t_RPAREN { + $$ = cr("()"); + } +| t_LPAREN _argument_list t_RPAREN { + $$ = "(" + $2 + ")"; + } +| t_LPAREN _argument_list t_COMMA t_RPAREN { + $$ = "(" + $2 + ",)"; + } +; + +_argument_list: + expression +| t_COMMA { + $$ = ","; + } +| t_COMMA expression { + $$ = "," + $2; + } +| _argument_list t_COMMA expression { + $$ = $1 + "," + $3; + } +| _argument_list t_COMMA { + $$ = $1 + ","; + } +; + +// Literals +numeric_literal: + t_NUMBER { + $$ = cr($1); + } +; + +string_literal: + t_LITERAL_STRING { + $$ = cr($1); + } +| t_EVALUATED_STRING { + $$ = cr($1); + } +; + +heredoc_literal: + t_HEREDOC { + $$ = cr($1); + } +; + +// Expressions +literal_expression: + numeric_literal +| string_literal +| heredoc_literal +; + +identifier: + t_IDENTIFIER { + $$ = cr($1); + } +; + +// XHP extensions +xhp_expression: + xhp_singleton +| xhp_open_tag xhp_children xhp_close_tag { + $$ = $1 + $2 + $3; + } +; + +xhp_children: + xhp_children_ { + if ($1.back() == ',') { + $1.pop_back(); + $$ = "array(" + $1 + ")"; + } else { + $$ = "null"; + } + } +; + +xhp_children_: + /* empty */ { + $$ = cr(""); + } +| xhp_children_ xhp_child { + $$ = $1 + $2 + ","; + } +; + +xhp_child: + t_XHP_TEXT { + $$ = "\"" + $1 + "\""; + } +| xhp_expression +| t_LCURLY { yy_push_state(PHP); } expression t_RCURLY { yy_pop_state(); } { + $$ = $3; + } +; + +xhp_attributes: + /* empty */ { + $$ = cr(""); + } +| xhp_attributes xhp_attribute { + $$ = $1 + $2 + ","; + } + +xhp_attribute: + t_IDENTIFIER t_ASSIGN { yy_push_state(XHP); } t_DOUBLE_QUOTE t_XHP_TEXT { yy_pop_state(); } t_DOUBLE_QUOTE { + $$ = "\"" + $1 + "\" => " + $5; + } +| t_IDENTIFIER t_ASSIGN { yy_push_state(PHP); } t_LCURLY expression { yy_pop_state(); } t_RCURLY { + $$ = "\"" + $1 + "\" => " + $5; + } +; + +xhp_lt: + t_LESS_THAN { + $$ = cr(""); + yy_push_state(XHP_ATTR); + } +; + +xhp_tag_name: + t_IDENTIFIER +| xhp_tag_name t_COLON t_IDENTIFIER { + $$ = $1 + "_" + $3; + } +| xhp_tag_name t_MINUS t_IDENTIFIER { + $$ = $1 + "-" + $3; + } +; + +xhp_singleton: + xhp_lt xhp_tag_name xhp_attributes t_XHP_DIV t_XHP_GREATER_THAN { + yy_pop_state(); + $$ = $1 + "new xhp_" + $2 + "(array(" + $3 + "), array())"; + } +; + +xhp_open_tag: + xhp_lt xhp_tag_name xhp_attributes t_XHP_GREATER_THAN { + yy_pop_state(); + yy_push_state(XHP); + $$ = $1 + "new xhp_" + $2 + "(array(" + $3 + "), "; + } +; + +xhp_close_tag: + t_XHP_LESS_THAN_DIV { yy_push_state(XHP_ATTR); } t_IDENTIFIER t_XHP_GREATER_THAN { + yy_pop_state(); + yy_pop_state(); + // TODO: check stack for mismatched tags + $$ = ")"; + } +; + +expression_with_comma: + expression +| expression_with_comma t_COMMA expression { + $$ = $1 + "," + $3; + } +; + +variables_and_stuff: + literal_expression +| t_SHELL_EXPRESSION +| identifier +| xhp_expression +| expression t_LBRACKET expression t_RBRACKET { + $$ = $1 + "[" + $3 + "]"; + } +| expression t_ARROW t_LCURLY expression t_RCURLY { + $$ = $1 + "->{" + $4 + "}"; + } +; + +expression: + variables_and_stuff +| variables_and_stuff argument_list { + $$ = $1 + $2; + } +| identifier identifier { + $$ = $1 + " " + $2; + } +| t_LPAREN expression t_RPAREN { + $$ = "(" + $2 + ")"; + } +| t_LPAREN expression t_RPAREN expression { + $$ = "(" + $2 + ")" + $4; + } +| expression t_LCURLY expression t_RCURLY { + $$ = $1 + "{" + $3 + "}"; + } +| expression t_LBRACKET t_RBRACKET { + $$ = $1 + "[]"; + } +| expression t_PLING expression t_COLON expression { + $$ = $1 + " ? " + $3 + " : " + $5; + } +| expression t_ASSIGN expression { + $$ = $1 + "=" + $3; + } +| expression t_APPEND expression { + $$ = $1 + ".=" + $3; + } +| expression t_PLUS_ASSIGN expression { + $$ = $1 + "+=" + $3; + } +| expression t_MINUS_ASSIGN expression { + $$ = $1 + "-=" + $3; + } +| expression t_DIV_ASSIGN expression { + $$ = $1 + "/=" + $3; + } +| expression t_MULT_ASSIGN expression { + $$ = $1 + "*=" + $3; + } +| expression t_MOD_ASSIGN expression { + $$ = $1 + "%=" + $3; + } +| expression t_BIT_AND_ASSIGN expression { + $$ = $1 + "&=" + $3; + } +| expression t_BIT_OR_ASSIGN expression { + $$ = $1 + "|=" + $3; + } +| expression t_BIT_XOR_ASSIGN expression { + $$ = $1 + "^=" + $3; + } +| expression t_LSHIFT_ASSIGN expression { + $$ = $1 + "<<=" + $3; + } +| expression t_RSHIFT_ASSIGN expression { + $$ = $1 + ">>=" + $3; + } +| expression t_OR expression { + $$ = $1 + "||" + $3; + } +| expression t_AND expression { + $$ = $1 + "&&" + $3; + } +| expression t_BIT_OR expression { + $$ = $1 + "|" + $3; + } +| expression t_BIT_AND expression { + $$ = $1 + "&" + $3; + } +| expression t_BIT_XOR expression { + $$ = $1 + "^" + $3; + } +| expression t_LOGICAL_OR expression { + $$ = $1 + " OR " + $3; + } +| expression t_LOGICAL_XOR expression { + $$ = $1 + " XOR " + $3; + } +| expression t_LOGICAL_AND expression { + $$ = $1 + " AND " + $3; + } +| expression t_EQUAL expression { + $$ = $1 + "==" + $3; + } +| expression t_NOT_EQUAL expression { + $$ = $1 + "!=" + $3; + } +| expression t_STRICT_EQUAL expression { + $$ = $1 + "===" + $3; + } +| expression t_STRICT_NOT_EQUAL expression { + $$ = $1 + "!==" + $3; + } +| expression t_LESS_THAN_EQUAL expression { + $$ = $1 + "<=" + $3; + } +| expression t_GREATER_THAN_EQUAL expression { + $$ = $1 + ">=" + $3; + } +| expression t_LESS_THAN expression { + $$ = $1 + "<" + $3; + } +| expression t_GREATER_THAN expression { + $$ = $1 + ">" + $3; + } +| expression t_LSHIFT expression { + $$ = $1 + "<<" + $3; + } +| expression t_RSHIFT expression { + $$ = $1 + ">>" + $3; + } +| expression t_PLUS expression { + $$ = $1 + "+" + $3; + } +| expression t_MINUS expression { + $$ = $1 + "-" + $3; + } +| expression t_MULT expression { + $$ = $1 + "*" + $3; + } +| expression t_DIV expression { + $$ = $1 + "/" + $3; + } +| expression t_MOD expression { + $$ = $1 + "%" + $3; + } +| expression t_CONCAT expression { + $$ = $1 + " . " + $3; + } +| expression t_INSTANCEOF expression { + $$ = $1 + " instanceof " + $3; + } +| expression t_AS expression { + $$ = $1 + " as " + $3; + } +| expression t_DOUBLE_ARROW expression { + $$ = $1 + "=>" + $3; + } +| expression t_HEBREW_THING expression { + $$ = $1 + "::" + $3; + } +| expression t_ARROW expression { + $$ = $1 + "->" + $3; + } +| t_INCR expression { + $$ = "++" + $2; + } +| t_DECR expression { + $$ = "--" + $2; + } +| expression t_INCR { + $$ = $1 + "++"; + } +| expression t_DECR { + $$ = $1 + "--"; + } +| t_PLUS expression { + $$ = "+" + $2; + } +| t_MINUS expression { + $$ = "-" + $2; + } +| t_BIT_NOT expression { + $$ = "~" + $2; + } +| t_NOT expression { + $$ = "!" + $2; + } +| t_BIT_AND expression { + $$ = "&" + $2; + } +| t_AT expression { + $$ = "@" + $2; + } +| t_DOLLAR t_LCURLY expression t_RCURLY { + $$ = "${" + $3 + "}"; + } +| t_PRINT expression { + $$ = "print " + $2; + } +| t_ECHO expression { + $$ = "echo " + $2; + } +| t_CLONE expression { + $$ = "clone " + $2; + } +| t_NEW expression { + $$ = "new " + $2; + } +| t_THROW expression { + $$ = "throw " + $2; + } +| t_REQ_ONCE expression { + $$ = "require_once " + $2; + } +| t_REQ expression { + $$ = "require " + $2; + } +| t_INC_ONCE expression { + $$ = "include_once " + $2; + } +| t_INC expression { + $$ = "include " + $2; + } +; + +// Declarations +many_fancy_delarations: + fancy_delarations +| many_fancy_delarations fancy_delarations { + $$ = $1 + " " + $2; + } +; + +fancy_delarations: + t_GLOBAL { + $$ = cr("global"); + } +| t_CONST { + $$ = cr("const"); + } +| t_PUBLIC { + $$ = cr("public"); + } +| t_PROTECTED { + $$ = cr("protected"); + } +| t_PRIVATE { + $$ = cr("private"); + } +| t_STATIC { + $$ = cr("static"); + } +| t_ABSTRACT { + $$ = cr("abstract"); + } +| t_FINAL { + $$ = cr("final"); + } +; + +declaration_statement: + many_fancy_delarations expression_with_comma semicolon { + $$ = $1 + " " + $2 + $3; + } +; + +// Functions +function_name: + identifier +| t_BIT_AND identifier { + $$ = "&" + $2; + } +; + +function: + many_fancy_delarations function { + $$ = $1 + " " + $2; + } +| t_FUNCTION function_name argument_list t_LCURLY statement_list t_RCURLY { + $$ = "function " + $2 + $3 + " {" + $5 + "}"; + } +| t_FUNCTION function_name argument_list semicolon { + $$ = "function " + $2 + $3 + $4; + } +; + +// Classes +class_statement: + t_CLASS classy_stuff t_LCURLY statement_list t_RCURLY { + $$ = "class " + $2 + " {" + $4 + "}"; + } +| many_fancy_delarations t_CLASS classy_stuff t_LCURLY statement_list t_RCURLY { + $$ = $1 + " class " + $3 + " {" + $5 + "}"; + } +| t_INTERFACE classy_stuff t_LCURLY statement_list t_RCURLY { + $$ = "interface " + $2 + " {" + $4 + "}"; + } +; + +classy_stuff: + /* empty */ { + $$ = ""; + } +| classy_stuff t_EXTENDS { + $$ = $1 + cr(" extends "); + } +| classy_stuff t_IMPLEMENTS { + $$ = $1 + cr(" implements "); + } +| classy_stuff identifier { + $$ = $1 + " " + $2; + } +| classy_stuff t_COMMA { + $$ = $1 + cr(", "); + } +; diff --git a/xhp_scanner.l b/xhp_scanner.l new file mode 100644 index 00000000..3cd6ca38 --- /dev/null +++ b/xhp_scanner.l @@ -0,0 +1,377 @@ +%{ +#include "xhp_parser.hpp" +#define YY_USER_INIT \ + if (yyextra->firsttoken) { \ + yyg->yy_init = 0; \ + int ft = yyextra->firsttoken; \ + yyextra->firsttoken = 0; \ + return ft; \ + } +#define tok(t) flex_tok(t, yyg); + +static int flex_tok(int t, void* y); +%} + +%option stack +%option noyywrap +%option reentrant +%option bison-bridge +%option bison-locations + +%s PHP +%s PHP_NO_RESERVED_WORDS +%s HEREDOC_LINE_START +%s HEREDOC_DATA +%s XHP +%s XHP_ATTR + +%% +{ + (?i:"{ + "?>"\r?\n { + ++yylloc->internal_line; + flex_pop_state(yyg); + return tok(t_PHP_CLOSE_TAG); + } + "?>" flex_pop_state(yyg); return tok(t_PHP_CLOSE_TAG); +} +{ + ("//"|"#").* | + [ \t\x0b\x0c\xa0\r]+ /* eat it up */ + \n { + ++yylloc->internal_line; + } + "/*" { + char c; + for (;;) { + while ((c = yyinput(yyscanner)) != '*' && c != EOF) { + if (c == '\n') { + ++yylloc->internal_line; + } + } + if (c == '*') { + while ((c = yyinput(yyscanner)) == '*'); + if (c == '/') { + break; + } else if (c == '\n') { + ++yylloc->internal_line; + } + } + if (c == EOF) { + return tok(0); + break; + } + } + } +} +{ + (?i:new) return tok(t_NEW); + (?i:clone) return tok(t_CLONE); + (?i:echo) return tok(t_ECHO); + (?i:print) return tok(t_PRINT); + (?i:if) return tok(t_IF); + (?i:else) return tok(t_ELSE); + (?i:elseif) return tok(t_ELSEIF); + (?i:do) return tok(t_DO); + (?i:while) return tok(t_WHILE); + (?i:for) return tok(t_FOR); + (?i:foreach) return tok(t_FOREACH); + (?i:as) return tok(t_AS); + (?i:switch) return tok(t_SWITCH); + (?i:case) return tok(t_CASE); + (?i:default) return tok(t_DEFAULT); + (?i:break) return tok(t_BREAK); + (?i:continue) return tok(t_CONTINUE); + (?i:function) return tok(t_FUNCTION); + (?i:const) return tok(t_CONST); + (?i:instanceof) return tok(t_INSTANCEOF); + (?i:return) return tok(t_RETURN); + (?i:try) return tok(t_TRY); + (?i:catch) return tok(t_CATCH); + (?i:finally) return tok(t_CATCH); + (?i:throw) return tok(t_THROW); + (?i:global) return tok(t_GLOBAL); + (?i:static) return tok(t_STATIC); + (?i:abstract) return tok(t_ABSTRACT); + (?i:final) return tok(t_FINAL); + (?i:private) return tok(t_PRIVATE); + (?i:protected) return tok(t_PROTECTED); + (?i:public) return tok(t_PUBLIC); + (?i:class) return tok(t_CLASS); + (?i:interface) return tok(t_INTERFACE); + (?i:extends) return tok(t_EXTENDS); + (?i:implements) return tok(t_IMPLEMENTS); + (?i:require_once) return tok(t_REQ_ONCE); + (?i:include_once) return tok(t_INC_ONCE); + (?i:require) return tok(t_REQ); + (?i:include) return tok(t_INC); +} +{ + "===" return tok(t_STRICT_EQUAL); + "!==" return tok(t_STRICT_NOT_EQUAL); + "<<=" return tok(t_LSHIFT_ASSIGN); + ">>=" return tok(t_RSHIFT_ASSIGN); + "<=" return tok(t_LESS_THAN_EQUAL); + ">=" return tok(t_GREATER_THAN_EQUAL); + "==" return tok(t_EQUAL); + "!="|"<>" return tok(t_NOT_EQUAL); + "++" return tok(t_INCR); + "--" return tok(t_DECR); + "<<" return tok(t_LSHIFT); + ">>" return tok(t_RSHIFT); + "+=" return tok(t_PLUS_ASSIGN); + "-=" return tok(t_MINUS_ASSIGN); + "/=" return tok(t_DIV_ASSIGN); + "*=" return tok(t_MULT_ASSIGN); + "%=" return tok(t_MOD_ASSIGN); + "&=" return tok(t_BIT_AND_ASSIGN); + "|=" return tok(t_BIT_OR_ASSIGN); + "^=" return tok(t_BIT_XOR_ASSIGN); + (?i:or) return tok(t_LOGICAL_OR); + (?i:and) return tok(t_LOGICAL_AND); + (?i:xor) return tok(t_LOGICAL_XOR); + ".=" return tok(t_APPEND); + "&&" return tok(t_AND); + "||" return tok(t_OR); + "->" { + int tt = tok(t_ARROW); + flex_push_state(PHP_NO_RESERVED_WORDS, yyg); + return tt; + } + "=>" return tok(t_DOUBLE_ARROW); + "::" return tok(t_HEBREW_THING); + "." return tok(t_CONCAT); + "," return tok(t_COMMA); + ";" return tok(t_SEMICOLON); + "?" return tok(t_PLING); + ":" return tok(t_COLON); + "<" return tok(t_LESS_THAN); + "+" return tok(t_PLUS); + "-" return tok(t_MINUS); + "*" return tok(t_MULT); + "%" return tok(t_MOD); + "|" return tok(t_BIT_OR); + "&" return tok(t_BIT_AND); + "^" return tok(t_BIT_XOR); + "!" return tok(t_NOT); + "~" return tok(t_BIT_NOT); + "=" return tok(t_ASSIGN); + "$" return tok(t_DOLLAR); + "@" return tok(t_AT); + "(" return tok(t_LPAREN); + ")" return tok(t_RPAREN); + "{" return tok(t_LCURLY); + "}" return tok(t_RCURLY); + "[" return tok(t_LBRACKET); + "]" return tok(t_RBRACKET); + "\"" return tok(t_DOUBLE_QUOTE); + "?>" BEGIN(INITIAL); +} +{ + "/" return tok(t_DIV); + ">" return tok(t_GREATER_THAN); +} +{ + "/" return tok(t_XHP_DIV); + ">" return tok(t_XHP_GREATER_THAN); +} +{ + '(\\.|\\\n|[^\\']+)*' { + *yylval = yytext; + for (char* ii = yytext; *ii; ++ii) { + if (*ii == '\n') { + ++yylloc->actual_line_offset; + } + } + return tok(t_LITERAL_STRING); + } + \"(\\.|\\\n|[^\\\"]+)*\" { + *yylval = yytext; + for (char* ii = yytext; *ii; ++ii) { + if (*ii == '\n') { + ++yylloc->actual_line_offset; + } + } + return tok(t_EVALUATED_STRING); + } + `[^`]*` { + *yylval = yytext; + for (char* ii = yytext; *ii; ++ii) { + if (*ii == '\n') { + ++yylloc->actual_line_offset; + } + } + return tok(t_SHELL_EXPRESSION); + } + 0x[a-fA-F0-9]+ { + *yylval = yytext; + return tok(t_NUMBER); + } + 0[0-7]+ { + *yylval = yytext; + return tok(t_NUMBER); + } + [0-9]*\.?[0-9]+[eE]-?[0-9]{1,3} { + *yylval = yytext; + return tok(t_NUMBER); + } + [0-9]+ | + [0-9]*\.[0-9]+ { + *yylval = yytext; + return tok(t_NUMBER); + } +} +{ + [a-zA-Z_$][a-zA-Z_$0-9]* { + *yylval = yytext; + return tok(t_IDENTIFIER); + } +} +{ + [^<"{]+ { + *yylval = yytext; + return tok(t_XHP_TEXT); + } + "{" { + return tok(t_LCURLY); + } + \" { + return tok(t_DOUBLE_QUOTE); + } + "<" { + return tok(t_LESS_THAN); + } + "{ + "<<<".+\n { + yyextra->heredoc_eom = yytext + 3; + yyextra->heredoc_eom_len = strlen(yyextra->heredoc_eom) - 1; + yyextra->heredoc_data_last = yyextra->heredoc_data_start = yyextra->heredoc_eom + yyextra->heredoc_eom_len + 1; + if (yyextra->heredoc_eom[yyextra->heredoc_eom_len - 1] == '\r') { + --yyextra->heredoc_eom_len; + } + ++yylloc->actual_line_offset; + yymore(); + flex_push_state(HEREDOC_LINE_START, yyg); + BEGIN(HEREDOC_LINE_START); + } +} +{ + [a-zA-Z0-9]+$ { + // copy-pasted below. + if (yyextra->heredoc_eom_len <= strlen(yyextra->heredoc_data_last) + && strncmp(yyextra->heredoc_eom, yyextra->heredoc_data_last, yyextra->heredoc_eom_len) == 0) { + *yylval = yytext; + *yylval = code_rope("<<<") + yyextra->heredoc_eom + "\n"; + flex_pop_state(yyg); + return tok(t_HEREDOC); + } else { + yymore(); + BEGIN(HEREDOC_DATA); + } + } + [a-zA-Z0-9]+/; { + // copy-pasted from above. + if (yyextra->heredoc_eom_len <= strlen(yyextra->heredoc_data_last) + && strncmp(yyextra->heredoc_eom, yyextra->heredoc_data_last, yyextra->heredoc_eom_len) == 0) { + *yylval = yytext; + *yylval = code_rope("<<<") + yyextra->heredoc_eom + "\n"; + flex_pop_state(yyg); + return tok(t_HEREDOC); + } else { + yymore(); + BEGIN(HEREDOC_DATA); + } + } + . { + yymore(); + BEGIN(HEREDOC_DATA); + } + \r?\n { + ++yylloc->actual_line_offset; + ++yyextra->heredoc_data_last; + if (*yyextra->heredoc_data_last == '\n') { + ++yyextra->heredoc_data_last; + } + yymore(); + BEGIN(HEREDOC_LINE_START); + } +} +{ + .*\r?\n { + ++yylloc->actual_line_offset; + yyextra->heredoc_data_last += strlen(yyextra->heredoc_data_last); + yymore(); + BEGIN(HEREDOC_LINE_START); + } +} + +<*>{ + .|\n { + fprintf(stderr,"Unknown text: (%s)\n", yytext); + return tok(t_IDK); + if (0) { + // stop warning me about unused functions! + yyunput(0, NULL, NULL); + yy_top_state(NULL); + } + } +} + +%% + +inline char* findstate(int s) { + switch(s) { + case INITIAL: + return "INITIAL"; + case PHP: + return "PHP"; + case PHP_NO_RESERVED_WORDS: + return "PHP_NO_RESERVED_WORDS"; + case XHP: + return "XHP"; + case XHP_ATTR: + return "XHP_ATTR"; + default: + char* a = new char[12]; + sprintf(a, "%d", s); + return a; + } +} + +void flexBEGIN(int s, void* y) { + yyguts_t* yyg = (yyguts_t*)y; + BEGIN(s); +} + +void flex_push_state(int s, void* y) { + yyguts_t* yyg = (yyguts_t*)y; +// fprintf(stderr, "--> PUSH(%s -> %s)\n", findstate((yyg->yy_start-1)/2), findstate(s)); + yy_push_state(s, yyg); +} + +void flex_pop_state(void* y) { + yyguts_t* yyg = (yyguts_t*)y; + int o = (yyg->yy_start-1)/2; + yy_pop_state(yyg); +// fprintf(stderr, "--> POP(%s -> %s)\n", findstate(o), findstate((yyg->yy_start-1)/2)); +} + +static int flex_tok(int t, void* y) { + yyguts_t* yyg = (yyguts_t*)y; + if (YY_START == PHP_NO_RESERVED_WORDS) { + flex_pop_state(yyg); + } +//printf("\ntok:%d\n", YY_START); + return t; +}