Skip to content
Permalink
Browse files

[language] Restrict character classes allowed in IR

This PR restricts the character classes that are allowed in IR files to
either be:
* ASCII printable (space through ~).
* \n for newlines
* \t for tabs
All other characters anywhere in the file will result in an error being
raised before the string is even passed to the parser.
  • Loading branch information...
Tim Zakian authored and tzakian committed Aug 19, 2019
1 parent f7577df commit 7efb0221989f17fdf7f8486730898ed947a1e19e
Showing with 147 additions and 17 deletions.
  1. +0 −1 language/compiler/ir_to_bytecode/Cargo.toml
  2. +147 −16 language/compiler/ir_to_bytecode/src/parser.rs
@@ -15,7 +15,6 @@ lalrpop-util = "0.16.3"
log = "0.4.7"
codespan = "0.1.3"
codespan-reporting = "0.1.4"
regex = "1.2.1"

[dev-dependencies]
types = { path = "../../../types", features = ["testing"] }
@@ -6,7 +6,6 @@ use codespan_reporting::{emit, termcolor::Buffer, Diagnostic, Label, Severity};
use failure::*;
use ir_to_bytecode_syntax::syntax;
use lalrpop_util::ParseError;
use regex::Regex;
use std::{
collections::hash_map::DefaultHasher,
hash::{Hash, Hasher},
@@ -16,13 +15,69 @@ use types::account_address::AccountAddress;
// Re-export this to make it convenient for other crates.
pub use ir_to_bytecode_syntax::ast;

// Since lalrpop can't handle comments without a custom lexer, we somewhat hackily remove all the
// comments from the input string before passing it off to lalrpop. We only support single line
// comments for now. Will later on add in other comment types.
fn strip_comments(string: &str) -> String {
// Remove line comments
let line_comments = Regex::new(r"(?m)//.*$").unwrap();
line_comments.replace_all(string, "$1").into_owned()
/// Determine if a character is an allowed eye-visible (printable) character.
///
/// The only allowed printable characters are the printable ascii characters (SPACE through ~) and
/// tabs. All other characters are invalid and we return false.
pub fn is_permitted_printable_char(c: char) -> bool {
let x = c as u32;
let is_above_space = x >= 0x20; // Don't allow meta characters
let is_below_tilde = x <= 0x7E; // Don't allow DEL meta character
let is_tab = x == 0x09; // Allow tabs
(is_above_space && is_below_tilde) || is_tab
}

/// Determine if a character is a permitted newline character.
///
/// The only permitted newline character is \n. All others are invalid.
pub fn is_permitted_newline_char(c: char) -> bool {
let x = c as u32;
x == 0x0A
}

/// Determine if a character is permitted character.
///
/// A permitted character is either a permitted printable character, or a permitted
/// newline. Any other characters are disallowed from appearing in the file.
pub fn is_permitted_char(c: char) -> bool {
is_permitted_printable_char(c) || is_permitted_newline_char(c)
}

fn verify_string(string: &str) -> Result<()> {
match string.chars().find(|c| !is_permitted_char(*c)) {
None => Ok(()),
Some(chr) => bail!(
"Parser Error: invalid character {} found when reading file.\
Only ascii printable, tabs (\\t), and \\n line ending characters are permitted.",
chr
),
}
}

fn strip_comments(source: &str) -> String {
const SLASH: char = '/';
const SPACE: char = ' ';

let mut in_comment = false;
let mut acc = String::with_capacity(source.len());
let mut char_iter = source.chars().peekable();

while let Some(chr) = char_iter.next() {
let at_newline = is_permitted_newline_char(chr);
let at_or_after_slash_slash =
in_comment || (chr == SLASH && char_iter.peek().map(|c| *c == SLASH).unwrap_or(false));
in_comment = !at_newline && at_or_after_slash_slash;
acc.push(if in_comment { SPACE } else { chr });
}

acc
}

// We restrict strings to only ascii visual characters (0x20 <= c <= 0x7E) or a permitted newline
// character--\n--or a tab--\t.
fn strip_comments_and_verify(string: &str) -> Result<String> {
verify_string(string)?;
Ok(strip_comments(string))
}

/// Given the raw input of a file, creates a `ScriptOrModule` enum
@@ -39,44 +94,44 @@ pub fn parse_script_or_module(s: &str) -> Result<ast::ScriptOrModule> {
/// Given the raw input of a file, creates a `Program` struct
/// Fails with `Err(_)` if the text cannot be parsed
pub fn parse_program(program_str: &str) -> Result<ast::Program> {
let stripped_string = &strip_comments(program_str);
let stripped_string = &strip_comments_and_verify(program_str)?;
let parser = syntax::ProgramParser::new();
match parser.parse(stripped_string) {
Ok(program) => Ok(program),
Err(e) => handle_error(e, program_str),
Err(e) => handle_error(e, stripped_string),
}
}

/// Given the raw input of a file, creates a `Script` struct
/// Fails with `Err(_)` if the text cannot be parsed
pub fn parse_script(script_str: &str) -> Result<ast::Script> {
let stripped_string = &strip_comments(script_str);
let stripped_string = &strip_comments_and_verify(script_str)?;
let parser = syntax::ScriptParser::new();
match parser.parse(stripped_string) {
Ok(script) => Ok(script),
Err(e) => handle_error(e, script_str),
Err(e) => handle_error(e, stripped_string),
}
}

/// Given the raw input of a file, creates a single `ModuleDefinition` struct
/// Fails with `Err(_)` if the text cannot be parsed
pub fn parse_module(modules_str: &str) -> Result<ast::ModuleDefinition> {
let stripped_string = &strip_comments(modules_str);
let stripped_string = &strip_comments_and_verify(modules_str)?;
let parser = syntax::ModuleParser::new();
match parser.parse(stripped_string) {
Ok(module) => Ok(module),
Err(e) => handle_error(e, modules_str),
Err(e) => handle_error(e, stripped_string),
}
}

/// Given the raw input of a file, creates a single `Cmd` struct
/// Fails with `Err(_)` if the text cannot be parsed
pub fn parse_cmd(cmd_str: &str, _sender_address: AccountAddress) -> Result<ast::Cmd> {
let stripped_string = &strip_comments(cmd_str);
let stripped_string = &strip_comments_and_verify(cmd_str)?;
let parser = syntax::CmdParser::new();
match parser.parse(stripped_string) {
Ok(cmd) => Ok(cmd),
Err(e) => handle_error(e, cmd_str),
Err(e) => handle_error(e, stripped_string),
}
}

@@ -124,3 +179,79 @@ where
println!("{}", msg);
bail!("ParserError: {}", e)
}

#[cfg(test)]
mod tests {
#[test]
fn verify_character_whitelist() {
let mut good_chars = (0x20..=0x7E).collect::<Vec<u8>>();
good_chars.push(0x0A);
good_chars.push(0x09);

let mut bad_chars = (0x0..0x09).collect::<Vec<_>>();
bad_chars.append(&mut (0x0B..=0x1F).collect::<Vec<_>>());
bad_chars.push(0x7F);

// Test to make sure that all the characters that are in the whitelist pass.
{
let s = std::str::from_utf8(&good_chars)
.expect("Failed to construct string containing an invalid character. This shouldn't happen.");
assert!(super::verify_string(s).is_ok());
}

// Test to make sure that we fail for all characters not in the whitelist.
for bad_char in bad_chars {
good_chars.push(bad_char);
let s = std::str::from_utf8(&good_chars)
.expect("Failed to construct string containing an invalid character. This shouldn't happen.");
assert!(super::verify_string(s).is_err());
good_chars.pop();
}
}

#[test]
fn test_strip_comments() {
let mut good_chars = (0x20..=0x7E).map(|x: u8| x as char).collect::<String>();
good_chars.push(0x09 as char);
good_chars.push(0x0A as char);
good_chars.insert(0, 0x2F as char);
good_chars.insert(0, 0x2F as char);

{
let x = super::strip_comments(&good_chars);
assert!(x.chars().all(|x| x == ' ' || x == '\t' || x == '\n'));
}

// Remove the \n at the end of the line
good_chars.pop();

let bad_chars: Vec<u8> = vec![
0x0B, // VT
0x0C, // FF
0x0D, // CR
0x0D, 0x0A, // CRLF
0xC2, 0x85, // NEL
0xE2, 0x80, 0xA8, // LS
0xE2, 0x80, 0xA9, // PS
0x1E, // RS
0x15, // NL
0x76, // NEWLINE
];

let bad_chars = std::str::from_utf8(&bad_chars).expect(
"Failed to construct string containing an invalid character. This shouldn't happen.",
);
for bad_char in bad_chars.chars() {
good_chars.push(bad_char);
good_chars.push('\n');
good_chars.push('a');
let x = super::strip_comments(&good_chars);
assert!(x
.chars()
.all(|c| c == ' ' || c == '\t' || c == '\n' || c == 'a'));
good_chars.pop();
good_chars.pop();
good_chars.pop();
}
}
}

0 comments on commit 7efb022

Please sign in to comment.
You can’t perform that action at this time.