In [287]:
from fuzzingbook.Grammars import Grammar, Expansion, srange

from fuzzingbook.GrammarFuzzer import GrammarFuzzer
from isla.solver import ISLaSolver, SemanticError
from fuzzingbook.Parser import EarleyParser, display_tree
from fuzzingbook.MutationFuzzer import FunctionCoverageRunner
from fuzzingbook.GreyboxGrammarFuzzer import  FragmentMutator, PowerSchedule, LangFuzzer, SeedWithStructure, print_stats  
from fuzzingbook.Coverage import population_coverage
from fuzzingbook.GrammarCoverageFuzzer import GrammarCoverageFuzzer, extend_grammar, duplicate_context
from fuzzingbook.GreyboxFuzzer import Seed
from fuzzingbook.Timeout import Timeout
import time
from typing import List
import string
import csv
import logging
from pandas.errors import ParserError, EmptyDataError
import pandas as pd

from tqdm import tqdm

from ordered_set import OrderedSet




In [288]:
list_ascii_printable = list(string.printable)
list_ascii_printable.remove('"')
list_ascii_printable.remove("\r")
list_ascii_printable.remove("\n")
list_ascii_printable.remove(",")

list_char: List[Expansion] = srange("".join(list_ascii_printable))

CSV_GRAMMMAR: Grammar = {
    "<start>": ["<csv-file>"],
    "<csv-file>": ["<hdr>", "<rows>"],
    "<rows>": ["<row>", "<row><crlf><rows>", "<row><crlf>"],
    "<hdr>": ["<row>"],
    "<row>" : ["<fields>"],
    "<fields>": ["<field>", "<field><comma><fields>"],
    "<field>": ["<TEXT>", "<STRING>", ""],
    "<TEXT>": ["<character>", "<character><TEXT>"],
    "<STRING>": ["<dblquote><list_character><dblquote>", "<dblquote><dblquote>"],
    "<list_character>": [
        "<character>",
        "<character><list_character>",
        "<dblquote><dblquote><list_character>",
    ],
    "<character>": list_char,
    "<dblquote>": [chr(34)],
    "<comma>": [","],
    "<crlf>": ["\r\n"],
}
START_SYMBOL = "<start>"

In [289]:
dup_CSV_GRAMMMAR = extend_grammar(CSV_GRAMMMAR)

duplicate_context(dup_CSV_GRAMMMAR, "<rows>", "<row><rows>")
duplicate_context(dup_CSV_GRAMMMAR, "<rows>", "<row><crlf><rows>")
duplicate_context(dup_CSV_GRAMMMAR, "<fields>", "<field><comma><fields>")
duplicate_context(dup_CSV_GRAMMMAR, "<TEXT>", "<character><TEXT>")
duplicate_context(dup_CSV_GRAMMMAR, "<list_character>", "<character><list_character>")
duplicate_context(dup_CSV_GRAMMMAR, "<STRING>", "<dblquote><dblquote><list_character>")

dup_gram_cov_fuzzer: GrammarCoverageFuzzer = GrammarCoverageFuzzer(dup_CSV_GRAMMMAR, start_symbol=START_SYMBOL, max_nonterminals= 50)

In [290]:
solver = ISLaSolver(CSV_GRAMMMAR, # type: ignore
                    '''    
                    exists int nb_comma :
                        exists <row> r : 
                            (count(r, "<comma>", nb_comma)
                            and 
                            forall <row> row in <rows>:
                                count(row, "<comma>", nb_comma))
                    '''
                    )       

In [291]:
seeds: list[SeedWithStructure] = []
syntax_erroneous_inputs = []
semantic_erroneous_inputs = []
def parse_input(input: str)  -> str:
    f = open("test.csv", "w")
    f.write(input)
    f.close()
    with open("test.csv", newline="") as f:
        csv.reader(f)

    pd.read_csv("test.csv", delimiter=",", engine="python")
    
    return input

def fuzz_for_seeds(fuzzer: GrammarFuzzer) :
    error= False
    fuzz = fuzzer.fuzz()

    try : 
        solver.parse(fuzz)
    except SyntaxError as e:
        syntax_erroneous_inputs .append(fuzz)
        error = True
    except SemanticError as e:
        semantic_erroneous_inputs.append(fuzz)
        error = True

    try :  
        seeds.append(SeedWithStructure(parse_input(fuzz)))
    except ParserError as e:
        if not error : 
            logging.error("ParserError : " + str(e) + "\n" + fuzz + "\n")
        

    except EmptyDataError as e:
        if not error : 
            logging.error("EmptyDataError : " + str(e) + "\n" + fuzz + "\n")
        

In [292]:
seeds: list[SeedWithStructure] = []
open("tests.log", "w").close()
total_coverage = len(dup_gram_cov_fuzzer.max_expansion_coverage())
coverage = 0
for i in tqdm(range(total_coverage)):
    while coverage == i :
        fuzz_for_seeds(dup_gram_cov_fuzzer)
        coverage = total_coverage - len(dup_gram_cov_fuzzer.max_expansion_coverage() - dup_gram_cov_fuzzer.expansion_coverage())

  0%|          | 0/5127 [00:00<?, ?it/s]

ERROR:root:EmptyDataError : No columns to parse from file


  4%|▍         | 196/5127 [00:00<00:17, 281.67it/s]ERROR:root:EmptyDataError : No columns to parse from file
""

  8%|▊         | 432/5127 [00:03<00:45, 102.47it/s]ERROR:root:EmptyDataError : No columns to parse from file


 29%|██▊       | 1473/5127 [00:14<00:29, 125.18it/s]ERROR:root:EmptyDataError : No columns to parse from file
	

 37%|███▋      | 1915/5127 [00:19<00:26, 119.20it/s]ERROR:root:EmptyDataError : No columns to parse from file



 40%|████      | 2057/5127 [00:20<00:29, 105.37it/s]ERROR:root:EmptyDataError : No columns to parse from file
 


 94%|█████████▍| 4826/5127 [04:37<01:15,  3.99it/s] ERROR:root:EmptyDataError : No columns to parse from file

"	"


100%|██████████| 5127/5127 [06:18<00:00, 13.56it/s]


In [293]:
print(len(semantic_erroneous_inputs))
print(len(seeds))

240
846


In [294]:
n=1000
runner = FunctionCoverageRunner(parse_input)
parser = EarleyParser(CSV_GRAMMMAR)
mutator = FragmentMutator(parser)
schedule = PowerSchedule()



lang_fuzzer = LangFuzzer([seed.data for seed in seeds], mutator, schedule)

start = time.time()
lang_fuzzer.runs(runner, trials=n)
end = time.time()



In [295]:
# Ordered set to avoid duplicates for later performance
syntax_error = OrderedSet([])
semantic_error = OrderedSet([])
other_error = OrderedSet([])
parsed_inputs = OrderedSet([])

def sort_seed(seed: Seed) -> int:
    try:
        solver.parse(seed.data, silent=True)
        
        
    except SyntaxError:
        syntax_error.add(seed.data)
        return 0
    except SemanticError:
        semantic_error.add(seed.data)
        return 0
    except Exception:
        other_error.add(seed.data)
        return 0
    else: 
        parsed_inputs.add(seed.data)
        return 1
    
    

coverage, _ = population_coverage(lang_fuzzer.inputs, parse_input)

has_structure = 0
for seed in lang_fuzzer.inputs:
    # reuse memoized information
    if hasattr(seed, "has_structure"):
        
        has_structure += sort_seed(seed)  # type: ignore
    else:
        if isinstance(seed, str):
            seed = Seed(seed)
        has_structure +=  sort_seed(seed) 
        

print("From the %d generated inputs, %d (%0.2f%%) can be parsed.\n"
        "In total, %d statements are covered." % (
        len(lang_fuzzer.inputs),
        has_structure,
        100 * has_structure / len(lang_fuzzer.inputs),
        len(coverage)))

From the 1000 generated inputs, 732 (73.20%) can be parsed.
In total, 2019 statements are covered.


In [296]:
print(len(parsed_inputs))
print(len(syntax_error))
print(len(semantic_error))
print(len(other_error))

693
8
229
0


In [297]:

wrongly_parsed_inputs = []
wrongly_parsed_inputs_syntax = []
wrongly_parsed_inputs_semantic = []
wrongly_parsed_inputs_other = []

def check_correct_input(input : str) -> None :
    try : 
        parse_input(input)
    except Exception as e:
        wrongly_parsed_inputs.append(input)



def check_semantic_incorrect_input(input : str) -> None :
    try : 
        parse_input(input)
        wrongly_parsed_inputs_semantic.append(input)
        
    except (ParserError , EmptyDataError) :
        pass
    except Exception as e:
        wrongly_parsed_inputs_semantic.append(input)

def check_syntax_incorrect_input(input : str) -> None :
    try : 
        parse_input(input)
        wrongly_parsed_inputs_syntax.append(input)
        
    except (ParserError , EmptyDataError) :
        pass
    except Exception as e:
        wrongly_parsed_inputs_other.append(input)

def check_other_incorrect_input(input : str) -> None :

    try : 
        parse_input(input)
        wrongly_parsed_inputs_other.append(input)
        
    except Exception as e:
        pass

In [298]:

for correct_input in parsed_inputs:
    check_correct_input(correct_input)
print(len(wrongly_parsed_inputs))
for syntax_error_input in syntax_error:
    check_syntax_incorrect_input(syntax_error_input)
print(len(wrongly_parsed_inputs_syntax))
for semantic_error_input in semantic_error:
    check_semantic_incorrect_input(semantic_error_input)
print(len(wrongly_parsed_inputs_semantic))
for other_error_input in other_error:
    check_other_incorrect_input(other_error_input)

print(len(wrongly_parsed_inputs_other))

7
2
229
0


In [299]:
print(wrongly_parsed_inputs)
print(wrongly_parsed_inputs_syntax)
print(wrongly_parsed_inputs_semantic)
print(wrongly_parsed_inputs_other)




['\r\n', '\r\n" "\r\n', '" "\r\n', '""\r\n', '""', '\x0b', '" "']
[',|"\r\n', '"",Qz"']
['\r\n""\r\n,', 'B,,"",""\r\n,\r\n', 'P\r\n,"5",3t,B\r\n\r\nCj,""', '"9""%"\r\n"",},""\r\n', '"Z",vn|\r\ny!', "'?,.\r\n>P\r\n", 'G\r\n"v""|",e2,i,"t"\r\n', 'O,],*aN,Hi,_\r\nfB', '#o\r\nF0J2)v,U,n,kfn\r\n', '3z,\\C,$\r\ns,r,IN)I,A,n', 'n,|v6,w&\x0c,7,8\r\nO\r\n', '>,%~,{R\r\nx\x0c', 'D\r\n:,_^,tgK,T=,;J3\r\n', '0E%2\r\nz0,6*52,/E,X-', "y/J,IW,8u,',y\r\n\x0b'i\r\na", 'f#R,)$,^B&\r\nA\r\n6\x0c\r\n&\r\nv\r\n', '=\r\nw,P$3,A,C;\r\n', 'X,fF\r\n}', '1V,E,f,\tK(\r\ncN,|tH:', '!#\r\nmdm,(,#\r\n', "[/Y\r\nw,k',p,-$@@,J", '-h>\r\n\\3,8', '>M,WZ,b,5\r\nq\r\n', 'J+\r\nP,lL,\x0c[\r\n', '0,BP,n8,:V\r\n?KT', '~a&J\r\nAE,{,Or,lyb\r\n', 'Q,PA*,-,u_kd,r\r\n<', 'IB\r\n>,bg#,u,>9|', 'e,DT,h,b\r\n`:o', '@.,YX\r\n6@Tey\r\n', '~G,A,li\r\nt<z,_,\x0c7,luB,Z', '1*\r\n(w9,H,%O,V+,N,h\r\n', '^S;,\x0b\t,71,>S\r\n[I1', 'K]t0U,H,\\,Y,C\r\nJ\r\n', 't\r\n),}xW,y', '\x0c6y,4l,( ,m1\r\nol,~,p`\x0c,p]OV\r\n)?,0,LW\r\n@\r\nRK_\r\nU=2', 