In [1]:
import os
import json
from pathlib import Path
from code_analysis import CFGReader
from code_analysis import CFG

# Global variable - directory where cfg.json and .dot files generated by our code will be stored 
part1_output_directory = "output/part_1/"
part2_output_directory = "output/part_2/"

# Utility functions taken from TP1
def get_json_files(extension, directory):
   directory = Path(directory)
   return [str(file) for file in directory.rglob(extension)]

def create_output_file(filename, directory):
    # Check if output directory exists, if not, create it
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Check if output file already exists, if so, delete and create new file
    file_path = os.path.join(directory, filename)
    if os.path.exists(file_path):
        os.remove(file_path)

    # Open in "append" mode to avoid overwriting the whole file after each modification
    return open(directory + filename, "a")

def close_output_file(file):
   file.close()

In [None]:
from typing import Dict, List, Set


class TaintAnalysisAlgorithm:
    def __init__(self, filename):
        self.cfg = None
        self.filename = filename

        # Dictionnary containing all necessary parameters (safe, filter, etc)
        self.tainted_params: Dict[str, List[int]] = dict() # Format: key = param_type(safe/filter/etc) value = node_ids arary

        # GEN and KILL dictionnaries
        ## Dictionnary containing all tainted_gen nodes for a specific node
        self.tainted_gen = Dict[int, Set()] = dict() # Format (for all following dictionnaries): key = node_id, value = set of node_ids
        ## Dictionnary containing all tainted_kill nodes for a specific node
        self.tainted_kill = Dict[int, Set()] = dict()

        # IN and OUT dictionnaries
        ## Dictionnary containing all tainted_in nodes for a specific node
        self.tainted_in = Dict[int, Set()] = dict()
        ## Dictionnary containing all tainted_out nodes for a specific node
        self.tainted_out = Dict[int, Set()] = dict()

    def __init_tainted_params(self, taint_json_filename):
        # Read the file and initialize the appropriate parameters in a dictionnary
        params = json.loads(taint_json_filename)
        self.tainted_params = {
            'defs': params['defs'],
            'refs': params['refs'],
            'pairs': params['pairs'],
            'sinks': params['sinks'],
            'filters': params['filters'],
            'safes': params['safes'],
            'sources': params['sources']
        }

    def get_nodes_tainted_gen(self, var_node_id, expr_node_id):
        ## To determine if the definition is tainted, we need to check the right side of the definition, and we need to check EACH node involved
        ## Ex: For definition x = y + z + w +1, we need to check y, z and w. If AT LEAST one of them is tainted, then the definition is tainted
        ## To do so, we need to check for BinOP nodes

        # Populate tainted_gen according to the algorithm
        ## If part of filter or safe, then not tainted (in which case, skip)
        if expr_node_id in self.tainted_params['filters']:
            pass
        if expr_node_id in self.tainted_params['safes']:
            pass
        if expr_node_id in self.tainted_params['sources']:
            self.tainted_gen[var_node_id, expr_node_id]['defs'].append() # append what???
    
    def get_nodes_tainted_kill(self, node_id):
        pass

    def get_taint_analysis(self, taint_json_filename):
        is_binOP_equal = lambda child_node_id: self.cfg.get_type(child_node_id) == "BinOP" and self.cfg.get_image(child_node_id) == "="

        # Start by initializing the relevant parameters for the analysis in order to populate the gen and kill dictionnaries for each node
        self.__init_tainted_params(taint_json_filename)

        # Retrieve the nodeSet. The algorithm we have to implement cannot be done recursively like we usually do 
        ## At least we found it simpler to do in an iterative manner, so as to follow the given algorithm as closely as possible
        ## So the nodeSet here is the list of all nodes in the cfg
        node_set = self.cfg.get_node_ids()

        for node_id in node_set:
            # Only check the taint for definitions (since we are implementing the possibly tainted definitions algorithm):
            ## Check if the node is BinOP and if child node is an '='
            if is_binOP_equal(node_id):
                # Left child is the variable, right child is the value/expression
                variable_node_id, expression_node_id = self.cfg.get_op_hands(node_id)

                # Initialize the gen and kill dictionnaries for the current node
                self.get_nodes_tainted_gen(variable_node_id, expression_node_id)

                # Initialize the in and out dictionnaries for the current node
                self.tainted_in[variable_node_id] = set()
                self.tainted_out[variable_node_id] = set()

                # Initialize and populate the gen and kill for the current node
                self.get_nodes_tainted_gen(node_id)

