# Authors
Ikram Kohil, 2019115 \
Johnatan Gao, 2013298

# 1. DataFlow extraction

In this part, we must extract the intra-procedural dataflow [1] of the exmple code that was provided:

- For each variable reference, we must determine the corresponding definition(s) [2]
- For each definition, we must determine the corresponding reference(s) [3]

*For this lab, we must only consider the simple definitions of the form (Variable = Expression ^ Literal)* [4]

We must verify our implementation by insuring that the set of pairs (definition - refefence) corresponds to the set of pairs (reference - definition). We must then extract the sets for the files in the part_1 folder.

[1] Intra-procedural dataflow refers to the analysis of how variables and values flow within a single function or procedure. In other words, it looks at how information moves from one point to another within a piece of code that is defined by a set of procedures or functions. This type of analysis can be useful for identifying common programming errors such as uninitialized variables, null pointer references, and other issues related to the flow of data through a program.
''''
For each variable references, we must determine the corresponding definition (s)

Since it's been instructed to only consider simple definitions:
Variable = Expression ^ Literal

''''

In [165]:
import os
from pathlib import Path
from code_analysis import CFGReader
from code_analysis import CFG

# Global variable - directory where cfg.json and .dot files generated by our code will be stored 
part1_output_directory = "output/part_1/"
part2_output_directory = "output/part_2/"

# Utility functions taken from TP1
def get_json_files(extension, directory):
   directory = Path(directory)
   return [str(file) for file in directory.rglob(extension)]

def create_output_file(filename, directory):
    # Check if output directory exists, if not, create it
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Check if output file already exists, if so, delete and create new file
    file_path = os.path.join(directory, filename)
    if os.path.exists(file_path):
        os.remove(file_path)

    # Open in "append" mode to avoid overwriting the whole file after each modification
    return open(directory + filename, "a")

def close_output_file(file):
   file.close()

In [166]:
class LiveVariableVisitor:
    def __init__(self, filename):
        self.cfg = None
        self.filename = filename
        self.test = []

        # Stored in array
        ## Using a set to avoid adding nodes that have already been visited
        self.visited_nodes = set()
        ## Contains all references to variables
        self.gen_live_vars = {} # Format: key = var_id, value=[0: var_name, 1: set of lines in which it was referenced]
        ## Contains all defined variables
        self.kill_live_vars = {} # Format: key = var_id, value=[0: var_name, 1: set of lines in which it was defined]

    def visit(self, cfg: CFG):
        self.cfg = cfg
        print(f"Visiting CFG from file {self.filename}")

        # Visit main program from exit node (hence the +1, since the exit node is generated right after the entry node)
        # Starting from exit node ensures that we find references first, then definitions
        exit_node_id = self.cfg.get_root() + 1
        self.__visit(exit_node_id)

        # Visit all other procedures in file
        defined_func_entry_nodes = self.cfg.get_func_entry_nodes()
        if len(defined_func_entry_nodes) > 0:
            for entry_node in defined_func_entry_nodes:
                func_exit_node = entry_node + 1
                self.__visit(func_exit_node)

    def __visit(self, node_id: int):
        self.visited_nodes.add(node_id)
        # Find all the variable nodes first
        node_type = self.cfg.get_type(node_id)
        node_children = self.cfg.get_children(node_id)

        if(node_type == "Variable"):
            for node in node_children:
                # if self.cfg.get_type(node) in out_of_scope_refs:
                print(self.cfg.get_type(node))
            self.__get_gen_kill_live_vars(node_id)

        # Visit parents
        parents = self.cfg.get_any_parents(node_id)
        for parent_id in parents:
            # Only visit if not previously visited
            if parent_id not in self.visited_nodes:
                self.__visit(parent_id)

    def __get_gen_kill_live_vars(self, node_id):
        # Check if child is (BinOP =);
        ## if not, then reference (so gen)
        ## if yes, then definition (so kill)
        is_binOP_equal = lambda child_node_id: self.cfg.get_type(child_node_id) == "BinOP" and self.cfg.get_image(child_node_id) == "="
        node_children = self.cfg.get_children(node_id)
        node_binop_equal = [child_node_id for child_node_id in node_children if is_binOP_equal(child_node_id)]
    
        # Prepare variables
        var_name = self.cfg.get_image(node_id)
        line_nbr = self.cfg.get_position(node_id)[0] # Only the line number is of interest to us in this output
        var_id = self.cfg.get_var_id(node_id) # Unique identifier common to all references to the variable

        if len(node_binop_equal) > 0:
            # Add lines where it was defined
            if var_id not in self.kill_live_vars:
                self.kill_live_vars[var_id] = [var_name, set([line_nbr])]
            else:
                self.kill_live_vars[var_id][1].add(line_nbr)
        else:
            # Add line where it was referenced
            if var_id not in self.gen_live_vars:
                self.gen_live_vars[var_id] = [var_name, set([line_nbr])]
            else:
                self.gen_live_vars[var_id][1].add(line_nbr)
        
    def get_undefined_refs(self):
        undefined_refs = {}

        for var_id in self.gen_live_vars:
            if var_id not in self.kill_live_vars.keys():
                var_name = self.gen_live_vars[var_id][0]
                line_nbr = self.gen_live_vars[var_id][1]
                undefined_refs[var_id] = [var_name, line_nbr]
        
        return undefined_refs

    def print_refs_defs_pairs(self, output_file):
        for key in self.gen_live_vars: 
            if key in self.kill_live_vars:
                output_file.write(f"Variable {self.gen_live_vars[key][0]} was referenced on lines: {self.gen_live_vars[key][1]} and defined on lines: {self.kill_live_vars[key][1]}\n")
        output_file.write("\n")

    def print_undefined_refs(self, output_file):
        undefined_refs = self.get_undefined_refs()

        for key in undefined_refs: 
            output_file.write(f"Variable {undefined_refs[key][0]} was referenced on lines: {undefined_refs[key][1]} but was never defined.\n") 
        output_file.write("\n")

In [167]:
from typing import Any, Dict, Set


class DefinitionVisitor:
    def __init__(self, filename):
        self.cfg = None
        self.filename = filename
        self.test = []

        # Stored in array
        ## Using a set to avoid adding nodes that have already been visited
        self.visited_nodes = set()
        ## Contains all references to variables
        self.refs: Dict[(str, str), Set[Any]] = dict()
        ## Contains all defined variables (GEN)
        self.gen: Dict[(str, str), Set[Any]] = dict()
        ## Contains  all the redefined variables (KILL)
        self.kills: Dict[(str, str), Set[Any]] = dict()

    def visit(self, cfg: CFG):
        self.cfg = cfg
        print(f"Visiting CFG from file {self.filename}")
        # Visit main program
        self.__visit(self.cfg.get_root())

        # Visit all other procedures in file
        defined_func_entry_nodes = self.cfg.get_func_entry_nodes()
        if len(defined_func_entry_nodes) > 0:
            for entry_node in defined_func_entry_nodes:
                self.__visit(entry_node)

    def __get_unreferenced_defs(self):
        unreferenced_defs = dict()
        for (label, var_id) in self.gen:
            if (label, var_id) not in self.refs:
                unreferenced_defs[((label, var_id))] = self.gen[(label, var_id)]
        
        return unreferenced_defs

    def __visit(self, node_id: int):
        self.visited_nodes.add(node_id)
        self.__get_gen_kills_definition(node_id)
        self.__get_references(node_id)
        # Visit children
        children = self.cfg.get_any_children(node_id)
        for child_id in children:
            # Only visit if not previously visited
            if child_id not in self.visited_nodes:
                self.__visit(child_id)

    def __get_gen_kills_definition(self, node_id: int):
        if(self.__is_binOp_equal(node_id)):
            variable_node_id = self.cfg.get_op_hands(node_id)[0]
            if(self.cfg.get_type(variable_node_id) == "Variable"):
                label = self.cfg.get_image(variable_node_id)
                var_id = self.cfg.get_var_id(variable_node_id)

                if (label, var_id) not in self.refs:
                    self.gen[(label, var_id)] = set([self.cfg.get_position(variable_node_id)[0]])
                else:
                    if (label, var_id) not in self.kills:
                        self.kills[(label, var_id)] = set([self.cfg.get_position(variable_node_id)[0]])
                    else:
                        self.kills[(label, var_id)].add(self.cfg.get_position(variable_node_id)[0])


    def __get_references(self, node_id: int):
        if self.cfg.get_type(node_id) == "Variable" and not self.__is_binOp_equal(self.cfg.get_children(node_id)[0]) and not self.cfg.get_image(node_id) == "this":
            label = self.cfg.get_image(node_id)
            var_id = self.cfg.get_var_id(node_id)

            if (label, var_id) not in self.refs:
                self.refs[(label, var_id)] = set([self.cfg.get_position(node_id)[0]])
            else:
                self.refs[(label, var_id)].add(self.cfg.get_position(node_id)[0])


    def __is_binOp_equal(self, node_id: int):
        return self.cfg.get_type(node_id) == "BinOP" and self.cfg.get_image(node_id) == "="

    def print_defs_refs_pairs(self, output_file):
        print(self.gen)
        print(self.kills)
        print(self.refs)
        for (label, var_id) in self.refs:
            refs_position_arr = set() if self.refs.get((label, var_id)) is None else self.refs.get((label, var_id))
            defs_position_arr = set() if self.gen.get((label, var_id)) is None else self.gen.get((label, var_id))
            if self.kills.get((label, var_id)) is not None:
                defs_position_arr.update(self.kills.get((label, var_id)))
            output_file.write("Variable {} with var id {} was referenced on lines: {} and defined on lines: {} \n".format(
                label,
                var_id,
                refs_position_arr,
                defs_position_arr
            ))
        output_file.write("\n")

    def print_unreferenced_definitions(self, output_file):
        unref_defs = self.__get_unreferenced_defs()
        for (label, var_id) in unref_defs:
            output_file.write("Variable {} with var id {} was defined at line: {}, but was never referenced".format(
                label,
                var_id,
                unref_defs[(label, var_id)]
            ))
            output_file.write("\n")

In [168]:
# Prepare output file and reader
cfg_reader = CFGReader()

def get_refs_defs_pairs(directory):
    part1_output_directory = "output/part_1/"
    part_1_output_file = create_output_file("part_1_output_file.txt", part1_output_directory)

    # Retrieve filenames of all cfg in the specified directory
    cfgFilenames = get_json_files('*.cfg.json', directory)

    # Iterate over the filenames array once to visit all cfgs
    for filename in cfgFilenames:
        print(filename)
        # Load cfg in memory
        cfg = cfg_reader.read_cfg(filename)

        # Visit currently loaded cfg
        live_var_visitor = LiveVariableVisitor(filename)
        live_var_visitor.visit(cfg)

        def_visitor = DefinitionVisitor(filename)
        def_visitor.visit(cfg)

        # Print output
        part_1_output_file.write(f"------------------------ File: {filename} ------------------------\n")

        part_1_output_file.write(f"------ Visitor: Live Variable \n")
        live_var_visitor.print_refs_defs_pairs(part_1_output_file)

        part_1_output_file.write(f"------ Visitor: Definition \n")
        def_visitor.print_defs_refs_pairs(part_1_output_file)

    
    close_output_file(part_1_output_file)

directory_to_analyze = "../tp4/part_1/"
get_refs_defs_pairs(directory_to_analyze)

../tp4/part_1/wordcount.php.cfg.json
Visiting CFG from file ../tp4/part_1/wordcount.php.cfg.json
Visiting CFG from file ../tp4/part_1/wordcount.php.cfg.json
{('inword', 466): {7}, ('nl', 467): {8}, ('nw', 468): {9}, ('nc', 469): {10}, ('ni', 470): {11}, ('fp', 471): {13}, ('c', 472): {14}}
{('nc', 469): {16}, ('nl', 467): {18}, ('ni', 470): {22}, ('inword', 466): {24, 26, 28, 31}, ('nw', 468): {32}, ('c', 472): {34}}
{('fp', 471): {34, 14}, ('c', 472): {15, 17, 19, 20, 21, 29}, ('nc', 469): {16, 39}, ('nl', 467): {18, 37}, ('ni', 470): {40, 22}, ('inword', 466): {30}, ('nw', 468): {32, 38}}
../tp4/part_1/test.php.cfg.json
Visiting CFG from file ../tp4/part_1/test.php.cfg.json
Visiting CFG from file ../tp4/part_1/test.php.cfg.json
{('b', 473): {2}, ('a', 474): {6}}
{}
{('b', 473): {5}, ('a', 474): {8}}


In [169]:
# Prepare output file and reader
cfg_reader = CFGReader()

def get_undefined_and_unreferenced_vars(directory):
    part2_output_directory = "output/part_2/"
    part_2_output_file = create_output_file("part_2_output_file.txt", part2_output_directory)

    # Retrieve filenames of all cfg in the specified directory
    cfgFilenames = get_json_files('*.cfg.json', directory)

    # Iterate over the filenames array once to visit all cfgs
    for filename in cfgFilenames:
        print(filename)
        # Load cfg in memory
        cfg = cfg_reader.read_cfg(filename)

        # Visit currently loaded cfg
        live_var_visitor = LiveVariableVisitor(filename)
        live_var_visitor.visit(cfg)

        def_visitor = DefinitionVisitor(filename)
        def_visitor.visit(cfg)

        # Print output
        part_2_output_file.write(f"------------------------ File: {filename} ------------------------\n")

        part_2_output_file.write(f"------ Visitor: Live Variable \n")
        live_var_visitor.print_undefined_refs(part_2_output_file)

        part_2_output_file.write(f"------ Visitor: Definition \n")
        def_visitor.print_unreferenced_definitions(part_2_output_file)

    
    close_output_file(part_2_output_file)

directory_to_analyze = "../tp4/part_2/"
get_undefined_and_unreferenced_vars(directory_to_analyze)

../tp4/part_2/file3.php.cfg.json
Visiting CFG from file ../tp4/part_2/file3.php.cfg.json
Visiting CFG from file ../tp4/part_2/file3.php.cfg.json
../tp4/part_2/file2.php.cfg.json
Visiting CFG from file ../tp4/part_2/file2.php.cfg.json
Visiting CFG from file ../tp4/part_2/file2.php.cfg.json
../tp4/part_2/file1.php.cfg.json
Visiting CFG from file ../tp4/part_2/file1.php.cfg.json
Visiting CFG from file ../tp4/part_2/file1.php.cfg.json
