# Authors
Ikram Kohil, 2019115 \
Johnatan Gao, 2013298

# 1. DataFlow extraction

In this part, we must extract the intra-procedural dataflow [1] of the exmple code that was provided:

- For each variable reference, we must determine the corresponding definition(s) [2]
- For each definition, we must determine the corresponding reference(s) [3]

*For this lab, we must only consider the simple definitions of the form (Variable = Expression ^ Literal)* [4]

We must verify our implementation by insuring that the set of pairs (definition - refefence) corresponds to the set of pairs (reference - definition). We must then extract the sets for the files in the part_1 folder.

[1] Intra-procedural dataflow refers to the analysis of how variables and values flow within a single function or procedure. In other words, it looks at how information moves from one point to another within a piece of code that is defined by a set of procedures or functions. This type of analysis can be useful for identifying common programming errors such as uninitialized variables, null pointer references, and other issues related to the flow of data through a program.
''''
For each variable references, we must determine the corresponding definition (s)

Since it's been instructed to only consider simple definitions:
Variable = Expression ^ Literal

''''

In [None]:
import os
from pathlib import Path
from code_analysis import CFGReader
from code_analysis import CFG

# Global variable - directory where cfg.json and .dot files generated by our code will be stored 
part1_output_directory = "output/part_1/"
part2_output_directory = "output/part_2/"

# Utility functions taken from TP1
def get_json_files(extension, directory):
   directory = Path(directory)
   return [str(file) for file in directory.rglob(extension)]

def create_output_file(filename, directory):
    # Check if output directory exists, if not, create it
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Check if output file already exists, if so, delete and create new file
    file_path = os.path.join(directory, filename)
    if os.path.exists(file_path):
        os.remove(file_path)

    # Open in "append" mode to avoid overwriting the whole file after each modification
    return open(directory + filename, "a")

def close_output_file(file):
   file.close()

In [None]:
class LiveVariableVisitor:
    def __init__(self, filename):
        self.cfg = None
        self.filename = filename
        self.test = []

        # Stored in array
        ## Using a set to avoid adding nodes that have already been visited
        self.visited_nodes = set()
        ## Contains all references to variables
        self.gen_live_vars = []
        ## Contains all defined variables
        self.kill_live_vars = []
        ## Contains ref/def matches
        self.refs_to_defs = {} # Format: key: variable_id, value: [0: variable name, 1: array of lines in which it was referenced, 2: array of lines in which it was defined]

    def visit(self, cfg: CFG):
        self.cfg = cfg
        print(f"Visiting CFG from file {self.filename}")

        # Visit main program from exit node (hence the +1, since the exit node is generated right after the entry node)
        # Starting from exit node ensures that we find references first, then definitions
        exit_node_id = self.cfg.get_root() + 1
        self.__visit(exit_node_id)

        # Visit all other procedures in file
        defined_func_entry_nodes = self.cfg.get_func_entry_nodes()
        if len(defined_func_entry_nodes) > 0:
            for entry_node in defined_func_entry_nodes:
                func_exit_node = entry_node + 1
                self.__visit(func_exit_node)

    def __visit(self, node_id: int):
        self.visited_nodes.add(node_id)
        # Find all the variable nodes first
        node_image = self.cfg.get_type(node_id)

        if(node_image == "Variable"):
            self.__gen_kill_live_vars(node_id)

        # Visit parents
        parents = self.cfg.get_any_parents(node_id)
        for parent_id in parents:
            # Only visit if not previously visited
            if parent_id not in self.visited_nodes:
                self.__visit(parent_id)
    
    def __gen_kill_live_vars(self, node_id):
        # Check if child is (BinOP =);
        is_binOP_equal = lambda child_node_id: self.cfg.get_type(child_node_id) == "BinOP" and self.cfg.get_image(child_node_id) == "="
        node_children = self.cfg.get_children(node_id)
        node_binop_equal = [child_node_id for child_node_id in node_children if is_binOP_equal(child_node_id)]
        ## if not, then reference (so gen)
        ## if yes, then definition (so kill)
    
        var_name = self.cfg.get_image(node_id)
        line_nbr = self.cfg.get_position(node_id)[0] # Only the line number is of interest to us in this output
        var_id = self.cfg.get_var_id(node_id) # Unique identifier common to all references to the variable

        if len(node_binop_equal) > 0:
            self.kill_live_vars.append(node_id)

            # Add lines where it was defined
            if var_id not in self.refs_to_defs:
                self.refs_to_defs[var_id] = [var_name, set(), set([line_nbr])]
            else:
                self.refs_to_defs[var_id][2].add(line_nbr)
        else:
            self.gen_live_vars.append(node_id)

            # Add line where it was referenced
            if var_id not in self.refs_to_defs:
                self.refs_to_defs[var_id] = [var_name, set([line_nbr]), set()]
            else:
                self.refs_to_defs[var_id][1].add(line_nbr)
        
    def print_refs_defs_pairs(self, output_file):
        for key in self.refs_to_defs: 
            output_file.write(f"Variable {self.refs_to_defs[key][0]} was referenced on lines: {self.refs_to_defs[key][1]} and defined on lines: {self.refs_to_defs[key][2]}\n") 
        output_file.write("\n")

In [None]:
# Prepare output file and reader
cfg_reader = CFGReader()

def get_refs_defs_pairs(directory):
    part1_output_directory = "output/part_1/"
    part_1_output_file = create_output_file("part_1_output_file.txt", part1_output_directory)

    # Retrieve filenames of all cfg in the specified directory
    cfgFilenames = get_json_files('*.cfg.json', directory)

    # Iterate over the filenames array once to visit all cfgs
    for filename in cfgFilenames:
        print(filename)
        # Load cfg in memory
        cfg = cfg_reader.read_cfg(filename)

        # Visit currently loaded cfg
        visitor = LiveVariableVisitor(filename)
        visitor.visit(cfg)

        # Print output
        part_1_output_file.write(f"------------------------ File: {filename} ------------------------\n")
        visitor.print_refs_defs_pairs(part_1_output_file)
    
    close_output_file(part_1_output_file)

directory_to_analyze = "../tp4/part_1/"
get_refs_defs_pairs(directory_to_analyze)

In [None]:
# This is Ikram's code (reference, definition)



from code_analysis import CFG, CFGReader


class LiveVariableVisitor:
    def __init__(self, filename):
        self.cfg = None
        self.filename = filename
        self.test = []

        # Stored in array
        ## Using a set to avoid adding nodes that have already been visited
        self.visited_nodes = set()
        ## Contains all references to variables
        self.gen_live_vars = []
        ## Contains all defined variables
        self.kill_live_vars = []
        ## Contains ref/def matches
        self.refs_to_defs = {} # Format: key: variable_id, value: [0: variable name, 1: array of lines in which it was referenced, 2: array of lines in which it was defined]

    def visit(self, cfg: CFG):
        self.cfg = cfg
        print(f"Visiting CFG from file {self.filename}")

        # Visit main program from exit node (hence the +1, since the exit node is generated right after the entry node)
        # Starting from exit node ensures that we find references first, then definitions
        exit_node_id = self.cfg.get_root() + 1
        self.__visit(exit_node_id)

        # Visit all other procedures in file
        defined_func_entry_nodes = self.cfg.get_func_entry_nodes()
        if len(defined_func_entry_nodes) > 0:
            for entry_node in defined_func_entry_nodes:
                func_exit_node = entry_node + 1
                self.__visit(func_exit_node)

    def __visit(self, node_id: int):
        is_binOP_equal = lambda child_node_id: self.cfg.get_type(child_node_id) == "BinOP" and self.cfg.get_image(child_node_id) == "="
        self.visited_nodes.add(node_id)
        # Find all the variable nodes first
        node_image = self.cfg.get_type(node_id)
        if(node_image == "Variable"):
            # Check if child is (BinOP =);
            node_children = self.cfg.get_children(node_id)
            node_binop_eqal = [child_node_id for child_node_id in node_children if is_binOP_equal(child_node_id)]
            ## if not, then reference (so gen)
            ## if yes, then definition (so kill)
        
            var_name = self.cfg.get_image(node_id)
            line_nbr = self.cfg.get_position(node_id)[0] # Only the line number is of interest to us in this output
            var_id = self.cfg.get_var_id(node_id) # Unique identifier common to all references to the variable

            if len(node_binop_eqal) > 0:
                self.kill_live_vars.append(node_id)

                # Add lines where it was defined
                if var_id not in self.refs_to_defs:
                    self.refs_to_defs[var_id] = [var_name, [], [line_nbr]]
                else:
                    self.refs_to_defs[var_id][2].append(line_nbr)
            else:
                self.gen_live_vars.append(node_id)

                # Add line where it was referenced
                if var_id not in self.refs_to_defs:
                    self.refs_to_defs[var_id] = [var_name, [line_nbr], []]
                else:
                    self.refs_to_defs[var_id][1].append(line_nbr)
    # Visit parents

            # else:
            #     self.gen_live_vars.append(node_id)

        # Visit parents
        parents = self.cfg.get_any_parents(node_id)
        for parent_id in parents:
            # Only visit if not previously visited
            if parent_id not in self.visited_nodes:
                self.__visit(parent_id)
        
    def match_def_to_ref(self):
        # print("DEFS")
        # for node in self.kill_live_vars:
        #     print(self.cfg.get_position(node)[0])
        print("MAP")
        for node_id in self.gen_live_vars:    
            var_id = self.cfg.get_var_id(node_id)
            print("Variable", self.refs_to_defs[var_id][0], "with var id ", var_id, "was referenced on lines:", self.refs_to_defs[var_id][1], "and defined on lines:", self.refs_to_defs[var_id][2])


cfg_reader = CFGReader()
filename = '/home/rami/Documents/LOG6302A/LOG6302/log6302a_lab4/log6302a_lab4/tp4/part_1/wordcount.php.cfg.json'
cfg = cfg_reader.read_cfg(filename)

# Visit currently loaded cfg
visitor = LiveVariableVisitor(filename)
visitor.visit(cfg)
visitor.match_def_to_ref()

In [None]:
# This is Ikram's code (reference, definition)



from code_analysis import CFG, CFGReader


class LiveVariableVisitor:
    def __init__(self, filename):
        self.cfg = None
        self.filename = filename
        self.test = []

        # Stored in array
        ## Using a set to avoid adding nodes that have already been visited
        self.visited_nodes = set()
        ## Contains all references to variables
        self.gen_live_vars = []
        ## Contains all defined variables
        self.kill_live_vars = []
        ## Contains ref/def matches
        self.refs_to_defs = {} # Format: key: variable_id, value: [0: variable name, 1: array of lines in which it was referenced, 2: array of lines in which it was defined]

    def visit(self, cfg: CFG):
        self.cfg = cfg
        print(f"Visiting CFG from file {self.filename}")

        # Visit main program from exit node (hence the +1, since the exit node is generated right after the entry node)
        # Starting from exit node ensures that we find references first, then definitions
        exit_node_id = self.cfg.get_root() + 1
        self.__visit(exit_node_id)

        # Visit all other procedures in file
        defined_func_entry_nodes = self.cfg.get_func_entry_nodes()
        if len(defined_func_entry_nodes) > 0:
            for entry_node in defined_func_entry_nodes:
                func_exit_node = entry_node + 1
                self.__visit(func_exit_node)

    def __visit(self, node_id: int):
        is_binOP_equal = lambda child_node_id: self.cfg.get_type(child_node_id) == "BinOP" and self.cfg.get_image(child_node_id) == "="
        self.visited_nodes.add(node_id)
        # Find all the variable nodes first
        node_image = self.cfg.get_type(node_id)
        if(node_image == "Variable"):
            # Check if child is (BinOP =);
            node_children = self.cfg.get_children(node_id)
            node_binop_eqal = [child_node_id for child_node_id in node_children if is_binOP_equal(child_node_id)]
            ## if not, then reference (so gen)
            ## if yes, then definition (so kill)
        
            var_name = self.cfg.get_image(node_id)
            line_nbr = self.cfg.get_position(node_id)[0] # Only the line number is of interest to us in this output
            var_id = self.cfg.get_var_id(node_id) # Unique identifier common to all references to the variable

            if len(node_binop_eqal) > 0:
                self.kill_live_vars.append(node_id)

                # Add lines where it was defined
                if var_id not in self.refs_to_defs:
                    self.refs_to_defs[var_id] = [var_name, [], [line_nbr]]
                else:
                    self.refs_to_defs[var_id][2].append(line_nbr)
            else:
                self.gen_live_vars.append(node_id)

                # Add line where it was referenced
                if var_id not in self.refs_to_defs:
                    self.refs_to_defs[var_id] = [var_name, [line_nbr], []]
                else:
                    self.refs_to_defs[var_id][1].append(line_nbr)
    # Visit parents

            # else:
            #     self.gen_live_vars.append(node_id)

        # Visit parents
        parents = self.cfg.get_any_parents(node_id)
        for parent_id in parents:
            # Only visit if not previously visited
            if parent_id not in self.visited_nodes:
                self.__visit(parent_id)
        
    def match_def_to_ref(self):
        # print("DEFS")
        # for node in self.kill_live_vars:
        #     print(self.cfg.get_position(node)[0])
        print("MAP")
        for node_id in self.gen_live_vars:    
            var_id = self.cfg.get_var_id(node_id)
            


cfg_reader = CFGReader()
filename = '/home/rami/Documents/LOG6302A/LOG6302/log6302a_lab4/log6302a_lab4/tp4/part_2/file1.php.cfg.json'
cfg = cfg_reader.read_cfg(filename)

# Visit currently loaded cfg
visitor = LiveVariableVisitor(filename)
visitor.visit(cfg)
visitor.match_def_to_ref()

In [None]:
# This is Ikram's code (reference, definition)



from code_analysis import CFG, CFGReader


class LiveVariableVisitor:
    def __init__(self, filename):
        self.cfg = None
        self.filename = filename
        self.test = []

        # Stored in array
        ## Using a set to avoid adding nodes that have already been visited
        self.visited_nodes = set()
        ## Contains all references to variables
        self.gen_live_vars = []
        ## Contains all defined variables
        self.kill_live_vars = []
        ## Contains ref/def matches
        self.refs_to_defs = {} # Format: key: variable_id, value: [0: variable name, 1: array of lines in which it was referenced, 2: array of lines in which it was defined]

    def visit(self, cfg: CFG):
        self.cfg = cfg
        print(f"Visiting CFG from file {self.filename}")

        # Visit main program from exit node (hence the +1, since the exit node is generated right after the entry node)
        # Starting from exit node ensures that we find references first, then definitions
        exit_node_id = self.cfg.get_root() + 1
        self.__visit(exit_node_id)

        # Visit all other procedures in file
        defined_func_entry_nodes = self.cfg.get_func_entry_nodes()
        if len(defined_func_entry_nodes) > 0:
            for entry_node in defined_func_entry_nodes:
                func_exit_node = entry_node + 1
                self.__visit(func_exit_node)

    def __visit(self, node_id: int):
        is_binOP_equal = lambda child_node_id: self.cfg.get_type(child_node_id) == "BinOP" and self.cfg.get_image(child_node_id) == "="
        self.visited_nodes.add(node_id)
        # Find all the variable nodes first
        node_image = self.cfg.get_type(node_id)
        if(node_image == "Variable"):
            # Check if child is (BinOP =);
            node_children = self.cfg.get_children(node_id)
            node_binop_eqal = [child_node_id for child_node_id in node_children if is_binOP_equal(child_node_id)]
            ## if not, then reference (so gen)
            ## if yes, then definition (so kill)
        
            var_name = self.cfg.get_image(node_id)
            line_nbr = self.cfg.get_position(node_id)[0] # Only the line number is of interest to us in this output
            var_id = self.cfg.get_var_id(node_id) # Unique identifier common to all references to the variable

            if len(node_binop_eqal) > 0:
                self.kill_live_vars.append(node_id)

                # Add lines where it was defined
                if var_id not in self.refs_to_defs:
                    self.refs_to_defs[var_id] = [var_name, [], [line_nbr]]
                else:
                    self.refs_to_defs[var_id][2].append(line_nbr)
            else:
                self.gen_live_vars.append(node_id)

                # Add line where it was referenced
                if var_id not in self.refs_to_defs:
                    self.refs_to_defs[var_id] = [var_name, [line_nbr], []]
                else:
                    self.refs_to_defs[var_id][1].append(line_nbr)
    # Visit parents

            # else:
            #     self.gen_live_vars.append(node_id)

        # Visit parents
        parents = self.cfg.get_any_parents(node_id)
        for parent_id in parents:
            # Only visit if not previously visited
            if parent_id not in self.visited_nodes:
                self.__visit(parent_id)
        
    def match_def_to_ref(self):
        # print("DEFS")
        # for node in self.kill_live_vars:
        #     print(self.cfg.get_position(node)[0])
        print("MAP")
        for node_id in self.gen_live_vars:    
            var_id = self.cfg.get_var_id(node_id)
            print("Variable " + self.refs_to_defs[var_id][0] + "was referenced on lines: ")
            print(self.refs_to_defs[var_id][1])
            print(" and defined on lines: ")
            print(self.refs_to_defs[var_id][2])


cfg_reader = CFGReader()
filename = '/home/rami/Documents/LOG6302A/LOG6302/log6302a_lab4/log6302a_lab4/tp4/part_2/file1.php.cfg.json'
cfg = cfg_reader.read_cfg(filename)

# Visit currently loaded cfg
visitor = LiveVariableVisitor(filename)
visitor.visit(cfg)
visitor.match_def_to_ref()

In [None]:
# This is Ikram's code (reference, definition)



from code_analysis import CFG, CFGReader


class LiveVariableVisitor:
    def __init__(self, filename):
        self.cfg = None
        self.filename = filename
        self.test = []

        # Stored in array
        ## Using a set to avoid adding nodes that have already been visited
        self.visited_nodes = set()
        ## Contains all references to variables
        self.gen_live_vars = []
        ## Contains all defined variables
        self.kill_live_vars = []
        ## Contains ref/def matches
        self.refs_to_defs = {} # Format: key: variable_id, value: [0: variable name, 1: array of lines in which it was referenced, 2: array of lines in which it was defined]

    def visit(self, cfg: CFG):
        self.cfg = cfg
        print(f"Visiting CFG from file {self.filename}")

        # Visit main program from exit node (hence the +1, since the exit node is generated right after the entry node)
        # Starting from exit node ensures that we find references first, then definitions
        exit_node_id = self.cfg.get_root() + 1
        self.__visit(exit_node_id)

        # Visit all other procedures in file
        defined_func_entry_nodes = self.cfg.get_func_entry_nodes()
        if len(defined_func_entry_nodes) > 0:
            for entry_node in defined_func_entry_nodes:
                func_exit_node = entry_node + 1
                self.__visit(func_exit_node)

    def __visit(self, node_id: int):
        is_binOP_equal = lambda child_node_id: self.cfg.get_type(child_node_id) == "BinOP" and self.cfg.get_image(child_node_id) == "="
        self.visited_nodes.add(node_id)
        # Find all the variable nodes first
        node_image = self.cfg.get_type(node_id)
        if(node_image == "Variable"):
            # Check if child is (BinOP =);
            node_children = self.cfg.get_children(node_id)
            node_binop_eqal = [child_node_id for child_node_id in node_children if is_binOP_equal(child_node_id)]
            ## if not, then reference (so gen)
            ## if yes, then definition (so kill)
        
            var_name = self.cfg.get_image(node_id)
            line_nbr = self.cfg.get_position(node_id)[0] # Only the line number is of interest to us in this output
            var_id = self.cfg.get_var_id(node_id) # Unique identifier common to all references to the variable

            if len(node_binop_eqal) > 0:
                self.kill_live_vars.append(node_id)

                # Add lines where it was defined
                if var_id not in self.refs_to_defs:
                    self.refs_to_defs[var_id] = [var_name, [], [line_nbr]]
                else:
                    self.refs_to_defs[var_id][2].append(line_nbr)
            else:
                self.gen_live_vars.append(node_id)

                # Add line where it was referenced
                if var_id not in self.refs_to_defs:
                    self.refs_to_defs[var_id] = [var_name, [line_nbr], []]
                else:
                    self.refs_to_defs[var_id][1].append(line_nbr)
    # Visit parents

            # else:
            #     self.gen_live_vars.append(node_id)

        # Visit parents
        parents = self.cfg.get_any_parents(node_id)
        for parent_id in parents:
            # Only visit if not previously visited
            if parent_id not in self.visited_nodes:
                self.__visit(parent_id)
        
    def match_def_to_ref(self):
        # print("DEFS")
        # for node in self.kill_live_vars:
        #     print(self.cfg.get_position(node)[0])
        print("MAP")
        for node_id in self.gen_live_vars:    
            var_id = self.cfg.get_var_id(node_id)
            print("Variable " + self.refs_to_defs[var_id][0] + "was referenced on lines: ")
            print(self.refs_to_defs[var_id][1])
            print(" and defined on lines: ")
            print(self.refs_to_defs[var_id][2])


cfg_reader = CFGReader()
filename = '/home/rami/Documents/LOG6302A/LOG6302/log6302a_lab4/log6302a_lab4/tp4/part_2/file1.php.cfg.json'
cfg = cfg_reader.read_cfg(filename)

# Visit currently loaded cfg
visitor = LiveVariableVisitor(filename)
visitor.visit(cfg)
visitor.match_def_to_ref()

In [None]:
# This is Gao's code (reference, definition)



from code_analysis import CFG, CFGReader


class DefinitionVisitor:
    def __init__(self, filename):
        self.cfg = None
        self.filename = filename
        self.test = []

        # Stored in array
        ## Using a set to avoid adding nodes that have already been visited
        self.visited_nodes = set()
        ## Contains all references to variables
        self.refs = []
        ## Contains all defined variables
        self.defs = []

    def visit(self, cfg: CFG):
        self.cfg = cfg
        print(f"Visiting CFG from file {self.filename}")

        # Visit main program
        self.__visit(self.cfg.get_root())

        # Visit all other procedures in file
        # defined_func_entry_nodes = self.cfg.get_func_entry_nodes()
        # if len(defined_func_entry_nodes) > 0:
        #     for entry_node in defined_func_entry_nodes:
        #         self.__visit(entry_node)
        
        print(self.defs)


    def __visit(self, node_id: int):

        is_binOP_equal = lambda child_node_id: self.cfg.get_type(child_node_id) == "BinOP" and self.cfg.get_image(child_node_id) == "="
        self.visited_nodes.add(node_id)
        # Find all the variable nodes the  operator first
        if(is_binOP_equal(node_id)):

            #is_var = lambda child_node_id: self.cfg.get_type(child_node_id) == "Variable"
            #node_var = [child_node_id for child_node_id in self.cfg.get_op_hands(node_id) if is_var(child_node_id)]
            #print("Node var: ", self.cfg.get_var_id(node_var[0]))
            variable_node_id, definition_node_id = self.cfg.get_op_hands(node_id)
            #print("Op hands: ", variable_node_id, definition_node_id)
            
            # The variable y referenced at line 12 is defined at line 2 (value=3) and at line 5 (value=6)
            # print("The variable {} is referenced at line {} is defined at line {} (value={})".format(
            #     self.cfg.get_image(variable_node_id), 
            #     self.cfg.get_position(variable_node_id)[0], 
            #     self.cfg.get_position(definition_node_id)[0], 
            #     self.cfg.get_image(definition_node_id)))
            self.defs.append({
                "label": self.cfg.get_image(variable_node_id),
                "label_pos": self.cfg.get_position(variable_node_id)[0],
                "definition_val": self.cfg.get_image(definition_node_id),
                "definition_pos": self.cfg.get_position(definition_node_id)[0]
            })
            # print(self.cfg.get_type(variable_node_id), self.cfg.get_image(variable_node_id))
            # print(self.cfg.get_position(node_var[0]))

        # Visit children
        children = self.cfg.get_any_children(node_id)
        for child_id in children:
            # Only visit if not previously visited, for performance
            if child_id not in self.visited_nodes:
                self.__visit(child_id)

cfg_reader = CFGReader()
filename = '../tp4/part_1/test.php.cfg.json'
cfg = cfg_reader.read_cfg(filename)

# Visit currently loaded cfg
visitor = DefinitionVisitor(filename)
visitor.visit(cfg)