# Authors
Ikram Kohil, 2019115 \
Johnatan Gao, 2013298

# 1. Part 1: DataFlow extraction

For this first part, we need to implement two algorithms:
* One algorithm to retrieve all references of variables first, then matching them to their definitions
    * LiveVariableVisitor
* One algorithm to retrieve all definitions of variables first, then matching them to their references
    * DefinitionVisitor

***Note: We only focus on the following definition format: Variable = Expression ∧ Literal***

# 1.1 Utility functions

In [None]:
import os
from pathlib import Path
from code_analysis import CFGReader
from code_analysis import CFG

# Global variable - directory where cfg.json and .dot files generated by our code will be stored 
part1_output_directory = "output/part_1/"
part2_output_directory = "output/part_2/"

# Utility functions taken from TP1
def get_json_files(extension, directory):
   directory = Path(directory)
   return [str(file) for file in directory.rglob(extension)]

def create_output_file(filename, directory):
    # Check if output directory exists, if not, create it
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Check if output file already exists, if so, delete and create new file
    file_path = os.path.join(directory, filename)
    if os.path.exists(file_path):
        os.remove(file_path)

    # Open in "append" mode to avoid overwriting the whole file after each modification
    return open(directory + filename, "a")

def close_output_file(file):
   file.close()

## 1.2 LiveVariableVisitor
To retrieve the references first, we decided to implement part of the Live Variables algorithm, since the **GEN** and **KILL** sets produced by this algorithm correspond respectively to references and definitions.

To do so, we visit the cfg from the exit node first, to garantee finding the references first. We based our search on the "Variable" node type to find references. Then, by observing the cfg, we noticed that in the case of a definition simple definition, the immediate child of the variable node is the '=' BinOP node. So when reaching a variable node, if the immediate child is a BinOP node, it is added to the KILL dictionnary, and if not, it is added to the GEN dictionnary. To keep track of each variable separately, we used the var_id, which is an identifier unique to each different variable, meaning that no matter the node_id in which the variable appears, it will always have the same var_id. This sorting is done in the `__get_gen_kill_live_vars()` function.

Once the cfg has been parsed from bottom to top, we used the var_id of each variable to match the references in **GEN** to their definitions in **KILL**. This matching is done in the `print_refs_defs_pairs()` function.

In [None]:
from typing import Dict, Set, Tuple


class LiveVariableVisitor:
    def __init__(self, filename):
        self.cfg = None
        self.filename = filename

        ## Using a set to avoid adding nodes that have already been visited
        self.visited_nodes = set()
        ## Contains all references to variables
        self.gen_live_vars: Dict[int, Tuple[str, Set[int]]] = dict() # Format: key = var_id, value=[0: var_name, 1: node_id]
        ## Contains all defined variables
        self.kill_live_vars: Dict[int, Tuple[str, Set[int]]] = dict() # Format: key = var_id, value=[0: var_name, 1: node_id]

    def visit(self, cfg: CFG):
        self.cfg = cfg
        print(f"Visiting CFG from file {self.filename}")

        # Visit main program from exit node (hence the +1, since the exit node is generated right after the entry node)
        # Starting from exit node ensures that we find references first, then definitions
        exit_node_id = self.cfg.get_root() + 1
        self.__visit(exit_node_id)

        # Visit all other procedures in file
        defined_func_entry_nodes = self.cfg.get_func_entry_nodes()
        if len(defined_func_entry_nodes) > 0:
            for entry_node in defined_func_entry_nodes:
                func_exit_node = entry_node + 1
                self.__visit(func_exit_node)

    def __visit(self, node_id: int):
        self.visited_nodes.add(node_id)

        # Find all the variable nodes first
        node_type = self.cfg.get_type(node_id)

        if(node_type == "Variable"):
            self.__get_gen_kill_live_vars(node_id)

        # Visit parents
        parents = self.cfg.get_any_parents(node_id)
        for parent_id in parents:
            # Only visit if not previously visited
            if parent_id not in self.visited_nodes:
                self.__visit(parent_id)

    def __get_gen_kill_live_vars(self, node_id):
        # Check if child is (BinOP =);
        ## if not, then reference (GEN)
        ## if yes, then definition (KILL)
        is_binOP_equal = lambda child_node_id: self.cfg.get_type(child_node_id) == "BinOP" and self.cfg.get_image(child_node_id) == "="
        node_children = self.cfg.get_children(node_id)
        node_binop_equal = [child_node_id for child_node_id in node_children if is_binOP_equal(child_node_id)]
    
        # Prepare variables
        var_name = self.cfg.get_image(node_id)
        var_id = self.cfg.get_var_id(node_id) # Unique identifier common to all references to the variable

        if len(node_binop_equal) > 0:
            # If the variable wasnt referenced before, create an entry, else update the node ids in which it was referenced
            if var_id not in self.kill_live_vars:
                self.kill_live_vars[var_id] = (var_name, set([node_id]))
            else:
                self.kill_live_vars[var_id][1].add(node_id)
        else:
            # If the variable wasnt definition before, create an entry, else update the node ids in which it was definition
            if var_id not in self.gen_live_vars:
                self.gen_live_vars[var_id] = (var_name, set([node_id]))
            else:
                self.gen_live_vars[var_id][1].add(node_id)
        
    def __get_undefined_refs(self):
        # If a variable is in GEN (refs) but not in KILL (defs), then it is undefined
        undefined_refs: Dict[int, Tuple[str, Set[int]]] = dict() # Format: key = var_id, value=[0: var_name, 1: set of node_ids]

        for var_id in self.gen_live_vars:
            if var_id not in self.kill_live_vars.keys():
                var_name = self.gen_live_vars[var_id][0]
                node_ids = self.gen_live_vars[var_id][1]
                undefined_refs[var_id] = (var_name, node_ids)
        
        return undefined_refs

    def print_refs_defs_pairs(self, output_file):
        # Using var_id (unique identifier) to pair references to their definitions
        for var_id in self.gen_live_vars: 
            # Skipping variables that are not defined in KILL; they will be dealt with in part 2.
            if var_id in self.kill_live_vars:
                output_file.write(f"Variable {self.gen_live_vars[var_id][0]} was referenced in nodes: {self.gen_live_vars[var_id][1]} and defined in nodes: {self.kill_live_vars[var_id][1]}\n")
        output_file.write("\n")

    def print_undefined_refs(self, output_file):
        undefined_refs = self.__get_undefined_refs()

        for var_id in undefined_refs: 
            output_file.write(f"Variable {undefined_refs[var_id][0]} was referenced in nodes: {undefined_refs[var_id][1]} but was never defined.\n") 
        output_file.write("\n")

## 1.3 DefinitionVisitor
Gao doc.

In [None]:
from typing import Any, Dict, Set


class DefinitionVisitor:
    def __init__(self, filename):
        self.cfg = None
        self.filename = filename

        ## Using a set to avoid adding nodes that have already been visited
        self.visited_nodes = set()
        ## Contains all references to variables
        self.refs: Dict[(str, str), Set[Any]] = dict()
        ## Contains all defined variables (GEN)
        self.gen: Dict[(str, str), Set[Any]] = dict()
        ## Contains  all the redefined variables (KILL)
        self.kills: Dict[(str, str), Set[Any]] = dict()

    def visit(self, cfg: CFG):
        self.cfg = cfg
        print(f"Visiting CFG from file {self.filename}")
        # Visit main program
        self.__visit(self.cfg.get_root())

        # Visit all other procedures in file
        defined_func_entry_nodes = self.cfg.get_func_entry_nodes()
        if len(defined_func_entry_nodes) > 0:
            for entry_node in defined_func_entry_nodes:
                self.__visit(entry_node)

    def __get_unreferenced_defs(self):
        unreferenced_defs = dict()
        for (label, var_id) in self.gen:
            if (label, var_id) not in self.refs:
                unreferenced_defs[((label, var_id))] = self.gen[(label, var_id)]
        
        return unreferenced_defs

    def __visit(self, node_id: int):
        self.visited_nodes.add(node_id)
        self.__get_gen_kills_definition(node_id)
        self.__get_references(node_id)
        # Visit children
        children = self.cfg.get_any_children(node_id)
        for child_id in children:
            # Only visit if not previously visited
            if child_id not in self.visited_nodes:
                self.__visit(child_id)

    def __get_gen_kills_definition(self, node_id: int):
        if(self.__is_binOp_equal(node_id)):
            variable_node_id = self.cfg.get_op_hands(node_id)[0]
            if(self.cfg.get_type(variable_node_id) == "Variable"):
                label = self.cfg.get_image(variable_node_id)
                var_id = self.cfg.get_var_id(variable_node_id)

                if (label, var_id) not in self.gen:
                    self.gen[(label, var_id)] = set([variable_node_id])
                else:
                    if (label, var_id) not in self.kills:
                        self.kills[(label, var_id)] = set([variable_node_id])
                    else:
                        self.kills[(label, var_id)].add(variable_node_id)


    def __get_references(self, node_id: int):
        if self.cfg.get_type(node_id) == "Variable" and not self.__is_binOp_equal(self.cfg.get_children(node_id)[0]) and not self.cfg.get_image(node_id) == "this":
            label = self.cfg.get_image(node_id)
            var_id = self.cfg.get_var_id(node_id)

            if (label, var_id) not in self.refs:
                self.refs[(label, var_id)] = set([node_id])
            else:
                self.refs[(label, var_id)].add(node_id)


    def __is_binOp_equal(self, node_id: int):
        return self.cfg.get_type(node_id) == "BinOP" and self.cfg.get_image(node_id) == "="

    def print_defs_refs_pairs(self, output_file):
        for (label, var_id) in self.refs:
            refs_node_ids_arr = set() if self.refs.get((label, var_id)) is None else self.refs.get((label, var_id))
            defs_node_ids_arr = set() if self.gen.get((label, var_id)) is None else self.gen.get((label, var_id))
            if self.kills.get((label, var_id)) is not None:
                defs_node_ids_arr.update(self.kills.get((label, var_id)))
            output_file.write("Variable {} was referenced in nodes: {} and defined in nodes: {} \n".format(
                label,
                refs_node_ids_arr,
                defs_node_ids_arr
            ))
        output_file.write("\n")

    def print_unreferenced_definitions(self, output_file):
        unref_defs = self.__get_unreferenced_defs()
        for (label, var_id) in unref_defs:
            output_file.write("Variable {} was defined in nodes: {} but was never referenced. \n".format(
                label,
                unref_defs[(label, var_id)]
            ))
        output_file.write("\n")

## 1.4 Output the results
The `get_refs_defs_pairs()` function calls both visitors and outputs the results in the `output/part/part_1_output_file.txt` file.

In [None]:
# Prepare output file and reader
cfg_reader = CFGReader()

def get_refs_defs_pairs(directory):
    part1_output_directory = "output/part_1/"
    part_1_output_file = create_output_file("part_1_output_file.txt", part1_output_directory)

    # Retrieve filenames of all cfg in the specified directory
    cfgFilenames = get_json_files('*.cfg.json', directory)

    # Iterate over the filenames array once to visit all cfgs
    for filename in cfgFilenames:
        print(filename)
        # Load cfg in memory
        cfg = cfg_reader.read_cfg(filename)

        # Visit currently loaded cfg
        live_var_visitor = LiveVariableVisitor(filename)
        live_var_visitor.visit(cfg)

        def_visitor = DefinitionVisitor(filename)
        def_visitor.visit(cfg)

        # Print output
        part_1_output_file.write(f"------------------------ File: {filename} ------------------------\n")

        part_1_output_file.write(f"------ Visitor: Live Variable \n")
        live_var_visitor.print_refs_defs_pairs(part_1_output_file)

        part_1_output_file.write(f"------ Visitor: Definition \n")
        def_visitor.print_defs_refs_pairs(part_1_output_file)

    
    close_output_file(part_1_output_file)

directory_to_analyze = "../tp4/part_1/"
get_refs_defs_pairs(directory_to_analyze)

# 2. Part 2: Using the data flow
For this part, we needed to use our previous algorithms 

In [None]:
# Prepare output file and reader
cfg_reader = CFGReader()

def get_undefined_and_unreferenced_vars(directory):
    part2_output_directory = "output/part_2/"
    part_2_output_file = create_output_file("part_2_output_file.txt", part2_output_directory)

    # Retrieve filenames of all cfg in the specified directory
    cfgFilenames = get_json_files('*.cfg.json', directory)

    # Iterate over the filenames array once to visit all cfgs
    for filename in cfgFilenames:
        print(filename)
        # Load cfg in memory
        cfg = cfg_reader.read_cfg(filename)

        # Visit currently loaded cfg
        live_var_visitor = LiveVariableVisitor(filename)
        live_var_visitor.visit(cfg)

        def_visitor = DefinitionVisitor(filename)
        def_visitor.visit(cfg)

        # Print output
        part_2_output_file.write(f"------------------------ File: {filename} ------------------------\n")

        part_2_output_file.write(f"------ Visitor: Live Variable \n")
        live_var_visitor.print_undefined_refs(part_2_output_file)

        part_2_output_file.write(f"------ Visitor: Definition \n")
        def_visitor.print_unreferenced_definitions(part_2_output_file)

    
    close_output_file(part_2_output_file)

directory_to_analyze = "../tp4/part_2/"
get_undefined_and_unreferenced_vars(directory_to_analyze)