diff --git a/.gitignore b/.gitignore index 61917ae..fcd1c7c 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,6 @@ engine/repos/ venv/ web/credentials.py -**/*.json -*.err -*.log +clones_*.json +qa/*.err +qa/*.log diff --git a/clear_db.sh b/clear_db.sh index 0e6bee3..7a506ff 100755 --- a/clear_db.sh +++ b/clear_db.sh @@ -2,5 +2,5 @@ set -e -psql -f web/prepare_tables.pgsql code_duplication +psql -f web/prepare_tables.pgsql cyclone rm -rf engine/repos/ diff --git a/cli/__init__.py b/cli/__init__.py index e69de29..963aa22 100644 --- a/cli/__init__.py +++ b/cli/__init__.py @@ -0,0 +1 @@ +"""Package containing implementation of the application's CLI (command line interface).""" diff --git a/cli/__main__.py b/cli/__main__.py index e138e54..4200890 100644 --- a/cli/__main__.py +++ b/cli/__main__.py @@ -1,3 +1,9 @@ +""" +Intended entry point of the application's CLI. + +Please use the following command to run the CLI: `python3 -m cli` +""" + from cli.app import main if __name__ == "__main__": diff --git a/cli/app.py b/cli/app.py index dd41170..272011e 100644 --- a/cli/app.py +++ b/cli/app.py @@ -1,19 +1,18 @@ +"""Module containing the CLI's core logic.""" + import sys import os from datetime import datetime -from engine.preprocessing.args_handler import handle_args +from .args_handler import handle_args from engine.preprocessing.module_parser import get_modules_from_dir from engine.algorithms.algorithm_runner import run_two_repos, IODINE from engine.utils.benchmark import time_snap from fastlog import log -from engine.errors.UserInputError import UserInputError +from engine.errors.user_input import UserInputError def main(): - """ - Entry point of the application. - """ - + """Entry point of the application.""" try: # Parse command line arguments repos = handle_args(sys.argv) @@ -46,7 +45,8 @@ def main(): # Create output directory if it doesn't exist and print output output_path = os.getcwd() now = datetime.now() - output_filename = "clones_" + f"{now.year}-{now.month}-{now.day}_{now.hour}-{now.minute}-{now.second}" + ".json" + output_filename = "clones_" + \ + f"{now.year}-{now.month}-{now.day}_{now.hour}-{now.minute}-{now.second}" + ".json" os.makedirs(output_path, exist_ok=True) with open(os.path.join(output_path, output_filename), "w") as output_file: output_file.write(clones.json()) diff --git a/cli/args_handler.py b/cli/args_handler.py new file mode 100644 index 0000000..0b8f127 --- /dev/null +++ b/cli/args_handler.py @@ -0,0 +1,75 @@ +"""Module containing functions for handling command-line arguments supplied by the user.""" + +from os.path import isdir +from engine.errors.user_input import UserInputError +from engine.preprocessing.repoinfo import RepoInfo + + +_USAGE_TEXT = """\ +Usage: + python3 -m cli - Repository comparison mode + python3 -m cli - Single repository mode + +Valid repository path formats: + Short GitHub repository path - username/repository + Full remote repository path - https://github.com/username/repository + Absolute or relative local directory path - /home/user/directory""" + + +def repo_path_to_local_path(repo_path): + """ + Convert a repository path into a local file system path. + + This repository path is extended (compared to the repository path + used by RepoInfo) by adding support for existing local directories. + + The process typically includes checking for a local directory or + parsing a full or short repository URL and then cloning the repository. + + Arguments: + repo_path {string} -- Path of a remote repository or a local directory. + + Returns: + string -- Path of a local directory equivalent to the one + specified by the repository path. + In case of local directories, the paths are equal. + + """ + if isdir(repo_path): + return repo_path + + info = RepoInfo.parse_repo_info(repo_path) + + if info and info.clone_or_pull(): + return info.dir + else: + return None + + +def handle_args(argv): + """ + Check the command line arguments and handles them. + + If there is any problem, an error message will be printed + and the script will exit with a non-zero exit code. + If everything goes right, tuple of local repository paths will be returned. + + Arguments: + argv -- List of command line arguments. + + Returns: + tuple[string] -- Tuple of local repository paths. + + """ + if len(argv) == 1 or (len(argv) == 2 and argv[1] in ['-h', '--help', '--usage']): + # Special case where the usage text is printed using the built-in + # print function instead of the logging library because + # the app exits right after the message is displayed. + print(_USAGE_TEXT) + raise UserInputError(None, 0) + + if len(argv) < 2 or len(argv) > 3: + raise UserInputError( + f"Invalid number of command line arguments: {len(argv) - 1}") + + return tuple(repo_path_to_local_path(a) for a in argv[1:]) diff --git a/engine/__init__.py b/engine/__init__.py index e69de29..45b35a3 100644 --- a/engine/__init__.py +++ b/engine/__init__.py @@ -0,0 +1 @@ +"""Package containing the entire code clone detection engine.""" diff --git a/engine/algorithms/__init__.py b/engine/algorithms/__init__.py index 79bf6e9..9e0a20a 100644 --- a/engine/algorithms/__init__.py +++ b/engine/algorithms/__init__.py @@ -1,3 +1,5 @@ +"""Package containing all implemented clone detection algorithms.""" + OXYGEN = "oxygen" CHLORINE = "chlorine" IODINE = "iodine" diff --git a/engine/algorithms/algorithm_runner.py b/engine/algorithms/algorithm_runner.py index f9f6de4..52520b2 100644 --- a/engine/algorithms/algorithm_runner.py +++ b/engine/algorithms/algorithm_runner.py @@ -1,13 +1,15 @@ +"""Module used for algorithm abstraction by providing a common interface.""" + from .oxygen.oxygen import oxygen from .chlorine.chlorine import chlorine_single_repo, chlorine_two_repos from .iodine.iodine import iodine -from ..errors.UserInputError import UserInputError +from engine.errors.user_input import UserInputError from . import OXYGEN, IODINE, CHLORINE def run_single_repo(modules, algorithm): """ - Runs the specified code clone detection algorithm on a single repository. + Run the specified code clone detection algorithm on a single repository. Arguments: modules {list[list[TreeNode]]} -- List of the repo's modules. @@ -18,8 +20,8 @@ def run_single_repo(modules, algorithm): Returns: DetectionResult -- Result of the code clone detection. - """ + """ if algorithm == OXYGEN: return oxygen(modules) elif algorithm == CHLORINE: @@ -30,7 +32,7 @@ def run_single_repo(modules, algorithm): def run_two_repos(modules1, modules2, algorithm): """ - Runs the specified code clone detection algorithm on two repositores. + Run the specified code clone detection algorithm on two repositores. Arguments: modules1 {list[list[TreeNode]]} -- List of first repo's modules. @@ -42,8 +44,8 @@ def run_two_repos(modules1, modules2, algorithm): Returns: DetectionResult -- Result of the code clone detection. - """ + """ if algorithm == CHLORINE: return chlorine_two_repos(modules1, modules2) elif algorithm == IODINE: diff --git a/engine/algorithms/chlorine/__init__.py b/engine/algorithms/chlorine/__init__.py index e69de29..e3e636e 100644 --- a/engine/algorithms/chlorine/__init__.py +++ b/engine/algorithms/chlorine/__init__.py @@ -0,0 +1 @@ +"""Package containing the Chlorine algorithm and its helper functions.""" diff --git a/engine/algorithms/chlorine/chlorine.py b/engine/algorithms/chlorine/chlorine.py index 355b193..b82187e 100644 --- a/engine/algorithms/chlorine/chlorine.py +++ b/engine/algorithms/chlorine/chlorine.py @@ -1,8 +1,10 @@ +"""Module containing implementation of the Chlorine algorithm.""" + from collections import defaultdict +from itertools import chain from ...utils.benchmark import time_snap -from ...utils.list_tools import flatten -from ...results.DetectedClone import DetectedClone -from ...results.DetectionResult import DetectionResult +from ...results.detected_clone import DetectedClone +from ...results.detection_result import DetectionResult # Minimum weight of a single node used in comparison. _MIN_NODE_WEIGHT = 50 @@ -24,6 +26,7 @@ def _get_skeleton_recursive(node): def _can_be_compared(node1, node2): """ First get rid of nodes with a weight below the specified threshold. + Checks if two nodes can be possible compared with each other. In order to be comparable, the nodes must have an equal value and they must have the exact same number of children. @@ -34,6 +37,7 @@ def _can_be_compared(node1, node2): Returns: bool -- True if nodes can be compared, False if they cannot. + """ return \ node1.weight >= _MIN_NODE_WEIGHT and \ @@ -44,8 +48,9 @@ def _can_be_compared(node1, node2): def _type1_compare(node1, node2): """ - Compares two nodes and returns the weight of their matching subtrees - and a skeleton string representing their common syntax tree skeleton. + Compare two nodes and return the weight of their matching subtree. + + Also return a string representing their common syntax tree skeleton. Arguments: node1 {TreeNode} -- First node. @@ -54,8 +59,8 @@ def _type1_compare(node1, node2): Returns: int -- Weight of the matching subtrees. string -- Common skeleton of the two nodes. - """ + """ combined_weight = node1.weight + node2.weight if not _can_be_compared(node1, node2): @@ -80,8 +85,7 @@ def _type1_compare(node1, node2): def _compare_internal(n1, n2, ignore_set, match_dict, skeleton_weight_dict): """ - Common logic shared by single-repo analysis and - two repository comparison mode. + Run common logic shared by single-repo analysis and 2-repo comparison mode. Arguments: n1 {TreeNode} -- First node. @@ -89,8 +93,8 @@ def _compare_internal(n1, n2, ignore_set, match_dict, skeleton_weight_dict): ignore_set {set[TreeNode]} -- Set of nodes to ignore. match_dict {dict[string: set[TreeNode]]} -- Origin nodes of matches. skeleton_weight_dict {dict[string: int]} -- Skeleton weights. - """ + """ if not _can_be_compared(n1, n2): return @@ -109,13 +113,13 @@ def _compare_internal(n1, n2, ignore_set, match_dict, skeleton_weight_dict): def _dict_to_result(match_dict, skeleton_weight_dict): """ - Compiles the detection result together from the input dictionaries. + Compile the detection result together from the input dictionaries. Arguments: match_dict {dict[string: set[TreeNode]]} -- Origin nodes of matches. skeleton_weight_dict {dict[string: int]} -- Skeleton weights. - """ + """ clones = [] for k, v in match_dict.items(): @@ -128,8 +132,9 @@ def _dict_to_result(match_dict, skeleton_weight_dict): def chlorine_single_repo(modules): """ - Finds all clones satisfying the settings at the top of this source file - in a single repository given its modules. + Find all clones in a single repository given its modules. + + Clones must satisfy the settings at the top of this source file. Detected code clones are printed on STDOUT, including the common skeleton, path to each clones (source file path, line number, column offset), size of each clone (number of nodes in its syntax tree) and their @@ -140,8 +145,8 @@ def chlorine_single_repo(modules): Returns: DetectionResult -- Result of the code clone detection. - """ + """ time_snap("Function started") nodes = [m[0] for m in modules] @@ -186,7 +191,8 @@ def chlorine_single_repo(modules): def chlorine_two_repos(modules1, modules2): """ - Finds code clones between two repositories given their module lists. + Find code clones between two repositories given their module lists. + Clones must satisfy rules defined at the top of this source file. Detected clones are printed on STDOUT. See `find_clones_in_repo(repo_url)` for details on output format. @@ -197,12 +203,12 @@ def chlorine_two_repos(modules1, modules2): Returns: DetectionResult -- Result of the code clone detection. - """ + """ time_snap("Function started") repo1_nodes = [m[0] for m in modules1] - repo2_nodes = flatten(modules2) + repo2_nodes = list(chain.from_iterable(modules2)) time_snap("Module lists optimized") diff --git a/engine/algorithms/iodine/__init__.py b/engine/algorithms/iodine/__init__.py index e69de29..24aca77 100644 --- a/engine/algorithms/iodine/__init__.py +++ b/engine/algorithms/iodine/__init__.py @@ -0,0 +1 @@ +"""Package containing the Iodine algorithm and its helper functions.""" diff --git a/engine/algorithms/iodine/anti_unification.py b/engine/algorithms/iodine/anti_unification.py index fcfd863..653cb8b 100644 --- a/engine/algorithms/iodine/anti_unification.py +++ b/engine/algorithms/iodine/anti_unification.py @@ -1,9 +1,9 @@ -from ...nodes.PatternNode import PatternNode +from ...nodes.pattern import PatternNode def anti_unify(list1, list2, index1, index2, worktable): """ - Creates a tree of PatternNodes from two lists + Create a tree of PatternNodes from two lists. Arguments: list1 {list of TreeNodes} -- first tree to be compared @@ -11,8 +11,8 @@ def anti_unify(list1, list2, index1, index2, worktable): index1 {int} -- index of current TreeNode to be compared from list1 index2 {int} -- index of current TreeNode to be compared from list2 worktable {2D boolean array} -- keeps track of which two nodes have been checked together - """ + """ # mark the pair as checked/True worktable[index1].set(True, index2) # determine if subtrees are the same (and lengths same) diff --git a/engine/algorithms/iodine/iodine.py b/engine/algorithms/iodine/iodine.py index 74bc852..bdc9caa 100644 --- a/engine/algorithms/iodine/iodine.py +++ b/engine/algorithms/iodine/iodine.py @@ -1,11 +1,13 @@ +"""Module containing the Iodine algorithm's interface.""" + from .pattern_collection import pattern_collection -from ...results.DetectedClone import DetectedClone -from ...results.DetectionResult import DetectionResult +from ...results.detected_clone import DetectedClone +from ...results.detection_result import DetectionResult def iodine(module_list_1, module_list_2): """ - Finds clones between the two modules by comparing all possible subtrees of + Find clones between the two modules by comparing all possible subtrees of their methods. Returns the results. Arguments: @@ -14,6 +16,7 @@ def iodine(module_list_1, module_list_2): Returns: DetectionResult -- Result of the code clone detection. + """ clusters = [] for module_tree_1 in module_list_1: diff --git a/engine/algorithms/iodine/pattern_clustering.py b/engine/algorithms/iodine/pattern_clustering.py index 42134b1..98106d7 100644 --- a/engine/algorithms/iodine/pattern_clustering.py +++ b/engine/algorithms/iodine/pattern_clustering.py @@ -1,13 +1,20 @@ def clustering(ps): """ - :param ps: a set of patterns - :return: a set of clustered-patterns + Perform pattern clustering and return clusters. + + Arguments: + ps -- a set of patterns + + Returns: + a set of clustered-patterns + """ cs = [] # initialize the set for p in ps: # iterate through the patterns in the set of patterns merged = False for c in cs: # iterate through the clustered-patterns in the set of clustered-patterns - if p.skeleton_equals(c): # if the pattern and the clustered-pattern are the same shape + # if the pattern and the clustered-pattern are the same shape + if p.skeleton_equals(c): c.add_node(p) # merge labels of p into c merged = True break diff --git a/engine/algorithms/iodine/pattern_collection.py b/engine/algorithms/iodine/pattern_collection.py index 7d9e769..1e0fc1b 100644 --- a/engine/algorithms/iodine/pattern_collection.py +++ b/engine/algorithms/iodine/pattern_collection.py @@ -5,14 +5,16 @@ def pattern_collection(tree_list_1, tree_list_2): """ - Compares two Abstract Syntax Trees representing two methods. The trees are + Compare two Abstract Syntax Trees representing two methods. The trees are provided as lists to provides indexes for the nodes within the tree. Arguments: tree_list_1 {list[TreeNode]}: A TreeNode tree represented as a list tree_list_2 {list[TreeNode]}: A TreeNode tree represented as a list - Returns: list[list[PatternNode]]: The clustered patterns identified in the repositories + Returns: + list[list[PatternNode]]: The clustered patterns identified in the repositories + """ # Get the sizes of the trees size_tree_1 = len(tree_list_1) @@ -37,7 +39,8 @@ def pattern_collection(tree_list_1, tree_list_2): # if the root nodes of the subtrees are equal if tree_list_1[i] == tree_list_2[j]: # Add the results of anti-unify to the list of subtrees - pats[i].append(anti_unify(tree_list_1, tree_list_2, i, j, work_list)) + pats[i].append(anti_unify( + tree_list_1, tree_list_2, i, j, work_list)) # for every set of patterns (one per node in the first tree) for pattern_set in pats: # run the clustering function on the pattern set diff --git a/engine/algorithms/oxygen/__init__.py b/engine/algorithms/oxygen/__init__.py index e69de29..781d02f 100644 --- a/engine/algorithms/oxygen/__init__.py +++ b/engine/algorithms/oxygen/__init__.py @@ -0,0 +1 @@ +"""Package containing the Oxygen algorithm and its helper functions.""" diff --git a/engine/algorithms/oxygen/oxygen.py b/engine/algorithms/oxygen/oxygen.py index 74a21d9..163bf6c 100644 --- a/engine/algorithms/oxygen/oxygen.py +++ b/engine/algorithms/oxygen/oxygen.py @@ -1,18 +1,20 @@ -from ...results.DetectedClone import DetectedClone -from ...results.DetectionResult import DetectionResult +"""Module containing logic and interface of the Oxygen algorithm.""" +from ...results.detected_clone import DetectedClone +from ...results.detection_result import DetectionResult -def oxygen(modules, weight_limit=25): + +def oxygen(modules, weight_limit=15): """ - Very simple type 1 code duplication check based on AST.dump() function. + Run basic type 1 code duplication check based on AST.dump() function. Arguments: modules (list[list[TreeNode]): Modules in locally standardized format. Returns: DetectionResult -- Result of the code clone detection. - """ + """ # Dictionary of all the different shapes of node trees. # Key is a string representation of the tree. # Value is a list of all nodes with the exact same string representation. diff --git a/engine/errors/__init__.py b/engine/errors/__init__.py index e69de29..450858f 100644 --- a/engine/errors/__init__.py +++ b/engine/errors/__init__.py @@ -0,0 +1 @@ +"""Package containing all custom exceptions used by the application.""" diff --git a/engine/errors/UserInputError.py b/engine/errors/user_input.py similarity index 64% rename from engine/errors/UserInputError.py rename to engine/errors/user_input.py index 8954f33..ae6b4cb 100644 --- a/engine/errors/UserInputError.py +++ b/engine/errors/user_input.py @@ -1,6 +1,10 @@ +"""Module containing the `UserInputError` exception class.""" + + class UserInputError(Exception): """ Exception representing invalid user input such as command line arguments. + Alternatively, this can also represented a problem caused by invalid user input further down the line. Simply put, the problem can / must be fixed by modifying the user input. @@ -8,8 +12,17 @@ class UserInputError(Exception): Attributes: message {string} -- Error message to print. code {int} -- Exit code to use. + """ def __init__(self, message, code=1): + """ + Initialize a new user input error instance. + + Arguments: + message {string} -- Message to display. + code {int} -- Preferred exit code (only if application exits). + + """ self.message = message self.code = code diff --git a/engine/nodes/__init__.py b/engine/nodes/__init__.py index e69de29..d219c1f 100644 --- a/engine/nodes/__init__.py +++ b/engine/nodes/__init__.py @@ -0,0 +1 @@ +"""Package containing classes for various AST node representations and their metadata.""" diff --git a/engine/nodes/nodeorigin.py b/engine/nodes/nodeorigin.py new file mode 100644 index 0000000..4f6759c --- /dev/null +++ b/engine/nodes/nodeorigin.py @@ -0,0 +1,63 @@ +"""Module containing the `NodeOrigin` class used to store node origin info.""" + + +class NodeOrigin: + """ + Class representing the origin of an AST node. + + Attributes: + file {string} -- Source file from which the node originates. + line {int|None} -- Line number at which the node was found. + col_offset {int|None} -- Column offset within the line. + Number of characters on the same + line before the node's token. + + """ + + def __init__(self, file_path, line=None, col_offset=None): + """ + Initialize a new node origin instance. + + Arguments: + file_path {string} -- Path to the node's source file. + + Keyword Arguments: + line {int} -- Line number of node's origin. (default: {None}) + col_offset {int} -- Column offset of node. (default: {None}) + + Raises: + ValueError -- When file path is None or when only one of the two + source position specifiers is not None. + + """ + if file_path is None: + raise ValueError( + "File path must always be set to a non-None value") + + if (line is None) != (col_offset is None): + raise ValueError( + "Either both line number and column offset must be set or neither") + + self.file = file_path + self.line = line + self.col_offset = col_offset + + def __str__(self): + """Convert the node origin into a human-readable string representation.""" + return self.file + (f" (L: {self.line} C: {self.col_offset})" + if self.line and self.col_offset else "") + + def __repr__(self): + """Return a string representation of the node origin.""" + return self.__str__() + + def __hash__(self): + """ + Get hash of the node origin. + + The `id` of the node origin is used right now, so two equivalent + node origins may not necessarily have the same hash. + That would be a problem normally, but it works fine in this project. + + """ + return hash(id(self)) diff --git a/engine/nodes/PatternNode.py b/engine/nodes/pattern.py similarity index 79% rename from engine/nodes/PatternNode.py rename to engine/nodes/pattern.py index 6c07519..441c211 100644 --- a/engine/nodes/PatternNode.py +++ b/engine/nodes/pattern.py @@ -1,3 +1,5 @@ +"""Module containing the `PatternNode` class.""" + _HOLE = "Hole" @@ -9,17 +11,19 @@ class PatternNode: nodes {list[TreeNode]} -- List of TreeNodes with the same skeleton. value {string} -- Common string representation of all the nodes. children {list[PatternNode]} -- List of node's direct children. + """ def __init__(self, node1, node2, value=None): """ - Creates a new PatternNode from two nodes and their common value. + Create a new PatternNode from two nodes and their common value. Arguments: node1 {TreeNode} -- First TreeNode sharing common skeleton. node2 {TreeNode} -- Second TreeNode sharing common skeleton. value {string} -- String representation common for all the nodes. None if the PatternNode represents a hole. + """ self.nodes = [node1, node2] self.value = value or _HOLE @@ -27,33 +31,35 @@ def __init__(self, node1, node2, value=None): def add_node(self, node): """ - Appends the supplied nodes to this node's list of origin nodes. + Append the supplied nodes to this node's list of origin nodes. Arguments: node {TreeNode} -- Node to be added to the list of origin nodes. + """ self.nodes.append(node) def add_child(self, child): """ - Appends the supplied nodes to this node's list of child nodes. + Append the supplied nodes to this node's list of child nodes. Arguments: child {PatternNode} -- Node that is a child of this node. + """ self.children.append(child) def skeleton_equals(self, other): """ - Checks if this node's skeleton is equal to another node's. + Check if this node's skeleton is equal to another node's. Arguments: other {PatterNode} -- Another node to compare this one with. Returns: bool -- True if the nodes have an equal skeleton, False otherwise. - """ + """ if not isinstance(other, PatternNode) or other.value != self.value or \ len(other.children) != len(self.children): return False @@ -66,17 +72,20 @@ def skeleton_equals(self, other): def get_match_weight(self): """ - Calculates the weight of the matching skeleton of all origin nodes. + Calculate the weight of the matching skeleton of all origin nodes. Returns: int -- Weight of the matching skeleton. - """ + """ return 0 if self.value == _HOLE else \ (1 + sum([c.get_match_weight() for c in self.children])) def __str__(self): + """Convert the pattern node into a human-readable string.""" + # FIXME: This doesn't seem right. return f"{self.value}(', '.join{[n.origin for n in self.nodes]})" def __repr__(self): + """Return string representation of the pattern node.""" return self.__str__() diff --git a/engine/nodes/TreeNode.py b/engine/nodes/tree.py similarity index 80% rename from engine/nodes/TreeNode.py rename to engine/nodes/tree.py index 130c4c5..825da21 100644 --- a/engine/nodes/TreeNode.py +++ b/engine/nodes/tree.py @@ -1,4 +1,7 @@ +"""Module containing the `TreeNode` class.""" + import ast +from .nodeorigin import NodeOrigin _IGNORE_CLASSES = [ast.Load, ast.Store, ast.Del, ast.AugLoad, ast.AugStore, ast.Param] @@ -7,12 +10,13 @@ class TreeNode: """ Represents a single node of the Python code AST (Abstract Syntax Tree). + Every node is also a tree of its own, with the exception of leaf (childless) nodes. Attributes: node {AST} -- Original AST node generated by Python's built-in parser. - origin {string} -- Origin of the node (file path, line and column). + origin {NodeOrigin} -- Origin of the node (file path, line and column). children {list[TreeNode]} -- List of direct children of this node. weight {int} -- Total number of nodes in this node's tree. names {list[string]} -- All names / symbols used in this node's tree. @@ -20,17 +24,21 @@ class TreeNode: index {int} -- Index of this node (in an external flat list of nodes). parent_index {int} -- Index of parent node. None if this is root node. child_indices {list[int]} -- Indices of this node's direct children. + """ def __init__(self, node, origin_file): """ + Initialize a new tree node instance. + Arguments: node -- Single raw node produced by the Python AST parser. origin_file {string} -- Relative path to the source file. + """ self.node = node - self.origin = origin_file + (f" (L:{node.lineno} C:{node.col_offset})" - if node._attributes else f" (ID:{id(node):x})") + self.origin = NodeOrigin(origin_file, node.lineno, node.col_offset) \ + if node._attributes else NodeOrigin(origin_file) # Check if this type of node can have docstring. can_have_docstring = node.__class__ in [ast.ClassDef, ast.FunctionDef] @@ -68,20 +76,21 @@ def __init__(self, node, origin_file): def dump(self): """ - Converts the node into a string using the built-in function. + Convert the node into a string using the built-in function. Returns: string -- String representation of the AST node. + """ return ast.dump(self.node) def get_all_children(self): """ - Recursively finds all children of the node - and collects them into a single list. + Find all children of the node recursively and collect them into a single list. Returns: list[TreeNode] -- List of all the recursively found children. + """ children = self.children.copy() @@ -92,7 +101,8 @@ def get_all_children(self): def __eq__(self, other): """ - Compares the node to another node recursively. + Compare the node to another node recursively. + This operator overload can be used for Type 1 clone detection. Arguments: @@ -100,6 +110,7 @@ def __eq__(self, other): Returns: bool -- True if the nodes are equivalent, False if they are not. + """ if not isinstance(other, TreeNode): return False @@ -117,13 +128,23 @@ def __eq__(self, other): return True def __ne__(self, other): + """Check if this node is not equal to another tree node.""" return not self.__eq__(other) def __str__(self): + """Convert the tree node into a human-readable string.""" return f"{self.origin} - {self.value} (W={self.weight})" def __repr__(self): + """Return string representation of this tree node.""" return self.__str__() def __hash__(self): + """ + Get the tree node's hash. + + The origin's hash is used for the whole node, + so if two different nodes somehow have the same origin, + it will cause the nodes to be treated as equal by hash-based types. + """ return hash(self.origin) diff --git a/engine/preprocessing/__init__.py b/engine/preprocessing/__init__.py index e69de29..7301afe 100644 --- a/engine/preprocessing/__init__.py +++ b/engine/preprocessing/__init__.py @@ -0,0 +1 @@ +"""Package containing functions and classes for source code preprocessing.""" diff --git a/engine/preprocessing/args_handler.py b/engine/preprocessing/args_handler.py deleted file mode 100644 index 35dbd6a..0000000 --- a/engine/preprocessing/args_handler.py +++ /dev/null @@ -1,41 +0,0 @@ -from .repo_cloner import get_repo_or_dir -from ..errors.UserInputError import UserInputError - -_USAGE_TEXT = """\ -Usage: - python3 -m code_duplication - Repository comparison mode - python3 -m code_duplication - Single repository mode - -Valid repository path formats: - Short GitHub repository path - username/repository - Full remote repository path - https://github.com/username/repository - Absolute or relative local directory path - /home/user/directory - Short path of an already cloned repository - github/username/repository""" - - -def handle_args(argv): - """ - Checks the command line arguments and handles them. - If there is any problem, an error message will be printed - and the script will exit with a non-zero exit code. - If everything goes right, tuple of local repository paths will be returned. - - Arguments: - argv -- List of command line arguments. - - Returns: - tuple[string] -- Tuple of local repository paths. - """ - - if len(argv) == 1 or (len(argv) == 2 and argv[1] in ['-h', '--help', '--usage']): - # Special case where the usage text is printed using the built-in - # print function instead of the logging library because - # the app exits right after the message is displayed. - print(_USAGE_TEXT) - raise UserInputError(None, 0) - - if len(argv) < 2 or len(argv) > 3: - raise UserInputError( - f"Invalid number of command line arguments: {len(argv) - 1}") - - return tuple(get_repo_or_dir(a) for a in argv[1:]) diff --git a/engine/preprocessing/module_parser.py b/engine/preprocessing/module_parser.py index 3d0e707..0dd26cb 100644 --- a/engine/preprocessing/module_parser.py +++ b/engine/preprocessing/module_parser.py @@ -1,14 +1,16 @@ +"""Module containing code used for parsing of modules and nodes from Python code.""" + import ast from os import listdir, path -from os.path import isdir, isfile -from ..nodes.TreeNode import TreeNode +from os.path import isdir, isfile, relpath +from ..nodes.tree import TreeNode from collections import deque -from .repo_cloner import clone_root_dir, get_repo_info def _read_whole_file(file_path): """ Read a text file into a single string. + Assumes UTF-8 encoding. """ with open(file_path, "r", encoding="utf-8") as f: @@ -17,40 +19,41 @@ def _read_whole_file(file_path): def _read_ast_from_file(file_path): """ - Parses a module AST from the specified file. + Parse a module AST from the specified file. Arguments: file_path {string} -- Path of file to parse the AST from. Returns: AST parsed from the specified file. + """ return ast.parse(_read_whole_file(file_path)) -def _get_tree_node_from_file(file_path): +def _get_tree_node_from_file(file_path, repo_path): """ - Parses a TreeNode representing the module in the specified file. + Parse a TreeNode representing the module in the specified file. Arguments: file_path {string} -- Path of file to parse the TreeNode from. Returns: TreeNode -- TreeNode parsed from the specified file. + """ - module_node = _read_ast_from_file(file_path) - file_rel_path = file_path.replace(clone_root_dir, "...") - return TreeNode(module_node, file_rel_path) + return TreeNode(_read_ast_from_file(file_path), + relpath(file_path, repo_path)) def _recursive_listdir_py(directory): """ - Returns relative paths of all *.py files in the specified directory. + Return relative paths of all *.py files in the specified directory. + If the provided argument is not a valid directory, an internal exception will be thrown by Python. That exception will most likely be NotImplementedError. """ - files = [] for item in listdir(directory): @@ -66,13 +69,14 @@ def _recursive_listdir_py(directory): def _flatten_module_nodes(module): """ - Converts a module TreeNode into a flat list of nodes in the module's AST. + Convert a module TreeNode into a flat list of nodes in the module's AST. Arguments: module {TreeNode} -- TreeNode representing a module root node. Returns: list[TreeNode] -- List of all the nodes in the module's AST. + """ module_nodes = [] node_queue = deque([module]) @@ -103,7 +107,8 @@ def _flatten_module_nodes(module): def get_modules_from_dir(directory): """ - Finds all *.py files in the specified directory recursively. + Find all *.py files in the specified directory recursively. + Every file is parsed as a module and converted into an AST. The parsed ASTs are converted into lists of all nodes in the ASTs. A list of all these lists is then constructed a returned. @@ -113,24 +118,7 @@ def get_modules_from_dir(directory): Returns: list[list[TreeNode]] -- List of lists of nodes from parsed modules. - """ - - return [_flatten_module_nodes(_get_tree_node_from_file(f)) - for f in _recursive_listdir_py(directory)] - - -def get_repo_modules_and_info(repo): - """ - Clones the repository or finds its directory and then finds - all modules inside of that directory and returns them. - - Arguments: - repo {string} -- Repository path. - Returns: - list[list[TreeNode]] -- List of lists of nodes from parsed modules. - ClonedRepo -- Information about the cloned repository. """ - - info = get_repo_info(repo) - return get_modules_from_dir(info.dir) if info else None, info + return [_flatten_module_nodes(_get_tree_node_from_file(f, directory)) + for f in _recursive_listdir_py(directory)] diff --git a/engine/preprocessing/repo_cloner.py b/engine/preprocessing/repo_cloner.py deleted file mode 100644 index eee35b7..0000000 --- a/engine/preprocessing/repo_cloner.py +++ /dev/null @@ -1,121 +0,0 @@ -import re -from os import path, makedirs -from os.path import isdir, dirname -from git import Repo, InvalidGitRepositoryError, GitCommandError -from engine import __file__ as base_path -from ..errors.UserInputError import UserInputError -from ..utils.config import config - -# Base directory for all cloned repositories is "[main module root directory]/repos/". -clone_root_dir = path.join(dirname(base_path), "repos") - - -class RepoInfo: # TODO: Add docstrings. - def __init__(self, url, server, user, name): - self.url = url - self.server = server - self.user = user - self.name = name - - self.dir = path.join(clone_root_dir, server, user, name) - self.hash = None - - -def _clone_repo(repo_url): - """ - Clones the specified repository into a special internal directory and - returns the directory path of the cloned repository. - - Arguments: - repo_url {string} -- URL of the repository to clone. - - Returns: - ClonedRepo -- Information about the cloned repository. - """ - - # Make sure the base clone dir exists. - makedirs(clone_root_dir, exist_ok=True) - - # NOTE: Only standard GitHub and GitLab are currently properly supported. - match = re.fullmatch( - r"^(?:https?://)?(?:[\w\-\.]*\.)?([\w\-]+)\.\w{1,10}/([\w\-]+)/([\w\-]+)(?:/?\.git)?/?$", repo_url) - - if not match: - return None - - info = RepoInfo(repo_url, match[1], match[2], match[3]) - - try: - # If repo dir already exists, pull it. - if isdir(info.dir): - repo = Repo(info.dir) - repo.remotes.origin.pull() - - # If the repo hasn't been cloned yet, clone it. - else: - repo = Repo.clone_from(info.url, info.dir) - - # Get HEAD's hash and store it in repo info. - info.hash = repo.head.object.hexsha - - except InvalidGitRepositoryError: - return None - - except GitCommandError: - return None - - return info - - -def _clone_github_short(short_path): # TODO: Add docstring. - if re.fullmatch(r"^[\w\-]+/[\w\-]+(?:\.git)?$", short_path): - return _clone_repo("https://github.com/" + short_path) - else: - return None - - -def get_repo_or_dir(repo): - """ - Attempts to process the given repository path in many different ways. - If all of them fail, an error message will be printed and - the script with exit with a non-zero exit code. - If one of them succeeds, local path of the repository will be returned. - - Arguments: - repo {string} -- Path to the repository or local directory. - - Returns: - string -- Local path to the repository's directory. - """ - - # TODO: This option should probably be removed in the future. - # It is more confusing than it is practical now. - - # Path of a previously cloned repository: "[server]/[user]/[repo name]" - repo_dir_by_name = path.join(clone_root_dir, repo) - if re.fullmatch(r"^[\w\-]+/[\w\-]+/[\w\-]+$", repo) and isdir(repo_dir_by_name): - return repo_dir_by_name - - # Shorthand for GitHub URLs: "[repository owner]/[repository name]" - repo_info = _clone_github_short(repo) - if repo_info: - return repo_info.dir - - # Local directory path - if isdir(repo): - if config.allow_local_access: - return repo - else: - raise UserInputError( - f"Access to local directory denied: \"{repo}\"") - - # Full remote repository URL - repo_info = _clone_repo(repo) - if repo_info: - return repo_info.dir - - raise UserInputError(f"Invalid repository path: \"{repo}\"") - - -def get_repo_info(repo): # TODO: Add docstring. - return _clone_github_short(repo) or _clone_repo(repo) diff --git a/engine/preprocessing/repoinfo.py b/engine/preprocessing/repoinfo.py new file mode 100644 index 0000000..c582b7c --- /dev/null +++ b/engine/preprocessing/repoinfo.py @@ -0,0 +1,131 @@ +"""Module containing the `RepoInfo` class.""" + +import re +from os.path import isdir, dirname, join as path_join +from git import Repo, InvalidGitRepositoryError, GitCommandError +from engine import __file__ as base_path +from urllib.parse import urlparse, urlunparse + +# Base directory for all cloned repositories is "[main module root directory]/repos/". +clone_root_dir = path_join(dirname(base_path), "repos") + + +class RepoInfo: + """ + Encapsulates all available information about a repository into a single object. + + Attributes: + url {string} -- Full remote source URL of the repository. + server {string} -- Name of the source server (e.g., "github.com"). + user {string} -- Username of the repository owner. + name {string} -- Name of the repository on the server. + dir {string} -- Path to the local clone of the repository. + hash {string} -- Hash of the last pulled commit. + + """ + + def __init__(self, url, server, user, name, local_dir, commit_hash=None): + """ + Initialize a new repository information object. + + Arguments: + url {string} -- Full remote source URL of the repository. + server {string} -- Name of the source server (e.g., "github.com"). + user {string} -- Username of the repository owner. + name {string} -- Name of the repository on the server. + local_dir {string} -- Path to the local clone of the repository. + commit_hash {string} -- Hash of the last pulled commit. + + """ + self.url = url + self.server = server + self.user = user + self.name = name + self.dir = local_dir + self.hash = commit_hash + + def clone_or_pull(self): + """Clone the repository or pull it if it has already been cloned.""" + try: + # If repo dir already exists, pull it. + if isdir(self.dir): + repo = Repo(self.dir) + repo.remotes.origin.pull() + + # If the repo hasn't been cloned yet, clone it. + else: + repo = Repo.clone_from(self.url, self.dir) + + # Get HEAD's hash and store it in repo info. + self.hash = repo.head.object.hexsha + return True + + except InvalidGitRepositoryError: + return False + + except GitCommandError: + return False + + @staticmethod + def parse_repo_info(repo_path): + """ + Parse repository information from a repository path. + + There are two valid repository path formats: + - Full remote repository URL (supports both GitHub and GitLab). + "https://github.com/user/repo" + - Short GitHub repository URL (only works with GitHub). + "user/repo" + + """ + try: + parts = urlparse(repo_path) + except ValueError: + return None + + if parts.username or parts.password or parts.params or parts.query or \ + parts.fragment or parts.scheme not in {"https", "http", ""}: + return None + + path_match = re.fullmatch( + r"/*([\w\-\.]+)/*([\w\-\.]+?)(?:\.git)?/*", parts.path) + + if not path_match: + # If there is no scheme, try to prepend HTTPS + return None if parts.scheme else \ + RepoInfo.parse_repo_info("https://" + repo_path) + + repo_user = path_match[1] + repo_name = path_match[2] + + scheme = parts.scheme or "https" + server = parts.netloc or "github.com" + + server_regex = re.compile(r"^(?:www\.)?(git(?:hub|lab)\.com)$", + re.IGNORECASE) + + server_match = server_regex.fullmatch(server) + if parts.netloc and server_match: + scheme = "https" + server = server_match[1].lower() + + # Inserting ":@" before hostname prevents a username/password prompt. + full_url = urlunparse((scheme, ":@" + server, + f"/{repo_user}/{repo_name}", "", "", "")) + + clone_dir = path_join(clone_root_dir, server, repo_user, repo_name) + + return RepoInfo(full_url, server, repo_user, repo_name, clone_dir) + + def __str__(self): + """Convert the most useful repo info into a human-readable string.""" + info_str = f"{self.url} -> {self.dir}" + + if self.hash: + info_str += f" (commit: {self.hash})" + + return info_str + + def __repr__(self): + """Return string representation of the repository information.""" + return self.__str__() diff --git a/engine/results/__init__.py b/engine/results/__init__.py index e69de29..b4d050a 100644 --- a/engine/results/__init__.py +++ b/engine/results/__init__.py @@ -0,0 +1 @@ +"""Package containing classes for storing clone detection results.""" diff --git a/engine/results/DetectedClone.py b/engine/results/detected_clone.py similarity index 83% rename from engine/results/DetectedClone.py rename to engine/results/detected_clone.py index 052d164..f3c1c47 100644 --- a/engine/results/DetectedClone.py +++ b/engine/results/detected_clone.py @@ -1,6 +1,9 @@ +"""Module containing the `DetectedClone` class for storing clone information.""" + + class DetectedClone: """ - Represents a single detected code clone. + Representation of a single detected code clone. Similarity coefficient is a floating-point number between 0 and 1, where 0 means the subtrees are completely different and 1 means @@ -12,26 +15,27 @@ class DetectedClone: origins {dict[string: float]} -- Origins and similarity coefficients. Origins are used for keys. Similarity coefficients are values. + """ def __init__(self, value, match_weight, nodes): """ - Initializes a new detected clone - given its values and origin nodes. + Initialize a new detected clone given its values and origin nodes. Arguments: value {string} -- String representation common to all the nodes. match_weight {int} -- Weight of the matching subtree skeleton. nodes {list[TreeNode]} -- List of origin nodes. - """ + """ self.value = value self.match_weight = match_weight self.origins = {n.origin: match_weight / n.weight for n in nodes} def dict(self): """ - Converts the detected clone into its dictionary representation. + Convert the detected clone into its dictionary representation. + This is necessary for later conversion to JSON, because there is no easy way to tell the JSON encoder how to encode instances of user-defined classes. @@ -39,6 +43,6 @@ def dict(self): Returns: dict -- Dictionary representation of the detected clone, including all of its attributes. - """ + """ return self.__dict__ diff --git a/engine/results/DetectionResult.py b/engine/results/detection_result.py similarity index 79% rename from engine/results/DetectionResult.py rename to engine/results/detection_result.py index d1e286f..ead0aee 100644 --- a/engine/results/DetectionResult.py +++ b/engine/results/detection_result.py @@ -1,9 +1,11 @@ +"""Module containing the `DetectionResult` class for final results of detection.""" + from json import dumps as json_dumps class DetectionResult: """ - Represents the final result of a detection query. + Representation of the final result of a detection query. For now it only contains a list of detected clones, but more information may be added in the future. @@ -11,28 +13,32 @@ class DetectionResult: Attributes: clones {list[DetectedClone]} -- List of detected code clones ordered by their significance (see __init__). + """ def __init__(self, clones): """ - Initializes a new detection result given the list of detected clones. + Initialize a new detection result given the list of detected clones. + The list of code clones will be copied and the copy will be sorted by the clones' weight of their matching subtrees. The original list of clones will not be modified in any way. Arguments: clones {list[Detectedlone]} -- List of detected code clones. - """ + """ self.clones = clones.copy() self.clones.sort(reverse=True, key=lambda c: c.match_weight) def json(self): """ - Converts the detection result into a JSON. + Convert the detection result into a JSON. + This includes information about all detected code clones. Returns: string -- JSON representation of the detection result. + """ return json_dumps([c.dict() for c in self.clones]) diff --git a/engine/utils/__init__.py b/engine/utils/__init__.py index e69de29..8c02ab6 100644 --- a/engine/utils/__init__.py +++ b/engine/utils/__init__.py @@ -0,0 +1 @@ +"""Package containing helper functions that do not fit into any existing category.""" diff --git a/engine/utils/benchmark.py b/engine/utils/benchmark.py index 0e136ad..3b636dc 100644 --- a/engine/utils/benchmark.py +++ b/engine/utils/benchmark.py @@ -1,3 +1,5 @@ +"""Module containing helper functions used for benchmarking.""" + from time import time from fastlog import log @@ -6,11 +8,13 @@ def time_snap(text=None): """ - Prints the time since the last call to this function in seconds. + Print the time since the last call to this function in seconds. + It is possible to supply a message to print along with the time. Arguments: text {str} (optional) -- Message to print with the time. + """ global _last_time current_time = time() diff --git a/engine/utils/config.py b/engine/utils/config.py deleted file mode 100644 index 04b51ba..0000000 --- a/engine/utils/config.py +++ /dev/null @@ -1,8 +0,0 @@ -class _Config: - def __init__(self): - # Boolean value that allows or denies access to local file system. - self.allow_local_access = True - - -# Global app configuration -config = _Config() diff --git a/engine/utils/list_tools.py b/engine/utils/list_tools.py deleted file mode 100644 index 8c36320..0000000 --- a/engine/utils/list_tools.py +++ /dev/null @@ -1,16 +0,0 @@ -def flatten(list_of_lists): - """ - Flattens a list of list into a single flat list. - - Arguments: - list_of_lists {list[list[T]]} -- List of lists to flatten. - - Returns: - list[T] -- Flat list generated by flattening the source list of lists. - """ - flat = [] - - for l in list_of_lists: - flat.extend(l) - - return flat diff --git a/engine/utils/printing.py b/engine/utils/printing.py index d511afd..35cdd69 100644 --- a/engine/utils/printing.py +++ b/engine/utils/printing.py @@ -1,9 +1,13 @@ +"""Module containing methods for pretty-printing node trees.""" + + def print_node_list(node_list): """ - Prints a list of TreeNodes for debugging + Print a list of TreeNodes for debugging. Arguments: node_list (list[TreeNode]): a list of tree nodes + """ for node in node_list: if node.parent_index is None: @@ -12,13 +16,14 @@ def print_node_list(node_list): def print_node(node, indent, level, node_list): """ - Prints a TreeNode for debugging + Print a TreeNode for debugging. Arguments: node (TreeNode): node to print indent (str): space to print before node level (int): depth of node within the tree (0 for root) node_list (list[TreeNode]): list of TreeNodes to reference children of TreeNode + """ print(indent, "(", level, ")", node) for index in node.child_indices: diff --git a/profiler.bat b/profiler.bat index 36a7e7d..957ec5a 100644 --- a/profiler.bat +++ b/profiler.bat @@ -1 +1 @@ -python -m cProfile -o profiling.dat -m code_duplication > output.txt +python -m cProfile -o profiling.dat -m cli > output.txt diff --git a/requirements.txt b/requirements.txt index 1931992..48546b6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,12 @@ -# GitPython - Package used for cloning Git(Hub) repositories +# Interface for working with Git repositories gitpython -# Bit String - Package used for "work list" array in Lee's algorithm +# "Work list" binary table in Lee's algorithm (Iodine) bitstring # Fastlog uses the "curses" package, which is part of stdlib on Linux, but not on Windows windows-curses; platform_system == 'Windows' -# Used for better-looking logging +# Prettier, uniform log messages fastlog -# Front-end +# Web UI back-end flask +# Web UI database +easy-postgres diff --git a/test/__init__.py b/test/__init__.py index e69de29..81fb5b6 100644 --- a/test/__init__.py +++ b/test/__init__.py @@ -0,0 +1 @@ +"""Package containing tests for all parts of this application (engine, interfaces).""" diff --git a/web/__init__.py b/web/__init__.py index e69de29..f022c52 100644 --- a/web/__init__.py +++ b/web/__init__.py @@ -0,0 +1 @@ +"""Package containing implementation of the web interface.""" diff --git a/web/__main__.py b/web/__main__.py index 6475d15..860be11 100644 --- a/web/__main__.py +++ b/web/__main__.py @@ -1,3 +1,5 @@ +"""Entry point of the web interface application.""" + from .app import app if __name__ == "__main__": diff --git a/web/app.py b/web/app.py index 4241e21..d03eb52 100644 --- a/web/app.py +++ b/web/app.py @@ -1,149 +1,166 @@ +"""Module containing the core of the web UI application.""" + import os.path from threading import Thread +from traceback import format_exc from flask import Flask, request from fastlog import log -from psycopg2 import connect, Error as PG_Error -from engine.preprocessing.module_parser import get_repo_modules_and_info -from engine.algorithms.algorithm_runner import run_single_repo, OXYGEN #, CHLORINE, IODINE -from engine.utils.config import config -from engine.errors.UserInputError import UserInputError -from .credentials import conn_str - -# Disable access to local file system -config.allow_local_access = False +from psycopg2 import Error as PG_Error +from easy_postgres import Connection as pg_conn +from engine.preprocessing.repoinfo import RepoInfo +from engine.nodes.nodeorigin import NodeOrigin +from engine.preprocessing.module_parser import get_modules_from_dir +from engine.algorithms.algorithm_runner import run_single_repo, OXYGEN +from engine.errors.user_input import UserInputError +from .credentials import db_url app = Flask(__name__) +# Clean up the repository table +with pg_conn(db_url) as conn: + conn.run("""UPDATE repos SET status = (SELECT id FROM states WHERE name = 'err_analysis') WHERE status = (SELECT id FROM states WHERE name = 'queue');""") + def _read_html(file_name): file_path = os.path.join(os.path.dirname(__file__), file_name + ".html") with open(file_path, "r", encoding="utf-8") as f: return f.read() - + _INDEX_HTML = _read_html("index") _MESSAGE_HTML = _read_html("message") _RESULTS_HTML = _read_html("results") -def _analyze_repo(repo): - try: - conn = connect(conn_str) - cur = conn.cursor() +def _postgres_err(ex): + log.error(f"PostgreSQL: {ex}\n{format_exc()}") - modules, repo_info = get_repo_modules_and_info(repo) - if not modules or not repo_info: - log.error("Unable to get the repository information") - return +def _pg_error_handler(ex, conn, repo_id): + _postgres_err(ex) - cur.execute("""SELECT COUNT(*) FROM repos WHERE url = %s OR dir = %s OR ("server" = %s AND "user" = %s AND "name" = %s);""", - (repo_info.url, repo_info.dir, repo_info.server, repo_info.user, repo_info.name)) + if conn and repo_id is not None: + conn.run("""UPDATE repos SET status = (SELECT id FROM states WHERE name = 'err_analysis') WHERE id = %s;""", + repo_id) - count = cur.fetchone()[0] - if count: - return +def _analyze_repo(repo_info, repo_id, algorithm=OXYGEN): + log.info(f"Analyzing repository: {repo_info}") - cur.execute("""INSERT INTO repos ("url", "dir", "server", "user", "name") VALUES (%s, %s, %s, %s, %s) RETURNING id;""", - (repo_info.url, repo_info.dir, repo_info.server, repo_info.user, repo_info.name)) + try: + conn = pg_conn(db_url) - repo_id = cur.fetchone()[0] + if repo_info.clone_or_pull(): + log.success( + f"Repository has been successfully cloned: {repo_info}") - cur.execute( - """INSERT INTO commits (repo_id, hash) VALUES (%s, %s) RETURNING id;""", (repo_id, repo_info.hash)) + else: + log.warning(f"Unable to clone repository: {repo_info}") - commit_id = cur.fetchone()[0] + conn.run("""UPDATE repos SET status = (SELECT id FROM states WHERE name = 'err_clone') WHERE id = %s;""", + repo_id) - conn.commit() + return + + modules = get_modules_from_dir(repo_info.dir) + + if not modules: + log.warning("Repository contains no Python module") + return - result = run_single_repo(modules, OXYGEN) + result = run_single_repo(modules, algorithm) - for c in result.clones: - cur.execute("""INSERT INTO clusters (commit_id, "value", weight) VALUES (%s, %s, %s) RETURNING id;""", - (commit_id, c.value, c.match_weight)) + # Insert repository analysis into database all at once + with conn.transaction(): + commit_id = conn.one("""INSERT INTO commits (repo_id, hash) VALUES (%s, %s) RETURNING id;""", + repo_id, repo_info.hash) - cluster_id = cur.fetchone()[0] + for c in result.clones: + cluster_id = conn.one("""INSERT INTO clusters (commit_id, "value", weight) VALUES (%s, %s, %s) RETURNING id;""", + commit_id, c.value, c.match_weight) - for o, s in c.origins.items(): - cur.execute( - """INSERT INTO clones (cluster_id, origin, similarity) VALUES (%s, %s, %s);""", (cluster_id, o, s)) + for o, s in c.origins.items(): + conn.run("""INSERT INTO origins (cluster_id, file, line, col_offset, similarity) VALUES (%s, %s, %s, %s, %s);""", + cluster_id, o.file, o.line, o.col_offset, s) - cur.execute( - """UPDATE commits SET finished = TRUE WHERE id = %s;""", (commit_id,)) + log.success(f"Repository has been successfully analyzed: {repo_info}") - conn.commit() + conn.run("""UPDATE repos SET status = (SELECT id FROM states WHERE name = 'done') WHERE id = %s;""", + repo_id) except PG_Error as ex: - log.error("PostgreSQL: " + str(ex)) + _pg_error_handler(ex, conn, repo_id) finally: - if conn: - cur.close() - conn.close() + conn.close() -def _get_repo_analysis(repo): # TODO: Add docstring. - try: - conn = connect(conn_str) - conn.autocommit = True - - cur = conn.cursor() +def _find_repo_results(conn, repo_id): + commit_id = conn.one("""SELECT id FROM commits WHERE repo_id = %s ORDER BY analyzed_at DESC LIMIT 1;""", + repo_id) - cur.execute( - """SELECT id FROM repos WHERE "url" = %s OR "name" = %s;""", (repo, repo)) + if commit_id is None: + return "No commit has been analyzed yet for this repository" - repos = cur.fetchall() + clusters = conn.all_dict("""SELECT id, "value", weight FROM clusters WHERE commit_id = %s;""", + commit_id) - if repos: - repo_id = repos[0][0] + for c in clusters: + c.origins = [(NodeOrigin(o.file, o.line, o.col_offset), o.similarity) for o in + conn.all_dict("""SELECT file, line, col_offset, similarity FROM origins WHERE cluster_id = %s;""", + c.id)] - cur.execute( - """SELECT id FROM commits WHERE finished AND repo_id = %s;""", (repo_id,)) + return clusters - commits = cur.fetchall() - if commits: - commit_id = commits[0][0] +def _get_repo_analysis(repo_path): + # Strip leading and trailing whitespace from the path and parse repo info. + repo_info = RepoInfo.parse_repo_info(repo_path.strip()) - cur.execute( - """SELECT id, "value", weight FROM clusters WHERE commit_id = %s;""", (commit_id,)) + if not repo_info: + return "Invalid Git repository path format" - clusters = cur.fetchall() + try: + conn = pg_conn(db_url) - output = [] + repo_id = conn.one("""INSERT INTO repos ("url", "server", "user", "name", "dir", "status") """ + + """VALUES (%s, %s, %s, %s, %s, (SELECT id FROM states WHERE name = 'queue')) """ + + """ON CONFLICT DO NOTHING RETURNING id;""", + repo_info.url, repo_info.server, repo_info.user, repo_info.name, repo_info.dir) - for c in clusters: - cur.execute( - """SELECT origin, similarity FROM clones WHERE cluster_id = %s;""", (c[0],)) + if repo_id is not None: + Thread(target=_analyze_repo, args=(repo_info, repo_id)).start() + return "The repository has been added to the queue" - clones = cur.fetchall() + repo = conn.one_dict("""SELECT repos.id, states.name AS "status_name", states.description AS "status_desc" """ + + """FROM repos JOIN states ON (repos.status = states.id) """ + + """WHERE repos.url = %s OR (repos.server = %s AND repos.user = %s AND repos.name = %s) OR repos.dir = %s;""", + repo_info.url, repo_info.server, repo_info.user, repo_info.name, repo_info.dir) - output.append((c, clones)) + # Theoretically, this should never happend, but it's better to check anyways. + if repo is None: + return "Database error" - return output + elif repo.status_name in {"queue", "err_clone", "err_analysis"}: + return repo.status_desc - else: - return "Enqueued" + elif repo.status_name == "done": + return _find_repo_results(conn, repo.id) else: - thread = Thread(target=_analyze_repo, args=(repo,)) - thread.start() - # _analyze_repo(repo) - return "Added to queue" + return "Unexpected repository status" except PG_Error as ex: - log.error("PostgreSQL: " + str(ex)) - return None + _pg_error_handler(ex, conn, repo_id) + return "Database error" finally: - if conn: - cur.close() - conn.close() + conn.close() @app.route("/") -def hello(): +def web_index(): + """Homepage of the web interface.""" content = "" repo = request.args.get("repo") @@ -152,17 +169,17 @@ def hello(): result = _get_repo_analysis(repo) if isinstance(result, str): - content = _MESSAGE_HTML.replace("#MSG#", "Result: " + result) + content = _MESSAGE_HTML.replace("#MSG#", result) elif result: - clones = "
    " + "".join([("
  1. " + c[0][1] + f" - Weight: {c[0][2]}" + "
      " + - "".join(["
    • " + o[0] + f" - Similarity: {o[1] * 100:g} %" + "
    • " for o in c[1]]) + - "
  2. ") for c in result]) + "
" + clones = "
    " + "".join([(f"
  1. {c.value} - Weight: {c.weight}
      " + + "".join([f"
    • {o[0]} - Similarity: {o[1] * 100:g} %
    • " for o in c.origins]) + + "

  2. ") for c in result]) + "
" content = _RESULTS_HTML.replace("#CLONES#", clones) else: content = _MESSAGE_HTML.replace( - "#MSG#", "

No code clones detected. Congratulations!

") + "#MSG#", "No code clones detected. Congratulations!") except UserInputError as ex: content = _MESSAGE_HTML.replace( diff --git a/web/message.html b/web/message.html index 063a983..acc3da9 100644 --- a/web/message.html +++ b/web/message.html @@ -1,3 +1,6 @@ -
- #MSG# + +
+

+ #MSG# +

diff --git a/web/prepare_tables.pgsql b/web/prepare_tables.pgsql index b87b158..0cc6220 100644 --- a/web/prepare_tables.pgsql +++ b/web/prepare_tables.pgsql @@ -1,15 +1,32 @@ -DROP TABLE IF EXISTS clones; +DROP TABLE IF EXISTS origins; DROP TABLE IF EXISTS clusters; DROP TABLE IF EXISTS commits; DROP TABLE IF EXISTS repos; +DROP INDEX IF EXISTS states_name_index; +DROP TABLE IF EXISTS states; + +CREATE TABLE states ( + id SERIAL PRIMARY KEY, + name TEXT UNIQUE NOT NULL, + description TEXT +); + +INSERT INTO states (name, description) VALUES + ('queue', 'The repository is already in the queue'), + ('err_clone', 'Error: Unable to clone the repository'), + ('err_analysis', 'Error: Repository analysis failed'), + ('done', 'The repository has been successfully analyzed'); + +CREATE INDEX states_name_index ON states (name); CREATE TABLE repos ( id SERIAL PRIMARY KEY, url TEXT UNIQUE NOT NULL, - dir TEXT UNIQUE NOT NULL, "server" TEXT NOT NULL, "user" TEXT NOT NULL, "name" TEXT NOT NULL, + dir TEXT UNIQUE NOT NULL, + status INTEGER REFERENCES states(id) NOT NULL, UNIQUE("server", "user", "name") ); @@ -17,8 +34,7 @@ CREATE TABLE commits ( id SERIAL PRIMARY KEY, repo_id INTEGER REFERENCES repos(id) NOT NULL, hash TEXT NOT NULL, - finished BOOLEAN NOT NULL DEFAULT FALSE, - date TIMESTAMP NOT NULL DEFAULT NOW(), + analyzed_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() NOT NULL, UNIQUE(repo_id, hash) ); @@ -29,9 +45,12 @@ CREATE TABLE clusters ( weight INTEGER NOT NULL ); -CREATE TABLE clones ( +CREATE TABLE origins ( id SERIAL PRIMARY KEY, cluster_id INTEGER REFERENCES clusters(id) NOT NULL, - origin TEXT NOT NULL, - similarity FLOAT NOT NULL + file TEXT NOT NULL, + line INTEGER, + col_offset INTEGER, -- column offset (number of characters on the same line before the token) + similarity FLOAT NOT NULL, + UNIQUE(cluster_id, file, line, col_offset) ); diff --git a/web/results.html b/web/results.html index c7ebcd1..4381fd1 100644 --- a/web/results.html +++ b/web/results.html @@ -1,4 +1,4 @@ -
+
Detected clones
#CLONES#