Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
71 commits
Select commit Hold shift + click to select a range
2eb0e68
Add postgres to requirements.txt; Close calebdehaan#92
natiiix Jul 30, 2019
d9ba102
Visual tweak: fit longer origin strings on screen
natiiix Jul 30, 2019
2732b7a
Decrease subtree weight threshold in Oxygen
natiiix Jul 30, 2019
d5caad9
Switch from raw psycopg2 to postgres package / lib
natiiix Jul 30, 2019
f7a3564
Switch from postgres.py to easy-postgres
natiiix Jul 31, 2019
8797e97
Switch postgres for easy-postgres in requirements
natiiix Jul 31, 2019
e493937
Remove print, add warning on duplicate repo
natiiix Jul 31, 2019
a6efd86
Restructure database to include more info
natiiix Jul 31, 2019
3c719ad
Add "valid" column to repos table
natiiix Aug 1, 2019
7707e3e
Move RepoInfo to its own module
natiiix Aug 1, 2019
4368c7f
Implement RepoInfo parser (from repo path)
natiiix Aug 1, 2019
7d60da8
Implement method to clone / pull repo via RepoInfo
natiiix Aug 1, 2019
025484d
Modify repo path regex to ignore ".git" at the end
natiiix Aug 1, 2019
3f2ecfb
Switch from valid bool to status table
natiiix Aug 1, 2019
2bd998b
Remove old repo cloner module
natiiix Aug 1, 2019
04cfaf0
Add new path handler module for path-related logic
natiiix Aug 1, 2019
a91185e
Add check for empty URL params
natiiix Aug 1, 2019
3afb5cf
Remove config import and set up from web app
natiiix Aug 1, 2019
ba7a75a
Remove global config module altogether
natiiix Aug 1, 2019
10c2c8f
Remove unused imports; Add missing class reference
natiiix Aug 1, 2019
ad46862
Switch to new repo path parsing functions
natiiix Aug 1, 2019
15551f3
Replace old repo cloning code reference with new
natiiix Aug 1, 2019
278178a
Move args_handler and path_handler to CLI module
natiiix Aug 1, 2019
5d24f2b
Update code_duplication mentions to new names
natiiix Aug 1, 2019
8e26f7c
Add docstrings to results modules
natiiix Aug 1, 2019
e5607e8
Add docstrings to utils modules
natiiix Aug 1, 2019
63570f7
Fix docstyle in CLI
natiiix Aug 1, 2019
21288fe
Add docstring to web app
natiiix Aug 1, 2019
cbac8c5
Fix docstyle in algorithms module
natiiix Aug 1, 2019
7748fca
Add docstring to UserInputError
natiiix Aug 1, 2019
6215a1b
Fix docstyle in node classes
natiiix Aug 1, 2019
93f55ba
Fix docstyle in preprocessing module
natiiix Aug 1, 2019
25b3b34
Fix docstyle in result classes
natiiix Aug 1, 2019
921908d
Fix docstyle in utils module
natiiix Aug 1, 2019
0a4a6bc
Rename "/" web app route from hello()
natiiix Aug 2, 2019
02a8593
Add reminder to fix PatterNode to-str conversion
natiiix Aug 2, 2019
1c3c7d2
Move results towards center of screen
natiiix Aug 2, 2019
02f990d
Make all messages larger
natiiix Aug 2, 2019
2f56dfa
Split invalid state into errors; Impr. commits tab
natiiix Aug 2, 2019
8751ce2
Implement NodeOrigin instead of using plain string
natiiix Aug 2, 2019
e1c25a8
Make relative origin file paths less verbose
natiiix Aug 2, 2019
10b8958
Add to-str conversion to RepoInfo
natiiix Aug 2, 2019
e2acb66
Switch from raw string to NodeOrigin in TreeNode
natiiix Aug 2, 2019
5ac6754
Rework web app to work with new database layout
natiiix Aug 2, 2019
a6cbb90
Remove node ID from NodeOrigin; get it at runtime
natiiix Aug 2, 2019
616ce8e
Change all module names to lower-case / snake_case
natiiix Aug 5, 2019
2b1297d
Update imports to new lower-case module names
natiiix Aug 5, 2019
8289052
Add missing trailing newlines to docstrings
natiiix Aug 5, 2019
5800695
Add docstrings to NodeOrigin
natiiix Aug 5, 2019
8fe30e9
Add package docstrings to algorithms
natiiix Aug 5, 2019
338dfce
Add missing package docstrings to engine
natiiix Aug 5, 2019
a0ce2a2
Add missing docstrings to engine/nodes/
natiiix Aug 5, 2019
fda89b7
Add ctor docstring to UserInputError
natiiix Aug 5, 2019
85ca144
Add docstring to utils package
natiiix Aug 5, 2019
4bc5822
Partially fix docstrings in Iodine package
natiiix Aug 5, 2019
7f9ce2d
Fix docstrings in Chlorine
natiiix Aug 5, 2019
1a03724
Fix docstrings in Oxygen
natiiix Aug 5, 2019
1865a13
Fix docstrings in nodes package
natiiix Aug 5, 2019
fc8288d
Make .json, .err and .log rules more specific
natiiix Aug 5, 2019
83ce041
Add missing periods (.) to printing.py docstrings
natiiix Aug 5, 2019
862edb5
Add missing docstrings to RepoInfo
natiiix Aug 5, 2019
65f0165
Add missing docstrings to CLI package
natiiix Aug 5, 2019
f29c721
Fix docstrings in web/ and test/ packages
natiiix Aug 5, 2019
2face4d
Fix linter errors and warnings
natiiix Aug 5, 2019
01f599f
Make UserInputError import absolute
natiiix Aug 5, 2019
c1e2654
Add GitHub/GitLab URL sanitization to RepoInfo
natiiix Aug 5, 2019
892fe59
Strip whitespace from repo URL in web app
natiiix Aug 5, 2019
62bf540
Change "representation" class docstring wording
natiiix Aug 6, 2019
b900817
Delte iter_tools module #95
natiiix Aug 6, 2019
1495ef8
Replace flatten(); Close calebdehaan#95
natiiix Aug 6, 2019
c2b90f0
Cast result of chain.from_iterable to list
natiiix Aug 6, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@
engine/repos/
venv/
web/credentials.py
**/*.json
*.err
*.log
clones_*.json
qa/*.err
qa/*.log
2 changes: 1 addition & 1 deletion clear_db.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@

set -e

psql -f web/prepare_tables.pgsql code_duplication
psql -f web/prepare_tables.pgsql cyclone
rm -rf engine/repos/
1 change: 1 addition & 0 deletions cli/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Package containing implementation of the application's CLI (command line interface)."""
6 changes: 6 additions & 0 deletions cli/__main__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
"""
Intended entry point of the application's CLI.

Please use the following command to run the CLI: `python3 -m cli`
"""

from cli.app import main

if __name__ == "__main__":
Expand Down
14 changes: 7 additions & 7 deletions cli/app.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
"""Module containing the CLI's core logic."""

import sys
import os
from datetime import datetime
from engine.preprocessing.args_handler import handle_args
from .args_handler import handle_args
from engine.preprocessing.module_parser import get_modules_from_dir
from engine.algorithms.algorithm_runner import run_two_repos, IODINE
from engine.utils.benchmark import time_snap
from fastlog import log
from engine.errors.UserInputError import UserInputError
from engine.errors.user_input import UserInputError


def main():
"""
Entry point of the application.
"""

"""Entry point of the application."""
try:
# Parse command line arguments
repos = handle_args(sys.argv)
Expand Down Expand Up @@ -46,7 +45,8 @@ def main():
# Create output directory if it doesn't exist and print output
output_path = os.getcwd()
now = datetime.now()
output_filename = "clones_" + f"{now.year}-{now.month}-{now.day}_{now.hour}-{now.minute}-{now.second}" + ".json"
output_filename = "clones_" + \
f"{now.year}-{now.month}-{now.day}_{now.hour}-{now.minute}-{now.second}" + ".json"
os.makedirs(output_path, exist_ok=True)
with open(os.path.join(output_path, output_filename), "w") as output_file:
output_file.write(clones.json())
Expand Down
75 changes: 75 additions & 0 deletions cli/args_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""Module containing functions for handling command-line arguments supplied by the user."""

from os.path import isdir
from engine.errors.user_input import UserInputError
from engine.preprocessing.repoinfo import RepoInfo


_USAGE_TEXT = """\
Usage:
python3 -m cli <first repository> <second repository> - Repository comparison mode
python3 -m cli <repository> - Single repository mode

Valid repository path formats:
Short GitHub repository path - username/repository
Full remote repository path - https://github.com/username/repository
Absolute or relative local directory path - /home/user/directory"""


def repo_path_to_local_path(repo_path):
"""
Convert a repository path into a local file system path.

This repository path is extended (compared to the repository path
used by RepoInfo) by adding support for existing local directories.

The process typically includes checking for a local directory or
parsing a full or short repository URL and then cloning the repository.

Arguments:
repo_path {string} -- Path of a remote repository or a local directory.

Returns:
string -- Path of a local directory equivalent to the one
specified by the repository path.
In case of local directories, the paths are equal.

"""
if isdir(repo_path):
return repo_path

info = RepoInfo.parse_repo_info(repo_path)

if info and info.clone_or_pull():
return info.dir
else:
return None


def handle_args(argv):
"""
Check the command line arguments and handles them.

If there is any problem, an error message will be printed
and the script will exit with a non-zero exit code.
If everything goes right, tuple of local repository paths will be returned.

Arguments:
argv -- List of command line arguments.

Returns:
tuple[string] -- Tuple of local repository paths.

"""
if len(argv) == 1 or (len(argv) == 2 and argv[1] in ['-h', '--help', '--usage']):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now you might start to accept, that the proposed argparse module might be better approach :)

https://docs.python.org/3.5/library/argparse.html

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it most certainly is, especially once we begin to allow the user to specify options for the algorithms. I plan on switching to it as soon as I have some tests for the core/engine.

# Special case where the usage text is printed using the built-in
# print function instead of the logging library because
# the app exits right after the message is displayed.
print(_USAGE_TEXT)
raise UserInputError(None, 0)

if len(argv) < 2 or len(argv) > 3:
raise UserInputError(
f"Invalid number of command line arguments: {len(argv) - 1}")

return tuple(repo_path_to_local_path(a) for a in argv[1:])
1 change: 1 addition & 0 deletions engine/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Package containing the entire code clone detection engine."""
2 changes: 2 additions & 0 deletions engine/algorithms/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Package containing all implemented clone detection algorithms."""

OXYGEN = "oxygen"
CHLORINE = "chlorine"
IODINE = "iodine"
12 changes: 7 additions & 5 deletions engine/algorithms/algorithm_runner.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
"""Module used for algorithm abstraction by providing a common interface."""

from .oxygen.oxygen import oxygen
from .chlorine.chlorine import chlorine_single_repo, chlorine_two_repos
from .iodine.iodine import iodine
from ..errors.UserInputError import UserInputError
from engine.errors.user_input import UserInputError
from . import OXYGEN, IODINE, CHLORINE


def run_single_repo(modules, algorithm):
"""
Runs the specified code clone detection algorithm on a single repository.
Run the specified code clone detection algorithm on a single repository.

Arguments:
modules {list[list[TreeNode]]} -- List of the repo's modules.
Expand All @@ -18,8 +20,8 @@ def run_single_repo(modules, algorithm):

Returns:
DetectionResult -- Result of the code clone detection.
"""

"""
if algorithm == OXYGEN:
return oxygen(modules)
elif algorithm == CHLORINE:
Expand All @@ -30,7 +32,7 @@ def run_single_repo(modules, algorithm):

def run_two_repos(modules1, modules2, algorithm):
"""
Runs the specified code clone detection algorithm on two repositores.
Run the specified code clone detection algorithm on two repositores.

Arguments:
modules1 {list[list[TreeNode]]} -- List of first repo's modules.
Expand All @@ -42,8 +44,8 @@ def run_two_repos(modules1, modules2, algorithm):

Returns:
DetectionResult -- Result of the code clone detection.
"""

"""
if algorithm == CHLORINE:
return chlorine_two_repos(modules1, modules2)
elif algorithm == IODINE:
Expand Down
1 change: 1 addition & 0 deletions engine/algorithms/chlorine/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Package containing the Chlorine algorithm and its helper functions."""
40 changes: 23 additions & 17 deletions engine/algorithms/chlorine/chlorine.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
"""Module containing implementation of the Chlorine algorithm."""

from collections import defaultdict
from itertools import chain
from ...utils.benchmark import time_snap
from ...utils.list_tools import flatten
from ...results.DetectedClone import DetectedClone
from ...results.DetectionResult import DetectionResult
from ...results.detected_clone import DetectedClone
from ...results.detection_result import DetectionResult

# Minimum weight of a single node used in comparison.
_MIN_NODE_WEIGHT = 50
Expand All @@ -24,6 +26,7 @@ def _get_skeleton_recursive(node):
def _can_be_compared(node1, node2):
"""
First get rid of nodes with a weight below the specified threshold.

Checks if two nodes can be possible compared with each other.
In order to be comparable, the nodes must have an equal value
and they must have the exact same number of children.
Expand All @@ -34,6 +37,7 @@ def _can_be_compared(node1, node2):

Returns:
bool -- True if nodes can be compared, False if they cannot.

"""
return \
node1.weight >= _MIN_NODE_WEIGHT and \
Expand All @@ -44,8 +48,9 @@ def _can_be_compared(node1, node2):

def _type1_compare(node1, node2):
"""
Compares two nodes and returns the weight of their matching subtrees
and a skeleton string representing their common syntax tree skeleton.
Compare two nodes and return the weight of their matching subtree.

Also return a string representing their common syntax tree skeleton.

Arguments:
node1 {TreeNode} -- First node.
Expand All @@ -54,8 +59,8 @@ def _type1_compare(node1, node2):
Returns:
int -- Weight of the matching subtrees.
string -- Common skeleton of the two nodes.
"""

"""
combined_weight = node1.weight + node2.weight

if not _can_be_compared(node1, node2):
Expand All @@ -80,17 +85,16 @@ def _type1_compare(node1, node2):

def _compare_internal(n1, n2, ignore_set, match_dict, skeleton_weight_dict):
"""
Common logic shared by single-repo analysis and
two repository comparison mode.
Run common logic shared by single-repo analysis and 2-repo comparison mode.

Arguments:
n1 {TreeNode} -- First node.
n2 {TreeNode} -- Second node.
ignore_set {set[TreeNode]} -- Set of nodes to ignore.
match_dict {dict[string: set[TreeNode]]} -- Origin nodes of matches.
skeleton_weight_dict {dict[string: int]} -- Skeleton weights.
"""

"""
if not _can_be_compared(n1, n2):
return

Expand All @@ -109,13 +113,13 @@ def _compare_internal(n1, n2, ignore_set, match_dict, skeleton_weight_dict):

def _dict_to_result(match_dict, skeleton_weight_dict):
"""
Compiles the detection result together from the input dictionaries.
Compile the detection result together from the input dictionaries.

Arguments:
match_dict {dict[string: set[TreeNode]]} -- Origin nodes of matches.
skeleton_weight_dict {dict[string: int]} -- Skeleton weights.
"""

"""
clones = []

for k, v in match_dict.items():
Expand All @@ -128,8 +132,9 @@ def _dict_to_result(match_dict, skeleton_weight_dict):

def chlorine_single_repo(modules):
"""
Finds all clones satisfying the settings at the top of this source file
in a single repository given its modules.
Find all clones in a single repository given its modules.

Clones must satisfy the settings at the top of this source file.
Detected code clones are printed on STDOUT, including the common skeleton,
path to each clones (source file path, line number, column offset),
size of each clone (number of nodes in its syntax tree) and their
Expand All @@ -140,8 +145,8 @@ def chlorine_single_repo(modules):

Returns:
DetectionResult -- Result of the code clone detection.
"""

"""
time_snap("Function started")

nodes = [m[0] for m in modules]
Expand Down Expand Up @@ -186,7 +191,8 @@ def chlorine_single_repo(modules):

def chlorine_two_repos(modules1, modules2):
"""
Finds code clones between two repositories given their module lists.
Find code clones between two repositories given their module lists.

Clones must satisfy rules defined at the top of this source file.
Detected clones are printed on STDOUT.
See `find_clones_in_repo(repo_url)` for details on output format.
Expand All @@ -197,12 +203,12 @@ def chlorine_two_repos(modules1, modules2):

Returns:
DetectionResult -- Result of the code clone detection.
"""

"""
time_snap("Function started")

repo1_nodes = [m[0] for m in modules1]
repo2_nodes = flatten(modules2)
repo2_nodes = list(chain.from_iterable(modules2))

time_snap("Module lists optimized")

Expand Down
1 change: 1 addition & 0 deletions engine/algorithms/iodine/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Package containing the Iodine algorithm and its helper functions."""
6 changes: 3 additions & 3 deletions engine/algorithms/iodine/anti_unification.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
from ...nodes.PatternNode import PatternNode
from ...nodes.pattern import PatternNode


def anti_unify(list1, list2, index1, index2, worktable):
"""
Creates a tree of PatternNodes from two lists
Create a tree of PatternNodes from two lists.

Arguments:
list1 {list of TreeNodes} -- first tree to be compared
list2 {list of TreeNodes} -- second tree to be compared
index1 {int} -- index of current TreeNode to be compared from list1
index2 {int} -- index of current TreeNode to be compared from list2
worktable {2D boolean array} -- keeps track of which two nodes have been checked together
"""

"""
# mark the pair as checked/True
worktable[index1].set(True, index2)
# determine if subtrees are the same (and lengths same)
Expand Down
9 changes: 6 additions & 3 deletions engine/algorithms/iodine/iodine.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""Module containing the Iodine algorithm's interface."""

from .pattern_collection import pattern_collection
from ...results.DetectedClone import DetectedClone
from ...results.DetectionResult import DetectionResult
from ...results.detected_clone import DetectedClone
from ...results.detection_result import DetectionResult


def iodine(module_list_1, module_list_2):
"""
Finds clones between the two modules by comparing all possible subtrees of
Find clones between the two modules by comparing all possible subtrees of
their methods. Returns the results.

Arguments:
Expand All @@ -14,6 +16,7 @@ def iodine(module_list_1, module_list_2):

Returns:
DetectionResult -- Result of the code clone detection.

"""
clusters = []
for module_tree_1 in module_list_1:
Expand Down
Loading