From 2eb0e68b4307b3f57ca0ff9c16debf8d1b3444e7 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Tue, 30 Jul 2019 13:23:54 +0200 Subject: [PATCH 01/71] Add postgres to requirements.txt; Close calebdehaan#92 --- requirements.txt | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 1931992..5b1bcb0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,12 @@ -# GitPython - Package used for cloning Git(Hub) repositories +# Interface for working with Git repositories gitpython -# Bit String - Package used for "work list" array in Lee's algorithm +# "Work list" binary table in Lee's algorithm (Iodine) bitstring # Fastlog uses the "curses" package, which is part of stdlib on Linux, but not on Windows windows-curses; platform_system == 'Windows' -# Used for better-looking logging +# Prettier, uniform log messages fastlog -# Front-end +# Web UI back-end flask +# Web UI database +postgres From d9ba102ae53b71fc459cc64532208110d4576916 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Tue, 30 Jul 2019 16:14:42 +0200 Subject: [PATCH 02/71] Visual tweak: fit longer origin strings on screen --- web/results.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/results.html b/web/results.html index c7ebcd1..54bdfbd 100644 --- a/web/results.html +++ b/web/results.html @@ -1,4 +1,4 @@ -
+
Detected clones
#CLONES#
From 2732b7a36a75d2739979272fe6d69808f81c1558 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Tue, 30 Jul 2019 16:15:17 +0200 Subject: [PATCH 03/71] Decrease subtree weight threshold in Oxygen --- engine/algorithms/oxygen/oxygen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engine/algorithms/oxygen/oxygen.py b/engine/algorithms/oxygen/oxygen.py index 74a21d9..36d9681 100644 --- a/engine/algorithms/oxygen/oxygen.py +++ b/engine/algorithms/oxygen/oxygen.py @@ -2,7 +2,7 @@ from ...results.DetectionResult import DetectionResult -def oxygen(modules, weight_limit=25): +def oxygen(modules, weight_limit=15): """ Very simple type 1 code duplication check based on AST.dump() function. From d5caad9f893d172ed398d4d455c5d8d86e725065 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Tue, 30 Jul 2019 16:15:42 +0200 Subject: [PATCH 04/71] Switch from raw psycopg2 to postgres package / lib --- web/app.py | 98 ++++++++++++++++++------------------------------------ 1 file changed, 33 insertions(+), 65 deletions(-) diff --git a/web/app.py b/web/app.py index 4241e21..c7c1a33 100644 --- a/web/app.py +++ b/web/app.py @@ -2,12 +2,13 @@ from threading import Thread from flask import Flask, request from fastlog import log -from psycopg2 import connect, Error as PG_Error +from psycopg2 import Error as PG_Error +from postgres import Postgres from engine.preprocessing.module_parser import get_repo_modules_and_info -from engine.algorithms.algorithm_runner import run_single_repo, OXYGEN #, CHLORINE, IODINE +from engine.algorithms.algorithm_runner import run_single_repo, OXYGEN from engine.utils.config import config from engine.errors.UserInputError import UserInputError -from .credentials import conn_str +from .credentials import db_url # Disable access to local file system config.allow_local_access = False @@ -20,7 +21,7 @@ def _read_html(file_name): with open(file_path, "r", encoding="utf-8") as f: return f.read() - + _INDEX_HTML = _read_html("index") _MESSAGE_HTML = _read_html("message") _RESULTS_HTML = _read_html("results") @@ -28,8 +29,7 @@ def _read_html(file_name): def _analyze_repo(repo): try: - conn = connect(conn_str) - cur = conn.cursor() + db = Postgres(db_url) modules, repo_info = get_repo_modules_and_info(repo) @@ -37,87 +37,60 @@ def _analyze_repo(repo): log.error("Unable to get the repository information") return - cur.execute("""SELECT COUNT(*) FROM repos WHERE url = %s OR dir = %s OR ("server" = %s AND "user" = %s AND "name" = %s);""", - (repo_info.url, repo_info.dir, repo_info.server, repo_info.user, repo_info.name)) - - count = cur.fetchone()[0] + count = db.one("""SELECT COUNT(*) FROM repos WHERE url = %s OR dir = %s OR ("server" = %s AND "user" = %s AND "name" = %s);""", + (repo_info.url, repo_info.dir, repo_info.server, repo_info.user, repo_info.name)) if count: return - cur.execute("""INSERT INTO repos ("url", "dir", "server", "user", "name") VALUES (%s, %s, %s, %s, %s) RETURNING id;""", - (repo_info.url, repo_info.dir, repo_info.server, repo_info.user, repo_info.name)) - - repo_id = cur.fetchone()[0] - - cur.execute( - """INSERT INTO commits (repo_id, hash) VALUES (%s, %s) RETURNING id;""", (repo_id, repo_info.hash)) + repo_id = db.one("""INSERT INTO repos ("url", "dir", "server", "user", "name") VALUES (%s, %s, %s, %s, %s) RETURNING id;""", + (repo_info.url, repo_info.dir, repo_info.server, repo_info.user, repo_info.name)) - commit_id = cur.fetchone()[0] - - conn.commit() + commit_id = db.one("""INSERT INTO commits (repo_id, hash) VALUES (%s, %s) RETURNING id;""", + (repo_id, repo_info.hash)) result = run_single_repo(modules, OXYGEN) for c in result.clones: - cur.execute("""INSERT INTO clusters (commit_id, "value", weight) VALUES (%s, %s, %s) RETURNING id;""", - (commit_id, c.value, c.match_weight)) - - cluster_id = cur.fetchone()[0] + cluster_id = db.one("""INSERT INTO clusters (commit_id, "value", weight) VALUES (%s, %s, %s) RETURNING id;""", + (commit_id, c.value, c.match_weight)) for o, s in c.origins.items(): - cur.execute( - """INSERT INTO clones (cluster_id, origin, similarity) VALUES (%s, %s, %s);""", (cluster_id, o, s)) - - cur.execute( - """UPDATE commits SET finished = TRUE WHERE id = %s;""", (commit_id,)) + db.run("""INSERT INTO clones (cluster_id, origin, similarity) VALUES (%s, %s, %s);""", + (cluster_id, o, s)) - conn.commit() + db.run("""UPDATE commits SET finished = TRUE WHERE id = %s;""", + (commit_id,)) except PG_Error as ex: log.error("PostgreSQL: " + str(ex)) - finally: - if conn: - cur.close() - conn.close() - def _get_repo_analysis(repo): # TODO: Add docstring. try: - conn = connect(conn_str) - conn.autocommit = True - - cur = conn.cursor() + db = Postgres(db_url) - cur.execute( - """SELECT id FROM repos WHERE "url" = %s OR "name" = %s;""", (repo, repo)) - - repos = cur.fetchall() + repos = db.all("""SELECT id FROM repos WHERE "url" = %(repo)s OR "name" = %(repo)s;""", + {"repo": repo}) if repos: - repo_id = repos[0][0] - - cur.execute( - """SELECT id FROM commits WHERE finished AND repo_id = %s;""", (repo_id,)) + repo_id = repos[0] - commits = cur.fetchall() + commits = db.all("""SELECT id FROM commits WHERE finished AND repo_id = %s;""", + (repo_id,)) if commits: - commit_id = commits[0][0] + commit_id = commits[0] - cur.execute( - """SELECT id, "value", weight FROM clusters WHERE commit_id = %s;""", (commit_id,)) - - clusters = cur.fetchall() + clusters = db.all("""SELECT * FROM clusters WHERE commit_id = %s;""", + (commit_id,)) output = [] for c in clusters: - cur.execute( - """SELECT origin, similarity FROM clones WHERE cluster_id = %s;""", (c[0],)) - - clones = cur.fetchall() + print(c, c.__class__) + clones = db.all("""SELECT * FROM clones WHERE cluster_id = %s;""", + (c.id,)) output.append((c, clones)) @@ -136,11 +109,6 @@ def _get_repo_analysis(repo): # TODO: Add docstring. log.error("PostgreSQL: " + str(ex)) return None - finally: - if conn: - cur.close() - conn.close() - @app.route("/") def hello(): @@ -154,9 +122,9 @@ def hello(): if isinstance(result, str): content = _MESSAGE_HTML.replace("#MSG#", "Result: " + result) elif result: - clones = "
    " + "".join([("
  1. " + c[0][1] + f" - Weight: {c[0][2]}" + "
      " + - "".join(["
    • " + o[0] + f" - Similarity: {o[1] * 100:g} %" + "
    • " for o in c[1]]) + - "
  2. ") for c in result]) + "
" + clones = "
    " + "".join([("
  1. " + c[0].value + f" - Weight: {c[0].weight}" + "
      " + + "".join(["
    • " + o.origin + f" - Similarity: {o.similarity * 100:g} %" + "
    • " for o in c[1]]) + + "

  2. ") for c in result]) + "
" content = _RESULTS_HTML.replace("#CLONES#", clones) From f7a356414d4023286d00bae48fe660faad61cba9 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Wed, 31 Jul 2019 11:06:26 +0200 Subject: [PATCH 05/71] Switch from postgres.py to easy-postgres --- web/app.py | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/web/app.py b/web/app.py index c7c1a33..9e75afd 100644 --- a/web/app.py +++ b/web/app.py @@ -3,7 +3,7 @@ from flask import Flask, request from fastlog import log from psycopg2 import Error as PG_Error -from postgres import Postgres +from easy_postgres import Connection as pg_conn from engine.preprocessing.module_parser import get_repo_modules_and_info from engine.algorithms.algorithm_runner import run_single_repo, OXYGEN from engine.utils.config import config @@ -29,7 +29,7 @@ def _read_html(file_name): def _analyze_repo(repo): try: - db = Postgres(db_url) + db = pg_conn(db_url) modules, repo_info = get_repo_modules_and_info(repo) @@ -38,29 +38,29 @@ def _analyze_repo(repo): return count = db.one("""SELECT COUNT(*) FROM repos WHERE url = %s OR dir = %s OR ("server" = %s AND "user" = %s AND "name" = %s);""", - (repo_info.url, repo_info.dir, repo_info.server, repo_info.user, repo_info.name)) + repo_info.url, repo_info.dir, repo_info.server, repo_info.user, repo_info.name) if count: return repo_id = db.one("""INSERT INTO repos ("url", "dir", "server", "user", "name") VALUES (%s, %s, %s, %s, %s) RETURNING id;""", - (repo_info.url, repo_info.dir, repo_info.server, repo_info.user, repo_info.name)) + repo_info.url, repo_info.dir, repo_info.server, repo_info.user, repo_info.name) commit_id = db.one("""INSERT INTO commits (repo_id, hash) VALUES (%s, %s) RETURNING id;""", - (repo_id, repo_info.hash)) + repo_id, repo_info.hash) result = run_single_repo(modules, OXYGEN) for c in result.clones: cluster_id = db.one("""INSERT INTO clusters (commit_id, "value", weight) VALUES (%s, %s, %s) RETURNING id;""", - (commit_id, c.value, c.match_weight)) + commit_id, c.value, c.match_weight) for o, s in c.origins.items(): - db.run("""INSERT INTO clones (cluster_id, origin, similarity) VALUES (%s, %s, %s);""", - (cluster_id, o, s)) + db.one("""INSERT INTO clones (cluster_id, origin, similarity) VALUES (%s, %s, %s);""", + cluster_id, o, s) - db.run("""UPDATE commits SET finished = TRUE WHERE id = %s;""", - (commit_id,)) + db.one("""UPDATE commits SET finished = TRUE WHERE id = %s;""", + commit_id) except PG_Error as ex: log.error("PostgreSQL: " + str(ex)) @@ -68,29 +68,29 @@ def _analyze_repo(repo): def _get_repo_analysis(repo): # TODO: Add docstring. try: - db = Postgres(db_url) + db = pg_conn(db_url) repos = db.all("""SELECT id FROM repos WHERE "url" = %(repo)s OR "name" = %(repo)s;""", - {"repo": repo}) + repo=repo) if repos: repo_id = repos[0] commits = db.all("""SELECT id FROM commits WHERE finished AND repo_id = %s;""", - (repo_id,)) + repo_id) if commits: commit_id = commits[0] - clusters = db.all("""SELECT * FROM clusters WHERE commit_id = %s;""", - (commit_id,)) + clusters = db.all_dict("""SELECT id, "value", weight FROM clusters WHERE commit_id = %s;""", + commit_id) output = [] for c in clusters: print(c, c.__class__) - clones = db.all("""SELECT * FROM clones WHERE cluster_id = %s;""", - (c.id,)) + clones = db.all_dict("""SELECT origin, similarity FROM clones WHERE cluster_id = %s;""", + c.id) output.append((c, clones)) @@ -102,7 +102,6 @@ def _get_repo_analysis(repo): # TODO: Add docstring. else: thread = Thread(target=_analyze_repo, args=(repo,)) thread.start() - # _analyze_repo(repo) return "Added to queue" except PG_Error as ex: From 8797e97984c073f933985a82fe98a89d6b82cd02 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Wed, 31 Jul 2019 11:07:50 +0200 Subject: [PATCH 06/71] Switch postgres for easy-postgres in requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5b1bcb0..48546b6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,4 @@ fastlog # Web UI back-end flask # Web UI database -postgres +easy-postgres From e4939376111ddf0276c78716ce6b678f95393a01 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Wed, 31 Jul 2019 11:48:43 +0200 Subject: [PATCH 07/71] Remove print, add warning on duplicate repo --- web/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/app.py b/web/app.py index 9e75afd..5001aa1 100644 --- a/web/app.py +++ b/web/app.py @@ -41,6 +41,7 @@ def _analyze_repo(repo): repo_info.url, repo_info.dir, repo_info.server, repo_info.user, repo_info.name) if count: + log.warning("Repository already present in database:", repo) return repo_id = db.one("""INSERT INTO repos ("url", "dir", "server", "user", "name") VALUES (%s, %s, %s, %s, %s) RETURNING id;""", @@ -88,7 +89,6 @@ def _get_repo_analysis(repo): # TODO: Add docstring. output = [] for c in clusters: - print(c, c.__class__) clones = db.all_dict("""SELECT origin, similarity FROM clones WHERE cluster_id = %s;""", c.id) From a6efd86cc86c2cdebf3e1669fbe725b58bcb8743 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Wed, 31 Jul 2019 13:59:13 +0200 Subject: [PATCH 08/71] Restructure database to include more info --- web/prepare_tables.pgsql | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/web/prepare_tables.pgsql b/web/prepare_tables.pgsql index b87b158..6e641fe 100644 --- a/web/prepare_tables.pgsql +++ b/web/prepare_tables.pgsql @@ -1,4 +1,4 @@ -DROP TABLE IF EXISTS clones; +DROP TABLE IF EXISTS origins; DROP TABLE IF EXISTS clusters; DROP TABLE IF EXISTS commits; DROP TABLE IF EXISTS repos; @@ -17,8 +17,8 @@ CREATE TABLE commits ( id SERIAL PRIMARY KEY, repo_id INTEGER REFERENCES repos(id) NOT NULL, hash TEXT NOT NULL, - finished BOOLEAN NOT NULL DEFAULT FALSE, - date TIMESTAMP NOT NULL DEFAULT NOW(), + finished BOOLEAN DEFAULT FALSE NOT NULL, + cloned_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() NOT NULL, UNIQUE(repo_id, hash) ); @@ -29,9 +29,12 @@ CREATE TABLE clusters ( weight INTEGER NOT NULL ); -CREATE TABLE clones ( +CREATE TABLE origins ( id SERIAL PRIMARY KEY, cluster_id INTEGER REFERENCES clusters(id) NOT NULL, - origin TEXT NOT NULL, - similarity FLOAT NOT NULL + file TEXT NOT NULL, + line INTEGER, + offset INTEGER, -- column offset (number of characters on the same line before the token) + similarity FLOAT NOT NULL, + UNIQUE(cluster_id, file, line, offset) ); From 3c719ad0bca9393185a77275c0758175cde449c3 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 10:10:55 +0200 Subject: [PATCH 09/71] Add "valid" column to repos table --- web/prepare_tables.pgsql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/web/prepare_tables.pgsql b/web/prepare_tables.pgsql index 6e641fe..8ecc4b7 100644 --- a/web/prepare_tables.pgsql +++ b/web/prepare_tables.pgsql @@ -6,10 +6,11 @@ DROP TABLE IF EXISTS repos; CREATE TABLE repos ( id SERIAL PRIMARY KEY, url TEXT UNIQUE NOT NULL, - dir TEXT UNIQUE NOT NULL, "server" TEXT NOT NULL, "user" TEXT NOT NULL, "name" TEXT NOT NULL, + dir TEXT UNIQUE NOT NULL, + valid BOOLEAN, -- NULL = validation in progress; FALSE = invalid repo; TRUE = valid and available UNIQUE("server", "user", "name") ); From 7707e3eeba185f0eb694a8b20f7ee4fc5d541acc Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 10:12:12 +0200 Subject: [PATCH 10/71] Move RepoInfo to its own module --- engine/preprocessing/repoinfo.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 engine/preprocessing/repoinfo.py diff --git a/engine/preprocessing/repoinfo.py b/engine/preprocessing/repoinfo.py new file mode 100644 index 0000000..1a47ab6 --- /dev/null +++ b/engine/preprocessing/repoinfo.py @@ -0,0 +1,20 @@ +import re +from os import path, makedirs +from os.path import isdir, dirname +from git import Repo, InvalidGitRepositoryError, GitCommandError +from engine import __file__ as base_path +from ..errors.UserInputError import UserInputError +from urllib.parse import urlparse, urlunparse + +# Base directory for all cloned repositories is "[main module root directory]/repos/". +clone_root_dir = path.join(dirname(base_path), "repos") + + +class RepoInfo: + def __init__(self, url, server, user, name, local_dir, commit_hash=None): + self.url = url + self.server = server + self.user = user + self.name = name + self.dir = local_dir + self.hash = commit_hash From 4368c7fe366dbd0ab25fbe17ed621dc9e591d945 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 10:12:46 +0200 Subject: [PATCH 11/71] Implement RepoInfo parser (from repo path) --- engine/preprocessing/repoinfo.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/engine/preprocessing/repoinfo.py b/engine/preprocessing/repoinfo.py index 1a47ab6..e2ce95d 100644 --- a/engine/preprocessing/repoinfo.py +++ b/engine/preprocessing/repoinfo.py @@ -18,3 +18,33 @@ def __init__(self, url, server, user, name, local_dir, commit_hash=None): self.name = name self.dir = local_dir self.hash = commit_hash + + @staticmethod + def parse_repo_info(repo_path): + try: + parts = urlparse(repo_path) + except ValueError: + return None + + if parts.username or parts.password or parts.query or parts.fragment \ + or parts.scheme not in {"https", "http", ""}: + return None + + path_match = re.fullmatch(r"/*([\w\-\.]+)/*([\w\-\.]+)/*", parts.path) + + if not path_match: + return None if parts.scheme else parse_repo_info("https://" + repo_path) + + repo_user = path_match[1] + repo_name = path_match[2] + + scheme = parts.scheme or "https" + server = parts.netloc or "github.com" + + # Inserting ":@" before hostname prevents username/password prompt + full_url = urlunparse((scheme, ":@" + server, + f"/{repo_user}/{repo_name}", "", "", "")) + + clone_dir = path.join(clone_root_dir, server, repo_user, repo_name) + + return RepoInfo(full_url, server, repo_user, repo_name, clone_dir) From 7d60da81ad5f508466b4826a133b6a8e6f5cbc79 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 10:14:04 +0200 Subject: [PATCH 12/71] Implement method to clone / pull repo via RepoInfo --- engine/preprocessing/repoinfo.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/engine/preprocessing/repoinfo.py b/engine/preprocessing/repoinfo.py index e2ce95d..f932e7b 100644 --- a/engine/preprocessing/repoinfo.py +++ b/engine/preprocessing/repoinfo.py @@ -19,6 +19,27 @@ def __init__(self, url, server, user, name, local_dir, commit_hash=None): self.dir = local_dir self.hash = commit_hash + def clone_or_pull(self): + try: + # If repo dir already exists, pull it. + if isdir(self.dir): + repo = Repo(self.dir) + repo.remotes.origin.pull() + + # If the repo hasn't been cloned yet, clone it. + else: + repo = Repo.clone_from(self.url, self.dir) + + # Get HEAD's hash and store it in repo info. + self.hash = repo.head.object.hexsha + return True + + except InvalidGitRepositoryError: + return False + + except GitCommandError: + return False + @staticmethod def parse_repo_info(repo_path): try: From 025484d43a06af999cf96a12ce8a8628bbc9df1d Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 10:17:23 +0200 Subject: [PATCH 13/71] Modify repo path regex to ignore ".git" at the end --- engine/preprocessing/repoinfo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/engine/preprocessing/repoinfo.py b/engine/preprocessing/repoinfo.py index f932e7b..fe2ec0e 100644 --- a/engine/preprocessing/repoinfo.py +++ b/engine/preprocessing/repoinfo.py @@ -51,7 +51,8 @@ def parse_repo_info(repo_path): or parts.scheme not in {"https", "http", ""}: return None - path_match = re.fullmatch(r"/*([\w\-\.]+)/*([\w\-\.]+)/*", parts.path) + path_match = re.fullmatch( + r"/*([\w\-\.]+)/*([\w\-\.]+?)(?:\.git)?/*", parts.path) if not path_match: return None if parts.scheme else parse_repo_info("https://" + repo_path) From 3f2ecfb31374b7e8c6cf5325d3348f05a28ee340 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 11:13:35 +0200 Subject: [PATCH 14/71] Switch from valid bool to status table --- web/prepare_tables.pgsql | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/web/prepare_tables.pgsql b/web/prepare_tables.pgsql index 8ecc4b7..67a8ecd 100644 --- a/web/prepare_tables.pgsql +++ b/web/prepare_tables.pgsql @@ -2,6 +2,21 @@ DROP TABLE IF EXISTS origins; DROP TABLE IF EXISTS clusters; DROP TABLE IF EXISTS commits; DROP TABLE IF EXISTS repos; +DROP INDEX IF EXISTS states_name_index; +DROP TABLE IF EXISTS states; + +CREATE TABLE states ( + id SERIAL PRIMARY KEY, + name TEXT UNIQUE NOT NULL, + description TEXT +); + +INSERT INTO states (name, description) VALUES + ('queue', 'The repository has been added to the queue.'), + ('invalid', 'This is not a valid repository.'), + ('done', 'The repository has been successfully analyzed.'); + +CREATE INDEX states_name_index ON states (name); CREATE TABLE repos ( id SERIAL PRIMARY KEY, @@ -10,7 +25,7 @@ CREATE TABLE repos ( "user" TEXT NOT NULL, "name" TEXT NOT NULL, dir TEXT UNIQUE NOT NULL, - valid BOOLEAN, -- NULL = validation in progress; FALSE = invalid repo; TRUE = valid and available + status INTEGER REFERENCES states(id) NOT NULL, UNIQUE("server", "user", "name") ); @@ -35,7 +50,7 @@ CREATE TABLE origins ( cluster_id INTEGER REFERENCES clusters(id) NOT NULL, file TEXT NOT NULL, line INTEGER, - offset INTEGER, -- column offset (number of characters on the same line before the token) + col_offset INTEGER, -- column offset (number of characters on the same line before the token) similarity FLOAT NOT NULL, - UNIQUE(cluster_id, file, line, offset) + UNIQUE(cluster_id, file, line, col_offset) ); From 2bd998b518fa8e7280fd092bc7a5b1e36fb8b827 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 11:25:38 +0200 Subject: [PATCH 15/71] Remove old repo cloner module --- engine/preprocessing/repo_cloner.py | 121 ---------------------------- 1 file changed, 121 deletions(-) delete mode 100644 engine/preprocessing/repo_cloner.py diff --git a/engine/preprocessing/repo_cloner.py b/engine/preprocessing/repo_cloner.py deleted file mode 100644 index eee35b7..0000000 --- a/engine/preprocessing/repo_cloner.py +++ /dev/null @@ -1,121 +0,0 @@ -import re -from os import path, makedirs -from os.path import isdir, dirname -from git import Repo, InvalidGitRepositoryError, GitCommandError -from engine import __file__ as base_path -from ..errors.UserInputError import UserInputError -from ..utils.config import config - -# Base directory for all cloned repositories is "[main module root directory]/repos/". -clone_root_dir = path.join(dirname(base_path), "repos") - - -class RepoInfo: # TODO: Add docstrings. - def __init__(self, url, server, user, name): - self.url = url - self.server = server - self.user = user - self.name = name - - self.dir = path.join(clone_root_dir, server, user, name) - self.hash = None - - -def _clone_repo(repo_url): - """ - Clones the specified repository into a special internal directory and - returns the directory path of the cloned repository. - - Arguments: - repo_url {string} -- URL of the repository to clone. - - Returns: - ClonedRepo -- Information about the cloned repository. - """ - - # Make sure the base clone dir exists. - makedirs(clone_root_dir, exist_ok=True) - - # NOTE: Only standard GitHub and GitLab are currently properly supported. - match = re.fullmatch( - r"^(?:https?://)?(?:[\w\-\.]*\.)?([\w\-]+)\.\w{1,10}/([\w\-]+)/([\w\-]+)(?:/?\.git)?/?$", repo_url) - - if not match: - return None - - info = RepoInfo(repo_url, match[1], match[2], match[3]) - - try: - # If repo dir already exists, pull it. - if isdir(info.dir): - repo = Repo(info.dir) - repo.remotes.origin.pull() - - # If the repo hasn't been cloned yet, clone it. - else: - repo = Repo.clone_from(info.url, info.dir) - - # Get HEAD's hash and store it in repo info. - info.hash = repo.head.object.hexsha - - except InvalidGitRepositoryError: - return None - - except GitCommandError: - return None - - return info - - -def _clone_github_short(short_path): # TODO: Add docstring. - if re.fullmatch(r"^[\w\-]+/[\w\-]+(?:\.git)?$", short_path): - return _clone_repo("https://github.com/" + short_path) - else: - return None - - -def get_repo_or_dir(repo): - """ - Attempts to process the given repository path in many different ways. - If all of them fail, an error message will be printed and - the script with exit with a non-zero exit code. - If one of them succeeds, local path of the repository will be returned. - - Arguments: - repo {string} -- Path to the repository or local directory. - - Returns: - string -- Local path to the repository's directory. - """ - - # TODO: This option should probably be removed in the future. - # It is more confusing than it is practical now. - - # Path of a previously cloned repository: "[server]/[user]/[repo name]" - repo_dir_by_name = path.join(clone_root_dir, repo) - if re.fullmatch(r"^[\w\-]+/[\w\-]+/[\w\-]+$", repo) and isdir(repo_dir_by_name): - return repo_dir_by_name - - # Shorthand for GitHub URLs: "[repository owner]/[repository name]" - repo_info = _clone_github_short(repo) - if repo_info: - return repo_info.dir - - # Local directory path - if isdir(repo): - if config.allow_local_access: - return repo - else: - raise UserInputError( - f"Access to local directory denied: \"{repo}\"") - - # Full remote repository URL - repo_info = _clone_repo(repo) - if repo_info: - return repo_info.dir - - raise UserInputError(f"Invalid repository path: \"{repo}\"") - - -def get_repo_info(repo): # TODO: Add docstring. - return _clone_github_short(repo) or _clone_repo(repo) From 04cfaf07286d507a661e5d5e47b7235396237f17 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 11:26:06 +0200 Subject: [PATCH 16/71] Add new path handler module for path-related logic --- engine/preprocessing/path_handler.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 engine/preprocessing/path_handler.py diff --git a/engine/preprocessing/path_handler.py b/engine/preprocessing/path_handler.py new file mode 100644 index 0000000..283e3e0 --- /dev/null +++ b/engine/preprocessing/path_handler.py @@ -0,0 +1,14 @@ +from os.path import isdir +from .repoinfo import RepoInfo + + +def repo_path_to_local_path(repo_path): + if isdir(repo_path): + return repo_path + + info = RepoInfo.parse_repo_info(repo_path) + + if info and info.clone_or_pull(): + return info.dir + else: + return None From a91185e4f000b816c18b94dff007ae8e7741448f Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 11:59:22 +0200 Subject: [PATCH 17/71] Add check for empty URL params --- engine/preprocessing/repoinfo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/engine/preprocessing/repoinfo.py b/engine/preprocessing/repoinfo.py index fe2ec0e..5cca77d 100644 --- a/engine/preprocessing/repoinfo.py +++ b/engine/preprocessing/repoinfo.py @@ -47,8 +47,8 @@ def parse_repo_info(repo_path): except ValueError: return None - if parts.username or parts.password or parts.query or parts.fragment \ - or parts.scheme not in {"https", "http", ""}: + if parts.username or parts.password or parts.params or parts.query or \ + parts.fragment or parts.scheme not in {"https", "http", ""}: return None path_match = re.fullmatch( From 3afb5cffd1ca7a4c12a5c4748db74cf14d03ea49 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 12:05:41 +0200 Subject: [PATCH 18/71] Remove config import and set up from web app --- web/app.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/web/app.py b/web/app.py index 5001aa1..e54a2e7 100644 --- a/web/app.py +++ b/web/app.py @@ -6,13 +6,9 @@ from easy_postgres import Connection as pg_conn from engine.preprocessing.module_parser import get_repo_modules_and_info from engine.algorithms.algorithm_runner import run_single_repo, OXYGEN -from engine.utils.config import config from engine.errors.UserInputError import UserInputError from .credentials import db_url -# Disable access to local file system -config.allow_local_access = False - app = Flask(__name__) From ba7a75a7142beb28b1ab0078c9de30b4e91f9ecc Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 12:12:29 +0200 Subject: [PATCH 19/71] Remove global config module altogether --- engine/utils/config.py | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 engine/utils/config.py diff --git a/engine/utils/config.py b/engine/utils/config.py deleted file mode 100644 index 04b51ba..0000000 --- a/engine/utils/config.py +++ /dev/null @@ -1,8 +0,0 @@ -class _Config: - def __init__(self): - # Boolean value that allows or denies access to local file system. - self.allow_local_access = True - - -# Global app configuration -config = _Config() From 10c2c8f878b92d82ca0e9173f06b815a733ea4ce Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 12:43:20 +0200 Subject: [PATCH 20/71] Remove unused imports; Add missing class reference --- engine/preprocessing/repoinfo.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/engine/preprocessing/repoinfo.py b/engine/preprocessing/repoinfo.py index 5cca77d..614ceff 100644 --- a/engine/preprocessing/repoinfo.py +++ b/engine/preprocessing/repoinfo.py @@ -1,13 +1,11 @@ import re -from os import path, makedirs -from os.path import isdir, dirname +from os.path import isdir, dirname, join as path_join from git import Repo, InvalidGitRepositoryError, GitCommandError from engine import __file__ as base_path -from ..errors.UserInputError import UserInputError from urllib.parse import urlparse, urlunparse # Base directory for all cloned repositories is "[main module root directory]/repos/". -clone_root_dir = path.join(dirname(base_path), "repos") +clone_root_dir = path_join(dirname(base_path), "repos") class RepoInfo: @@ -55,7 +53,9 @@ def parse_repo_info(repo_path): r"/*([\w\-\.]+)/*([\w\-\.]+?)(?:\.git)?/*", parts.path) if not path_match: - return None if parts.scheme else parse_repo_info("https://" + repo_path) + # If there is no scheme, try to prepend HTTPS + return None if parts.scheme else \ + RepoInfo.parse_repo_info("https://" + repo_path) repo_user = path_match[1] repo_name = path_match[2] @@ -67,6 +67,6 @@ def parse_repo_info(repo_path): full_url = urlunparse((scheme, ":@" + server, f"/{repo_user}/{repo_name}", "", "", "")) - clone_dir = path.join(clone_root_dir, server, repo_user, repo_name) + clone_dir = path_join(clone_root_dir, server, repo_user, repo_name) return RepoInfo(full_url, server, repo_user, repo_name, clone_dir) From ad46862ea0ab64a8a95877f8bc81284cf4533342 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 12:43:44 +0200 Subject: [PATCH 21/71] Switch to new repo path parsing functions --- engine/preprocessing/args_handler.py | 7 +++---- engine/preprocessing/module_parser.py | 19 +------------------ 2 files changed, 4 insertions(+), 22 deletions(-) diff --git a/engine/preprocessing/args_handler.py b/engine/preprocessing/args_handler.py index 35dbd6a..a9a314e 100644 --- a/engine/preprocessing/args_handler.py +++ b/engine/preprocessing/args_handler.py @@ -1,4 +1,4 @@ -from .repo_cloner import get_repo_or_dir +from .path_handler import repo_path_to_local_path from ..errors.UserInputError import UserInputError _USAGE_TEXT = """\ @@ -9,8 +9,7 @@ Valid repository path formats: Short GitHub repository path - username/repository Full remote repository path - https://github.com/username/repository - Absolute or relative local directory path - /home/user/directory - Short path of an already cloned repository - github/username/repository""" + Absolute or relative local directory path - /home/user/directory""" def handle_args(argv): @@ -38,4 +37,4 @@ def handle_args(argv): raise UserInputError( f"Invalid number of command line arguments: {len(argv) - 1}") - return tuple(get_repo_or_dir(a) for a in argv[1:]) + return tuple(repo_path_to_local_path(a) for a in argv[1:]) diff --git a/engine/preprocessing/module_parser.py b/engine/preprocessing/module_parser.py index 3d0e707..7d879a0 100644 --- a/engine/preprocessing/module_parser.py +++ b/engine/preprocessing/module_parser.py @@ -3,7 +3,7 @@ from os.path import isdir, isfile from ..nodes.TreeNode import TreeNode from collections import deque -from .repo_cloner import clone_root_dir, get_repo_info +from .repoinfo import clone_root_dir def _read_whole_file(file_path): @@ -117,20 +117,3 @@ def get_modules_from_dir(directory): return [_flatten_module_nodes(_get_tree_node_from_file(f)) for f in _recursive_listdir_py(directory)] - - -def get_repo_modules_and_info(repo): - """ - Clones the repository or finds its directory and then finds - all modules inside of that directory and returns them. - - Arguments: - repo {string} -- Repository path. - - Returns: - list[list[TreeNode]] -- List of lists of nodes from parsed modules. - ClonedRepo -- Information about the cloned repository. - """ - - info = get_repo_info(repo) - return get_modules_from_dir(info.dir) if info else None, info From 15551f33ecf3b5d7f43f61763f89c386795003ac Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 12:50:02 +0200 Subject: [PATCH 22/71] Replace old repo cloning code reference with new --- web/app.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/web/app.py b/web/app.py index e54a2e7..f863813 100644 --- a/web/app.py +++ b/web/app.py @@ -4,7 +4,8 @@ from fastlog import log from psycopg2 import Error as PG_Error from easy_postgres import Connection as pg_conn -from engine.preprocessing.module_parser import get_repo_modules_and_info +from engine.preprocessing.repoinfo import RepoInfo +from engine.preprocessing.module_parser import get_modules_from_dir from engine.algorithms.algorithm_runner import run_single_repo, OXYGEN from engine.errors.UserInputError import UserInputError from .credentials import db_url @@ -23,11 +24,17 @@ def _read_html(file_name): _RESULTS_HTML = _read_html("results") -def _analyze_repo(repo): +def _analyze_repo(repo_path): try: db = pg_conn(db_url) - modules, repo_info = get_repo_modules_and_info(repo) + repo_info = RepoInfo.parse_repo_info(repo_path) + + if not repo_info.clone_or_pull(): + log.error("Unable to clone repository:", repo_path) + return + + modules = get_modules_from_dir(repo_info.dir) if not modules or not repo_info: log.error("Unable to get the repository information") @@ -37,7 +44,7 @@ def _analyze_repo(repo): repo_info.url, repo_info.dir, repo_info.server, repo_info.user, repo_info.name) if count: - log.warning("Repository already present in database:", repo) + log.warning("Repository already present in database:", repo_path) return repo_id = db.one("""INSERT INTO repos ("url", "dir", "server", "user", "name") VALUES (%s, %s, %s, %s, %s) RETURNING id;""", From 278178a55761a3167bff4cc989e05ab8a3e0ccb2 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 13:40:23 +0200 Subject: [PATCH 23/71] Move args_handler and path_handler to CLI module --- cli/app.py | 2 +- {engine/preprocessing => cli}/args_handler.py | 18 ++++++++++++++++-- engine/preprocessing/path_handler.py | 14 -------------- 3 files changed, 17 insertions(+), 17 deletions(-) rename {engine/preprocessing => cli}/args_handler.py (80%) delete mode 100644 engine/preprocessing/path_handler.py diff --git a/cli/app.py b/cli/app.py index dd41170..a306eba 100644 --- a/cli/app.py +++ b/cli/app.py @@ -1,7 +1,7 @@ import sys import os from datetime import datetime -from engine.preprocessing.args_handler import handle_args +from .args_handler import handle_args from engine.preprocessing.module_parser import get_modules_from_dir from engine.algorithms.algorithm_runner import run_two_repos, IODINE from engine.utils.benchmark import time_snap diff --git a/engine/preprocessing/args_handler.py b/cli/args_handler.py similarity index 80% rename from engine/preprocessing/args_handler.py rename to cli/args_handler.py index a9a314e..ca910ec 100644 --- a/engine/preprocessing/args_handler.py +++ b/cli/args_handler.py @@ -1,5 +1,7 @@ -from .path_handler import repo_path_to_local_path -from ..errors.UserInputError import UserInputError +from os.path import isdir +from engine.errors.UserInputError import UserInputError +from engine.preprocessing.repoinfo import RepoInfo + _USAGE_TEXT = """\ Usage: @@ -12,6 +14,18 @@ Absolute or relative local directory path - /home/user/directory""" +def repo_path_to_local_path(repo_path): + if isdir(repo_path): + return repo_path + + info = RepoInfo.parse_repo_info(repo_path) + + if info and info.clone_or_pull(): + return info.dir + else: + return None + + def handle_args(argv): """ Checks the command line arguments and handles them. diff --git a/engine/preprocessing/path_handler.py b/engine/preprocessing/path_handler.py deleted file mode 100644 index 283e3e0..0000000 --- a/engine/preprocessing/path_handler.py +++ /dev/null @@ -1,14 +0,0 @@ -from os.path import isdir -from .repoinfo import RepoInfo - - -def repo_path_to_local_path(repo_path): - if isdir(repo_path): - return repo_path - - info = RepoInfo.parse_repo_info(repo_path) - - if info and info.clone_or_pull(): - return info.dir - else: - return None From 5d24f2bcdb6c02d2685540070ba082c3521c965d Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 13:41:51 +0200 Subject: [PATCH 24/71] Update code_duplication mentions to new names --- clear_db.sh | 2 +- cli/args_handler.py | 4 ++-- profiler.bat | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/clear_db.sh b/clear_db.sh index 0e6bee3..7a506ff 100755 --- a/clear_db.sh +++ b/clear_db.sh @@ -2,5 +2,5 @@ set -e -psql -f web/prepare_tables.pgsql code_duplication +psql -f web/prepare_tables.pgsql cyclone rm -rf engine/repos/ diff --git a/cli/args_handler.py b/cli/args_handler.py index ca910ec..5fde950 100644 --- a/cli/args_handler.py +++ b/cli/args_handler.py @@ -5,8 +5,8 @@ _USAGE_TEXT = """\ Usage: - python3 -m code_duplication - Repository comparison mode - python3 -m code_duplication - Single repository mode + python3 -m cli - Repository comparison mode + python3 -m cli - Single repository mode Valid repository path formats: Short GitHub repository path - username/repository diff --git a/profiler.bat b/profiler.bat index 36a7e7d..957ec5a 100644 --- a/profiler.bat +++ b/profiler.bat @@ -1 +1 @@ -python -m cProfile -o profiling.dat -m code_duplication > output.txt +python -m cProfile -o profiling.dat -m cli > output.txt From 8e26f7c1b376238999c20c0b928fb118ec7b646a Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 13:52:27 +0200 Subject: [PATCH 25/71] Add docstrings to results modules --- engine/results/DetectedClone.py | 9 +++++++++ engine/results/DetectionResult.py | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/engine/results/DetectedClone.py b/engine/results/DetectedClone.py index 052d164..0f72320 100644 --- a/engine/results/DetectedClone.py +++ b/engine/results/DetectedClone.py @@ -1,3 +1,9 @@ +""" +Module containing the `DetectedClone` class for +storing information about a single detected clone. +""" + + class DetectedClone: """ Represents a single detected code clone. @@ -12,6 +18,7 @@ class DetectedClone: origins {dict[string: float]} -- Origins and similarity coefficients. Origins are used for keys. Similarity coefficients are values. + """ def __init__(self, value, match_weight, nodes): @@ -23,6 +30,7 @@ def __init__(self, value, match_weight, nodes): value {string} -- String representation common to all the nodes. match_weight {int} -- Weight of the matching subtree skeleton. nodes {list[TreeNode]} -- List of origin nodes. + """ self.value = value @@ -39,6 +47,7 @@ def dict(self): Returns: dict -- Dictionary representation of the detected clone, including all of its attributes. + """ return self.__dict__ diff --git a/engine/results/DetectionResult.py b/engine/results/DetectionResult.py index d1e286f..3f27edc 100644 --- a/engine/results/DetectionResult.py +++ b/engine/results/DetectionResult.py @@ -1,3 +1,8 @@ +""" +Module containing the `DetectionResult` class for storing +final results about a code clone detection run. +""" + from json import dumps as json_dumps @@ -11,6 +16,7 @@ class DetectionResult: Attributes: clones {list[DetectedClone]} -- List of detected code clones ordered by their significance (see __init__). + """ def __init__(self, clones): @@ -22,6 +28,7 @@ def __init__(self, clones): Arguments: clones {list[Detectedlone]} -- List of detected code clones. + """ self.clones = clones.copy() @@ -34,5 +41,6 @@ def json(self): Returns: string -- JSON representation of the detection result. + """ return json_dumps([c.dict() for c in self.clones]) From e5607e887d7d88eecbf8b5e2e4ef11f9cc08cd76 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 13:52:36 +0200 Subject: [PATCH 26/71] Add docstrings to utils modules --- engine/utils/benchmark.py | 4 ++++ engine/utils/list_tools.py | 7 +++++++ engine/utils/printing.py | 8 ++++++++ 3 files changed, 19 insertions(+) diff --git a/engine/utils/benchmark.py b/engine/utils/benchmark.py index 0e136ad..62b143e 100644 --- a/engine/utils/benchmark.py +++ b/engine/utils/benchmark.py @@ -1,3 +1,7 @@ +""" +Module containing helper functions used for benchmarking. +""" + from time import time from fastlog import log diff --git a/engine/utils/list_tools.py b/engine/utils/list_tools.py index 8c36320..8d9b265 100644 --- a/engine/utils/list_tools.py +++ b/engine/utils/list_tools.py @@ -1,3 +1,8 @@ +""" +Module containing helper functions for list manipulation. +""" + + def flatten(list_of_lists): """ Flattens a list of list into a single flat list. @@ -7,7 +12,9 @@ def flatten(list_of_lists): Returns: list[T] -- Flat list generated by flattening the source list of lists. + """ + flat = [] for l in list_of_lists: diff --git a/engine/utils/printing.py b/engine/utils/printing.py index d511afd..197d034 100644 --- a/engine/utils/printing.py +++ b/engine/utils/printing.py @@ -1,9 +1,15 @@ +""" +Module containing methods for pretty-printing node trees. +""" + + def print_node_list(node_list): """ Prints a list of TreeNodes for debugging Arguments: node_list (list[TreeNode]): a list of tree nodes + """ for node in node_list: if node.parent_index is None: @@ -19,7 +25,9 @@ def print_node(node, indent, level, node_list): indent (str): space to print before node level (int): depth of node within the tree (0 for root) node_list (list[TreeNode]): list of TreeNodes to reference children of TreeNode + """ + print(indent, "(", level, ")", node) for index in node.child_indices: for node in node_list: From 63570f7ee37d48e788465e05e557c4fbbfdded04 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 14:51:33 +0200 Subject: [PATCH 27/71] Fix docstyle in CLI --- cli/app.py | 5 +---- cli/args_handler.py | 5 +++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/cli/app.py b/cli/app.py index a306eba..96371c3 100644 --- a/cli/app.py +++ b/cli/app.py @@ -10,10 +10,7 @@ def main(): - """ - Entry point of the application. - """ - + """Entry point of the application.""" try: # Parse command line arguments repos = handle_args(sys.argv) diff --git a/cli/args_handler.py b/cli/args_handler.py index 5fde950..bb16f4a 100644 --- a/cli/args_handler.py +++ b/cli/args_handler.py @@ -28,7 +28,8 @@ def repo_path_to_local_path(repo_path): def handle_args(argv): """ - Checks the command line arguments and handles them. + Check the command line arguments and handles them. + If there is any problem, an error message will be printed and the script will exit with a non-zero exit code. If everything goes right, tuple of local repository paths will be returned. @@ -38,8 +39,8 @@ def handle_args(argv): Returns: tuple[string] -- Tuple of local repository paths. - """ + """ if len(argv) == 1 or (len(argv) == 2 and argv[1] in ['-h', '--help', '--usage']): # Special case where the usage text is printed using the built-in # print function instead of the logging library because From 21288feb5f0c1f533de1caf5cc1683e0cb947802 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 14:51:47 +0200 Subject: [PATCH 28/71] Add docstring to web app --- web/app.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/web/app.py b/web/app.py index f863813..627a1f2 100644 --- a/web/app.py +++ b/web/app.py @@ -1,3 +1,5 @@ +"""Module containing the core of the web UI application.""" + import os.path from threading import Thread from flask import Flask, request From cbac8c5d4f6afe300234b2895c07637707ed262c Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 14:52:11 +0200 Subject: [PATCH 29/71] Fix docstyle in algorithms module --- engine/algorithms/algorithm_runner.py | 10 ++++++---- engine/algorithms/chlorine/chlorine.py | 18 +++++++++--------- engine/algorithms/iodine/anti_unification.py | 3 +-- engine/algorithms/iodine/iodine.py | 4 +++- engine/algorithms/iodine/pattern_clustering.py | 1 + engine/algorithms/iodine/pattern_collection.py | 2 +- engine/algorithms/oxygen/oxygen.py | 2 +- 7 files changed, 22 insertions(+), 18 deletions(-) diff --git a/engine/algorithms/algorithm_runner.py b/engine/algorithms/algorithm_runner.py index f9f6de4..4968387 100644 --- a/engine/algorithms/algorithm_runner.py +++ b/engine/algorithms/algorithm_runner.py @@ -1,3 +1,5 @@ +"""Module used for algorithm abstraction by providing a common interface.""" + from .oxygen.oxygen import oxygen from .chlorine.chlorine import chlorine_single_repo, chlorine_two_repos from .iodine.iodine import iodine @@ -7,7 +9,7 @@ def run_single_repo(modules, algorithm): """ - Runs the specified code clone detection algorithm on a single repository. + Run the specified code clone detection algorithm on a single repository. Arguments: modules {list[list[TreeNode]]} -- List of the repo's modules. @@ -18,8 +20,8 @@ def run_single_repo(modules, algorithm): Returns: DetectionResult -- Result of the code clone detection. - """ + """ if algorithm == OXYGEN: return oxygen(modules) elif algorithm == CHLORINE: @@ -30,7 +32,7 @@ def run_single_repo(modules, algorithm): def run_two_repos(modules1, modules2, algorithm): """ - Runs the specified code clone detection algorithm on two repositores. + Run the specified code clone detection algorithm on two repositores. Arguments: modules1 {list[list[TreeNode]]} -- List of first repo's modules. @@ -42,8 +44,8 @@ def run_two_repos(modules1, modules2, algorithm): Returns: DetectionResult -- Result of the code clone detection. - """ + """ if algorithm == CHLORINE: return chlorine_two_repos(modules1, modules2) elif algorithm == IODINE: diff --git a/engine/algorithms/chlorine/chlorine.py b/engine/algorithms/chlorine/chlorine.py index 355b193..eaf14a5 100644 --- a/engine/algorithms/chlorine/chlorine.py +++ b/engine/algorithms/chlorine/chlorine.py @@ -24,6 +24,7 @@ def _get_skeleton_recursive(node): def _can_be_compared(node1, node2): """ First get rid of nodes with a weight below the specified threshold. + Checks if two nodes can be possible compared with each other. In order to be comparable, the nodes must have an equal value and they must have the exact same number of children. @@ -34,6 +35,7 @@ def _can_be_compared(node1, node2): Returns: bool -- True if nodes can be compared, False if they cannot. + """ return \ node1.weight >= _MIN_NODE_WEIGHT and \ @@ -44,7 +46,7 @@ def _can_be_compared(node1, node2): def _type1_compare(node1, node2): """ - Compares two nodes and returns the weight of their matching subtrees + Compare two nodes and returns the weight of their matching subtrees and a skeleton string representing their common syntax tree skeleton. Arguments: @@ -55,7 +57,6 @@ def _type1_compare(node1, node2): int -- Weight of the matching subtrees. string -- Common skeleton of the two nodes. """ - combined_weight = node1.weight + node2.weight if not _can_be_compared(node1, node2): @@ -90,7 +91,6 @@ def _compare_internal(n1, n2, ignore_set, match_dict, skeleton_weight_dict): match_dict {dict[string: set[TreeNode]]} -- Origin nodes of matches. skeleton_weight_dict {dict[string: int]} -- Skeleton weights. """ - if not _can_be_compared(n1, n2): return @@ -109,13 +109,12 @@ def _compare_internal(n1, n2, ignore_set, match_dict, skeleton_weight_dict): def _dict_to_result(match_dict, skeleton_weight_dict): """ - Compiles the detection result together from the input dictionaries. + Compile the detection result together from the input dictionaries. Arguments: match_dict {dict[string: set[TreeNode]]} -- Origin nodes of matches. skeleton_weight_dict {dict[string: int]} -- Skeleton weights. """ - clones = [] for k, v in match_dict.items(): @@ -128,7 +127,7 @@ def _dict_to_result(match_dict, skeleton_weight_dict): def chlorine_single_repo(modules): """ - Finds all clones satisfying the settings at the top of this source file + Find all clones satisfying the settings at the top of this source file in a single repository given its modules. Detected code clones are printed on STDOUT, including the common skeleton, path to each clones (source file path, line number, column offset), @@ -140,8 +139,8 @@ def chlorine_single_repo(modules): Returns: DetectionResult -- Result of the code clone detection. - """ + """ time_snap("Function started") nodes = [m[0] for m in modules] @@ -186,7 +185,8 @@ def chlorine_single_repo(modules): def chlorine_two_repos(modules1, modules2): """ - Finds code clones between two repositories given their module lists. + Find code clones between two repositories given their module lists. + Clones must satisfy rules defined at the top of this source file. Detected clones are printed on STDOUT. See `find_clones_in_repo(repo_url)` for details on output format. @@ -197,8 +197,8 @@ def chlorine_two_repos(modules1, modules2): Returns: DetectionResult -- Result of the code clone detection. - """ + """ time_snap("Function started") repo1_nodes = [m[0] for m in modules1] diff --git a/engine/algorithms/iodine/anti_unification.py b/engine/algorithms/iodine/anti_unification.py index fcfd863..958a5c0 100644 --- a/engine/algorithms/iodine/anti_unification.py +++ b/engine/algorithms/iodine/anti_unification.py @@ -3,7 +3,7 @@ def anti_unify(list1, list2, index1, index2, worktable): """ - Creates a tree of PatternNodes from two lists + Create a tree of PatternNodes from two lists Arguments: list1 {list of TreeNodes} -- first tree to be compared @@ -12,7 +12,6 @@ def anti_unify(list1, list2, index1, index2, worktable): index2 {int} -- index of current TreeNode to be compared from list2 worktable {2D boolean array} -- keeps track of which two nodes have been checked together """ - # mark the pair as checked/True worktable[index1].set(True, index2) # determine if subtrees are the same (and lengths same) diff --git a/engine/algorithms/iodine/iodine.py b/engine/algorithms/iodine/iodine.py index 74bc852..24bbe4d 100644 --- a/engine/algorithms/iodine/iodine.py +++ b/engine/algorithms/iodine/iodine.py @@ -5,7 +5,7 @@ def iodine(module_list_1, module_list_2): """ - Finds clones between the two modules by comparing all possible subtrees of + Find clones between the two modules by comparing all possible subtrees of their methods. Returns the results. Arguments: @@ -14,7 +14,9 @@ def iodine(module_list_1, module_list_2): Returns: DetectionResult -- Result of the code clone detection. + """ + clusters = [] for module_tree_1 in module_list_1: for module_tree_2 in module_list_2: diff --git a/engine/algorithms/iodine/pattern_clustering.py b/engine/algorithms/iodine/pattern_clustering.py index 42134b1..9360fe8 100644 --- a/engine/algorithms/iodine/pattern_clustering.py +++ b/engine/algorithms/iodine/pattern_clustering.py @@ -3,6 +3,7 @@ def clustering(ps): :param ps: a set of patterns :return: a set of clustered-patterns """ + cs = [] # initialize the set for p in ps: # iterate through the patterns in the set of patterns merged = False diff --git a/engine/algorithms/iodine/pattern_collection.py b/engine/algorithms/iodine/pattern_collection.py index 7d9e769..68b094d 100644 --- a/engine/algorithms/iodine/pattern_collection.py +++ b/engine/algorithms/iodine/pattern_collection.py @@ -5,7 +5,7 @@ def pattern_collection(tree_list_1, tree_list_2): """ - Compares two Abstract Syntax Trees representing two methods. The trees are + Compare two Abstract Syntax Trees representing two methods. The trees are provided as lists to provides indexes for the nodes within the tree. Arguments: diff --git a/engine/algorithms/oxygen/oxygen.py b/engine/algorithms/oxygen/oxygen.py index 36d9681..3c5c470 100644 --- a/engine/algorithms/oxygen/oxygen.py +++ b/engine/algorithms/oxygen/oxygen.py @@ -11,8 +11,8 @@ def oxygen(modules, weight_limit=15): Returns: DetectionResult -- Result of the code clone detection. - """ + """ # Dictionary of all the different shapes of node trees. # Key is a string representation of the tree. # Value is a list of all nodes with the exact same string representation. From 7748fca2e91fe8558b4531ef41c8a9891bd6388d Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 14:52:27 +0200 Subject: [PATCH 30/71] Add docstring to UserInputError --- engine/errors/UserInputError.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/engine/errors/UserInputError.py b/engine/errors/UserInputError.py index 8954f33..6df12d5 100644 --- a/engine/errors/UserInputError.py +++ b/engine/errors/UserInputError.py @@ -1,6 +1,10 @@ +"""Module containing the `UserInputError` exception class.""" + + class UserInputError(Exception): """ Exception representing invalid user input such as command line arguments. + Alternatively, this can also represented a problem caused by invalid user input further down the line. Simply put, the problem can / must be fixed by modifying the user input. From 6215a1b79ec4da985eb02716886bcd9429365de8 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 14:52:40 +0200 Subject: [PATCH 31/71] Fix docstyle in node classes --- engine/nodes/PatternNode.py | 18 +++++++++++------- engine/nodes/TreeNode.py | 15 +++++++++++---- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/engine/nodes/PatternNode.py b/engine/nodes/PatternNode.py index 6c07519..a475bb6 100644 --- a/engine/nodes/PatternNode.py +++ b/engine/nodes/PatternNode.py @@ -1,3 +1,5 @@ +"""Module containing the `PatternNode` class.""" + _HOLE = "Hole" @@ -13,7 +15,7 @@ class PatternNode: def __init__(self, node1, node2, value=None): """ - Creates a new PatternNode from two nodes and their common value. + Create a new PatternNode from two nodes and their common value. Arguments: node1 {TreeNode} -- First TreeNode sharing common skeleton. @@ -27,33 +29,35 @@ def __init__(self, node1, node2, value=None): def add_node(self, node): """ - Appends the supplied nodes to this node's list of origin nodes. + Append the supplied nodes to this node's list of origin nodes. Arguments: node {TreeNode} -- Node to be added to the list of origin nodes. + """ self.nodes.append(node) def add_child(self, child): """ - Appends the supplied nodes to this node's list of child nodes. + Append the supplied nodes to this node's list of child nodes. Arguments: child {PatternNode} -- Node that is a child of this node. + """ self.children.append(child) def skeleton_equals(self, other): """ - Checks if this node's skeleton is equal to another node's. + Check if this node's skeleton is equal to another node's. Arguments: other {PatterNode} -- Another node to compare this one with. Returns: bool -- True if the nodes have an equal skeleton, False otherwise. - """ + """ if not isinstance(other, PatternNode) or other.value != self.value or \ len(other.children) != len(self.children): return False @@ -66,12 +70,12 @@ def skeleton_equals(self, other): def get_match_weight(self): """ - Calculates the weight of the matching skeleton of all origin nodes. + Calculate the weight of the matching skeleton of all origin nodes. Returns: int -- Weight of the matching skeleton. - """ + """ return 0 if self.value == _HOLE else \ (1 + sum([c.get_match_weight() for c in self.children])) diff --git a/engine/nodes/TreeNode.py b/engine/nodes/TreeNode.py index 130c4c5..ef23bb2 100644 --- a/engine/nodes/TreeNode.py +++ b/engine/nodes/TreeNode.py @@ -1,3 +1,5 @@ +"""Module containing the `TreeNode` class.""" + import ast _IGNORE_CLASSES = [ast.Load, ast.Store, ast.Del, @@ -6,7 +8,8 @@ class TreeNode: """ - Represents a single node of the Python code AST (Abstract Syntax Tree). + Represent a single node of the Python code AST (Abstract Syntax Tree). + Every node is also a tree of its own, with the exception of leaf (childless) nodes. @@ -24,7 +27,7 @@ class TreeNode: def __init__(self, node, origin_file): """ - Arguments: + Argument: node -- Single raw node produced by the Python AST parser. origin_file {string} -- Relative path to the source file. """ @@ -68,10 +71,11 @@ def __init__(self, node, origin_file): def dump(self): """ - Converts the node into a string using the built-in function. + Convert the node into a string using the built-in function. Returns: string -- String representation of the AST node. + """ return ast.dump(self.node) @@ -82,6 +86,7 @@ def get_all_children(self): Returns: list[TreeNode] -- List of all the recursively found children. + """ children = self.children.copy() @@ -92,7 +97,8 @@ def get_all_children(self): def __eq__(self, other): """ - Compares the node to another node recursively. + Compare the node to another node recursively. + This operator overload can be used for Type 1 clone detection. Arguments: @@ -100,6 +106,7 @@ def __eq__(self, other): Returns: bool -- True if the nodes are equivalent, False if they are not. + """ if not isinstance(other, TreeNode): return False From 93f55ba2c411b4fa9b5f4b1b9b8e245859534b52 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 14:52:56 +0200 Subject: [PATCH 32/71] Fix docstyle in preprocessing module --- engine/preprocessing/module_parser.py | 21 ++++++++++++++------- engine/preprocessing/repoinfo.py | 5 +++++ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/engine/preprocessing/module_parser.py b/engine/preprocessing/module_parser.py index 7d879a0..fe9d993 100644 --- a/engine/preprocessing/module_parser.py +++ b/engine/preprocessing/module_parser.py @@ -1,3 +1,5 @@ +"""Module containing code used for parsing of modules and nodes from Python code.""" + import ast from os import listdir, path from os.path import isdir, isfile @@ -9,6 +11,7 @@ def _read_whole_file(file_path): """ Read a text file into a single string. + Assumes UTF-8 encoding. """ with open(file_path, "r", encoding="utf-8") as f: @@ -17,26 +20,28 @@ def _read_whole_file(file_path): def _read_ast_from_file(file_path): """ - Parses a module AST from the specified file. + Parse a module AST from the specified file. Arguments: file_path {string} -- Path of file to parse the AST from. Returns: AST parsed from the specified file. + """ return ast.parse(_read_whole_file(file_path)) def _get_tree_node_from_file(file_path): """ - Parses a TreeNode representing the module in the specified file. + Parse a TreeNode representing the module in the specified file. Arguments: file_path {string} -- Path of file to parse the TreeNode from. Returns: TreeNode -- TreeNode parsed from the specified file. + """ module_node = _read_ast_from_file(file_path) file_rel_path = file_path.replace(clone_root_dir, "...") @@ -45,12 +50,12 @@ def _get_tree_node_from_file(file_path): def _recursive_listdir_py(directory): """ - Returns relative paths of all *.py files in the specified directory. + Return relative paths of all *.py files in the specified directory. + If the provided argument is not a valid directory, an internal exception will be thrown by Python. That exception will most likely be NotImplementedError. """ - files = [] for item in listdir(directory): @@ -66,13 +71,14 @@ def _recursive_listdir_py(directory): def _flatten_module_nodes(module): """ - Converts a module TreeNode into a flat list of nodes in the module's AST. + Convert a module TreeNode into a flat list of nodes in the module's AST. Arguments: module {TreeNode} -- TreeNode representing a module root node. Returns: list[TreeNode] -- List of all the nodes in the module's AST. + """ module_nodes = [] node_queue = deque([module]) @@ -103,7 +109,8 @@ def _flatten_module_nodes(module): def get_modules_from_dir(directory): """ - Finds all *.py files in the specified directory recursively. + Find all *.py files in the specified directory recursively. + Every file is parsed as a module and converted into an AST. The parsed ASTs are converted into lists of all nodes in the ASTs. A list of all these lists is then constructed a returned. @@ -113,7 +120,7 @@ def get_modules_from_dir(directory): Returns: list[list[TreeNode]] -- List of lists of nodes from parsed modules. - """ + """ return [_flatten_module_nodes(_get_tree_node_from_file(f)) for f in _recursive_listdir_py(directory)] diff --git a/engine/preprocessing/repoinfo.py b/engine/preprocessing/repoinfo.py index 614ceff..02e5464 100644 --- a/engine/preprocessing/repoinfo.py +++ b/engine/preprocessing/repoinfo.py @@ -1,3 +1,8 @@ +""" +Module containing the `RepoInfo` class, which is used to encapsulate +all available information about a repository into a single object. +""" + import re from os.path import isdir, dirname, join as path_join from git import Repo, InvalidGitRepositoryError, GitCommandError From 25b3b349e5d0126f877a700e81cb1db48856921d Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 14:53:08 +0200 Subject: [PATCH 33/71] Fix docstyle in result classes --- engine/results/DetectedClone.py | 15 +++++---------- engine/results/DetectionResult.py | 14 ++++++-------- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/engine/results/DetectedClone.py b/engine/results/DetectedClone.py index 0f72320..6279469 100644 --- a/engine/results/DetectedClone.py +++ b/engine/results/DetectedClone.py @@ -1,12 +1,9 @@ -""" -Module containing the `DetectedClone` class for -storing information about a single detected clone. -""" +"""Module containing the `DetectedClone` class for storing clone information.""" class DetectedClone: """ - Represents a single detected code clone. + Represent a single detected code clone. Similarity coefficient is a floating-point number between 0 and 1, where 0 means the subtrees are completely different and 1 means @@ -23,8 +20,7 @@ class DetectedClone: def __init__(self, value, match_weight, nodes): """ - Initializes a new detected clone - given its values and origin nodes. + Initialize a new detected clone given its values and origin nodes. Arguments: value {string} -- String representation common to all the nodes. @@ -32,14 +28,14 @@ def __init__(self, value, match_weight, nodes): nodes {list[TreeNode]} -- List of origin nodes. """ - self.value = value self.match_weight = match_weight self.origins = {n.origin: match_weight / n.weight for n in nodes} def dict(self): """ - Converts the detected clone into its dictionary representation. + Convert the detected clone into its dictionary representation. + This is necessary for later conversion to JSON, because there is no easy way to tell the JSON encoder how to encode instances of user-defined classes. @@ -49,5 +45,4 @@ def dict(self): including all of its attributes. """ - return self.__dict__ diff --git a/engine/results/DetectionResult.py b/engine/results/DetectionResult.py index 3f27edc..4778aa5 100644 --- a/engine/results/DetectionResult.py +++ b/engine/results/DetectionResult.py @@ -1,14 +1,11 @@ -""" -Module containing the `DetectionResult` class for storing -final results about a code clone detection run. -""" +"""Module containing the `DetectionResult` class for final results of detection.""" from json import dumps as json_dumps class DetectionResult: """ - Represents the final result of a detection query. + Represent the final result of a detection query. For now it only contains a list of detected clones, but more information may be added in the future. @@ -21,7 +18,8 @@ class DetectionResult: def __init__(self, clones): """ - Initializes a new detection result given the list of detected clones. + Initialize a new detection result given the list of detected clones. + The list of code clones will be copied and the copy will be sorted by the clones' weight of their matching subtrees. The original list of clones will not be modified in any way. @@ -30,13 +28,13 @@ def __init__(self, clones): clones {list[Detectedlone]} -- List of detected code clones. """ - self.clones = clones.copy() self.clones.sort(reverse=True, key=lambda c: c.match_weight) def json(self): """ - Converts the detection result into a JSON. + Convert the detection result into a JSON. + This includes information about all detected code clones. Returns: From 921908da9daa6da4b65dd60bbf626e024f73d5d8 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Thu, 1 Aug 2019 14:53:20 +0200 Subject: [PATCH 34/71] Fix docstyle in utils module --- engine/utils/benchmark.py | 8 ++++---- engine/utils/list_tools.py | 7 ++----- engine/utils/printing.py | 9 +++------ 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/engine/utils/benchmark.py b/engine/utils/benchmark.py index 62b143e..3b636dc 100644 --- a/engine/utils/benchmark.py +++ b/engine/utils/benchmark.py @@ -1,6 +1,4 @@ -""" -Module containing helper functions used for benchmarking. -""" +"""Module containing helper functions used for benchmarking.""" from time import time from fastlog import log @@ -10,11 +8,13 @@ def time_snap(text=None): """ - Prints the time since the last call to this function in seconds. + Print the time since the last call to this function in seconds. + It is possible to supply a message to print along with the time. Arguments: text {str} (optional) -- Message to print with the time. + """ global _last_time current_time = time() diff --git a/engine/utils/list_tools.py b/engine/utils/list_tools.py index 8d9b265..daea93b 100644 --- a/engine/utils/list_tools.py +++ b/engine/utils/list_tools.py @@ -1,11 +1,9 @@ -""" -Module containing helper functions for list manipulation. -""" +"""Module containing helper functions for list manipulation.""" def flatten(list_of_lists): """ - Flattens a list of list into a single flat list. + Flatten a list of list into a single flat list. Arguments: list_of_lists {list[list[T]]} -- List of lists to flatten. @@ -14,7 +12,6 @@ def flatten(list_of_lists): list[T] -- Flat list generated by flattening the source list of lists. """ - flat = [] for l in list_of_lists: diff --git a/engine/utils/printing.py b/engine/utils/printing.py index 197d034..eeab86e 100644 --- a/engine/utils/printing.py +++ b/engine/utils/printing.py @@ -1,11 +1,9 @@ -""" -Module containing methods for pretty-printing node trees. -""" +"""Module containing methods for pretty-printing node trees.""" def print_node_list(node_list): """ - Prints a list of TreeNodes for debugging + Print a list of TreeNodes for debugging Arguments: node_list (list[TreeNode]): a list of tree nodes @@ -18,7 +16,7 @@ def print_node_list(node_list): def print_node(node, indent, level, node_list): """ - Prints a TreeNode for debugging + Print a TreeNode for debugging Arguments: node (TreeNode): node to print @@ -27,7 +25,6 @@ def print_node(node, indent, level, node_list): node_list (list[TreeNode]): list of TreeNodes to reference children of TreeNode """ - print(indent, "(", level, ")", node) for index in node.child_indices: for node in node_list: From 0a4a6bcd1eb5c4f6026d5afa88a1ab089b812f18 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Fri, 2 Aug 2019 10:23:23 +0200 Subject: [PATCH 35/71] Rename "/" web app route from hello() Somehow I have never noticed I left the function name unchanged. --- web/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/app.py b/web/app.py index 627a1f2..5b86446 100644 --- a/web/app.py +++ b/web/app.py @@ -115,7 +115,7 @@ def _get_repo_analysis(repo): # TODO: Add docstring. @app.route("/") -def hello(): +def web_index(): content = "" repo = request.args.get("repo") From 02a85934a777af12ee81165c27fa6702e70b91a6 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Fri, 2 Aug 2019 14:03:42 +0200 Subject: [PATCH 36/71] Add reminder to fix PatterNode to-str conversion --- engine/nodes/PatternNode.py | 1 + 1 file changed, 1 insertion(+) diff --git a/engine/nodes/PatternNode.py b/engine/nodes/PatternNode.py index a475bb6..3a70e5f 100644 --- a/engine/nodes/PatternNode.py +++ b/engine/nodes/PatternNode.py @@ -80,6 +80,7 @@ def get_match_weight(self): (1 + sum([c.get_match_weight() for c in self.children])) def __str__(self): + # FIXME: This doesn't seem right. return f"{self.value}(', '.join{[n.origin for n in self.nodes]})" def __repr__(self): From 1c3c7d2327f0dad0c53747089e24d5bb54ae639a Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Fri, 2 Aug 2019 16:39:23 +0200 Subject: [PATCH 37/71] Move results towards center of screen --- web/results.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/results.html b/web/results.html index 54bdfbd..4381fd1 100644 --- a/web/results.html +++ b/web/results.html @@ -1,4 +1,4 @@ -
+
Detected clones
#CLONES#
From 02f990db78de59d370fdc09ae52ccbfcf05eaa7e Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Fri, 2 Aug 2019 16:39:42 +0200 Subject: [PATCH 38/71] Make all messages larger --- web/message.html | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/web/message.html b/web/message.html index 063a983..acc3da9 100644 --- a/web/message.html +++ b/web/message.html @@ -1,3 +1,6 @@ -
- #MSG# + +
+

+ #MSG# +

From 2f56dfad6d102facfe3bf8118dbbf9b7d7e2187d Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Fri, 2 Aug 2019 17:05:07 +0200 Subject: [PATCH 39/71] Split invalid state into errors; Impr. commits tab --- web/prepare_tables.pgsql | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/web/prepare_tables.pgsql b/web/prepare_tables.pgsql index 67a8ecd..0cc6220 100644 --- a/web/prepare_tables.pgsql +++ b/web/prepare_tables.pgsql @@ -12,9 +12,10 @@ CREATE TABLE states ( ); INSERT INTO states (name, description) VALUES - ('queue', 'The repository has been added to the queue.'), - ('invalid', 'This is not a valid repository.'), - ('done', 'The repository has been successfully analyzed.'); + ('queue', 'The repository is already in the queue'), + ('err_clone', 'Error: Unable to clone the repository'), + ('err_analysis', 'Error: Repository analysis failed'), + ('done', 'The repository has been successfully analyzed'); CREATE INDEX states_name_index ON states (name); @@ -33,8 +34,7 @@ CREATE TABLE commits ( id SERIAL PRIMARY KEY, repo_id INTEGER REFERENCES repos(id) NOT NULL, hash TEXT NOT NULL, - finished BOOLEAN DEFAULT FALSE NOT NULL, - cloned_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() NOT NULL, + analyzed_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() NOT NULL, UNIQUE(repo_id, hash) ); From 8751ce2949c3522f347a1ec7484e4fd2dfa7127b Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Fri, 2 Aug 2019 17:06:01 +0200 Subject: [PATCH 40/71] Implement NodeOrigin instead of using plain string --- engine/nodes/nodeorigin.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 engine/nodes/nodeorigin.py diff --git a/engine/nodes/nodeorigin.py b/engine/nodes/nodeorigin.py new file mode 100644 index 0000000..2764ec6 --- /dev/null +++ b/engine/nodes/nodeorigin.py @@ -0,0 +1,23 @@ +class NodeOrigin: + def __init__(self, file_path, line=None, col_offset=None, node_id=None): + if file_path is None or \ + (node_id is None and (line is None or col_offset is None)) or \ + (node_id is not None and (line is not None or col_offset is not None)): + + raise ValueError( + "File path and either ID or both line and column offset must be set to a non-None value") + + self.file = file_path + self.line = line + self.col_offset = col_offset + self.id = node_id + + def __str__(self): + return self.file + (f" (ID: {self.id:x})" if self.id else + f" (L: {self.line} C: {self.col_offset})") + + def __repr__(self): + return self.__str__() + + def __hash__(self): + return hash(self.__str__()) From e1c25a8ba1fea9ff848979f3dee5deefcaf1c978 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Fri, 2 Aug 2019 17:06:32 +0200 Subject: [PATCH 41/71] Make relative origin file paths less verbose --- engine/preprocessing/module_parser.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/engine/preprocessing/module_parser.py b/engine/preprocessing/module_parser.py index fe9d993..318a09d 100644 --- a/engine/preprocessing/module_parser.py +++ b/engine/preprocessing/module_parser.py @@ -2,7 +2,7 @@ import ast from os import listdir, path -from os.path import isdir, isfile +from os.path import isdir, isfile, relpath from ..nodes.TreeNode import TreeNode from collections import deque from .repoinfo import clone_root_dir @@ -32,7 +32,7 @@ def _read_ast_from_file(file_path): return ast.parse(_read_whole_file(file_path)) -def _get_tree_node_from_file(file_path): +def _get_tree_node_from_file(file_path, repo_path): """ Parse a TreeNode representing the module in the specified file. @@ -43,9 +43,8 @@ def _get_tree_node_from_file(file_path): TreeNode -- TreeNode parsed from the specified file. """ - module_node = _read_ast_from_file(file_path) - file_rel_path = file_path.replace(clone_root_dir, "...") - return TreeNode(module_node, file_rel_path) + return TreeNode(_read_ast_from_file(file_path), + relpath(file_path, repo_path)) def _recursive_listdir_py(directory): @@ -122,5 +121,5 @@ def get_modules_from_dir(directory): list[list[TreeNode]] -- List of lists of nodes from parsed modules. """ - return [_flatten_module_nodes(_get_tree_node_from_file(f)) + return [_flatten_module_nodes(_get_tree_node_from_file(f, directory)) for f in _recursive_listdir_py(directory)] From 10b8958d5a72f672713b90cf689200c6ce384c88 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Fri, 2 Aug 2019 17:06:56 +0200 Subject: [PATCH 42/71] Add to-str conversion to RepoInfo --- engine/preprocessing/repoinfo.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/engine/preprocessing/repoinfo.py b/engine/preprocessing/repoinfo.py index 02e5464..3962046 100644 --- a/engine/preprocessing/repoinfo.py +++ b/engine/preprocessing/repoinfo.py @@ -75,3 +75,11 @@ def parse_repo_info(repo_path): clone_dir = path_join(clone_root_dir, server, repo_user, repo_name) return RepoInfo(full_url, server, repo_user, repo_name, clone_dir) + + def __str__(self): + info_str = f"{self.url} -> {self.dir}" + + if self.hash: + info_str += f" (commit: {self.hash})" + + return info_str From e2acb668a1ca8339897334c2d6877502892a0b9e Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Fri, 2 Aug 2019 17:07:29 +0200 Subject: [PATCH 43/71] Switch from raw string to NodeOrigin in TreeNode --- engine/nodes/TreeNode.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/engine/nodes/TreeNode.py b/engine/nodes/TreeNode.py index ef23bb2..45b47f9 100644 --- a/engine/nodes/TreeNode.py +++ b/engine/nodes/TreeNode.py @@ -1,6 +1,7 @@ """Module containing the `TreeNode` class.""" import ast +from .nodeorigin import NodeOrigin _IGNORE_CLASSES = [ast.Load, ast.Store, ast.Del, ast.AugLoad, ast.AugStore, ast.Param] @@ -15,7 +16,7 @@ class TreeNode: Attributes: node {AST} -- Original AST node generated by Python's built-in parser. - origin {string} -- Origin of the node (file path, line and column). + origin {NodeOrigin} -- Origin of the node (file path, line and column). children {list[TreeNode]} -- List of direct children of this node. weight {int} -- Total number of nodes in this node's tree. names {list[string]} -- All names / symbols used in this node's tree. @@ -32,8 +33,8 @@ def __init__(self, node, origin_file): origin_file {string} -- Relative path to the source file. """ self.node = node - self.origin = origin_file + (f" (L:{node.lineno} C:{node.col_offset})" - if node._attributes else f" (ID:{id(node):x})") + self.origin = NodeOrigin(origin_file, node.lineno, node.col_offset) if \ + node._attributes else NodeOrigin(origin_file, node_id=id(node)) # Check if this type of node can have docstring. can_have_docstring = node.__class__ in [ast.ClassDef, ast.FunctionDef] From 5ac6754eee9bcfc880593eb686b6f8a8dcf6ec81 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Fri, 2 Aug 2019 17:26:29 +0200 Subject: [PATCH 44/71] Rework web app to work with new database layout Added some basic error checks and made it more verbose in the process. --- web/app.py | 157 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 100 insertions(+), 57 deletions(-) diff --git a/web/app.py b/web/app.py index 5b86446..f398b5c 100644 --- a/web/app.py +++ b/web/app.py @@ -2,11 +2,13 @@ import os.path from threading import Thread +from traceback import format_exc from flask import Flask, request from fastlog import log from psycopg2 import Error as PG_Error from easy_postgres import Connection as pg_conn from engine.preprocessing.repoinfo import RepoInfo +from engine.nodes.nodeorigin import NodeOrigin from engine.preprocessing.module_parser import get_modules_from_dir from engine.algorithms.algorithm_runner import run_single_repo, OXYGEN from engine.errors.UserInputError import UserInputError @@ -14,6 +16,10 @@ app = Flask(__name__) +# Clean up the repository table +with pg_conn(db_url) as conn: + conn.run("""UPDATE repos SET status = (SELECT id FROM states WHERE name = 'err_analysis') WHERE status = (SELECT id FROM states WHERE name = 'queue');""") + def _read_html(file_name): file_path = os.path.join(os.path.dirname(__file__), file_name + ".html") @@ -26,92 +32,129 @@ def _read_html(file_name): _RESULTS_HTML = _read_html("results") -def _analyze_repo(repo_path): +def _postgres_err(ex): + log.error(f"PostgreSQL: {ex}\n{format_exc()}") + + +def _pg_error_handler(ex, conn, repo_id): + _postgres_err(ex) + + if conn and repo_id is not None: + conn.run("""UPDATE repos SET status = (SELECT id FROM states WHERE name = 'err_analysis') WHERE id = %s;""", + repo_id) + + +def _analyze_repo(repo_info, repo_id, algorithm=OXYGEN): + log.info(f"Analyzing repository: {repo_info}") + try: - db = pg_conn(db_url) + conn = pg_conn(db_url) - repo_info = RepoInfo.parse_repo_info(repo_path) + if repo_info.clone_or_pull(): + log.success( + f"Repository has been successfully cloned: {repo_info}") - if not repo_info.clone_or_pull(): - log.error("Unable to clone repository:", repo_path) - return + else: + log.warning(f"Unable to clone repository: {repo_info}") - modules = get_modules_from_dir(repo_info.dir) + conn.run("""UPDATE repos SET status = (SELECT id FROM states WHERE name = 'err_clone') WHERE id = %s;""", + repo_id) - if not modules or not repo_info: - log.error("Unable to get the repository information") return - count = db.one("""SELECT COUNT(*) FROM repos WHERE url = %s OR dir = %s OR ("server" = %s AND "user" = %s AND "name" = %s);""", - repo_info.url, repo_info.dir, repo_info.server, repo_info.user, repo_info.name) + modules = get_modules_from_dir(repo_info.dir) - if count: - log.warning("Repository already present in database:", repo_path) + if not modules: + log.warning("Repository contains no Python module") return - repo_id = db.one("""INSERT INTO repos ("url", "dir", "server", "user", "name") VALUES (%s, %s, %s, %s, %s) RETURNING id;""", - repo_info.url, repo_info.dir, repo_info.server, repo_info.user, repo_info.name) + result = run_single_repo(modules, algorithm) - commit_id = db.one("""INSERT INTO commits (repo_id, hash) VALUES (%s, %s) RETURNING id;""", - repo_id, repo_info.hash) + # Insert repository analysis into database all at once + with conn.transaction(): + commit_id = conn.one("""INSERT INTO commits (repo_id, hash) VALUES (%s, %s) RETURNING id;""", + repo_id, repo_info.hash) - result = run_single_repo(modules, OXYGEN) + for c in result.clones: + cluster_id = conn.one("""INSERT INTO clusters (commit_id, "value", weight) VALUES (%s, %s, %s) RETURNING id;""", + commit_id, c.value, c.match_weight) - for c in result.clones: - cluster_id = db.one("""INSERT INTO clusters (commit_id, "value", weight) VALUES (%s, %s, %s) RETURNING id;""", - commit_id, c.value, c.match_weight) + for o, s in c.origins.items(): + conn.run("""INSERT INTO origins (cluster_id, file, line, col_offset, similarity) VALUES (%s, %s, %s, %s, %s);""", + cluster_id, o.file, o.line, o.col_offset, s) - for o, s in c.origins.items(): - db.one("""INSERT INTO clones (cluster_id, origin, similarity) VALUES (%s, %s, %s);""", - cluster_id, o, s) + log.success(f"Repository has been successfully analyzed: {repo_info}") - db.one("""UPDATE commits SET finished = TRUE WHERE id = %s;""", - commit_id) + conn.run("""UPDATE repos SET status = (SELECT id FROM states WHERE name = 'done') WHERE id = %s;""", + repo_id) except PG_Error as ex: - log.error("PostgreSQL: " + str(ex)) + _pg_error_handler(ex, conn, repo_id) + finally: + conn.close() -def _get_repo_analysis(repo): # TODO: Add docstring. - try: - db = pg_conn(db_url) - repos = db.all("""SELECT id FROM repos WHERE "url" = %(repo)s OR "name" = %(repo)s;""", - repo=repo) +def _find_repo_results(conn, repo_id): + commit_id = conn.one("""SELECT id FROM commits WHERE repo_id = %s ORDER BY analyzed_at DESC LIMIT 1;""", + repo_id) - if repos: - repo_id = repos[0] + if commit_id is None: + return "No commit has been analyzed yet for this repository" - commits = db.all("""SELECT id FROM commits WHERE finished AND repo_id = %s;""", - repo_id) + clusters = conn.all_dict("""SELECT id, "value", weight FROM clusters WHERE commit_id = %s;""", + commit_id) - if commits: - commit_id = commits[0] + for c in clusters: + c.origins = [(NodeOrigin(o.file, o.line, o.col_offset), o.similarity) for o in + conn.all_dict("""SELECT file, line, col_offset, similarity FROM origins WHERE cluster_id = %s;""", + c.id)] - clusters = db.all_dict("""SELECT id, "value", weight FROM clusters WHERE commit_id = %s;""", - commit_id) + return clusters - output = [] - for c in clusters: - clones = db.all_dict("""SELECT origin, similarity FROM clones WHERE cluster_id = %s;""", - c.id) +def _get_repo_analysis(repo_path): + repo_info = RepoInfo.parse_repo_info(repo_path) - output.append((c, clones)) + if not repo_info: + return "Invalid Git repository path format" - return output + try: + conn = pg_conn(db_url) - else: - return "Enqueued" + repo_id = conn.one("""INSERT INTO repos ("url", "server", "user", "name", "dir", "status") """ + + """VALUES (%s, %s, %s, %s, %s, (SELECT id FROM states WHERE name = 'queue')) """ + + """ON CONFLICT DO NOTHING RETURNING id;""", + repo_info.url, repo_info.server, repo_info.user, repo_info.name, repo_info.dir) + + if repo_id is not None: + Thread(target=_analyze_repo, args=(repo_info, repo_id)).start() + return "The repository has been added to the queue" + + repo = conn.one_dict("""SELECT repos.id, states.name AS "status_name", states.description AS "status_desc" """ + + """FROM repos JOIN states ON (repos.status = states.id) """ + + """WHERE repos.url = %s OR (repos.server = %s AND repos.user = %s AND repos.name = %s) OR repos.dir = %s;""", + repo_info.url, repo_info.server, repo_info.user, repo_info.name, repo_info.dir) + + # Theoretically, this should never happend, but it's better to check anyways. + if repo is None: + return "Database error" + + elif repo.status_name in {"queue", "err_clone", "err_analysis"}: + return repo.status_desc + + elif repo.status_name == "done": + return _find_repo_results(conn, repo.id) else: - thread = Thread(target=_analyze_repo, args=(repo,)) - thread.start() - return "Added to queue" + return "Unexpected repository status" except PG_Error as ex: - log.error("PostgreSQL: " + str(ex)) - return None + _pg_error_handler(ex, conn, repo_id) + return "Database error" + + finally: + conn.close() @app.route("/") @@ -124,17 +167,17 @@ def web_index(): result = _get_repo_analysis(repo) if isinstance(result, str): - content = _MESSAGE_HTML.replace("#MSG#", "Result: " + result) + content = _MESSAGE_HTML.replace("#MSG#", result) elif result: - clones = "
    " + "".join([("
  1. " + c[0].value + f" - Weight: {c[0].weight}" + "
      " + - "".join(["
    • " + o.origin + f" - Similarity: {o.similarity * 100:g} %" + "
    • " for o in c[1]]) + + clones = "
        " + "".join([(f"
      1. {c.value} - Weight: {c.weight}
          " + + "".join([f"
        • {o[0]} - Similarity: {o[1] * 100:g} %
        • " for o in c.origins]) + "

      2. ") for c in result]) + "
      " content = _RESULTS_HTML.replace("#CLONES#", clones) else: content = _MESSAGE_HTML.replace( - "#MSG#", "

      No code clones detected. Congratulations!

      ") + "#MSG#", "No code clones detected. Congratulations!") except UserInputError as ex: content = _MESSAGE_HTML.replace( From a6cbb90589b4fb6b3984c8ad2f7dcf283290ecf0 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Fri, 2 Aug 2019 17:42:43 +0200 Subject: [PATCH 45/71] Remove node ID from NodeOrigin; get it at runtime --- engine/nodes/TreeNode.py | 4 ++-- engine/nodes/nodeorigin.py | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/engine/nodes/TreeNode.py b/engine/nodes/TreeNode.py index 45b47f9..43be67d 100644 --- a/engine/nodes/TreeNode.py +++ b/engine/nodes/TreeNode.py @@ -33,8 +33,8 @@ def __init__(self, node, origin_file): origin_file {string} -- Relative path to the source file. """ self.node = node - self.origin = NodeOrigin(origin_file, node.lineno, node.col_offset) if \ - node._attributes else NodeOrigin(origin_file, node_id=id(node)) + self.origin = NodeOrigin(origin_file, node.lineno, node.col_offset) \ + if node._attributes else NodeOrigin(origin_file) # Check if this type of node can have docstring. can_have_docstring = node.__class__ in [ast.ClassDef, ast.FunctionDef] diff --git a/engine/nodes/nodeorigin.py b/engine/nodes/nodeorigin.py index 2764ec6..aadc804 100644 --- a/engine/nodes/nodeorigin.py +++ b/engine/nodes/nodeorigin.py @@ -1,23 +1,23 @@ class NodeOrigin: - def __init__(self, file_path, line=None, col_offset=None, node_id=None): - if file_path is None or \ - (node_id is None and (line is None or col_offset is None)) or \ - (node_id is not None and (line is not None or col_offset is not None)): + def __init__(self, file_path, line=None, col_offset=None): + if file_path is None: + raise ValueError( + "File path must always be set to a non-None value") + if line is None != col_offset is None: raise ValueError( - "File path and either ID or both line and column offset must be set to a non-None value") + "Either both line number and column offset must be set or neither") self.file = file_path self.line = line self.col_offset = col_offset - self.id = node_id def __str__(self): - return self.file + (f" (ID: {self.id:x})" if self.id else - f" (L: {self.line} C: {self.col_offset})") + return self.file + (f" (L: {self.line} C: {self.col_offset})" + if self.line and self.col_offset else "") def __repr__(self): return self.__str__() def __hash__(self): - return hash(self.__str__()) + return hash(id(self)) From 616ce8eba05115ffd1ac8c1cec5220a6384cb65f Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Mon, 5 Aug 2019 08:54:31 +0200 Subject: [PATCH 46/71] Change all module names to lower-case / snake_case --- engine/errors/{UserInputError.py => user_input.py} | 0 engine/nodes/{PatternNode.py => pattern.py} | 0 engine/nodes/{TreeNode.py => tree.py} | 0 engine/results/{DetectedClone.py => detected_clone.py} | 0 engine/results/{DetectionResult.py => detection_result.py} | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename engine/errors/{UserInputError.py => user_input.py} (100%) rename engine/nodes/{PatternNode.py => pattern.py} (100%) rename engine/nodes/{TreeNode.py => tree.py} (100%) rename engine/results/{DetectedClone.py => detected_clone.py} (100%) rename engine/results/{DetectionResult.py => detection_result.py} (100%) diff --git a/engine/errors/UserInputError.py b/engine/errors/user_input.py similarity index 100% rename from engine/errors/UserInputError.py rename to engine/errors/user_input.py diff --git a/engine/nodes/PatternNode.py b/engine/nodes/pattern.py similarity index 100% rename from engine/nodes/PatternNode.py rename to engine/nodes/pattern.py diff --git a/engine/nodes/TreeNode.py b/engine/nodes/tree.py similarity index 100% rename from engine/nodes/TreeNode.py rename to engine/nodes/tree.py diff --git a/engine/results/DetectedClone.py b/engine/results/detected_clone.py similarity index 100% rename from engine/results/DetectedClone.py rename to engine/results/detected_clone.py diff --git a/engine/results/DetectionResult.py b/engine/results/detection_result.py similarity index 100% rename from engine/results/DetectionResult.py rename to engine/results/detection_result.py From 2b1297dfcf42e92016edffc0af6db321b49ee4c0 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Mon, 5 Aug 2019 08:58:00 +0200 Subject: [PATCH 47/71] Update imports to new lower-case module names --- cli/app.py | 2 +- cli/args_handler.py | 2 +- engine/algorithms/algorithm_runner.py | 2 +- engine/algorithms/chlorine/chlorine.py | 4 ++-- engine/algorithms/iodine/anti_unification.py | 2 +- engine/algorithms/iodine/iodine.py | 6 +++--- engine/algorithms/oxygen/oxygen.py | 4 ++-- engine/preprocessing/module_parser.py | 2 +- web/app.py | 2 +- 9 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cli/app.py b/cli/app.py index 96371c3..a6210cd 100644 --- a/cli/app.py +++ b/cli/app.py @@ -6,7 +6,7 @@ from engine.algorithms.algorithm_runner import run_two_repos, IODINE from engine.utils.benchmark import time_snap from fastlog import log -from engine.errors.UserInputError import UserInputError +from engine.errors.user_input import UserInputError def main(): diff --git a/cli/args_handler.py b/cli/args_handler.py index bb16f4a..4113a87 100644 --- a/cli/args_handler.py +++ b/cli/args_handler.py @@ -1,5 +1,5 @@ from os.path import isdir -from engine.errors.UserInputError import UserInputError +from engine.errors.user_input import UserInputError from engine.preprocessing.repoinfo import RepoInfo diff --git a/engine/algorithms/algorithm_runner.py b/engine/algorithms/algorithm_runner.py index 4968387..a785c22 100644 --- a/engine/algorithms/algorithm_runner.py +++ b/engine/algorithms/algorithm_runner.py @@ -3,7 +3,7 @@ from .oxygen.oxygen import oxygen from .chlorine.chlorine import chlorine_single_repo, chlorine_two_repos from .iodine.iodine import iodine -from ..errors.UserInputError import UserInputError +from ..errors.user_input import UserInputError from . import OXYGEN, IODINE, CHLORINE diff --git a/engine/algorithms/chlorine/chlorine.py b/engine/algorithms/chlorine/chlorine.py index eaf14a5..8dd13e7 100644 --- a/engine/algorithms/chlorine/chlorine.py +++ b/engine/algorithms/chlorine/chlorine.py @@ -1,8 +1,8 @@ from collections import defaultdict from ...utils.benchmark import time_snap from ...utils.list_tools import flatten -from ...results.DetectedClone import DetectedClone -from ...results.DetectionResult import DetectionResult +from ...results.detected_clone import DetectedClone +from ...results.detection_result import DetectionResult # Minimum weight of a single node used in comparison. _MIN_NODE_WEIGHT = 50 diff --git a/engine/algorithms/iodine/anti_unification.py b/engine/algorithms/iodine/anti_unification.py index 958a5c0..8ec3917 100644 --- a/engine/algorithms/iodine/anti_unification.py +++ b/engine/algorithms/iodine/anti_unification.py @@ -1,4 +1,4 @@ -from ...nodes.PatternNode import PatternNode +from ...nodes.pattern import PatternNode def anti_unify(list1, list2, index1, index2, worktable): diff --git a/engine/algorithms/iodine/iodine.py b/engine/algorithms/iodine/iodine.py index 24bbe4d..023ffe7 100644 --- a/engine/algorithms/iodine/iodine.py +++ b/engine/algorithms/iodine/iodine.py @@ -1,6 +1,6 @@ from .pattern_collection import pattern_collection -from ...results.DetectedClone import DetectedClone -from ...results.DetectionResult import DetectionResult +from ...results.detected_clone import DetectedClone +from ...results.detection_result import DetectionResult def iodine(module_list_1, module_list_2): @@ -16,7 +16,7 @@ def iodine(module_list_1, module_list_2): DetectionResult -- Result of the code clone detection. """ - + clusters = [] for module_tree_1 in module_list_1: for module_tree_2 in module_list_2: diff --git a/engine/algorithms/oxygen/oxygen.py b/engine/algorithms/oxygen/oxygen.py index 3c5c470..e360a24 100644 --- a/engine/algorithms/oxygen/oxygen.py +++ b/engine/algorithms/oxygen/oxygen.py @@ -1,5 +1,5 @@ -from ...results.DetectedClone import DetectedClone -from ...results.DetectionResult import DetectionResult +from ...results.detected_clone import DetectedClone +from ...results.detection_result import DetectionResult def oxygen(modules, weight_limit=15): diff --git a/engine/preprocessing/module_parser.py b/engine/preprocessing/module_parser.py index 318a09d..8e250ff 100644 --- a/engine/preprocessing/module_parser.py +++ b/engine/preprocessing/module_parser.py @@ -3,7 +3,7 @@ import ast from os import listdir, path from os.path import isdir, isfile, relpath -from ..nodes.TreeNode import TreeNode +from ..nodes.tree import TreeNode from collections import deque from .repoinfo import clone_root_dir diff --git a/web/app.py b/web/app.py index f398b5c..76dc84f 100644 --- a/web/app.py +++ b/web/app.py @@ -11,7 +11,7 @@ from engine.nodes.nodeorigin import NodeOrigin from engine.preprocessing.module_parser import get_modules_from_dir from engine.algorithms.algorithm_runner import run_single_repo, OXYGEN -from engine.errors.UserInputError import UserInputError +from engine.errors.user_input import UserInputError from .credentials import db_url app = Flask(__name__) From 82890523e981e4dba92419337f3fcea1e059c9ce Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Mon, 5 Aug 2019 09:33:20 +0200 Subject: [PATCH 48/71] Add missing trailing newlines to docstrings --- engine/algorithms/chlorine/chlorine.py | 3 +++ engine/algorithms/iodine/anti_unification.py | 1 + engine/algorithms/iodine/pattern_collection.py | 4 +++- engine/errors/user_input.py | 1 + engine/nodes/pattern.py | 2 ++ engine/nodes/tree.py | 2 ++ 6 files changed, 12 insertions(+), 1 deletion(-) diff --git a/engine/algorithms/chlorine/chlorine.py b/engine/algorithms/chlorine/chlorine.py index 8dd13e7..19d81c0 100644 --- a/engine/algorithms/chlorine/chlorine.py +++ b/engine/algorithms/chlorine/chlorine.py @@ -56,6 +56,7 @@ def _type1_compare(node1, node2): Returns: int -- Weight of the matching subtrees. string -- Common skeleton of the two nodes. + """ combined_weight = node1.weight + node2.weight @@ -90,6 +91,7 @@ def _compare_internal(n1, n2, ignore_set, match_dict, skeleton_weight_dict): ignore_set {set[TreeNode]} -- Set of nodes to ignore. match_dict {dict[string: set[TreeNode]]} -- Origin nodes of matches. skeleton_weight_dict {dict[string: int]} -- Skeleton weights. + """ if not _can_be_compared(n1, n2): return @@ -114,6 +116,7 @@ def _dict_to_result(match_dict, skeleton_weight_dict): Arguments: match_dict {dict[string: set[TreeNode]]} -- Origin nodes of matches. skeleton_weight_dict {dict[string: int]} -- Skeleton weights. + """ clones = [] diff --git a/engine/algorithms/iodine/anti_unification.py b/engine/algorithms/iodine/anti_unification.py index 8ec3917..a0c6621 100644 --- a/engine/algorithms/iodine/anti_unification.py +++ b/engine/algorithms/iodine/anti_unification.py @@ -11,6 +11,7 @@ def anti_unify(list1, list2, index1, index2, worktable): index1 {int} -- index of current TreeNode to be compared from list1 index2 {int} -- index of current TreeNode to be compared from list2 worktable {2D boolean array} -- keeps track of which two nodes have been checked together + """ # mark the pair as checked/True worktable[index1].set(True, index2) diff --git a/engine/algorithms/iodine/pattern_collection.py b/engine/algorithms/iodine/pattern_collection.py index 68b094d..4ffdc47 100644 --- a/engine/algorithms/iodine/pattern_collection.py +++ b/engine/algorithms/iodine/pattern_collection.py @@ -13,6 +13,7 @@ def pattern_collection(tree_list_1, tree_list_2): tree_list_2 {list[TreeNode]}: A TreeNode tree represented as a list Returns: list[list[PatternNode]]: The clustered patterns identified in the repositories + """ # Get the sizes of the trees size_tree_1 = len(tree_list_1) @@ -37,7 +38,8 @@ def pattern_collection(tree_list_1, tree_list_2): # if the root nodes of the subtrees are equal if tree_list_1[i] == tree_list_2[j]: # Add the results of anti-unify to the list of subtrees - pats[i].append(anti_unify(tree_list_1, tree_list_2, i, j, work_list)) + pats[i].append(anti_unify( + tree_list_1, tree_list_2, i, j, work_list)) # for every set of patterns (one per node in the first tree) for pattern_set in pats: # run the clustering function on the pattern set diff --git a/engine/errors/user_input.py b/engine/errors/user_input.py index 6df12d5..d947d8c 100644 --- a/engine/errors/user_input.py +++ b/engine/errors/user_input.py @@ -12,6 +12,7 @@ class UserInputError(Exception): Attributes: message {string} -- Error message to print. code {int} -- Exit code to use. + """ def __init__(self, message, code=1): diff --git a/engine/nodes/pattern.py b/engine/nodes/pattern.py index 3a70e5f..b7096cd 100644 --- a/engine/nodes/pattern.py +++ b/engine/nodes/pattern.py @@ -11,6 +11,7 @@ class PatternNode: nodes {list[TreeNode]} -- List of TreeNodes with the same skeleton. value {string} -- Common string representation of all the nodes. children {list[PatternNode]} -- List of node's direct children. + """ def __init__(self, node1, node2, value=None): @@ -22,6 +23,7 @@ def __init__(self, node1, node2, value=None): node2 {TreeNode} -- Second TreeNode sharing common skeleton. value {string} -- String representation common for all the nodes. None if the PatternNode represents a hole. + """ self.nodes = [node1, node2] self.value = value or _HOLE diff --git a/engine/nodes/tree.py b/engine/nodes/tree.py index 43be67d..9fdca67 100644 --- a/engine/nodes/tree.py +++ b/engine/nodes/tree.py @@ -24,6 +24,7 @@ class TreeNode: index {int} -- Index of this node (in an external flat list of nodes). parent_index {int} -- Index of parent node. None if this is root node. child_indices {list[int]} -- Indices of this node's direct children. + """ def __init__(self, node, origin_file): @@ -31,6 +32,7 @@ def __init__(self, node, origin_file): Argument: node -- Single raw node produced by the Python AST parser. origin_file {string} -- Relative path to the source file. + """ self.node = node self.origin = NodeOrigin(origin_file, node.lineno, node.col_offset) \ From 5800695ab22b7c9b7325f94e90e1ad383d27278e Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Mon, 5 Aug 2019 09:38:36 +0200 Subject: [PATCH 49/71] Add docstrings to NodeOrigin --- engine/nodes/nodeorigin.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/engine/nodes/nodeorigin.py b/engine/nodes/nodeorigin.py index aadc804..8f70418 100644 --- a/engine/nodes/nodeorigin.py +++ b/engine/nodes/nodeorigin.py @@ -1,5 +1,32 @@ class NodeOrigin: + """ + Class representing the origin of an AST node. + + Attributes: + file {string} -- Source file from which the node originates. + line {int|None} -- Line number at which the node was found. + col_offset {int|None} -- Column offset within the line. + Number of characters on the same + line before the node's token. + + """ + def __init__(self, file_path, line=None, col_offset=None): + """ + Initialize a new node origin instance. + + Arguments: + file_path {string} -- Path to the node's source file. + + Keyword Arguments: + line {int} -- Line number of node's origin. (default: {None}) + col_offset {int} -- Column offset of node. (default: {None}) + + Raises: + ValueError -- When file path is None or when only one of the two + source position specifiers is not None. + + """ if file_path is None: raise ValueError( "File path must always be set to a non-None value") @@ -13,11 +40,21 @@ def __init__(self, file_path, line=None, col_offset=None): self.col_offset = col_offset def __str__(self): + """Convert the node origin into a human-readable string representation.""" return self.file + (f" (L: {self.line} C: {self.col_offset})" if self.line and self.col_offset else "") def __repr__(self): + """Return a string representation of the node origin.""" return self.__str__() def __hash__(self): + """ + Get hash of the node origin. + + The `id` of the node origin is used right now, so two equivalent + node origins may not necessarily have the same hash. + That would be a problem normally, but it works fine in this project. + + """ return hash(id(self)) From 8fe30e93ba4abd909aceba42a197fb695a0f02ff Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Mon, 5 Aug 2019 09:55:00 +0200 Subject: [PATCH 50/71] Add package docstrings to algorithms --- engine/algorithms/__init__.py | 2 ++ engine/algorithms/chlorine/__init__.py | 1 + engine/algorithms/iodine/__init__.py | 1 + engine/algorithms/oxygen/__init__.py | 1 + 4 files changed, 5 insertions(+) diff --git a/engine/algorithms/__init__.py b/engine/algorithms/__init__.py index 79bf6e9..9e0a20a 100644 --- a/engine/algorithms/__init__.py +++ b/engine/algorithms/__init__.py @@ -1,3 +1,5 @@ +"""Package containing all implemented clone detection algorithms.""" + OXYGEN = "oxygen" CHLORINE = "chlorine" IODINE = "iodine" diff --git a/engine/algorithms/chlorine/__init__.py b/engine/algorithms/chlorine/__init__.py index e69de29..e3e636e 100644 --- a/engine/algorithms/chlorine/__init__.py +++ b/engine/algorithms/chlorine/__init__.py @@ -0,0 +1 @@ +"""Package containing the Chlorine algorithm and its helper functions.""" diff --git a/engine/algorithms/iodine/__init__.py b/engine/algorithms/iodine/__init__.py index e69de29..24aca77 100644 --- a/engine/algorithms/iodine/__init__.py +++ b/engine/algorithms/iodine/__init__.py @@ -0,0 +1 @@ +"""Package containing the Iodine algorithm and its helper functions.""" diff --git a/engine/algorithms/oxygen/__init__.py b/engine/algorithms/oxygen/__init__.py index e69de29..781d02f 100644 --- a/engine/algorithms/oxygen/__init__.py +++ b/engine/algorithms/oxygen/__init__.py @@ -0,0 +1 @@ +"""Package containing the Oxygen algorithm and its helper functions.""" From 338dfce9ff7ef28fe76d8ad6face287ef1618939 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Mon, 5 Aug 2019 10:01:18 +0200 Subject: [PATCH 51/71] Add missing package docstrings to engine With the exception of engine/nodes/. --- engine/__init__.py | 1 + engine/errors/__init__.py | 1 + engine/preprocessing/__init__.py | 1 + engine/results/__init__.py | 1 + 4 files changed, 4 insertions(+) diff --git a/engine/__init__.py b/engine/__init__.py index e69de29..45b35a3 100644 --- a/engine/__init__.py +++ b/engine/__init__.py @@ -0,0 +1 @@ +"""Package containing the entire code clone detection engine.""" diff --git a/engine/errors/__init__.py b/engine/errors/__init__.py index e69de29..450858f 100644 --- a/engine/errors/__init__.py +++ b/engine/errors/__init__.py @@ -0,0 +1 @@ +"""Package containing all custom exceptions used by the application.""" diff --git a/engine/preprocessing/__init__.py b/engine/preprocessing/__init__.py index e69de29..7301afe 100644 --- a/engine/preprocessing/__init__.py +++ b/engine/preprocessing/__init__.py @@ -0,0 +1 @@ +"""Package containing functions and classes for source code preprocessing.""" diff --git a/engine/results/__init__.py b/engine/results/__init__.py index e69de29..b4d050a 100644 --- a/engine/results/__init__.py +++ b/engine/results/__init__.py @@ -0,0 +1 @@ +"""Package containing classes for storing clone detection results.""" From a0ce2a21769d2219d4b53fe073d2eff827342976 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Mon, 5 Aug 2019 10:03:15 +0200 Subject: [PATCH 52/71] Add missing docstrings to engine/nodes/ --- engine/nodes/__init__.py | 1 + engine/nodes/nodeorigin.py | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/engine/nodes/__init__.py b/engine/nodes/__init__.py index e69de29..d219c1f 100644 --- a/engine/nodes/__init__.py +++ b/engine/nodes/__init__.py @@ -0,0 +1 @@ +"""Package containing classes for various AST node representations and their metadata.""" diff --git a/engine/nodes/nodeorigin.py b/engine/nodes/nodeorigin.py index 8f70418..288182b 100644 --- a/engine/nodes/nodeorigin.py +++ b/engine/nodes/nodeorigin.py @@ -1,3 +1,6 @@ +"""Module containing the `NodeOrigin` class used to store node origin info.""" + + class NodeOrigin: """ Class representing the origin of an AST node. @@ -51,10 +54,10 @@ def __repr__(self): def __hash__(self): """ Get hash of the node origin. - + The `id` of the node origin is used right now, so two equivalent node origins may not necessarily have the same hash. That would be a problem normally, but it works fine in this project. - + """ return hash(id(self)) From fda89b7cf390eeff7212ee43d85cfe904528aae6 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Mon, 5 Aug 2019 10:07:31 +0200 Subject: [PATCH 53/71] Add ctor docstring to UserInputError --- engine/errors/user_input.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/engine/errors/user_input.py b/engine/errors/user_input.py index d947d8c..0ad1cf0 100644 --- a/engine/errors/user_input.py +++ b/engine/errors/user_input.py @@ -16,5 +16,13 @@ class UserInputError(Exception): """ def __init__(self, message, code=1): + """ + Initialize a new user input error instance. + + Arguments: + message {string} -- Message to display. + code {int} -- Preferred exit code (only if application exits). + + """ self.message = message self.code = code From 85ca144175703ca394628922241ab65a649e3257 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Mon, 5 Aug 2019 10:08:52 +0200 Subject: [PATCH 54/71] Add docstring to utils package --- engine/utils/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/engine/utils/__init__.py b/engine/utils/__init__.py index e69de29..0b34439 100644 --- a/engine/utils/__init__.py +++ b/engine/utils/__init__.py @@ -0,0 +1 @@ +"""Package containing helper functions that do not fit into any existing category.""" \ No newline at end of file From 4bc5822c18bf3f9d817d0e836e31101dd11925a3 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Mon, 5 Aug 2019 10:15:23 +0200 Subject: [PATCH 55/71] Partially fix docstrings in Iodine package --- engine/algorithms/iodine/anti_unification.py | 2 +- engine/algorithms/iodine/iodine.py | 3 ++- engine/algorithms/iodine/pattern_clustering.py | 13 +++++++++---- engine/algorithms/iodine/pattern_collection.py | 3 ++- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/engine/algorithms/iodine/anti_unification.py b/engine/algorithms/iodine/anti_unification.py index a0c6621..653cb8b 100644 --- a/engine/algorithms/iodine/anti_unification.py +++ b/engine/algorithms/iodine/anti_unification.py @@ -3,7 +3,7 @@ def anti_unify(list1, list2, index1, index2, worktable): """ - Create a tree of PatternNodes from two lists + Create a tree of PatternNodes from two lists. Arguments: list1 {list of TreeNodes} -- first tree to be compared diff --git a/engine/algorithms/iodine/iodine.py b/engine/algorithms/iodine/iodine.py index 023ffe7..bdc9caa 100644 --- a/engine/algorithms/iodine/iodine.py +++ b/engine/algorithms/iodine/iodine.py @@ -1,3 +1,5 @@ +"""Module containing the Iodine algorithm's interface.""" + from .pattern_collection import pattern_collection from ...results.detected_clone import DetectedClone from ...results.detection_result import DetectionResult @@ -16,7 +18,6 @@ def iodine(module_list_1, module_list_2): DetectionResult -- Result of the code clone detection. """ - clusters = [] for module_tree_1 in module_list_1: for module_tree_2 in module_list_2: diff --git a/engine/algorithms/iodine/pattern_clustering.py b/engine/algorithms/iodine/pattern_clustering.py index 9360fe8..29aff24 100644 --- a/engine/algorithms/iodine/pattern_clustering.py +++ b/engine/algorithms/iodine/pattern_clustering.py @@ -1,9 +1,14 @@ def clustering(ps): """ - :param ps: a set of patterns - :return: a set of clustered-patterns - """ - + Perform pattern clustering and return clusters. + + Arguments: + ps -- a set of patterns + + Returns: + a set of clustered-patterns + + """ cs = [] # initialize the set for p in ps: # iterate through the patterns in the set of patterns merged = False diff --git a/engine/algorithms/iodine/pattern_collection.py b/engine/algorithms/iodine/pattern_collection.py index 4ffdc47..1e0fc1b 100644 --- a/engine/algorithms/iodine/pattern_collection.py +++ b/engine/algorithms/iodine/pattern_collection.py @@ -12,7 +12,8 @@ def pattern_collection(tree_list_1, tree_list_2): tree_list_1 {list[TreeNode]}: A TreeNode tree represented as a list tree_list_2 {list[TreeNode]}: A TreeNode tree represented as a list - Returns: list[list[PatternNode]]: The clustered patterns identified in the repositories + Returns: + list[list[PatternNode]]: The clustered patterns identified in the repositories """ # Get the sizes of the trees From 7f9ce2db82d14a2716c8a5dad8948621a0a3d7df Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Mon, 5 Aug 2019 10:20:59 +0200 Subject: [PATCH 56/71] Fix docstrings in Chlorine --- engine/algorithms/chlorine/chlorine.py | 15 +++++++++------ engine/errors/user_input.py | 4 ++-- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/engine/algorithms/chlorine/chlorine.py b/engine/algorithms/chlorine/chlorine.py index 19d81c0..61a86a0 100644 --- a/engine/algorithms/chlorine/chlorine.py +++ b/engine/algorithms/chlorine/chlorine.py @@ -1,3 +1,5 @@ +"""Module containing implementation of the Chlorine algorithm.""" + from collections import defaultdict from ...utils.benchmark import time_snap from ...utils.list_tools import flatten @@ -46,8 +48,9 @@ def _can_be_compared(node1, node2): def _type1_compare(node1, node2): """ - Compare two nodes and returns the weight of their matching subtrees - and a skeleton string representing their common syntax tree skeleton. + Compare two nodes and return the weight of their matching subtree. + + Also return a string representing their common syntax tree skeleton. Arguments: node1 {TreeNode} -- First node. @@ -82,8 +85,7 @@ def _type1_compare(node1, node2): def _compare_internal(n1, n2, ignore_set, match_dict, skeleton_weight_dict): """ - Common logic shared by single-repo analysis and - two repository comparison mode. + Run common logic shared by single-repo analysis and 2-repo comparison mode. Arguments: n1 {TreeNode} -- First node. @@ -130,8 +132,9 @@ def _dict_to_result(match_dict, skeleton_weight_dict): def chlorine_single_repo(modules): """ - Find all clones satisfying the settings at the top of this source file - in a single repository given its modules. + Find all clones in a single repository given its modules. + + Clones must satisfy the settings at the top of this source file. Detected code clones are printed on STDOUT, including the common skeleton, path to each clones (source file path, line number, column offset), size of each clone (number of nodes in its syntax tree) and their diff --git a/engine/errors/user_input.py b/engine/errors/user_input.py index 0ad1cf0..ae6b4cb 100644 --- a/engine/errors/user_input.py +++ b/engine/errors/user_input.py @@ -18,11 +18,11 @@ class UserInputError(Exception): def __init__(self, message, code=1): """ Initialize a new user input error instance. - + Arguments: message {string} -- Message to display. code {int} -- Preferred exit code (only if application exits). - + """ self.message = message self.code = code From 1a037240e387eafcd6a248198cad3c3cde7a4c70 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Mon, 5 Aug 2019 10:23:46 +0200 Subject: [PATCH 57/71] Fix docstrings in Oxygen --- engine/algorithms/oxygen/oxygen.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/engine/algorithms/oxygen/oxygen.py b/engine/algorithms/oxygen/oxygen.py index e360a24..163bf6c 100644 --- a/engine/algorithms/oxygen/oxygen.py +++ b/engine/algorithms/oxygen/oxygen.py @@ -1,10 +1,12 @@ +"""Module containing logic and interface of the Oxygen algorithm.""" + from ...results.detected_clone import DetectedClone from ...results.detection_result import DetectionResult def oxygen(modules, weight_limit=15): """ - Very simple type 1 code duplication check based on AST.dump() function. + Run basic type 1 code duplication check based on AST.dump() function. Arguments: modules (list[list[TreeNode]): Modules in locally standardized format. From 1865a13ad69937b568accf77077d2437e388243f Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Mon, 5 Aug 2019 10:31:54 +0200 Subject: [PATCH 58/71] Fix docstrings in nodes package --- engine/nodes/pattern.py | 2 ++ engine/nodes/tree.py | 19 +++++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/engine/nodes/pattern.py b/engine/nodes/pattern.py index b7096cd..441c211 100644 --- a/engine/nodes/pattern.py +++ b/engine/nodes/pattern.py @@ -82,8 +82,10 @@ def get_match_weight(self): (1 + sum([c.get_match_weight() for c in self.children])) def __str__(self): + """Convert the pattern node into a human-readable string.""" # FIXME: This doesn't seem right. return f"{self.value}(', '.join{[n.origin for n in self.nodes]})" def __repr__(self): + """Return string representation of the pattern node.""" return self.__str__() diff --git a/engine/nodes/tree.py b/engine/nodes/tree.py index 9fdca67..825da21 100644 --- a/engine/nodes/tree.py +++ b/engine/nodes/tree.py @@ -9,7 +9,7 @@ class TreeNode: """ - Represent a single node of the Python code AST (Abstract Syntax Tree). + Represents a single node of the Python code AST (Abstract Syntax Tree). Every node is also a tree of its own, with the exception of leaf (childless) nodes. @@ -29,7 +29,9 @@ class TreeNode: def __init__(self, node, origin_file): """ - Argument: + Initialize a new tree node instance. + + Arguments: node -- Single raw node produced by the Python AST parser. origin_file {string} -- Relative path to the source file. @@ -84,8 +86,7 @@ def dump(self): def get_all_children(self): """ - Recursively finds all children of the node - and collects them into a single list. + Find all children of the node recursively and collect them into a single list. Returns: list[TreeNode] -- List of all the recursively found children. @@ -127,13 +128,23 @@ def __eq__(self, other): return True def __ne__(self, other): + """Check if this node is not equal to another tree node.""" return not self.__eq__(other) def __str__(self): + """Convert the tree node into a human-readable string.""" return f"{self.origin} - {self.value} (W={self.weight})" def __repr__(self): + """Return string representation of this tree node.""" return self.__str__() def __hash__(self): + """ + Get the tree node's hash. + + The origin's hash is used for the whole node, + so if two different nodes somehow have the same origin, + it will cause the nodes to be treated as equal by hash-based types. + """ return hash(self.origin) From fc8288d5ad6fab3baa76adbad8553b6d086e559e Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Mon, 5 Aug 2019 11:00:03 +0200 Subject: [PATCH 59/71] Make .json, .err and .log rules more specific --- .gitignore | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 61917ae..fcd1c7c 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,6 @@ engine/repos/ venv/ web/credentials.py -**/*.json -*.err -*.log +clones_*.json +qa/*.err +qa/*.log From 83ce041c9b7444a3585c0b719b8450683a2c3c90 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Mon, 5 Aug 2019 11:02:33 +0200 Subject: [PATCH 60/71] Add missing periods (.) to printing.py docstrings --- engine/utils/printing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/engine/utils/printing.py b/engine/utils/printing.py index eeab86e..35cdd69 100644 --- a/engine/utils/printing.py +++ b/engine/utils/printing.py @@ -3,7 +3,7 @@ def print_node_list(node_list): """ - Print a list of TreeNodes for debugging + Print a list of TreeNodes for debugging. Arguments: node_list (list[TreeNode]): a list of tree nodes @@ -16,7 +16,7 @@ def print_node_list(node_list): def print_node(node, indent, level, node_list): """ - Print a TreeNode for debugging + Print a TreeNode for debugging. Arguments: node (TreeNode): node to print From 862edb524ac24b683616d0924a9203ac150eadfe Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Mon, 5 Aug 2019 11:14:53 +0200 Subject: [PATCH 61/71] Add missing docstrings to RepoInfo --- engine/preprocessing/repoinfo.py | 46 +++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/engine/preprocessing/repoinfo.py b/engine/preprocessing/repoinfo.py index 3962046..47bf8d4 100644 --- a/engine/preprocessing/repoinfo.py +++ b/engine/preprocessing/repoinfo.py @@ -1,7 +1,4 @@ -""" -Module containing the `RepoInfo` class, which is used to encapsulate -all available information about a repository into a single object. -""" +"""Module containing the `RepoInfo` class.""" import re from os.path import isdir, dirname, join as path_join @@ -14,7 +11,32 @@ class RepoInfo: + """ + Encapsulates all available information about a repository into a single object. + + Attributes: + url {string} -- Full remote source URL of the repository. + server {string} -- Name of the source server (e.g., "github.com"). + user {string} -- Username of the repository owner. + name {string} -- Name of the repository on the server. + dir {string} -- Path to the local clone of the repository. + hash {string} -- Hash of the last pulled commit. + + """ + def __init__(self, url, server, user, name, local_dir, commit_hash=None): + """ + Initialize a new repository information object. + + Arguments: + url {string} -- Full remote source URL of the repository. + server {string} -- Name of the source server (e.g., "github.com"). + user {string} -- Username of the repository owner. + name {string} -- Name of the repository on the server. + local_dir {string} -- Path to the local clone of the repository. + commit_hash {string} -- Hash of the last pulled commit. + + """ self.url = url self.server = server self.user = user @@ -23,6 +45,7 @@ def __init__(self, url, server, user, name, local_dir, commit_hash=None): self.hash = commit_hash def clone_or_pull(self): + """Clone the repository or pull it if it has already been cloned.""" try: # If repo dir already exists, pull it. if isdir(self.dir): @@ -45,6 +68,16 @@ def clone_or_pull(self): @staticmethod def parse_repo_info(repo_path): + """ + Parse repository information from a repository path. + + There are two valid repository path formats: + - Full remote repository URL (supports both GitHub and GitLab). + "https://github.com/user/repo" + - Short GitHub repository URL (only works with GitHub). + "user/repo" + + """ try: parts = urlparse(repo_path) except ValueError: @@ -77,9 +110,14 @@ def parse_repo_info(repo_path): return RepoInfo(full_url, server, repo_user, repo_name, clone_dir) def __str__(self): + """Convert the most useful repo info into a human-readable string.""" info_str = f"{self.url} -> {self.dir}" if self.hash: info_str += f" (commit: {self.hash})" return info_str + + def __repr__(self): + """Return string representation of the repository information.""" + return self.__str__() From 65f0165d5c59a6eaab53f1ff003bcc97d43a0405 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Mon, 5 Aug 2019 11:28:02 +0200 Subject: [PATCH 62/71] Add missing docstrings to CLI package --- cli/__init__.py | 1 + cli/__main__.py | 7 +++++++ cli/app.py | 5 ++++- cli/args_handler.py | 20 ++++++++++++++++++++ 4 files changed, 32 insertions(+), 1 deletion(-) diff --git a/cli/__init__.py b/cli/__init__.py index e69de29..963aa22 100644 --- a/cli/__init__.py +++ b/cli/__init__.py @@ -0,0 +1 @@ +"""Package containing implementation of the application's CLI (command line interface).""" diff --git a/cli/__main__.py b/cli/__main__.py index e138e54..2884c3e 100644 --- a/cli/__main__.py +++ b/cli/__main__.py @@ -1,3 +1,10 @@ +""" +Intended entry point of the application's CLI. + +Please use the following command to run the CLI: `python3 -m cli` +""" + +from . import __main__ from cli.app import main if __name__ == "__main__": diff --git a/cli/app.py b/cli/app.py index a6210cd..272011e 100644 --- a/cli/app.py +++ b/cli/app.py @@ -1,3 +1,5 @@ +"""Module containing the CLI's core logic.""" + import sys import os from datetime import datetime @@ -43,7 +45,8 @@ def main(): # Create output directory if it doesn't exist and print output output_path = os.getcwd() now = datetime.now() - output_filename = "clones_" + f"{now.year}-{now.month}-{now.day}_{now.hour}-{now.minute}-{now.second}" + ".json" + output_filename = "clones_" + \ + f"{now.year}-{now.month}-{now.day}_{now.hour}-{now.minute}-{now.second}" + ".json" os.makedirs(output_path, exist_ok=True) with open(os.path.join(output_path, output_filename), "w") as output_file: output_file.write(clones.json()) diff --git a/cli/args_handler.py b/cli/args_handler.py index 4113a87..0b8f127 100644 --- a/cli/args_handler.py +++ b/cli/args_handler.py @@ -1,3 +1,5 @@ +"""Module containing functions for handling command-line arguments supplied by the user.""" + from os.path import isdir from engine.errors.user_input import UserInputError from engine.preprocessing.repoinfo import RepoInfo @@ -15,6 +17,24 @@ def repo_path_to_local_path(repo_path): + """ + Convert a repository path into a local file system path. + + This repository path is extended (compared to the repository path + used by RepoInfo) by adding support for existing local directories. + + The process typically includes checking for a local directory or + parsing a full or short repository URL and then cloning the repository. + + Arguments: + repo_path {string} -- Path of a remote repository or a local directory. + + Returns: + string -- Path of a local directory equivalent to the one + specified by the repository path. + In case of local directories, the paths are equal. + + """ if isdir(repo_path): return repo_path From f29c7213d806854b7e8c5143510fa363fe35f358 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Mon, 5 Aug 2019 15:12:20 +0200 Subject: [PATCH 63/71] Fix docstrings in web/ and test/ packages --- test/__init__.py | 1 + web/__init__.py | 1 + web/__main__.py | 2 ++ web/app.py | 1 + 4 files changed, 5 insertions(+) diff --git a/test/__init__.py b/test/__init__.py index e69de29..81fb5b6 100644 --- a/test/__init__.py +++ b/test/__init__.py @@ -0,0 +1 @@ +"""Package containing tests for all parts of this application (engine, interfaces).""" diff --git a/web/__init__.py b/web/__init__.py index e69de29..f022c52 100644 --- a/web/__init__.py +++ b/web/__init__.py @@ -0,0 +1 @@ +"""Package containing implementation of the web interface.""" diff --git a/web/__main__.py b/web/__main__.py index 6475d15..860be11 100644 --- a/web/__main__.py +++ b/web/__main__.py @@ -1,3 +1,5 @@ +"""Entry point of the web interface application.""" + from .app import app if __name__ == "__main__": diff --git a/web/app.py b/web/app.py index 76dc84f..ebb85d7 100644 --- a/web/app.py +++ b/web/app.py @@ -159,6 +159,7 @@ def _get_repo_analysis(repo_path): @app.route("/") def web_index(): + """Homepage of the web interface.""" content = "" repo = request.args.get("repo") From 2face4daf649aa48eb4ec654158d82e2227e46db Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Mon, 5 Aug 2019 16:33:22 +0200 Subject: [PATCH 64/71] Fix linter errors and warnings With the exception of E501. It's difficult to have 79 character lines while keeping docstring title lines on a single line. --- cli/__main__.py | 1 - engine/algorithms/chlorine/chlorine.py | 4 ++-- engine/algorithms/iodine/pattern_clustering.py | 5 +++-- engine/nodes/nodeorigin.py | 2 +- engine/preprocessing/module_parser.py | 1 - engine/utils/__init__.py | 2 +- 6 files changed, 7 insertions(+), 8 deletions(-) diff --git a/cli/__main__.py b/cli/__main__.py index 2884c3e..4200890 100644 --- a/cli/__main__.py +++ b/cli/__main__.py @@ -4,7 +4,6 @@ Please use the following command to run the CLI: `python3 -m cli` """ -from . import __main__ from cli.app import main if __name__ == "__main__": diff --git a/engine/algorithms/chlorine/chlorine.py b/engine/algorithms/chlorine/chlorine.py index 61a86a0..6f1a752 100644 --- a/engine/algorithms/chlorine/chlorine.py +++ b/engine/algorithms/chlorine/chlorine.py @@ -49,7 +49,7 @@ def _can_be_compared(node1, node2): def _type1_compare(node1, node2): """ Compare two nodes and return the weight of their matching subtree. - + Also return a string representing their common syntax tree skeleton. Arguments: @@ -133,7 +133,7 @@ def _dict_to_result(match_dict, skeleton_weight_dict): def chlorine_single_repo(modules): """ Find all clones in a single repository given its modules. - + Clones must satisfy the settings at the top of this source file. Detected code clones are printed on STDOUT, including the common skeleton, path to each clones (source file path, line number, column offset), diff --git a/engine/algorithms/iodine/pattern_clustering.py b/engine/algorithms/iodine/pattern_clustering.py index 29aff24..98106d7 100644 --- a/engine/algorithms/iodine/pattern_clustering.py +++ b/engine/algorithms/iodine/pattern_clustering.py @@ -8,12 +8,13 @@ def clustering(ps): Returns: a set of clustered-patterns - """ + """ cs = [] # initialize the set for p in ps: # iterate through the patterns in the set of patterns merged = False for c in cs: # iterate through the clustered-patterns in the set of clustered-patterns - if p.skeleton_equals(c): # if the pattern and the clustered-pattern are the same shape + # if the pattern and the clustered-pattern are the same shape + if p.skeleton_equals(c): c.add_node(p) # merge labels of p into c merged = True break diff --git a/engine/nodes/nodeorigin.py b/engine/nodes/nodeorigin.py index 288182b..4f6759c 100644 --- a/engine/nodes/nodeorigin.py +++ b/engine/nodes/nodeorigin.py @@ -34,7 +34,7 @@ def __init__(self, file_path, line=None, col_offset=None): raise ValueError( "File path must always be set to a non-None value") - if line is None != col_offset is None: + if (line is None) != (col_offset is None): raise ValueError( "Either both line number and column offset must be set or neither") diff --git a/engine/preprocessing/module_parser.py b/engine/preprocessing/module_parser.py index 8e250ff..0dd26cb 100644 --- a/engine/preprocessing/module_parser.py +++ b/engine/preprocessing/module_parser.py @@ -5,7 +5,6 @@ from os.path import isdir, isfile, relpath from ..nodes.tree import TreeNode from collections import deque -from .repoinfo import clone_root_dir def _read_whole_file(file_path): diff --git a/engine/utils/__init__.py b/engine/utils/__init__.py index 0b34439..8c02ab6 100644 --- a/engine/utils/__init__.py +++ b/engine/utils/__init__.py @@ -1 +1 @@ -"""Package containing helper functions that do not fit into any existing category.""" \ No newline at end of file +"""Package containing helper functions that do not fit into any existing category.""" From 01f599fb4af4745748b12e0628200a63d4695f21 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Mon, 5 Aug 2019 17:45:41 +0200 Subject: [PATCH 65/71] Make UserInputError import absolute --- engine/algorithms/algorithm_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engine/algorithms/algorithm_runner.py b/engine/algorithms/algorithm_runner.py index a785c22..52520b2 100644 --- a/engine/algorithms/algorithm_runner.py +++ b/engine/algorithms/algorithm_runner.py @@ -3,7 +3,7 @@ from .oxygen.oxygen import oxygen from .chlorine.chlorine import chlorine_single_repo, chlorine_two_repos from .iodine.iodine import iodine -from ..errors.user_input import UserInputError +from engine.errors.user_input import UserInputError from . import OXYGEN, IODINE, CHLORINE From c1e265411de9dd87b26b67ccee41ba943184e692 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Mon, 5 Aug 2019 17:46:57 +0200 Subject: [PATCH 66/71] Add GitHub/GitLab URL sanitization to RepoInfo --- engine/preprocessing/repoinfo.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/engine/preprocessing/repoinfo.py b/engine/preprocessing/repoinfo.py index 47bf8d4..c582b7c 100644 --- a/engine/preprocessing/repoinfo.py +++ b/engine/preprocessing/repoinfo.py @@ -101,7 +101,15 @@ def parse_repo_info(repo_path): scheme = parts.scheme or "https" server = parts.netloc or "github.com" - # Inserting ":@" before hostname prevents username/password prompt + server_regex = re.compile(r"^(?:www\.)?(git(?:hub|lab)\.com)$", + re.IGNORECASE) + + server_match = server_regex.fullmatch(server) + if parts.netloc and server_match: + scheme = "https" + server = server_match[1].lower() + + # Inserting ":@" before hostname prevents a username/password prompt. full_url = urlunparse((scheme, ":@" + server, f"/{repo_user}/{repo_name}", "", "", "")) From 892fe59fe80016351143973a99f520b5bd66942d Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Mon, 5 Aug 2019 17:49:07 +0200 Subject: [PATCH 67/71] Strip whitespace from repo URL in web app --- web/app.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/web/app.py b/web/app.py index ebb85d7..d03eb52 100644 --- a/web/app.py +++ b/web/app.py @@ -114,7 +114,8 @@ def _find_repo_results(conn, repo_id): def _get_repo_analysis(repo_path): - repo_info = RepoInfo.parse_repo_info(repo_path) + # Strip leading and trailing whitespace from the path and parse repo info. + repo_info = RepoInfo.parse_repo_info(repo_path.strip()) if not repo_info: return "Invalid Git repository path format" From 62bf540a3ef0b0feea7e281d6936f5cdaee638a9 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Tue, 6 Aug 2019 11:21:41 +0200 Subject: [PATCH 68/71] Change "representation" class docstring wording --- engine/results/detected_clone.py | 2 +- engine/results/detection_result.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/engine/results/detected_clone.py b/engine/results/detected_clone.py index 6279469..f3c1c47 100644 --- a/engine/results/detected_clone.py +++ b/engine/results/detected_clone.py @@ -3,7 +3,7 @@ class DetectedClone: """ - Represent a single detected code clone. + Representation of a single detected code clone. Similarity coefficient is a floating-point number between 0 and 1, where 0 means the subtrees are completely different and 1 means diff --git a/engine/results/detection_result.py b/engine/results/detection_result.py index 4778aa5..ead0aee 100644 --- a/engine/results/detection_result.py +++ b/engine/results/detection_result.py @@ -5,7 +5,7 @@ class DetectionResult: """ - Represent the final result of a detection query. + Representation of the final result of a detection query. For now it only contains a list of detected clones, but more information may be added in the future. From b900817a3c1aa5ca174f135ffd9c71521bab5d9e Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Tue, 6 Aug 2019 11:21:56 +0200 Subject: [PATCH 69/71] Delte iter_tools module #95 --- engine/utils/list_tools.py | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 engine/utils/list_tools.py diff --git a/engine/utils/list_tools.py b/engine/utils/list_tools.py deleted file mode 100644 index daea93b..0000000 --- a/engine/utils/list_tools.py +++ /dev/null @@ -1,20 +0,0 @@ -"""Module containing helper functions for list manipulation.""" - - -def flatten(list_of_lists): - """ - Flatten a list of list into a single flat list. - - Arguments: - list_of_lists {list[list[T]]} -- List of lists to flatten. - - Returns: - list[T] -- Flat list generated by flattening the source list of lists. - - """ - flat = [] - - for l in list_of_lists: - flat.extend(l) - - return flat From 1495ef8e425ba706409367cfa006a4d08a4fbdd1 Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Tue, 6 Aug 2019 11:25:08 +0200 Subject: [PATCH 70/71] Replace flatten(); Close calebdehaan#95 --- engine/algorithms/chlorine/chlorine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/engine/algorithms/chlorine/chlorine.py b/engine/algorithms/chlorine/chlorine.py index 6f1a752..7dd29f4 100644 --- a/engine/algorithms/chlorine/chlorine.py +++ b/engine/algorithms/chlorine/chlorine.py @@ -1,8 +1,8 @@ """Module containing implementation of the Chlorine algorithm.""" from collections import defaultdict +from itertools import chain from ...utils.benchmark import time_snap -from ...utils.list_tools import flatten from ...results.detected_clone import DetectedClone from ...results.detection_result import DetectionResult @@ -208,7 +208,7 @@ def chlorine_two_repos(modules1, modules2): time_snap("Function started") repo1_nodes = [m[0] for m in modules1] - repo2_nodes = flatten(modules2) + repo2_nodes = chain.from_iterable(modules2) time_snap("Module lists optimized") From c2b90f0ade44665145e0739235d21b9bda5b4d8b Mon Sep 17 00:00:00 2001 From: Ivo Meixner Date: Tue, 6 Aug 2019 12:10:46 +0200 Subject: [PATCH 71/71] Cast result of chain.from_iterable to list The chain type doesn't allow many operations and it's not possible to repeatedly iterate over it. --- engine/algorithms/chlorine/chlorine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engine/algorithms/chlorine/chlorine.py b/engine/algorithms/chlorine/chlorine.py index 7dd29f4..b82187e 100644 --- a/engine/algorithms/chlorine/chlorine.py +++ b/engine/algorithms/chlorine/chlorine.py @@ -208,7 +208,7 @@ def chlorine_two_repos(modules1, modules2): time_snap("Function started") repo1_nodes = [m[0] for m in modules1] - repo2_nodes = chain.from_iterable(modules2) + repo2_nodes = list(chain.from_iterable(modules2)) time_snap("Module lists optimized")