-
Notifications
You must be signed in to change notification settings - Fork 20
/
pasta_analyse.py
291 lines (231 loc) · 11 KB
/
pasta_analyse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
"""
PaStA - Patch Stack Analysis
Copyright (c) OTH Regensburg, 2016-2020
Author:
Ralf Ramsauer <ralf.ramsauer@oth-regensburg.de>
This work is licensed under the terms of the GNU GPL, version 2. See
the COPYING file in the top-level directory.
"""
import os
import re
import sys
from functools import partial
from logging import getLogger
from multiprocessing import cpu_count, Pool
from time import sleep
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from pypasta import *
log = getLogger(__name__[-15:])
_repo = None
def _evaluate_patch_list_wrapper(thresholds, args):
global _repo
orig, cand = args
return evaluate_commit_list(_repo, thresholds,
False, EvaluationType.PatchStack,
orig, cand,
parallelise=False)
def find_cherries(repo, commit_hashes, dest_list):
"""
find_cherries() takes a list of commit hashes, a list of potential
candidates and the type of the evaluation (PatchStack / Upstream) and tries
to detect if one commit is the cherry pick of another.
Cherry picks can happen everywhere: picks across patch stacks, or picks from
upstream. We have to distinguish between those types.
:param repo: Repository
:param commit_hashes: list of commit-hashes
:param dest_list: list of potential cherry-pick hashes
:return: EvaluationResult containing all detected cherry picks
"""
log.info('Auto-detecting cherry-picks')
cherries = EvaluationResult()
cherry_rgxs = [r'.*pick.*', r'.*upstream.*commit.*',
r'.*commit.*upstream.*']
cherry_rgxs = re.compile('(' + ')|('.join(cherry_rgxs) + ')', re.IGNORECASE)
sha1_regex = re.compile(r'\b([0-9a-fA-F]{5,40})\b')
for commit_hash in commit_hashes:
commit = repo[commit_hash]
for line in commit.message:
if cherry_rgxs.match(line):
sha_found = sha1_regex.search(line)
if not sha_found:
continue
cherry = sha_found.group(1)
if cherry in dest_list:
if commit_hash in cherries:
cherries[commit_hash].append((cherry,
SimRating(1.0, 1.0, 1.0)))
else:
cherries[commit_hash] = [(cherry,
SimRating(1.0, 1.0, 1.0))]
else:
log.info('Found cherry-pick %s <-> %s but it is not a '
'valid reference in this context'
% (commit_hash, cherry))
log.info(' ↪ done. Found %d cherry-picks' % len(cherries))
return cherries
def remove_from_cluster(message, cluster, ids):
log.warning('PATCH-GROUPS CONTAINS %d %s THAT ARE NOT '
'REACHABLE BY THE CURRENT CONFIGURATION' % (len(ids), message))
log.warning('Those messages will be removed from the result')
log.warning('Waiting 5 seconds before starting. Press Ctrl-C to '
'abort.')
sleep(5)
for unreachable in ids:
cluster.remove_element(unreachable)
cluster.optimize()
def analyse(config, argv):
parser = argparse.ArgumentParser(prog='analyse', description='Analyse patch stacks')
# thresholds
parser.add_argument('-th', dest='thres_heading', metavar='threshold',
default=config.thresholds.heading, type=float,
help='Minimum diff hunk section heading similarity '
'(default: %(default)s)')
parser.add_argument('-tf', dest='thres_filename', metavar='threshold',
default=config.thresholds.filename, type=float,
help='Minimum filename similarity '
'(default: %(default)s)')
parser.add_argument('-dlr', dest='thres_diff_lines', metavar='threshold',
type=float, default=config.thresholds.diff_lines_ratio,
help='Diff lines ratio threshold (default: %(default)s)')
parser.add_argument('-adi', dest='thres_adi', metavar='days', type=int,
default=config.thresholds.author_date_interval,
help='Author date interval (default: %(default)s)')
parser.add_argument('-cpu', dest='cpu_factor', metavar='cpu', type=float,
default=1.0, help='CPU factor for parallelisation '
'(default: %(default)s)')
# choose analysis mode
parser.add_argument('mode', default='succ',
choices=['succ', 'rep', 'upstream'],
help='rep: '
'compare representatives of the stack - '
'succ: '
'compare successive versions of the stacks - '
'upstream: '
'compare representatives against upstream - '
'(default: %(default)s)')
args = parser.parse_args(argv)
config.thresholds.heading = args.thres_heading
config.thresholds.filename = args.thres_filename
config.thresholds.diff_lines_ratio = args.thres_diff_lines
config.thresholds.author_date_interval = args.thres_adi
repo = config.repo
mbox = config.mode == Config.Mode.MBOX
mode = args.mode
if mbox and mode == 'succ':
log.error('Analysis mode succ is not available in mailbox mode!')
return -1
f_cluster, cluster = config.load_cluster(must_exist=False)
def fill_result(hashes, tag):
for hash in hashes:
cluster.insert_element(hash)
if tag:
cluster.mark_upstream(hash, True)
# intermediate persistence
cluster.to_file(f_cluster)
if mbox:
log.info('Regarding mails in time window %s--%s' %
(format_date_ymd(config.mbox_mindate),
format_date_ymd(config.mbox_maxdate)))
# load mbox ccache very early, because we need it in any case if it
# exists.
config.load_ccache_mbox()
if mode == 'rep':
victims = repo.mbox.get_ids(config.mbox_time_window)
# we have to temporarily cache those commits to filter out invalid
# emails. Commit cache is already loaded, so evict everything except
# victims and then cache all victims.
repo.cache_evict_except(victims)
repo.cache_commits(victims)
# we might have loaded invalid emails, so reload the victim list once
# more. This time, include all patches from the pre-existing (partial)
# result, and check if all patches are reachable
victims |= cluster.get_downstream()
# in case of an mbox analysis, we will definitely need all untagged
# commit hashes as we need to determine the representative system for
# both modes, rep and upstream.
available = repo.cache_commits(victims)
unreachable = victims - available
if unreachable:
remove_from_cluster('MESSAGES', cluster, unreachable)
victims = available
log.info('Cached %d relevant mails' % len(available))
fill_result(victims, False)
cherries = EvaluationResult()
if mode == 'succ':
victims = config.psd.commits_on_stacks
fill_result(victims, False)
num_cpus = int(cpu_count() * args.cpu_factor)
psd = config.psd
global _repo
repo = config.repo
_repo = repo
config.load_ccache_stack()
evaluation_list = []
for patch_stack in psd:
successor = psd.get_successor(patch_stack)
if successor == None:
break
log.info('Queueing %s <-> %s' % (patch_stack.stack_version,
successor.stack_version))
evaluation_list.append((patch_stack.commit_hashes,
successor.commit_hashes))
# cache missing commits
repo.cache_commits(psd.commits_on_stacks)
cherries = find_cherries(repo,
psd.commits_on_stacks, psd.commits_on_stacks)
f = partial(_evaluate_patch_list_wrapper, config.thresholds)
log.info('Starting evaluation.')
pool = Pool(num_cpus, maxtasksperchild=1)
results = pool.map(f, evaluation_list, chunksize=5)
pool.close()
pool.join()
log.info(' ↪ done.')
_repo = None
evaluation_result = EvaluationResult(False, EvaluationType.PatchStack)
for result in results:
evaluation_result.merge(result)
else: # mode is rep or upstream
# iterate over similar patch list and get latest commit of patches
log.info('Determining patch stack representative system')
# Get the complete representative system
# The lambda compares two patches of an equivalence class and chooses
# the one with the later release version
if mbox:
representatives = cluster.get_representative_system(
lambda x, y:
repo.get_commit(x).author.date >
repo.get_commit(y).author.date)
else:
representatives = cluster.get_representative_system(
lambda x, y: config.psd.is_stack_version_greater(
config.psd.get_stack_of_commit(x),
config.psd.get_stack_of_commit(y)))
log.info(' ↪ done')
if mode == 'upstream':
candidates = set(config.upstream_hashes)
unreachable = cluster.get_upstream() - candidates
if unreachable:
remove_from_cluster('COMMITS', cluster, unreachable)
fill_result(candidates, True)
config.load_ccache_upstream()
# cache missing commits
repo.cache_commits(representatives | candidates)
repo.cache_evict_except(representatives | candidates)
cherries = find_cherries(repo, representatives, candidates)
type = EvaluationType.Upstream
elif mode == 'rep':
repo.cache_commits(representatives)
candidates = representatives
if not mbox:
cherries = find_cherries(repo, representatives,
config.psd.commits_on_stacks)
type = EvaluationType.PatchStack
log.info('Starting evaluation')
evaluation_result = evaluate_commit_list(repo, config.thresholds,
mbox, type,
representatives, candidates,
parallelise=True, verbose=True,
cpu_factor=args.cpu_factor)
log.info(' ↪ done.')
evaluation_result.merge(cherries)
evaluation_result.to_file(config.f_evaluation_result)