Skip to content

Commit

Permalink
mk: Implement checksum script in awk.
Browse files Browse the repository at this point in the history
The existing shell script runtime is quadratic with the number of
distfiles that need to be handled.  Ordinarily this is not a problem,
but with some packages, for example those that use Go modules, this can
quickly become a serious bottleneck.

Rewriting the script in awk takes, as an example, the runtime of "bmake
checksum" in www/grafana (during an update to 7.1.3) containing over 600
distfiles from:

  real    8m12.049s
  user    7m32.178s
  sys     0m37.830s

down to just:

  real    0m3.936s
  user    0m3.015s
  sys     0m0.759s

I've tried to preserve the output of the shell script, and support all
of its idiosyncrasies, for example "IGNORE" support.
  • Loading branch information
Jonathan Perkin committed Aug 21, 2020
1 parent afb5c01 commit a040b89
Show file tree
Hide file tree
Showing 2 changed files with 271 additions and 4 deletions.
267 changes: 267 additions & 0 deletions mk/checksum/checksum.awk
@@ -0,0 +1,267 @@
#!/usr/bin/awk -f
#
# $NetBSD$
#
###########################################################################
#
# NAME
# checksum.awk -- checksum files
#
# SYNOPSIS
# checksum.awk [options] distinfo [file ...]
#
# DESCRIPTION
# checksum will verify the checksums in the distinfo file for each
# of the files specified.
#
# The checksum utility exits with one of the following values:
#
# 0 All of the file checksums verify.
#
# 1 At least one of the file checksums did not match.
#
# 2 At least one of the files is missing any checksum.
#
# >2 An error occurred.
#
# OPTIONS
# -a algorithm Only verify checksums for the specified algorithm.
#
# -p The specified files are patches, so strip out any
# lines containing NetBSD RCS ID tags before
# computing the checksums for verification.
#
# -s suffix Strip the specified suffix from the file names
# when searching for the checksum.
#
#
# BUGS
# The flow of this program is not performed in the most optimal way
# possible, as it was deemed important to retain output compatibility
# with the previous shell script implementation.
#

BEGIN {
DIGEST = ENVIRON["DIGEST"] ? ENVIRON["DIGEST"] : "digest"
SED = ENVIRON["SED"] ? ENVIRON["SED"] : "sed"

# Retain output compatible with previous "checksum" shell script
self = "checksum"

a_flag = ""
distinfo = ""
exitcode = 0
patch = 0
suffix = ""

for (arg = 1; arg < ARGC; arg++) {
opt = ARGV[arg]
if (opt == "-a") {
a_flag = ARGV[++arg]
} else if (opt == "-p") {
patch = 1
} else if (opt == "-s") {
suffix = ARGV[++arg]
} else if (opt == "--") {
arg++
break
} else if (match(opt, /^-.*/) != 0) {
opt = substr(opt, RSTART + 1, RLENGTH)
print self ": unknown option -- " opt > "/dev/stderr"
usage()
exit 1
} else {
break
}
}

if (arg > ARGC) {
print self ": missing distinfo"
usage()
exit 1
}

distinfo = ARGV[arg++]
cmd = "test -f " distinfo
if (system(cmd) != 0) {
print self ": " distinfo " not found"
usage()
exit 128
}

#
# Initialise list of files to check, passed on the command line. In
# order to keep things simple, distfiles[] is also used when operating
# in patch mode (-p).
#
while (arg < ARGC) {
distfile = ARGV[arg++]
sfile = distfile
if (suffix) {
sub(suffix "$", "", sfile)
}
if (patch) {
gsub(/.*\//, "", sfile)
}

#
# Have we seen this file in distinfo? Used later to verify
# that all checksums have been recorded.
#
seen[sfile] = 0

#
# Store the filename to be checked in the distinfo file. The
# -s flag allows temporary download files to be tested instead,
# where the suffix will be stripped to match distinfo.
#
distfiles[sfile] = distfile
}

#
# Parse the distinfo file for checksums that must be verified. We're
# only interested in lines of the format:
#
# algorithm (distfile) = checksum
#
while (getline < distinfo) {
if (NF != 4) {
continue
}
if ($0 ~ /^(\#|\$|Size)/) {
continue
}

algorithm = $1
distfile = substr($2, 2, (length($2) - 2)) # strip ()
checksum = $4

# Skip IGNORE lines (likely legacy at this point).
if (checksum == "IGNORE") {
continue
}

# If -a is set then skip non-matching algorithms.
if (a_flag && tolower(algorithm) != tolower(a_flag)) {
continue
}

# Skip if file not in distfiles.
if (!(distfile in distfiles)) {
continue
}

#
# Handle patch files inline. As they need to be modified (by
# removing the $NetBSD$) they are parsed individually by
# digest(1), and so we calculate the checksums now rather than
# saving for later processing to simplify things.
#
if (patch) {
patchfile = distfiles[distfile]
cmd = SED " -e '/[$]NetBSD.*/d' " patchfile " | " \
DIGEST " " algorithm
while ((cmd | getline) > 0) {
checksums[algorithm, distfile] = $1
}
close(cmd)
continue
}

#
# If not a patch file, then we're handling a distfile, where we
# want to build a list of input files to digest(1) so they can
# all be calculated in one go.
#
distsums[algorithm] = distsums[algorithm] " " distfiles[distfile]
}
close(distinfo)

#
# We now have a list of distfiles to be checked for each algorithm,
# pass them all to a single digest(1) command and parse the checksums
# to be compared against distinfo.
#
for (algorithm in distsums) {
cmd = DIGEST " " algorithm " " distsums[algorithm]
while ((cmd | getline) > 0) {
# Should be unnecessary, but just in case. If we want
# to be really paranoid then test that $1 == algorithm.
if (NF != 4) {
continue
}
distfile = substr($2, 2, length($2) - 2)
checksums[$1, distfile] = $4
}
close(cmd)
}

#
# Now that we have computed all the necessary checksums for all of the
# files listed on the command line, go back through distinfo and verify
# that they all match.
#
while (getline < distinfo) {
if (NF != 4) {
continue
}
if ($0 ~ /^(\#|\$|Size)/) {
continue
}

algorithm = $1
distfile = substr($2, 2, (length($2) - 2)) # strip ()
checksum = $4

# If -a is set then skip non-matching algorithms.
if (a_flag && tolower(algorithm) != tolower(a_flag)) {
continue
}

# Skip if file not in distfiles.
if (!(distfile in distfiles)) {
continue
}

# This is likely very legacy at this point.
if (checksum == "IGNORE") {
print self ": Ignoring checksum for " distfile
continue
}

if (checksums[algorithm,distfile] == checksum) {
print "=> Checksum " algorithm " OK for " distfile
seen[distfile] = 1
} else {
print self ": Checksum " algorithm " mismatch for " distfile >"/dev/stderr"
exit 1
}
}
close(distinfo)

#
# Check that all distfiles supplied on the command line have at least
# one matching checksum.
#
for (distfile in distfiles) {
if (seen[distfile] == 0) {
if (a_flag) {
print self ": No " a_flag \
" checksum recorded for " distfile \
> "/dev/stderr"
} else {
print self ": No checksum recorded for " \
distfile > "/dev/stderr"
}
exitcode = 2
}
}

exit(exitcode)
}

function usage() {
print "usage: " self \
" -- [-a algorithm] [-p] [-s suffix] distinfo [file ...]" \
> "/dev/stderr"
}
8 changes: 4 additions & 4 deletions mk/checksum/checksum.mk
Expand Up @@ -27,10 +27,10 @@ _PATCH_DIGEST_ALGORITHMS?= SHA1
_COOKIE.checksum= ${_COOKIE.extract}

_CHECKSUM_CMD= \
${PKGSRC_SETENV} DIGEST=${TOOLS_DIGEST:Q} CAT=${TOOLS_CAT:Q} \
ECHO=${TOOLS_ECHO:Q} SED=${TOOLS_CMDLINE_SED:Q} \
TEST=${TOOLS_TEST:Q} \
${SH} ${PKGSRCDIR}/mk/checksum/checksum \
${PKGSRC_SETENV} \
DIGEST=${TOOLS_DIGEST:Q} SED=${TOOLS_CMDLINE_SED:Q} \
${AWK} -f ${PKGSRCDIR}/mk/checksum/checksum.awk --


.if defined(NO_CHECKSUM) || empty(_CKSUMFILES)
checksum checksum-phase:
Expand Down

0 comments on commit a040b89

Please sign in to comment.