This repository has been archived by the owner on Jun 1, 2021. It is now read-only.
/
prepare-zone-file-domain-lists.sh
executable file
·124 lines (88 loc) · 2.34 KB
/
prepare-zone-file-domain-lists.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env bash
set -e
zoneSuffix="$1"
input="$2"
TEMPORARY=$(mktemp -d "$(basename "${BASH_SOURCE}").XXXXXXXX")
trap 'rm -rf "$TEMPORARY"' EXIT
ZONESUFFIX="$(echo "$zoneSuffix" | tr '[:lower:]' '[:upper:]')"
timestamp=$(date -u +%FT%TZ | tr -d ':')
outputDir="$timestamp"
outputPrefix="$outputDir/zone.$zoneSuffix."
outputSuffix=".$timestamp.txt"
# TODO: check for shuf, use gshuf as a fallback.
shuffler="$(which shuf || which gshuf || "" 2>/dev/null)"
[[ -z shuffler ]] && { echo "shuf/gshuf is required" 1>&2; exit 1; }
# From https://github.com/EtiennePerot/parcimonie.sh/blob/master/parcimonie.sh
# Test for GNU `sed`, or use a `sed` fallback in sedExtRegexp
sedExec=(sed)
if [ "$(echo 'abc' | sed -r 's/abc/def/' 2> /dev/null || true)" == 'def' ]; then
# GNU Linux sed
sedExec+=(-r)
else
# Mac OS X sed
sedExec+=(-E)
fi
sedExtRegexp() {
"${sedExec[@]}" "$@"
}
filename(){
echo "$outputPrefix$1$outputSuffix"
}
write(){
cat > "$(filename "$1")"
}
T(){
tee "$(filename "$1")"
}
read(){
cat "$(filename "$1")"
}
tempFilename(){
echo "$TEMPORARY/$(filename "$1")"
}
writeTemp(){
cat > "$(tempFilename "$1")"
}
Ttemp(){
tee "$(filename "$1")"
}
readTemp(){
cat "$(tempFilename "$1")"
}
top10k(){
head -n 10000 "$@"
}
shuffle(){
"$shuffler" "$@"
}
extractDomains(){
sedExtRegexp -e "1,/^${ZONESUFFIX}\./ d" -e '/^[^ ]+ NS / ! d' -e 's/^([^ ]+) .*$/\1/' -e 's/./\L&/g' -e "s/\$/.${zoneSuffix}/" "$@"
}
differ(){
diff --unified "$(filename "$1")" "$(filename "$2")"
}
unique(){
# Using `uniq` instead of `sort -u` as indata is already grouped by domain.
uniq
}
mkdir "$timestamp"
# DEBUGGING
# Used to test the difference between `uniq` and `sort -u` on this indata.
# `uniq` is a lot faster, and the indata is grouped by domain after filtering; `sort -u` isn't necessary.
# echo extract
# time extractDomains "$input" | write "t"
# echo uniq
# time read "t" | uniq | write "u"
# echo sort -u
# time read "t" | sort -u | write "su"
# echo uniq sort
# time read "u" | sort | write "us"
# echo differ
# time differ "su" "us" | write "d"
# echo done diffing
# echo extract+unique
# time extractDomains "$input" | unique | write "unique"
# echo shuffle+top10k
# time read "unique" | shuffle | top10k | write "random.10000"
# echo extract+unique+shuffle+top10k
extractDomains | unique | T "unique" | shuffle | top10k | write "random.10000"