Skip to content
This repository has been archived by the owner on Jun 1, 2021. It is now read-only.

Commit

Permalink
Rename effectlive tld to public suffic, move before classification, a…
Browse files Browse the repository at this point in the history
…dd same primary domain as a classification for internal domains
  • Loading branch information
joelpurra committed Sep 26, 2014
1 parent 2f8922f commit 95ab2f1
Show file tree
Hide file tree
Showing 14 changed files with 51 additions and 28 deletions.
1 change: 1 addition & 0 deletions src/aggregate/analysis.sh
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ def mangleShared:
"is-same-domain": .classification.isSameDomain,
"is-subdomain": .classification.isSubdomain,
"is-superdomain": .classification.isSuperdomain,
"is-same-primary-domain": .classification.isSamePrimaryDomain,
"is-internal-domain": .classification.isInternalDomain,
"is-external-domain": .classification.isExternalDomain,
"is-successful-request": .classification.isSuccessful,
Expand Down
6 changes: 4 additions & 2 deletions src/aggregate/prepare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def mangleUrl:
.domain | {
value: .value,
"public-suffixes": (if ."public-suffixes" then (."public-suffixes" | map(.idn | fallbackString)) else null end),
"primary-domain": (if ."private-prefixes" then (."private-prefixes"[-1:][0] | fallbackString) else null end),
"primary-domain",
}
)
};
Expand Down Expand Up @@ -126,7 +126,7 @@ def distinctMangleDomain(domain):
domain as $domain
| .value |= setKeyCounterObjectCount($domain.value | fallbackString; 1)
| ."public-suffixes" |= setArrayToKeyCounterObject(($domain."public-suffixes" // []) | map(.idn | fallbackString); 1)
| ."primary-domain" |= setKeyCounterObjectCount(($domain."private-prefixes" // []) | .[-1:][0] | fallbackString; 1);
| ."primary-domain" |= setKeyCounterObjectCount($domain."primary-domain" | fallbackString; 1);
def distinctMangleUrl(url):
. as $aggregatedUrl
Expand Down Expand Up @@ -166,6 +166,7 @@ def distinctMangle:
isSameDomain: (.[0].classification.isSameDomain // false),
isSubdomain: (.[0].classification.isSubdomain // false),
isSuperdomain: (.[0].classification.isSuperdomain // false),
isSamePrimaryDomain: (.[0].classification.isSamePrimaryDomain // false),
isInternalDomain: (.[0].classification.isInternalDomain // false),
isExternalDomain: (.[0].classification.isExternalDomain // false),
isSuccessful: (.[0].classification.isSuccessful // false),
Expand Down Expand Up @@ -196,6 +197,7 @@ def distinctMangle:
.classification.isSameDomain = (.classification.isSameDomain and $request.classification.isSameDomain)
| .classification.isSubdomain = (.classification.isSubdomain and $request.classification.isSubdomain)
| .classification.isSuperdomain = (.classification.isSuperdomain and $request.classification.isSuperdomain)
| .classification.isSamePrimaryDomain = (.classification.isSamePrimaryDomain and $request.classification.isSamePrimaryDomain)
| .classification.isInternalDomain = (.classification.isInternalDomain and $request.classification.isInternalDomain)
| .classification.isExternalDomain = (.classification.isExternalDomain and $request.classification.isExternalDomain)
| .classification.isSuccessful = (.classification.isSuccessful and $request.classification.isSuccessful)
Expand Down
5 changes: 4 additions & 1 deletion src/aggregate/prepare2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def base:
isSameDomain: 0,
isSubdomain: 0,
isSuperdomain: 0,
isSamePrimaryDomain: 0,
isInternalDomain: 0,
isExternalDomain: 0,
isSuccessful: 0,
Expand Down Expand Up @@ -177,6 +178,7 @@ def mangleClassification(request):
| .classification.isSameDomain += ($request.classification.isSameDomain | boolToInt)
| .classification.isSubdomain += ($request.classification.isSubdomain | boolToInt)
| .classification.isSuperdomain += ($request.classification.isSuperdomain | boolToInt)
| .classification.isSamePrimaryDomain += ($request.classification.isSamePrimaryDomain | boolToInt)
| .classification.isInternalDomain += ($request.classification.isInternalDomain | boolToInt)
| .classification.isExternalDomain += ($request.classification.isExternalDomain | boolToInt)
| .classification.isSuccessful += ($request.classification.isSuccessful | boolToInt)
Expand Down Expand Up @@ -235,6 +237,7 @@ def distinctBase:
isSameDomain: 0,
isSubdomain: 0,
isSuperdomain: 0,
isSamePrimaryDomain: 0,
isInternalDomain: 0,
isExternalDomain: 0,
isSuccessful: 0,
Expand Down Expand Up @@ -278,7 +281,7 @@ def distinctMangleClassification(request):
request as $request
| .classification.isSameDomain += ($request.classification.isSameDomain | boolToInt)
| .classification.isSubdomain += ($request.classification.isSubdomain | boolToInt)
| .classification.isSuperdomain += ($request.classification.isSuperdomain | boolToInt)
| .classification.isSamePrimaryDomain += ($request.classification.isSamePrimaryDomain | boolToInt)
| .classification.isInternalDomain += ($request.classification.isInternalDomain | boolToInt)
| .classification.isExternalDomain += ($request.classification.isExternalDomain | boolToInt)
| .classification.isSuccessful += ($request.classification.isSuccessful | boolToInt)
Expand Down
8 changes: 7 additions & 1 deletion src/classification/basic.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ def isSuperdomain(domain):
| $domain
| isSubdomain($original);
def isSamePrimaryDomain(originDomain):
originDomain as $originDomain
| ."primary-domain" == $originDomain."primary-domain";
def isSecure:
. == "https";
Expand All @@ -28,12 +32,14 @@ def classifyUrl(origin):
| (if $hasDomainValue then (.domain.value | isSameDomain($origin.domain.value)) else false end) as $isSameDomain
| (if $hasDomainValue then (.domain.value | isSubdomain($origin.domain.value)) else false end) as $isSubdomain
| (if $hasDomainValue then (.domain.value | isSuperdomain($origin.domain.value)) else false end) as $isSuperdomain
| ($isSameDomain or $isSubdomain or $isSuperdomain) as $isInternalDomain
| (if $hasDomainValue then (.domain | isSamePrimaryDomain($origin.domain)) else false end) as $isSamePrimaryDomain
| ($isSameDomain or $isSubdomain or $isSuperdomain or $isSamePrimaryDomain) as $isInternalDomain
| (if (.scheme and .scheme.valid and .scheme.value) then (.scheme.value | isSecure) else false end) as $isSecure
| {
isSameDomain: $isSameDomain,
isSubdomain: $isSubdomain,
isSuperdomain: $isSuperdomain,
isSamePrimaryDomain: $isSamePrimaryDomain,
isInternalDomain: $isInternalDomain,
isExternalDomain: ($isInternalDomain | not),
isSecure: $isSecure,
Expand Down
4 changes: 2 additions & 2 deletions src/classification/disconnect/prepare-service-list.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env bash
set -e

read -d '' classifyExpandedParts <<-'EOF' || true
read -d '' prepareServiceList <<-'EOF' || true
def toNullOrSingleValueOrArray:
if length == 0 then
# Replace an empty array with null.
Expand Down Expand Up @@ -71,4 +71,4 @@ def transformRawDisconnect:
| transformRawDisconnect
EOF

cat | jq "$classifyExpandedParts"
cat | jq "$prepareServiceList"
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#!/usr/bin/env bash
set -e

effectiveTldFile="$1"
preparedPublicSuffixFile="$1"

read -d '' classifyExpandedParts <<-'EOF' || true
read -d '' addPublicSuffixes <<-'EOF' || true
def deleteNullKey(key):
# Delete a property if it is null.
key as $key
Expand Down Expand Up @@ -32,13 +32,13 @@ def deleteEmptyArrayKey(key):
)
);
def matchEffectiveTld:
# Match the domain to all possible rules/groups/public-suffixes in the effective tld list.
def matchPublicSuffix:
# Match the domain to all possible rules/groups/public-suffixes in the public suffix list.
map(
# has($subdomain) is more effective than $effectiveTld[.] // empty
# has($subdomain) is more effective than $publicSuffixLookup[.] // empty
. as $subdomain
| if $effectiveTld | has($subdomain) then
$effectiveTld[$subdomain]
| if $publicSuffixLookup | has($subdomain) then
$publicSuffixLookup[$subdomain]
else
empty
end
Expand All @@ -49,11 +49,16 @@ def getPrivatePrefix(publicSuffixes):
| ($publicSuffixes | length) as $publicSuffixesLength
| .[0:(length - $publicSuffixesLength)];
def getPrimaryDomain:
.[-1:][0];
def mangle:
if . and .domain and .domain.components and (.domain.components | type) == "array" and (.domain.components | length) > 0 then
(.domain.components | matchEffectiveTld) as $currentSuffixes
(.domain.components | matchPublicSuffix) as $currentSuffixes
| (.domain.components | getPrivatePrefix($currentSuffixes)) as $currentPrefixes
| .domain."public-suffixes" = $currentSuffixes
| .domain."private-prefixes" = (.domain.components | getPrivatePrefix($currentSuffixes))
| .domain."private-prefixes" = $currentPrefixes
| .domain."primary-domain" = ($currentPrefixes | getPrimaryDomain)
else
.
end;
Expand All @@ -63,4 +68,4 @@ def mangle:
| .requestedUrls[].referer |= mangle
EOF

cat | jq "$classifyExpandedParts" --argfile "effectiveTld" "$effectiveTldFile"
cat | jq "$addPublicSuffixes" --argfile "publicSuffixLookup" "$preparedPublicSuffixFile"
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ read -d '' getNonFailedClassificationDomainScope <<-'EOF' || true
"is-same-domain": .successfulOrigin.unfilteredUrls.requestedUrlsDistinct.coverage.classification."is-same-domain",
"is-subdomain": .successfulOrigin.unfilteredUrls.requestedUrlsDistinct.coverage.classification."is-subdomain",
"is-superdomain": .successfulOrigin.unfilteredUrls.requestedUrlsDistinct.coverage.classification."is-superdomain",
"is-same-primary-domain": .successfulOrigin.unfilteredUrls.requestedUrlsDistinct.coverage.classification."is-same-primary-domain",
"is-internal-domain": .successfulOrigin.unfilteredUrls.requestedUrlsDistinct.coverage.classification."is-internal-domain",
"is-external-domain": .successfulOrigin.unfilteredUrls.requestedUrlsDistinct.coverage.classification."is-external-domain",
}
Expand All @@ -23,6 +24,7 @@ read -d '' mapData <<-'EOF' || true
"is-same-domain",
"is-subdomain",
"is-superdomain",
"is-same-primary-domain",
"is-internal-domain",
"is-external-domain",
"is-mixed-domain": (1 - ."is-internal-domain" - ."is-external-domain"),
Expand All @@ -37,9 +39,10 @@ map(
"03--Same domain": ."is-same-domain",
"04--Subdomain": ."is-subdomain",
"05--Superdomain": ."is-superdomain",
"06--Internal domain": ."is-internal-domain",
"07--External domain": ."is-external-domain",
"08--Mixed": ."is-mixed-domain",
"06--Same primary": ."is-same-primary-domain",
"07--Internal domain": ."is-internal-domain",
"08--External domain": ."is-external-domain",
"09--Mixed": ."is-mixed-domain",
}
)
EOF
Expand Down
2 changes: 1 addition & 1 deletion src/one-shot/aggregate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
set -e

# Prepare aggregates base
<"domains.parts.expanded.classified.disconnect.effective-tld.json" "${BASH_SOURCE%/*}/../util/parallel-chunks.sh" "${BASH_SOURCE%/*}/../aggregate/prepare.sh" > "aggregates.base.json"
<"domains.parts.expanded.public-suffix.classified.disconnect.json" "${BASH_SOURCE%/*}/../util/parallel-chunks.sh" "${BASH_SOURCE%/*}/../aggregate/prepare.sh" > "aggregates.base.json"

# Aggregates
<"aggregates.base.json" "${BASH_SOURCE%/*}/../aggregate/all.sh" > "aggregates.json"
Expand Down
12 changes: 6 additions & 6 deletions src/one-shot/data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ domainroot=$(cd -- "$domainroot"; echo "$PWD")
# Expand parts by splitting them up to parts
<"domains.parts.json" "${BASH_SOURCE%/*}/../util/parallel-chunks.sh" "${BASH_SOURCE%/*}/../extract/request/expand-parts.sh" > "domains.parts.expanded.json"

# Add basic classifications
<"domains.parts.expanded.json" "${BASH_SOURCE%/*}/../util/parallel-chunks.sh" "${BASH_SOURCE%/*}/../classification/basic.sh" > "domains.parts.expanded.classified.json"
# Add public suffix domain grouping
<"domains.parts.expanded.json" "${BASH_SOURCE%/*}/../util/parallel-chunks.sh" "${BASH_SOURCE%/*}/../classification/public-suffix/add.sh" "prepared.public-suffix.json" > "domains.parts.expanded.public-suffix.json"

# Add disconnect's block matching
<"domains.parts.expanded.classified.json" "${BASH_SOURCE%/*}/../util/parallel-chunks.sh" "${BASH_SOURCE%/*}/../classification/disconnect/add.sh" "prepared.disconnect.services.json" > "domains.parts.expanded.classified.disconnect.json"
# Add basic classifications
<"domains.parts.expanded.public-suffix.json" "${BASH_SOURCE%/*}/../util/parallel-chunks.sh" "${BASH_SOURCE%/*}/../classification/basic.sh" > "domains.parts.expanded.public-suffix.classified.json"

# Add effective tld domain grouping
<"domains.parts.expanded.classified.disconnect.json" "${BASH_SOURCE%/*}/../util/parallel-chunks.sh" "${BASH_SOURCE%/*}/../classification/effective-tld/add.sh" "prepared.effective-tld.json" > "domains.parts.expanded.classified.disconnect.effective-tld.json"
# # Add disconnect's block matching
<"domains.parts.expanded.public-suffix.classified.json" "${BASH_SOURCE%/*}/../util/parallel-chunks.sh" "${BASH_SOURCE%/*}/../classification/disconnect/add.sh" "prepared.disconnect.services.json" > "domains.parts.expanded.public-suffix.classified.disconnect.json"
2 changes: 1 addition & 1 deletion src/one-shot/preparations.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@ download "https://services.disconnect.me/disconnect-plaintext.json"


download "https://publicsuffix.org/list/effective_tld_names.dat"
<"effective_tld_names.dat" "${BASH_SOURCE%/*}/../classification/effective-tld/prepare-list.sh" > "prepared.effective-tld.json"
<"effective_tld_names.dat" "${BASH_SOURCE%/*}/../classification/public-suffix/prepare-list.sh" > "prepared.public-suffix.json"

2 changes: 1 addition & 1 deletion src/one-shot/questions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ set -e
<"google-gtm-ga-dc.json" "${BASH_SOURCE%/*}/../questions/google-gtm-ga-dc.aggregate.sh" > "google-gtm-ga-dc.aggregate.json"

# Origin redirects
<"domains.parts.expanded.classified.json" "${BASH_SOURCE%/*}/../util/parallel-chunks.sh" "${BASH_SOURCE%/*}/../questions/origin-redirects.sh" > "origin-redirects.json"
<"domains.parts.expanded.public-suffix.classified.json" "${BASH_SOURCE%/*}/../util/parallel-chunks.sh" "${BASH_SOURCE%/*}/../questions/origin-redirects.sh" > "origin-redirects.json"
# TODO: parallelize question aggregation?
<"origin-redirects.json" "${BASH_SOURCE%/*}/../questions/origin-redirects.aggregate.sh" > "origin-redirects.aggregate.json"

Expand Down
1 change: 1 addition & 0 deletions src/questions/origin-redirects.aggregate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def countsBase:
isSameDomain: 0,
isSubdomain: 0,
isSuperdomain: 0,
isSamePrimaryDomain: 0,
isInternalDomain: 0,
isExternalDomain: 0,
isSecure: 0,
Expand Down
2 changes: 2 additions & 0 deletions src/questions/origin-redirects.sh
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ select(
isSameDomain: 0,
isSubdomain: 0,
isSuperdomain: 0,
isSamePrimaryDomain: 0,
isInternalDomain: 0,
isExternalDomain: 0,
isSecure: 0,
Expand All @@ -183,6 +184,7 @@ select(
| .counts.isSameDomain = redirectClassificationCount(.isSameDomain)
| .counts.isSubdomain = redirectClassificationCount(.isSubdomain)
| .counts.isSuperdomain = redirectClassificationCount(.isSuperdomain)
| .counts.isSamePrimaryDomain = redirectClassificationCount(.isSamePrimaryDomain)
| .counts.isInternalDomain = redirectClassificationCount(.isInternalDomain)
| .counts.isExternalDomain = redirectClassificationCount(.isExternalDomain)
| .counts.isSecure = redirectClassificationCount(.isSecure)
Expand Down

0 comments on commit 95ab2f1

Please sign in to comment.