# Init script

Initializes the scan analysis

In [None]:
import os
import re
import json
import datetime
import subprocess
import shutil
import sys

In [None]:
VERSION = 2
SRC_V4 = '/mnt/turbodiesel/ssl/ssl-goscannerv4-zonefiles'
SRC_V6 = '/mnt/turbodiesel/ssl/ssl-goscannerv6-zonefiles'
SCAN_ELIGIBILITY_FILE = '/mnt/turbodiesel/hsts/analysis/scan-eligibility.json'
ANALYSIS_RESULTS = '/mnt/turbodiesel/hsts/results'
ANALYSIS_TMP_FILES = '/root/analysis'
GIT_REPO = '/root/hstsadoption.github.io'
HOSTS_FILE = 'hosts.csv'
HTTP_FILE = 'http.csv'
EXT_COMPRESSION = '.xz'

MIN_SIZE = 200
MAX_ERRORS = 10
MAX_SCAN_DISTANCE = datetime.timedelta(days=5)
scanFolderRe = re.compile('^ssl-goscannerv(?:4|6)-zonefiles-(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})-(?P<hour>\d{2})(?P<minute>\d{2})-output$')
resultFolderRe = re.compile('^hsts-analysis-scan-(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})-(?P<hour>\d{2})(?P<minute>\d{2})-results$')

ANALYSIS_ARGS = ['-c', '6000000', '-d', '-t', ANALYSIS_TMP_FILES]
RESULT_FILES = ['meta.json', 'results.json']

In [None]:
def listRelevantFolders(srcDir, ipVersion):
    """Lists only folders that are relevant for the analysis."""
    
    relevantFolders = []
    for file in os.listdir(srcDir):
        filePath = os.path.join(srcDir, file)
        
        # Check if is folder
        if not os.path.isdir(filePath):
            continue
        
        # Check file name
        mResult = scanFolderRe.match(file)
        if not mResult:
            continue
        
        relevantFolders.append({
            'name': file,
            'path': filePath,
            'date': datetime.datetime(
                int(mResult.group('year')),
                int(mResult.group('month')),
                int(mResult.group('day')),
                hour=int(mResult.group('hour')),
                minute=int(mResult.group('minute'))
            ),
            'ipVersion': ipVersion
        })
    
    return relevantFolders

In [None]:
def checkEligibility(scanMeta):
    """Checks the eligibilty of the scan data for the analysis."""
    
    print("Checking eligibility of %s" % scanMeta['path'])
    
    hostsFileCSV = os.path.join(scanMeta['path'], HOSTS_FILE)
    hostsFileXZ = hostsFileCSV + EXT_COMPRESSION
    httpFileCSV = os.path.join(scanMeta['path'], HTTP_FILE)
    httpFileXZ = httpFileCSV + EXT_COMPRESSION
    
    # Check hosts file exist
    if os.path.isfile(hostsFileCSV):
        hostsFile = hostsFileCSV
        # Check hosts file size
        if os.path.getsize(hostsFileCSV) < MIN_SIZE:
            return (False, "%s: file size < %d" % (HOSTS_FILE, MIN_SIZE))
    elif os.path.isfile(hostsFileXZ):
        hostsFile = hostsFileXZ
        # Check hosts file size
        if os.path.getsize(hostsFileXZ) < MIN_SIZE:
            return (False, "%s: file size < %d" % (HOSTS_FILE, MIN_SIZE))
    else:
        return (False, "%s: missing" % HOSTS_FILE)
        
    # Check http file exists
    if os.path.isfile(httpFileCSV):
        httpFile = httpFileCSV
        # Check http file size
        if os.path.getsize(httpFileCSV) < MIN_SIZE:
            return (False, "%s: file size < %d" % (HTTP_FILE, MIN_SIZE))
    elif os.path.isfile(httpFileXZ):
        httpFile = httpFileXZ
        # Check http file size
        if os.path.getsize(httpFileXZ) < MIN_SIZE:
            return (False, "%s: file size < %d" % (HTTP_FILE, MIN_SIZE))
    else:
        return (False, "%s: missing" % HTTP_FILE)
    
    # Check too many open files error
    count = subprocess.run(['./count_occurences.sh', "too many open files", hostsFile], check=True, stdout=subprocess.PIPE).stdout.decode('utf-8')
    count = int(count)
    if count > MAX_ERRORS:
        return (False, "%s: error 'too many open files' > %d" % (HOSTS_FILE, MAX_ERRORS))
    
    # Check address already in use error
    count = subprocess.run(['./count_occurences.sh', "address already in use", hostsFile], check=True, stdout=subprocess.PIPE).stdout.decode('utf-8')
    count = int(count)
    if count > MAX_ERRORS:
        return (False, "%s: error 'address already in use' > %d" % (HOSTS_FILE, MAX_ERRORS))
    
    # Scan is eligible
    scanMeta['http-file'] = httpFile
    return (True, None)

In [None]:
def filterEligibleScans(scanMeta, scanSetId):
    """Filters only eligible scans."""
    
    global scanEligibility
    
    eligibleScans = []
    for sMeta in scanMeta:
        if not sMeta['name'] in scanEligibility[scanSetId].keys():
            # Scan has never been seen before
            eligible, reason = checkEligibility(sMeta)

            # Set eligibility
            eligibility = {
                'eligible': eligible
            }
            if reason:
                eligibility['reason'] = reason
            scanEligibility[scanSetId][sMeta['name']] = eligibility
            
            # Save eligibility
            saveEligibility()

        # Scan files have been checked before
        if scanEligibility[scanSetId][sMeta['name']]['eligible']:
            eligibleScans.append(sMeta)
    
    return eligibleScans

In [None]:
def saveEligibility():
    """Saves the eligibility data"""
    
    global scanEligibility
    
    print("Saving eligibility data")
    with open(SCAN_ELIGIBILITY_FILE, 'w') as isf:
        json.dump(scanEligibility, isf, indent=2, sort_keys=True)

In [None]:
def saveAnalysisResults(group):
    """Saves the results of a finished analyis"""
    
    print("Copying analysis results")
    
    # Create results dir
    resultsPath = os.path.join(ANALYSIS_RESULTS, group['id'])
    if not os.path.isdir(resultsPath):
        os.makedirs(resultsPath)

    # Save group meta
    with open(os.path.join(resultsPath, 'group-meta.json'), 'w') as file:
        json.dump(group, file, indent=2, default=str, sort_keys=True)

    # Copy log
    try:
        shutil.copy(os.path.join(ANALYSIS_TMP_FILES, 'hsts-analysis.log'), resultsPath)
    except:
        print("Failed to copy log file!")

    copyFailed = False
    for dataSet in ["ipv%d" % scanMeta['ipVersion'] for scanMeta in group['scans']] + ['merged']:
        dsResultDir = os.path.join(resultsPath, dataSet)
        dsAnalysisDir = os.path.join(ANALYSIS_TMP_FILES, dataSet)

        if not os.path.isdir(dsResultDir):
            os.makedirs(dsResultDir)

        for f in RESULT_FILES:
            try:
                shutil.copy(os.path.join(dsAnalysisDir, f), dsResultDir)
            except:
                print("Failed to copy %s from %s to %s" %(f, dsAnalysisDir, dsResultDir))
                copyFailed = True
    
    if copyFailed:
        raise OSError("Failed to copy analysis results")

In [None]:
def cleanAnalysisData():
    """Cleans the analysis data"""
    
    print("Cleaning temporary analysis files")
    for file in os.listdir(ANALYSIS_TMP_FILES):
        filePath = os.path.join(ANALYSIS_TMP_FILES, file)
        if os.path.isfile(filePath):
            os.remove(filePath)
        elif os.path.isdir(filePath):
            shutil.rmtree(filePath)

## Scan files

In [None]:
# Load scan meta data
print("Loading scan meta data")
scanMeta4 = listRelevantFolders(SRC_V4, 4)
scanMeta6 = listRelevantFolders(SRC_V6, 6)

In [None]:
# Load scan list
print("Loading scan eligibility data")
scanEligibility = {SRC_V4:{},SRC_V6:{}}

if os.path.isfile(SCAN_ELIGIBILITY_FILE):
    with open(SCAN_ELIGIBILITY_FILE, 'r') as isf:
        scanEligibility = json.loads(isf.read())

In [None]:
# Filter eligible scans
print("Filtering eligible scans")
eligibleScans4 = filterEligibleScans(scanMeta4, SRC_V4)
eligibleScans6 = filterEligibleScans(scanMeta6, SRC_V6)

del scanMeta4
del scanMeta6

# Save eligibility data
saveEligibility()

In [None]:
# Group IPv4 and IPv6 scans
print("Grouping scans")

# Merge scan meta data
eligibleScans = []
eligibleScans.extend(eligibleScans4)
eligibleScans.extend(eligibleScans6)

del eligibleScans4
del eligibleScans6

# Sort by scan date
eligibleScans.sort(key=lambda scanMeta: scanMeta['date'])

# Group scans
scanGroups = []
curGroup = None
for scanMeta in eligibleScans:
    if curGroup != None:
        # If no other scan of same ip version is in current group and time distance to any other scan
        # in group is less than MAX_SCAN_DISTANCE
        if not scanMeta['ipVersion'] in [s['ipVersion'] for s in curGroup['scans']] \
        and any([scanMeta['date'] - s['date'] < MAX_SCAN_DISTANCE for s in curGroup['scans']]):
            # Add to group
            curGroup['scans'].append(scanMeta)
        else:
            # Finish group
            scanGroups.append(curGroup)
            curGroup = None
    
    if curGroup == None:
        # Start new group
        curGroup = {
            'scans': [scanMeta],
            'id': "hsts-analysis-scan-%s-results" % scanMeta['date'].strftime('%Y-%m-%d-%H%M'),
            'init-version': VERSION
        }
# Add last group
scanGroups.append(curGroup)
curGroup = None

del eligibleScans

In [None]:
# Check analysis results for scan groups
print("Checking analysis results for scan groups")

now = datetime.datetime.now()
analysisGroups = []
for group in scanGroups:
    # Do not analze gropus that might get more scans in the near future
    # If not all scans in group and at least one scan is less than MAX_SCAN_DISTANCE in the past
    if len(group['scans']) < 2 and any([now - s['date'] < MAX_SCAN_DISTANCE for s in group['scans']]):
        print("Skipping group '%s' for now as more scans might be added in the future" % group['id'])
        continue
    
    # Check folder existence
    resultsPath = os.path.join(ANALYSIS_RESULTS, group['id'])
    if os.path.isdir(resultsPath):
        # TODO Check script version
        continue
    
    analysisGroups.append(group)

del scanGroups

In [None]:
# Analyze remaining groups
print("Analyzing new groups (%d)" % len(analysisGroups))

if len(analysisGroups) > 0:
    cleanAnalysisData()

for group in analysisGroups:
    print("Analyzing group: %s (%d scans)" % (group['id'], len(group['scans'])))
    
    # Determine http file path
    for scanMeta in group['scans']:
        if not 'http-file' in scanMeta.keys():
            httpFile = os.path.join(scanMeta['path'], HTTP_FILE)
            if os.path.isfile(httpFile):
                scanMeta['http-file'] = httpFile
            else:
                httpFile += EXT_COMPRESSION
                if os.path.isfile(httpFile):
                    scanMeta['http-file'] = httpFile
                else:
                    raise FileNotFoundError("Could not find http file in %s" % scanMeta['path'])
    
    # Analyze scan group
    args = ['python3', 'analysis.py'] + ANALYSIS_ARGS + \
        ["%s:%s:%d" % (scanMeta['http-file'], "ipv%d" % scanMeta['ipVersion'], scanMeta['ipVersion']) \
         for scanMeta in group['scans']]
    try:
        subprocess.run(args, check=True, stdout=subprocess.DEVNULL)
    except subprocess.CalledProcessError as error:
        print("Failed to analyze %s" % group['id'])
        print(error)
        break
    
    print("Analysis done")
    
    # Copy results
    saveAnalysisResults(group)
    
    # Clean tmp dir
    cleanAnalysisData()

In [None]:
if len(analysisGroups) == 0:
    # No need to update website
    print("Nothing analyzed, skipping website update")
    sys.exit(0)

---

# Update website

In [None]:
# Update repository
print("Updating local repository")
subprocess.run(['git', '-C', GIT_REPO, 'pull'])

In [None]:
# Collect results
print("Collecting results")
MAX_AGE_KEYS = ['other', 'year', 'hy-y', 'half-year', 'w-hy', 'week', 'day', 'test', 'off']
chromium = {'ipv4': [], 'ipv6': [], 'merged': []}

# Load chromium data
for k in chromium.keys():
    configuration = {}
    with open(os.path.join(GIT_REPO, 'data', 'configuration', 'absolute', k + '.json')) as f:
        configuration = json.loads(f.read())
    chromium[k] = configuration.get('chromium', [])

# Results data
results = {
    'adoption-absolute-ipv4': {
        'domains': [],
        'hsts': [],
        'includeSubDomains': [],
        'preload': []
    },
    'adoption-absolute-ipv6': {
        'domains': [],
        'hsts': [],
        'includeSubDomains': [],
        'preload': []
    },
    'adoption-absolute-merged': {
        'domains': [],
        'hsts': [],
        'includeSubDomains': [],
        'preload': []
    },
    'adoption-relative-ipv4': {
        'hsts': [],
        'includeSubDomains': [],
        'preload': []
    },
    'adoption-relative-ipv6': {
        'hsts': [],
        'includeSubDomains': [],
        'preload': []
    },
    'adoption-relative-merged': {
        'hsts': [],
        'includeSubDomains': [],
        'preload': []
    },
    'max-age-ipv4': {k: [] for k in MAX_AGE_KEYS},
    'max-age-ipv6': {k: [] for k in MAX_AGE_KEYS},
    'max-age-merged': {k: [] for k in MAX_AGE_KEYS},
    'configuration-absolute-ipv4': {
        'includeSubDomains': [],
        'preload': [],
        'chromium': chromium['ipv6']
    },
    'configuration-absolute-ipv6': {
        'includeSubDomains': [],
        'preload': [],
        'chromium': chromium['ipv4']
    },
    'configuration-absolute-merged': {
        'includeSubDomains': [],
        'preload': [],
        'chromium': chromium['merged']
    },
    'configuration-relative-ipv4': {
        'includeSubDomains': [],
        'preload': []
    },
    'configuration-relative-ipv6': {
        'includeSubDomains': [],
        'preload': []
    },
    'configuration-relative-merged': {
        'includeSubDomains': [],
        'preload': []
    },
    'inconsistencies-absolute-ipv4': {
        'total': [],
        'configuration': [],
        'existence': []
    },
    'inconsistencies-absolute-ipv6': {
        'total': [],
        'configuration': [],
        'existence': []
    },
    'inconsistencies-relative-ipv4': {
        'total': [],
        'configuration': [],
        'existence': []
    },
    'inconsistencies-relative-ipv6': {
        'total': [],
        'configuration': [],
        'existence': []
    },
    'inconsistency-v4v6-absolute': {
        'inconsistent': []
    },
    'inconsistency-v4v6-relative': {
        'inconsistent': []
    },
    'parse-errors-absolute-ipv4': {
        'parseError': []
    },
    'parse-errors-absolute-ipv6': {
        'parseError': []
    },
    'parse-errors-absolute-merged': {
        'parseError': []
    },
    'parse-errors-relative-ipv4': {
        'parseError': []
    },
    'parse-errors-relative-ipv6': {
        'parseError': []
    },
    'parse-errors-relative-merged': {
        'parseError': []
    },
    'parse-errors-most-common': {
        'ipv4': [],
        'ipv6': []
    }
}

In [None]:
resultDirs = []
for resultsDir in os.listdir(ANALYSIS_RESULTS):
    match = resultFolderRe.match(resultsDir)
    if not match:
        continue
    
    scanDate = datetime.datetime(
        int(match.group('year')),
        int(match.group('month')),
        int(match.group('day')),
        hour=int(match.group('hour')),
        minute=int(match.group('minute'))
    )
    resultDirs.append((scanDate, resultsDir))
    scanTimestamp = int(scanDate.timestamp()) * 1000
    
    # Check ipv4 results
    resultsIPv4Dir = os.path.join(ANALYSIS_RESULTS, resultsDir, 'ipv4')
    if os.path.isdir(resultsIPv4Dir):
        resultsIPv4File = os.path.join(resultsIPv4Dir, 'results.json')
        if os.path.isfile(resultsIPv4File):
            resultsIPv4 = None
            
            with open(resultsIPv4File, 'r') as rf:
                resultsIPv4 = json.loads(rf.read())
            
            if resultsIPv4:
                # Scanned domains
                domains = resultsIPv4.get('domains', -1)
                
                if domains > 0:
                    results['adoption-absolute-ipv4']['domains'].append([scanTimestamp, domains])
                
                # Adoption
                sts = resultsIPv4.get('sts', -1)
                if sts >= 0:
                    results['adoption-absolute-ipv4']['hsts'].append([scanTimestamp, sts])
                    
                    if domains > 0:
                        results['adoption-relative-ipv4']['hsts'] \
                            .append([scanTimestamp, sts / domains * 100])
                
                # Configuration
                includeSubs = resultsIPv4.get('sts-includeSubDomains', -1)
                if includeSubs >= 0:
                    results['configuration-absolute-ipv4']['includeSubDomains'] \
                        .append([scanTimestamp, includeSubs])
                    results['adoption-absolute-ipv4']['includeSubDomains'] \
                        .append([scanTimestamp, includeSubs])
                    
                    if sts > 0:
                        results['configuration-relative-ipv4']['includeSubDomains'] \
                            .append([scanTimestamp, includeSubs / sts * 100])
                    if domains > 0:
                        results['adoption-relative-ipv4']['includeSubDomains'] \
                            .append([scanTimestamp, includeSubs / domains * 100])
                preload = resultsIPv4.get('sts-preload', -1)
                if preload >= 0:
                    results['configuration-absolute-ipv4']['preload'] \
                        .append([scanTimestamp, preload])
                    results['adoption-absolute-ipv4']['preload'] \
                        .append([scanTimestamp, preload])
                    
                    if sts > 0:
                        results['configuration-relative-ipv4']['preload'] \
                            .append([scanTimestamp, preload / sts * 100])
                    if domains > 0:
                        results['adoption-relative-ipv4']['preload'] \
                            .append([scanTimestamp, preload / domains * 100])
                
                # Inconsistencies
                incons = resultsIPv4.get('inconsistency', {})
                
                inconsTotal = incons.get('total', {}).get('domains', -1)
                if inconsTotal >= 0:
                    results['inconsistencies-absolute-ipv4']['total'] \
                        .append([scanTimestamp, inconsTotal])
                    
                    if sts > 0:
                        results['inconsistencies-relative-ipv4']['total'] \
                            .append([scanTimestamp, inconsTotal / sts * 100])
                inconsConfig = incons.get('configuration', {}).get('domains', -1)
                if inconsConfig >= 0:
                    results['inconsistencies-absolute-ipv4']['configuration'] \
                        .append([scanTimestamp, inconsConfig])
                    
                    if sts > 0:
                        results['inconsistencies-relative-ipv4']['configuration'] \
                            .append([scanTimestamp, inconsConfig / sts * 100])
                inconsExistence = incons.get('existence', {}).get('domains', -1)
                if inconsExistence >= 0:
                    results['inconsistencies-absolute-ipv4']['existence'] \
                        .append([scanTimestamp, inconsExistence])
                    
                    if sts > 0:
                        results['inconsistencies-relative-ipv4']['existence'] \
                            .append([scanTimestamp, inconsExistence / sts * 100])
                
                # Parse errors
                parseError = resultsIPv4.get('parse-error', -1)
                if parseError >= 0:
                    results['parse-errors-absolute-ipv4']['parseError'].append([scanTimestamp, parseError])
                    
                    if domains > 0:
                        results['parse-errors-relative-ipv4']['parseError'] \
                            .append([scanTimestamp, parseError / domains * 100])
                
                # Max age
                ma = resultsIPv4.get('sts-max-age', {})
                
                maValues = [ma.get(k, -1) for k in MAX_AGE_KEYS]
                if all([v >= 0 for v in maValues]):
                    for i in range(len(MAX_AGE_KEYS)):
                        results['max-age-ipv4'][MAX_AGE_KEYS[i]].append([scanTimestamp, maValues[i]])
    
    # Check ipv6 results
    resultsIPv6Dir = os.path.join(ANALYSIS_RESULTS, resultsDir, 'ipv6')
    if os.path.isdir(resultsIPv6Dir):
        resultsIPv6File = os.path.join(resultsIPv6Dir, 'results.json')
        if os.path.isfile(resultsIPv6File):
            resultsIPv6 = None
            
            with open(resultsIPv6File, 'r') as rf:
                resultsIPv6 = json.loads(rf.read())
            
            if resultsIPv6:
                # Scanned domains
                domains = resultsIPv6.get('domains', -1)
                
                if domains > 0:
                    results['adoption-absolute-ipv6']['domains'].append([scanTimestamp, domains])
                
                # Adoption
                sts = resultsIPv6.get('sts', -1)
                if sts >= 0:
                    results['adoption-absolute-ipv6']['hsts'].append([scanTimestamp, sts])
                    
                    if domains > 0:
                        results['adoption-relative-ipv6']['hsts'].append([scanTimestamp, sts / domains * 100])
                                
                # Configuration
                includeSubs = resultsIPv6.get('sts-includeSubDomains', -1)
                if includeSubs >= 0:
                    results['configuration-absolute-ipv6']['includeSubDomains'] \
                        .append([scanTimestamp, includeSubs])
                    results['adoption-absolute-ipv6']['includeSubDomains'] \
                        .append([scanTimestamp, includeSubs])
                    
                    if sts > 0:
                        results['configuration-relative-ipv6']['includeSubDomains'] \
                            .append([scanTimestamp, includeSubs / sts * 100])
                    if domains > 0:
                        results['adoption-relative-ipv6']['includeSubDomains'] \
                            .append([scanTimestamp, includeSubs / domains * 100])
                preload = resultsIPv6.get('sts-preload', -1)
                if preload >= 0:
                    results['configuration-absolute-ipv6']['preload'] \
                        .append([scanTimestamp, preload])
                    results['adoption-absolute-ipv6']['preload'] \
                        .append([scanTimestamp, preload])
                    
                    if sts > 0:
                        results['configuration-relative-ipv6']['preload'] \
                            .append([scanTimestamp, preload / sts * 100])
                    if domains > 0:
                        results['adoption-relative-ipv6']['preload'] \
                            .append([scanTimestamp, preload / domains * 100])
                
                # Inconsistencies
                incons = resultsIPv6.get('inconsistency', {})
                
                inconsTotal = incons.get('total', {}).get('domains', -1)
                if inconsTotal >= 0:
                    results['inconsistencies-absolute-ipv6']['total'] \
                        .append([scanTimestamp, inconsTotal])
                    
                    if sts > 0:
                        results['inconsistencies-relative-ipv6']['total'] \
                            .append([scanTimestamp, inconsTotal / sts * 100])
                inconsConfig = incons.get('configuration', {}).get('domains', -1)
                if inconsConfig >= 0:
                    results['inconsistencies-absolute-ipv6']['configuration'] \
                        .append([scanTimestamp, inconsConfig])
                    
                    if sts > 0:
                        results['inconsistencies-relative-ipv6']['configuration'] \
                            .append([scanTimestamp, inconsConfig / sts * 100])
                inconsExistence = incons.get('existence', {}).get('domains', -1)
                if inconsExistence >= 0:
                    results['inconsistencies-absolute-ipv6']['existence'] \
                        .append([scanTimestamp, inconsExistence])
                    
                    if sts > 0:
                        results['inconsistencies-relative-ipv6']['existence'] \
                            .append([scanTimestamp, inconsExistence / sts * 100])
                
                # Parse errors
                parseError = resultsIPv6.get('parse-error', -1)
                if parseError >= 0:
                    results['parse-errors-absolute-ipv6']['parseError'].append([scanTimestamp, parseError])
                    
                    if domains > 0:
                        results['parse-errors-relative-ipv6']['parseError'] \
                            .append([scanTimestamp, parseError / domains * 100])
                
                # Max age
                ma = resultsIPv6.get('sts-max-age', {})
                
                maValues = [ma.get(k, -1) for k in MAX_AGE_KEYS]
                if all([v >= 0 for v in maValues]):
                    for i in range(len(MAX_AGE_KEYS)):
                        results['max-age-ipv6'][MAX_AGE_KEYS[i]].append([scanTimestamp, maValues[i]])
    
    # Check merged results
    resultsMergedDir = os.path.join(ANALYSIS_RESULTS, resultsDir, 'merged')
    if os.path.isdir(resultsMergedDir):
        resultsMergedFile = os.path.join(resultsMergedDir, 'results.json')
        if os.path.isfile(resultsMergedFile):
            resultsMerged = None
            
            # Load results file
            with open(resultsMergedFile, 'r') as rf:
                resultsMerged = json.loads(rf.read())
            
            if resultsMerged:
                # Scanned domains
                domains = resultsMerged.get('domains', -1)
                
                if domains > 0:
                    results['adoption-absolute-merged']['domains'].append([scanTimestamp, domains])
                
                # Adoption
                sts = resultsMerged.get('sts', -1)
                if sts >= 0:
                    results['adoption-absolute-merged']['hsts'].append([scanTimestamp, sts])
                    
                    if domains > 0:
                        results['adoption-relative-merged']['hsts'].append([scanTimestamp, sts / domains * 100])
                
                # Configuration
                includeSubs = resultsMerged.get('sts-includeSubDomains', -1)
                if includeSubs >= 0:
                    results['configuration-absolute-merged']['includeSubDomains'] \
                        .append([scanTimestamp, includeSubs])
                    results['adoption-absolute-merged']['includeSubDomains'] \
                        .append([scanTimestamp, includeSubs])
                    
                    if sts > 0:
                        results['configuration-relative-merged']['includeSubDomains'] \
                            .append([scanTimestamp, includeSubs / sts * 100])
                    if domains > 0:
                        results['adoption-relative-merged']['includeSubDomains'] \
                            .append([scanTimestamp, includeSubs / domains * 100])
                preload = resultsMerged.get('sts-preload', -1)
                if preload >= 0:
                    results['configuration-absolute-merged']['preload'] \
                        .append([scanTimestamp, preload])
                    results['adoption-absolute-merged']['preload'] \
                        .append([scanTimestamp, preload])
                    
                    if sts > 0:
                        results['configuration-relative-merged']['preload'] \
                            .append([scanTimestamp, preload / sts * 100])
                    if domains > 0:
                        results['adoption-relative-merged']['preload'] \
                            .append([scanTimestamp, preload / domains * 100])
                
                # Inconsistency v4v6
                inconsv4v6 = resultsMerged.get('v4v6inconsistent-domains', -1)
                if inconsv4v6 >= 0:
                    results['inconsistency-v4v6-absolute']['inconsistent'] \
                        .append([scanTimestamp, inconsv4v6])
                    
                    if sts > 0:
                        results['inconsistency-v4v6-relative']['inconsistent'] \
                            .append([scanTimestamp, inconsv4v6 / sts * 100])
                
                # Parse errors
                parseError = resultsMerged.get('parse-error', -1)
                if parseError >= 0:
                    results['parse-errors-absolute-merged']['parseError'].append([scanTimestamp, parseError])
                    
                    if domains > 0:
                        results['parse-errors-relative-merged']['parseError'] \
                            .append([scanTimestamp, parseError / domains * 100])
                
                # Max age
                ma = resultsMerged.get('sts-max-age', {})
                
                maValues = [ma.get(k, -1) for k in MAX_AGE_KEYS]
                if all([v >= 0 for v in maValues]):
                    for i in range(len(MAX_AGE_KEYS)):
                        results['max-age-merged'][MAX_AGE_KEYS[i]].append([scanTimestamp, maValues[i]])

In [None]:
# Sort data by time
print("Sorting results")
for chart in results.values():
    for series in chart.keys():
        chart[series].sort(key=lambda x: x[0])

In [None]:
# Latest parse errors
if len(resultDirs) > 0:
    resultDirs.sort(key=lambda x: x[0], reverse=True)
    _, latestDir = resultDirs[0]
    
    for ds in ['ipv4', 'ipv6']:
        meta = {}
        try:
            with open(os.path.join(ANALYSIS_RESULTS, latestDir, ds, 'meta.json')) as f:
                meta = json.loads(f.read())
        except:
            pass
        errors = meta.get('httpParseErrors', [])
        errors.sort(key=lambda x: x['occurences'], reverse=True)
        results['parse-errors-most-common'][ds] = errors[:10]

In [None]:
# Add updates
print("Updating chart data")
keyToFile = {
    'adoption-absolute-ipv4': ['adoption', 'absolute', 'ipv4.json'],
    'adoption-absolute-ipv6': ['adoption', 'absolute', 'ipv6.json'],
    'adoption-absolute-merged': ['adoption', 'absolute', 'merged.json'],
    'adoption-relative-ipv4': ['adoption', 'relative', 'ipv4.json'],
    'adoption-relative-ipv6': ['adoption', 'relative', 'ipv6.json'],
    'adoption-relative-merged': ['adoption', 'relative', 'merged.json'],
    'max-age-ipv4': ['max-age', 'ipv4.json'],
    'max-age-ipv6': ['max-age', 'ipv6.json'],
    'max-age-merged': ['max-age', 'merged.json'],
    'configuration-absolute-ipv4': ['configuration', 'absolute', 'ipv4.json'],
    'configuration-absolute-ipv6': ['configuration', 'absolute', 'ipv6.json'],
    'configuration-absolute-merged': ['configuration', 'absolute', 'merged.json'],
    'configuration-relative-ipv4': ['configuration', 'relative', 'ipv4.json'],
    'configuration-relative-ipv6': ['configuration', 'relative', 'ipv6.json'],
    'configuration-relative-merged': ['configuration', 'relative', 'merged.json'],
    'inconsistencies-absolute-ipv4': ['inconsistencies', 'absolute', 'ipv4.json'],
    'inconsistencies-absolute-ipv6': ['inconsistencies', 'absolute', 'ipv6.json'],
    'inconsistencies-relative-ipv4': ['inconsistencies', 'relative', 'ipv4.json'],
    'inconsistencies-relative-ipv6': ['inconsistencies', 'relative', 'ipv6.json'],
    'inconsistency-v4v6-absolute': ['inconsistency-v4v6', 'absolute.json'],
    'inconsistency-v4v6-relative': ['inconsistency-v4v6', 'relative.json'],
    'parse-errors-absolute-ipv4': ['parse-errors', 'absolute', 'ipv4.json'],
    'parse-errors-absolute-ipv6': ['parse-errors', 'absolute', 'ipv6.json'],
    'parse-errors-absolute-merged': ['parse-errors', 'absolute', 'merged.json'],
    'parse-errors-relative-ipv4': ['parse-errors', 'relative', 'ipv4.json'],
    'parse-errors-relative-ipv6': ['parse-errors', 'relative', 'ipv6.json'],
    'parse-errors-relative-merged': ['parse-errors', 'relative', 'merged.json'],
    'parse-errors-most-common': ['parse-errors', 'most-common.json']
}

for key, file in keyToFile.items():
    filePath = os.path.join(GIT_REPO, 'data', *file)
    
    # Update file contents
    with open(filePath, 'w') as resultFile:
        json.dump(results[key], resultFile, indent="\t", sort_keys=True)
    
    # Add to git
    subprocess.run(['git', '-C', GIT_REPO, 'add', filePath])

In [None]:
# Deploy changes
print("Deploying changes")
subprocess.run(['git', '-C', GIT_REPO, 'commit', '-m', "automated update"])
subprocess.run(['git', '-C', GIT_REPO, 'push'])