# HSTS Analysis

In [None]:
import pandas as pd
import numpy as np
import sys
import subprocess
import pickle
from pathlib import Path
import json
import os
import re
import gc
import collections
import time
from datetime import datetime
import psutil
import getopt
import logging

print("Versions: Python {}, Pandas {}"
      .format(sys.version, pd.__version__))

In [None]:
SCRIPT_VERSION_MAJOR = 2
SCRIPT_VERSION_MINOR = 20180709
DEBUG = False
CLEANUP = False

HOSTS_FILE = 'http.csv'
META_FILE = 'meta.json'
RESULTS_FILE = 'results.json'

TMP_DATA_ROOT = '../data/tmp'
DATA_SETS = [
    {
        'id': 'ipv4',
        'ipVersion': 4,
        'src': '../data/ipv4/' + HOSTS_FILE
    }, {
        'id': 'ipv6',
        'ipVersion': 6,
        'src': '../data/ipv6/' + HOSTS_FILE
    }
]
CHUNK_SIZE = 8000000

EXT_PREPARED = '.cut_sort'
EXT_CHUNK = '.chunk-'
EXT_PICKLE = '.pickle'
EXT_PARSED = '.parsed'
EXT_INCONS_MARKED = '.incons-marked'
EXT_REDUCED = '.reduced'

EXT_INCONSISTENT = '.inconsistent'
EXT_INCONS_EXISTENCE = '.exist'
EXT_INCONS_CONFIGURATION = '.config'
EXT_INCONS_V4V6 = '.v4v6'

maxAgeAggregation = {
    'off':       0,
    'test':      60,
    'day':       60 * 60 * 24,
    'week':      60 * 60 * 24 * 7,
    'w-hy':      60 * 60 * 24 * 180,
    'half-year': 60 * 60 * 24 * 190,
    'hy-y':      60 * 60 * 24 * 360,
    'year':      60 * 60 * 24 * 370,
    'other':     float('Inf')
}

In [None]:
# Get current process for memory consumption
proc = psutil.Process(os.getpid())
memInfo = {
    'last': 0,
    'max': 0
}

In [None]:
# Do not execute this when in Jupyter Notebook!
def parseCliArguments():
    """Parses the command line arguments"""
    
    global DATA_SETS
    global CHUNK_SIZE
    global DEBUG
    global TMP_DATA_ROOT
    
    try:
        optlist, args = getopt.getopt(sys.argv[1:], 'dhc:r:t:', ['debug', 'help', 'chunk-size=', 'data-root=',
                                                               'tmp-root='])
    except getopt.GetoptError as err:
        print(str(err))
        sys.exit(2)

    if len(args) > 0:
        DATA_SETS = [{'id': i, 'src': s, 'ipVersion': int(v)} for s,i,v in [x.split(':') for x in args]]

    for opt, arg in optlist:
        if opt in ('-c', '--chunk-size'):
            chunkSize = -1
            if arg.isdigit():
                chunkSize = int(arg)
            if chunkSize <= 0:
                print("Chunk size has to be a positive integer, '%s' given" % arg)
                sys.exit(1)
            CHUNK_SIZE = chunkSize
        elif opt in ('-d', '--debug'):
            DEBUG = True
        elif opt in ('-h', '--help'):
            print("Usage: python3 %s [OPTIONS] source(s)" % sys.argv[0])
            print("")
            print("Source:")
            print("<inFile>:<id>:<ipVersion>")
            print("")
            print("Options:")
            print("-c <size>, --chunk-size=<size>  Set the chunk size to <size>")
            print("-d, --debug                     Enables debug logging")
            print("-h, --help                      Displays this help message")
            print("-t <dir>, --tmp-root=<dir>      Sets the temporary directory to <dir>")
            sys.exit()
        elif opt in ('-t', '--tmp-root'):
            if not os.path.isdir(arg):
                print("Temporary data root has to be a directory, '%s' given" % arg)
                sys.exit(1)
            TMP_DATA_ROOT = arg

parseCliArguments()

In [None]:
ADDITIONAL_LINES_BULK = int(CHUNK_SIZE * 0.0001)
MERGE_CHUNK_SIZE = CHUNK_SIZE

In [None]:
# Setup logger
logging.basicConfig(format="%(asctime)s [%(levelname)s] at %(funcName)s:%(lineno)d: %(message)s")
logger = logging.getLogger('hsts-analysis')

fileHandler = logging.FileHandler(os.path.join(TMP_DATA_ROOT, 'hsts-analysis.log'), mode='w')
fileHandler.setFormatter(logging.Formatter(fmt="%(asctime)s [%(levelname)s] at %(funcName)s:%(lineno)d: %(message)s"))

logger.addHandler(fileHandler)
if DEBUG:
    logger.setLevel("DEBUG")
else:
    logger.setLevel("INFO")

### Function definitions

In [None]:
def cutAndSort(dataSet):
    """Selects only relevant columns from a scan file and sorts the rows by hostname afterwards."""
    
    inFile = buildFileName(dataSet, tmp=False)
    outFile = buildFileName(dataSet, prepared=True)
    subprocess.run(['./cut_sort.sh', inFile, outFile], check=True)

In [None]:
def sha512sum(file):
    """Generates the SHA512 sum for the given file."""
    
    sha512 = subprocess.run(['sha512sum', file], check=True, stdout=subprocess.PIPE).stdout.decode('utf-8')
    sha512 = sha512.split()[0]
    logger.debug("SHA512 of file '%s': %s" % (file, sha512))
    return sha512

In [None]:
def saveResults(*dataSets):
    """Saves the results in the data sets."""
    
    for ds in dataSets:
        with open(os.path.join(ds['dir'], RESULTS_FILE), 'w') as resultsFile:
            json.dump(ds['results'], resultsFile, indent=2, sort_keys=True)
    
    logger.debug("Saved results")

In [None]:
def saveMeta(*dataSets):
    """Saves the meta data of the data sets."""
        
    for ds in dataSets:
        with open(os.path.join(ds['dir'], META_FILE), 'w') as metaFile:
            json.dump(ds['meta'], metaFile, indent=2, sort_keys=True)
    
    logger.debug("Saved meta data")

In [None]:
def printMemory():
    """Prints current memory consumption."""
    
    global proc
    global memInfo
    
    current = proc.memory_info().rss
    if current > memInfo['max']:
        memInfo['max'] = current
    
    logger.debug("[MEMORY] Current: %.2fMB (%+.3fMB), Max: %.2fMB" % (
        current / 1000 / 1000,
        (current - memInfo['last']) / 1000 / 1000,
        memInfo['max'] / 1000 / 1000
    ))
    memInfo['last'] = current

In [None]:
def isValidFile(fileName, fileHash = None):
    """Checks if the file exists and no zero size. If a hash is given, it will also be checked."""
    
    file = Path(fileName)
    # Check file exists
    if not (file.exists() and file.is_file() and os.path.getsize(fileName) > 0):
        logger.debug("File '%s' does not exist!" % fileName)
        return False
    # Check hash if given
    if fileHash and fileHash != sha512sum(fileName):
        logger.debug("Hashes don't match for file '%s'!" % fileName)
        return False
    
    return True

In [None]:
def filterChunk(df):
    """Filters only valid data in chunk"""
    
    # Drop invalid rows
    if not df.empty:
        df.dropna(subset=['host', 'server_name', 'http_code'], how='any', inplace=True)
        logger.debug("Dropped invalid entries (%d remaining)" % len(df.index))
    
    # Drop unsuccessful connections
    if not df.empty:
        df = df[df['http_code'] >= 0]
        logger.debug("Dropped unsuccessful entries (%d remaining)" % len(df.index))
    
    if not df.empty:
        df = df[['host', 'server_name', 'headers']]
    
    return df

In [None]:
def readChunk(ds, nr, start, count = CHUNK_SIZE):
    """Reads a chunk from the given file, filters the relevant information and saves it as pickle.
    Returns the number of read and relevant lines."""
    
    file = buildFileName(ds, prepared=True)
    logger.debug("Reading chunk %.0f from %s (lines %d - %d)" % (nr, file, start, start + count))
    
    # Read csv
    colNames = ['host', 'server_name', 'http_code', 'headers']
    colTypes = {col: str for col in colNames}
    colTypes['http_code'] = float # Cannot be int as NaN != int
    df = pd.read_csv(file, skiprows=start, nrows=count, encoding='ISO-8859-1', names=colNames, dtype=colTypes)
    linesRead = len(df.index)
    printMemory()
    
    logger.debug("Read %d lines" % linesRead)
    
    df = filterChunk(df)
    linesRelevant = len(df.index)
    
    if linesRelevant > 0:
        # Read additional lines to have all lines of one server_name in one chunk
        lastServerName = df.iloc[-1]['server_name']
        logger.debug("Last server_name: %s" % lastServerName)
        additionalLinesRead = 0
        additionalLinesRelevant = 0
        
        # Reading lines in bulk is faster than reading every line idividually
        additionalLinesFound = True
        while additionalLinesFound:
            logger.debug("Looking for additional lines for last host in lines %d to %d" % (
                    start + count + additionalLinesRead, start + count + additionalLinesRead + ADDITIONAL_LINES_BULK
                ))
            linesDf = pd.read_csv(file, skiprows=start + count + additionalLinesRead,
                                  nrows=ADDITIONAL_LINES_BULK, encoding='ISO-8859-1', names=colNames, dtype=colTypes)
            printMemory()
            
            if len(linesDf.index) <= 0:
                # No more lines to read
                logger.debug("EOF reached")
                additionalLinesFound = False
                break
            
            # Counts the additional lines that have been processed in this sub-chunk
            linesReadInSubChunk = 0
            linesDf = filterChunk(linesDf)
            
            for i in range(len(linesDf.index)):
                lineServerName = linesDf.iloc[i]['server_name']
                lineNrInSubChunk = int(linesDf.iloc[i].name)
                logger.debug("Next server name: %s (@%d)" % (lineServerName, lineNrInSubChunk))
                if lineServerName == lastServerName:
                    df = df.append(linesDf.iloc[i], ignore_index=True)
                    # Line nrs start at 0
                    linesReadInSubChunk = lineNrInSubChunk + 1
                    additionalLinesRelevant += 1
                else:
                    additionalLinesFound = False
                    break
                    
            additionalLinesRead += linesReadInSubChunk

            del linesDf
            gc.collect()
            
            printMemory()
        
        logger.debug("Read %d additional lines and found %d relevant ones" %
                     (additionalLinesRead, additionalLinesRelevant))
    
        linesRead += additionalLinesRead
        linesRelevant += additionalLinesRelevant
    
    logger.debug("Read %d lines from csv" % linesRead)
    printMemory()
    
    if linesRead > 0:
        if nr != None:
            # Save loaded chunk
            df.to_pickle(buildFileName(ds, chunk=nr))
            logger.debug("Saved chunk")
        
    # Free memory
    del df
    gc.collect()
    
    printMemory()
    
    return linesRead, linesRelevant

In [None]:
def buildFileName(ds, prepared = False, chunk = np.nan, status = None, tmp = True):
    """Builds the name of the file."""
    
    if not tmp and not (prepared or chunk == chunk or status):
        return ds.get('src')
    
    file = os.path.join(ds.get('dir'), HOSTS_FILE)
    
    if prepared or chunk == chunk:
        file += EXT_PREPARED
    if chunk == chunk:
        file += EXT_CHUNK + str(chunk)
    if status == 'parsed' or status == 'marked' or status == 'reduced':
        file += EXT_PARSED
    if status == 'marked' or status == 'reduced':
        file += EXT_INCONS_MARKED
    if status == 'reduced':
        file += EXT_REDUCED
    if chunk == chunk:
        file+= EXT_PICKLE
    
    return file

In [None]:
def loadChunk(ds, nr, status = None):
    """Loads the specified chunk of the given dataset."""
    
    try:
        return pd.read_pickle(buildFileName(ds, chunk=nr, status=status))
    except FileNotFoundError:
        return pd.DataFrame()

In [None]:
def buildDataSet(ds):
    """Loads the meta data and results for the given data set."""
    
    logger.info("Building data set %s from source file '%s'" % (ds['id'], ds['src']))
    
    # Store temporary files in here
    ds['dir'] = os.path.join(TMP_DATA_ROOT, ds['id'])
    # Id that can be used in pandas queries
    ds['id_normal'] = re.sub(r'^\d+|[^a-zA-Z_0-9]', '', ds['id'].replace('-', '_'))
    
    if not os.path.exists(ds['dir']):
        os.makedirs(ds['dir'])
    
    try:
        with open(os.path.join(ds['dir'], META_FILE), 'r') as f:
            ds['meta'] = json.loads(f.read())
    except FileNotFoundError:
        # No meta data exist
        
        srcFileSize = -1
        if isValidFile(ds['src']):
            srcFileSize = os.path.getsize(ds['src'])
        
        ds['meta'] = {
            'script': {
                'version': {
                    'major': SCRIPT_VERSION_MAJOR,
                    'minor': SCRIPT_VERSION_MINOR
                },
                'source': {
                    'file': {
                        'name': ds['src'],
                        'size': srcFileSize
                    },
                    'date': str(datetime.now())
                }
            }
        }
    
    try:
        with open(os.path.join(ds['dir'], RESULTS_FILE), 'r') as f:
            ds['results'] = json.loads(f.read())
    except FileNotFoundError:
        # No results exist
        ds['results'] = {}
        ds['results']['script'] = ds['meta']['script']

In [None]:
def onStepStart(name):
    """Call this when a new analysis step starts"""
    
    logger.info('')
    logger.info('#')
    logger.info('# ' + name)
    logger.info('#')
    printMemory()

---

## Define data sets to analyze

In [None]:
def defineDataSets():
    
    global dataSets
    
    onStepStart('build data sets')
    # Define data folders to analyze
    dataSets = DATA_SETS

    # Read meta data
    for ds in dataSets:
        buildDataSet(ds)

        logger.info("Dataset: %s\nversion: %d (%d), chunks: %.0f, totalLines: %.0f, parseErrors: %.0f, results: %s" % (
            ds['id'],
            ds['meta']['script']['version']['major'],
            ds['meta']['script']['version']['minor'],
            len(ds['meta']['chunks']) if 'chunks' in ds['meta'] else np.NaN,
            ds['meta']['totalLines'] if 'totalLines' in ds['meta'] else np.NaN,
            len(ds['meta']['httpParseErrors']) if 'httpParseErrors' in ds['meta'] else np.NaN,
            str(True if 'results' in ds and len(ds['results']) > 0 else False)
        ))
    printMemory()
defineDataSets()

In [None]:
dsMerge = {
    'src': '',
    'ipVersion': np.nan,
    'id': 'merged'
}
buildDataSet(dsMerge)
printMemory()

## Data preparation

* Deletes any columns that are not required
* Sorts scanns by server_name

In [None]:
def dataPreparation():
    onStepStart('data preparation')
    # Prepare data by selecting relevant columns and sort by hostname
    for ds in dataSets:
        logger.info("Preparing %s" % ds['id'])

        # Cut and sort
        fileName = buildFileName(ds, prepared=True)
        if not isValidFile(fileName, ds['meta'].get('cutSortHash')):
            logger.info("Cutting and sorting...")
            cutAndSort(ds)
            ds['meta']['cutSortHash'] = sha512sum(fileName)

        # Save meta data to save progress in this step
        saveMeta(ds)
        printMemory()

dataPreparation()

## Chunking

* Splits data into smaller chunks making sure that all scanns for the same server_name are in the same chunk

In [None]:
def chunking():
    onStepStart('chunking')
    printMemory()
    # Split data into chunks
    for ds in dataSets:
        logger.debug("Splitting data in %s" % ds['id'])

        requiresSplitting = False
        splitStart = 0
        if 'chunks' in ds['meta']:
            # Data has been split before
            lastChunkEnd = 0
            for nr, chunk in enumerate(ds['meta']['chunks']):
                # Check chunk info
                if not ('start' in chunk and 'count' in chunk):
                    logger.warning("Missing info for chunk %d @ %s!" % (nr, ds['id']))
                    requiresSplitting = True
                    break

                # Check consistency
                if lastChunkEnd != chunk['start']:
                    logger.warning("Inconsistency found at chunk %d @ %s! Expected to start at line %d, but started at line %d" %
                                   (nr, ds['id'], lastChunkEnd, chunk['start']))
                    requiresSplitting = True
                    splitStart = nr
                    break
                lastChunkEnd = chunk['start'] + chunk['count']

                # Check pickle
                if isValidFile(buildFileName(ds, chunk=nr), chunk.get('hash')):
                    logger.info("Chunk %d @ %s is valid" % (nr, ds['id']))
                    continue

                # Re-read chunk
                logger.info("Chunk %d @ %s is invalid, reading again" % (nr, ds['id']))
                read, relevant = readChunk(ds, nr, chunk['start'], chunk['count'])
                if read != chunk['count']:
                    logger.warning("Read %d lines, expected to read %d lines!" % (read, chunk['count']))
                    requiresSplitting = True
                    splitStart = nr
                    break

                chunk['hash'] = sha512sum(buildFileName(ds, chunk=nr))
                chunk['lines'] = relevant

            # Check that all lines have been read
            if not requiresSplitting:
                logger.debug("Looking for additional lines")
                read, relevant = readChunk(ds, np.nan, lastChunkEnd, 1)
                if read > 0:
                    logger.warning("Found lines that were not in chunks!")
                    requiresSplitting = True
                    # Split only additional data
                    splitStart = len(ds['meta']['chunks'])

            if requiresSplitting:
                # Data has to be re-split, delete old files
                for f in os.listdir(ds['dir']):
                    match = re.search('^' + HOSTS_FILE + EXT_PREPARED + EXT_CHUNK + '(\d+).*$', f)
                    if match and int(match.group(1)) >= splitStart:
                        os.remove(os.path.join(ds['dir'], f))
        else:
            requiresSplitting = True
        printMemory()

        # Split chunks
        if requiresSplitting:
            nr = 0
            lastChunkEnd = 0

            # Set starting values
            if splitStart > 0:
                nr = splitStart
                prevChunk = ds['meta']['chunks'][splitStart - 1]
                lastChunkEnd = prevChunk['start'] + prevChunk['count']
                ds['meta']['chunks'] = ds['meta']['chunks'][:splitStart]
            else:
                ds['meta']['chunks'] = []
            printMemory()

            while True:
                logger.info("Building chunk %d @ %s" % (nr, ds['id']))
                printMemory()
                chunk = {
                    'start': lastChunkEnd,
                    'count': CHUNK_SIZE
                }

                read, relevant = readChunk(ds, nr, chunk['start'], chunk['count'])
                logger.debug("Read %d lines out of which %d lines are relevant" % (read, relevant))
                chunk['lines'] = relevant
                chunk['count'] = read
                chunk['hash'] = sha512sum(buildFileName(ds, chunk=nr))

                ds['meta']['chunks'].append(chunk)
                nr += 1
                lastChunkEnd = chunk['start'] + chunk['count']

                saveMeta(ds)

                if read < CHUNK_SIZE:
                    # Last chunk has been read
                    break

            # Recompute total lines
            totalLines = 0
            noHttpLines = 0
            for chunk in ds['meta']['chunks']:
                totalLines += chunk['lines']
                noHttpLines += chunk['count'] - chunk['lines']
            ds['meta']['totalLines'] = totalLines
            ds['results']['totalLines'] = totalLines
            ds['results']['noHttpLines'] = noHttpLines

        # Save meta data to save progress in this step
        saveMeta(ds)
        saveResults(ds)
        printMemory()

chunking()

---

In [None]:
stsMaxAgePattern = re.compile('^\s*max-age\s*=\s*(?:"(\d+)"|(\d+))\s*$')
colsToCreate = {
    'http-header-parse-error': False,
    'http-header--sts': False,
    'http-header--sts-max-age': np.NaN,
    'http-header--sts-includeSubDomains': False,
    'http-header--sts-preload': False
}

In [None]:
def parseHttpHeaders(headers, errList):
    """Parses the http headers."""
    
    # Skip empty headers
    if type(headers) == float and np.isnan(headers):
        return np.NaN
        
    headers = str(headers)
    result = {}
    
    for header in headers.splitlines():
        try:
            hName, hValue = header.split(':', 1)
        except ValueError:
            errList.append("Missing header value: '{}'".format(header))
            result['http-header-parse-error'] = True
            continue
        hNameLower = hName.lower()
            
        if hNameLower == 'strict-transport-security':
            if 'http-header--sts' in result:
                # Process only first header (RFC 6797 section 8.1)
                errList.append("Sts header set more than once")
                result['http-header-parse-error'] = True
                continue
            
            result['http-header--sts'] = True
            # Analyze HSTS header
            for directive in hValue.split(';'):
                directive = directive.strip().lower()
                
                if directive == '':
                    # ignore
                    pass
                elif directive == 'includesubdomains':
                    if 'http-header--sts-includeSubDomains' in result:
                        # Directive must only exist once (RFC 6797 section 6.1)
                        errList.append("Sts directive includeSubDomains set more than once")
                        result['http-header-parse-error'] = True
                        result['http-header--sts'] = False
                        continue
                    result['http-header--sts-includeSubDomains'] = True
                elif directive == 'preload':
                    if 'http-header--sts-preload' in result:
                        # Directive must only exist once (RFC 6797 section 6.1)
                        errList.append("Sts directive preload set more than once")
                        result['http-header-parse-error'] = True
                        result['http-header--sts'] = False
                        continue
                    result['http-header--sts-preload'] = True
                else:
                    maResult = stsMaxAgePattern.match(directive)
                    if maResult:
                        if 'http-header--sts-max-age' in result:
                            # Directive must only exist once (RFC 6797 section 6.1)
                            errList.append("Sts directive max-age set more than once")
                            result['http-header-parse-error'] = True
                            result['http-header--sts'] = False
                            continue
                        result['http-header--sts-max-age'] = int(maResult.group(1) or maResult.group(2))
                    else:
                        # ignore other elements (RFC 6797 section 6.1)
                        errList.append("Unknown directive: '{}' in header '{}'".format(directive, header))
                        # This is a parse error, but the header is still valid
                        result['http-header-parse-error'] = True
            
            # Check existence of required columns
            if not 'http-header--sts-max-age' in result.keys():
                result['http-header--sts'] = False
            
        else:
            # ignore other headers
            pass
    
    if len(result) > 0:
        return result
    else:
        return np.NaN

In [None]:
def dict2Cols(df):
    """Extracts the data in the http dictionary to their columns."""
    
    if type(df['http_header_dict']) == dict:
        for key, value in df['http_header_dict'].items():
            df[key] = value
    return df

## Extract headers

* Analyzes the headers column and extracts sts headers and their configuration

In [None]:
def extractHeaders():    
    onStepStart('extract headers')
    # Parse http headers
    for ds in dataSets:
        logger.info("Parsing headers in %s" % ds['id'])
        printMemory()
        cntParseErrors = collections.Counter()
        for nr, chunk in enumerate(ds['meta']['chunks']):
            logger.info("Processing chunk %d @ %s" % (nr, ds['id']))
            printMemory()
            chunkFile = buildFileName(ds, chunk=nr, status='parsed')

            if isValidFile(chunkFile, chunk.get('parsed', {}).get('hash', None)):
                logger.info("Found valid chunk")
                continue

            df = loadChunk(ds, nr)
            logger.debug("Loaded chunk (%d lines)" % len(df.index))
            printMemory()

            # Split into a set which has header values and a set which doesn't
            dfWithHeaders = df[df['headers'].notna()]
            dfWithoutHeaders = df[df['headers'].isna()]
            del df
            logger.debug("Split chunk into set with headers (%d lines) and without headers (%d lines)" %
                         (len(dfWithHeaders.index), len(dfWithoutHeaders.index)))

            # Parse headers into dict
            parseErrors = []
            dfWithHeaders = dfWithHeaders.assign(
                http_header_dict = dfWithHeaders['headers']
                    .apply(parseHttpHeaders, errList = parseErrors))
            logger.debug("Parsed headers in %d lines" % len(dfWithHeaders.index))
            printMemory()
            
            # Add parse errors
            logger.debug("Found %d parse errors" % len(parseErrors))
            cntParseErrors.update(parseErrors)
            del parseErrors
            gc.collect()
            printMemory()

            # Create required columns
            for col in colsToCreate.keys():
                dfWithHeaders[col] = colsToCreate[col]
                dfWithoutHeaders[col] = colsToCreate[col]
            logger.debug("Created required columns")
            printMemory()

            # Extract data from dictionary to columns
            dfWithHeaders = dfWithHeaders.apply(lambda x: dict2Cols(x), axis=1)
            logger.debug("Extracted data from dictionary")
            printMemory()

            # Drop dictionary column
            dfWithHeaders.drop(['http_header_dict'], 1, inplace=True)
            logger.debug("Dropped dictionaries")
            printMemory()

            # Merge both sets back together
            df = dfWithHeaders.append(dfWithoutHeaders)
            del dfWithHeaders
            del dfWithoutHeaders
            logger.debug("Merged analyzed headers and empty headers")
            printMemory()

            # Drop original headers column
            df.drop(['headers'], 1, inplace=True)
            logger.debug("Dropped raw headers")

            # Sort by server_name (required for merge)
            df.sort_values('server_name', axis=0, inplace=True)
            logger.debug("Sorted values")

            df.to_pickle(chunkFile)
            chunk['parsed'] = {
                'hash': sha512sum(chunkFile),
                'lines': len(df.index)
            }
            logger.debug("Saved analyzed chunk")

            # Free memory
            del df
            gc.collect()
            printMemory()
            
            saveMeta(ds)

        # Analyze parse errors
        ds['results']['parseErrors'] = sum(cntParseErrors.values())
        logger.info("Found %d parse errors." % ds['results']['parseErrors'])
        saveResults(ds)

        for error, occurrences in cntParseErrors.most_common(20):
            logger.info("%d occurrences of: %s" % (occurrences, error))

        ds['meta']['httpParseErrors'] = [{'occurences': occ, 'error': err} for err, occ in cntParseErrors.most_common()]
        saveMeta(ds)

        # Remove any data that is not required any more
        del cntParseErrors
        gc.collect()
        printMemory()

extractHeaders()

## Inconsistency analysis

Analyzes inconsitencies between different IPs of the same domain in a single data set.

### Existence inconsistency

* Analyzes inconsistency in existence of the hsts header

In [None]:
def inconsistencyExistence():
    onStepStart('existence inconsistency')
    for ds in dataSets:
        logger.info("Processing data in %s" % ds['id'])

        ds['meta']['results'] = ds['meta'].get('results', {})
        ds['meta']['results']['chunks'] = ds['meta']['results'].get('chunks', [])

        allInconsistent = pd.DataFrame()
        for nr, chunk in enumerate(ds['meta']['chunks']):
            logger.debug("Processing chunk %d @ %s" % (nr, ds['id']))
            printMemory()

            df = loadChunk(ds, nr, status='parsed')
            printMemory()

            # Drop entries where header existence is consistent per domain
            withoutConsistent = df.drop_duplicates(subset=['server_name', 'http-header--sts'], keep='first')

            # Remaining duplicates are inconsistent
            inconsistent = withoutConsistent[withoutConsistent.duplicated(subset='server_name', keep=False)]

            # Add all entries for inconsistent domains
            inconsistent = df[df['server_name'].isin(inconsistent['server_name'])]

            del withoutConsistent
            del df
            printMemory()

            # Add length to meta results
            resultsChunk = {}
            if len(ds['meta']['results']['chunks']) <= nr:
                ds['meta']['results']['chunks'].append(resultsChunk)
            else:
                resultsChunk = ds['meta']['results']['chunks'][nr]

            if not 'inconsistency' in resultsChunk:
                resultsChunk['inconsistency'] = {}

            resultsChunk['inconsistency']['existence'] = len(inconsistent.index)

            logger.info("Found %d inconsistent entries" % ds['meta']['results']['chunks'][nr]['inconsistency']['existence'])

            # Merge onto all inconsistent for data set
            if allInconsistent.empty:
                allInconsistent = inconsistent
            else:
                allInconsistent = allInconsistent.append(inconsistent)

            # Free memory
            del inconsistent
            gc.collect()

            saveMeta(ds)
            printMemory()

        ds['meta']['results']['inconsistency'] = ds['meta']['results'].get('inconsistency', {})
        ds['meta']['results']['inconsistency']['existence'] = {
            'lines': len(allInconsistent.index),
            'domains': len(allInconsistent.drop_duplicates(subset=['server_name'], keep='first').index)
        }
        print("Found %d inconsistent entries (of %d domains) in data set" %
              (ds['meta']['results']['inconsistency']['existence']['lines'],
               ds['meta']['results']['inconsistency']['existence']['domains']))
        saveMeta(ds)

        # Save inconsistent data
        allInconsistent.to_pickle(buildFileName(ds) + EXT_INCONSISTENT + EXT_INCONS_EXISTENCE + EXT_PICKLE)

        cols = ['server_name', 'host', 'http-header--sts']
        print()
        print("server_name                              | host                                     | sts")
        print(("-" * 41) + "|" + ("-" * 42) + "|" + ("-" * 7))
        for entry in allInconsistent.as_matrix(cols):
            print("{:<40} | {:<40} | {:>5}".format(*entry))
        print()

        ds['results']['inconsistency'] = ds['results'].get('inconsistency', {})
        ds['results']['inconsistency']['existence'] = ds['meta']['results']['inconsistency']['existence']
        saveResults(ds)
        printMemory()

inconsistencyExistence()

### Configuration inconsistency

* Analyzes in consistency in the configuration of the hsts header

In [None]:
def inconsistenceConfiguration():
    onStepStart('configuration inconsistency')
    for ds in dataSets:
        logger.info("Processing data in %s" % ds['id'])
        ds['meta']['results'] = ds['meta'].get('results', {})
        ds['meta']['results']['chunks'] = ds['meta']['results'].get('chunks', [])

        allInconsistent = pd.DataFrame()
        for nr, chunk in enumerate(ds['meta']['chunks']):
            logger.info("Processing chunk %d @ %s" % (nr, ds['id']))
            printMemory()

            df = loadChunk(ds, nr, status='parsed')
            printMemory()

            # Drop entries without sts header
            df = df[df['http-header--sts']]

            # Drop entries where header configuration is consistent per domain
            withoutConsistent = df.drop_duplicates(subset=['server_name', 'http-header--sts-max-age',
                                                           'http-header--sts-includeSubDomains',
                                                           'http-header--sts-preload'], keep='first')

            # Remaining duplicates are inconsistent
            inconsistent = withoutConsistent[withoutConsistent.duplicated(subset='server_name', keep=False)]

            # Add all entries for inconsistent domains
            inconsistent = df[df['server_name'].isin(inconsistent['server_name'])]

            del withoutConsistent
            del df
            printMemory()

            # Add length to meta results
            resultsChunk = {}
            if len(ds['meta']['results']['chunks']) <= nr:
                ds['meta']['results']['chunks'].append(resultsChunk)
            else:
                resultsChunk = ds['meta']['results']['chunks'][nr]

            if not 'inconsistency' in resultsChunk:
                resultsChunk['inconsistency'] = {}

            resultsChunk['inconsistency']['configuration'] = len(inconsistent.index)

            logger.info("Found %d inconsistent entries" % ds['meta']['results']['chunks'][nr]['inconsistency']['configuration'])

            # Merge onto all inconsistent for data set
            if allInconsistent.empty:
                allInconsistent = inconsistent
            else:
                allInconsistent = allInconsistent.append(inconsistent)

            # Free memory
            del inconsistent
            gc.collect()

            saveMeta(ds)
            printMemory()

        ds['meta']['results']['inconsistency'] = ds['meta']['results'].get('inconsistency', {})
        ds['meta']['results']['inconsistency']['configuration'] = {
            'lines': len(allInconsistent.index),
            'domains': len(allInconsistent.drop_duplicates(subset=['server_name'], keep='first').index)
        }
        print("Found %d inconsistent entries (of %d domains) in data set" %
              (ds['meta']['results']['inconsistency']['configuration']['lines'],
               ds['meta']['results']['inconsistency']['configuration']['domains']))
        saveMeta(ds)

        # Save inconsistent data
        allInconsistent.to_pickle(buildFileName(ds) + EXT_INCONSISTENT + EXT_INCONS_CONFIGURATION + EXT_PICKLE)

        cols = ['server_name', 'host', 'http-header--sts-max-age', 'http-header--sts-includeSubDomains', 'http-header--sts-preload']
        print()
        print("server_name                              | host                                     | max-age    | inclSubs | preload")
        print(("-" * 41) + "|" + ("-" * 42) + "|" + ("-" * 12) + "|" + ("-" * 10) + "|" + ("-" * 8))
        for entry in allInconsistent.as_matrix(cols):
            print("{:<40} | {:<40} | {:>10} | {:>8} | {:>7}".format(*entry))
        print()

        ds['results']['inconsistency'] = ds['results'].get('inconsistency', {})
        ds['results']['inconsistency']['configuration'] = ds['meta']['results']['inconsistency']['configuration']
        saveResults(ds)
        printMemory()

inconsistenceConfiguration()

In [None]:
def inconsistencyMerge():
    # Merge all inconsistent domains
    for ds in dataSets:
        inconsExist = pd.read_pickle(buildFileName(ds) + EXT_INCONSISTENT + EXT_INCONS_EXISTENCE + EXT_PICKLE)
        inconsConfig = pd.read_pickle(buildFileName(ds) + EXT_INCONSISTENT + EXT_INCONS_CONFIGURATION + EXT_PICKLE)

        inconsistent = inconsExist.append(inconsConfig, ignore_index=True)
        inconsistent.drop_duplicates(subset=['host', 'server_name'], keep='first', inplace=True)

        ds['meta']['results']['inconsistency'] = ds['meta']['results'].get('inconsistency', {})
        ds['meta']['results']['inconsistency']['total'] = {
            'lines': len(inconsistent.index),
            'domains': len(inconsistent.drop_duplicates(subset=['server_name'], keep='first').index)
        }

        saveMeta(ds)

        ds['results']['inconsistency'] = ds['results'].get('inconsistency', {})
        ds['results']['inconsistency']['total'] = ds['meta']['results']['inconsistency']['total']

        saveResults(ds)

inconsistencyMerge()

## IPv4 - IPv6 Inconsistency

* Looks for inconsistencies between IPv4 and IPv6 scanns

In [None]:
def prepareChunkIPv4v6(df):
    """Prepares the given chunk for the IPv4/IPv6 inconsistency analysis"""
    
    # Required for comparison of sets
    df.fillna(value={'http-header--sts-max-age': -1}, inplace=True)

    # Build sts configuration (tuple)
    df['sts_config'] = list(zip(df['http-header--sts'], df['http-header--sts-max-age'], \
                      df['http-header--sts-includeSubDomains'], df['http-header--sts-preload']))
    logger.debug("Built sts configuration")
    printMemory()

    # Drop not required columns
    dfConfig = df[['server_name', 'sts_config']]
    del df
    gc.collect()
    logger.debug("Dropped not required columns")
    printMemory()

    # Split
    dfGroup = dfConfig[dfConfig.duplicated('server_name', keep=False)]
    dfNoGroup = dfConfig[~dfConfig.duplicated('server_name', keep=False)]
    del dfConfig
    gc.collect()
    logger.debug("Split into those that need grouping (%d) and those that don't (%d)" % (
        len(dfGroup), len(dfNoGroup)
    ))
    printMemory()

    # Group by server_name
    dfGroup = dfGroup.groupby(by='server_name', sort=False).aggregate(lambda x: set(x))
    dfGroup = dfGroup.reset_index()
    logger.debug("Grouped by domain")
    printMemory()

    # Make non-grouped set
    dfNoGroup['sts_config'] = dfNoGroup['sts_config'].apply(lambda x: set([x]))
    logger.debug("Built sets")
    printMemory()

    # Merge grouped and not grouped
    df = dfNoGroup.append(dfGroup)
    df.sort_values('server_name', inplace=True)
    logger.debug("Merged grouped and not grouped")
    printMemory()
    
    # Free memory
    del dfNoGroup
    del dfGroup
    gc.collect()
    printMemory()
    
    return df

In [None]:
def inconsistencyIPv4v6():
    onStepStart('ipv4/ipv6 inconsistency')
    # Iterate over all relevant combinations of chunks
    dsToCompare = []
    for ds in dataSets:
        ds['incons'] = {
            'chunk': 0,
            'df': pd.DataFrame()
        }
        dsToCompare.append(ds)

    # Compare configurations
    inconsistent = pd.DataFrame()
    # At least 2 entries required
    while(len(dsToCompare) > 1):
        logger.info("Comparing: " + str([(ds['id_normal'], ds['incons']['chunk']) for ds in dsToCompare]))
        printMemory()

        # Load and prepare chunks
        for i, ds in enumerate(dsToCompare):
            if ds['incons']['df'].empty:
                # Load next chunk
                df = loadChunk(ds, ds['incons']['chunk'], status='parsed')
                logger.debug("Loaded chunk %d from %s (%d lines)" % (ds['incons']['chunk'], ds['id'], len(df.index)))
                printMemory()

                ds['incons']['df'] = prepareChunkIPv4v6(df)
        logger.info("Loaded and prepared chunks")
        printMemory()
        
        # Select first chunk end
        chunkEnds = []
        for i, ds in enumerate(dsToCompare):
            if not ds['incons']['df'].empty:
                chunkEnds.append((ds['incons']['df'].iloc[-1]['server_name'], i))
        chunkEnds.sort()

        splitName, dsWithChunkEnd = chunkEnds[0]
        del chunkEnds

        # Select merge part from chunks up to splitName
        mergeParts = []
        for ds in dsToCompare:
            df = ds['incons']['df']
            splitIndex = np.searchsorted(df['server_name'], splitName)[0] + 1
            # Append data up to splitIndex to mergeParts
            mergeParts.append((ds['id_normal'], df[:splitIndex]))
            # Remove merged part from chunk
            ds['incons']['df'] = df[splitIndex:]
        printMemory()
        
        # Merge parts from all chunks
        merged = pd.DataFrame()
        for dsIdNormal, part in mergeParts:
            logger.debug("Merging %s to %d entries" % (dsIdNormal, len(merged.index)))
            part.rename(columns={'sts_config': 'sts_config_' + dsIdNormal}, inplace=True)
            if merged.empty:
                merged = part
            else:
                merged = merged.merge(part, on='server_name', how='outer', \
                                      suffixes=(None, '_' + dsIdNormal), copy=False)
        logger.debug("Merged chunks (%d entries)" % len(merged.index))
        printMemory()
        
        del mergeParts
        gc.collect()
        printMemory()

        configCols = ['sts_config_' + ds['id_normal'] for ds in dsToCompare]

        # Drop entries that only appear in a single data set
        merged.dropna(axis=0, thresh=2, subset=configCols, inplace=True)
        logger.debug("Dropped entries that only appear in a single data set (%d entries remaining)" % len(merged.index))
        printMemory()

        # Filter inconsistent configurations
        inconsistencyQuery = []
        for i in range(len(configCols)):
            for j in range(i + 1, len(configCols)):
                pairCheck = [
                    configCols[i] + " == " + configCols[i], # i is not nan
                    configCols[j] + " == " + configCols[j], # j is not nan
                    configCols[i] + " != " + configCols[j], # if neither is nan then they have to be unequal to be inconsistent
                ]
                inconsistencyQuery.append("(" + " and ".join(pairCheck) + ")")
        inconsistencyQuery = " or ".join(inconsistencyQuery)
        logger.debug("Inonsistency query: %s" % inconsistencyQuery)
        mergedIncons = merged.query(inconsistencyQuery)
        del merged
        gc.collect()
        logger.debug("Filtered inconsistent configurations (%d entries remaining)" % len(mergedIncons.index))
        printMemory()

        # Append inconsistent
        inconsistent = mergedIncons.append(inconsistent, ignore_index=True)
        del mergedIncons
        logger.debug("Appended inconsistent domains")
        printMemory()

        # Determine next chunks
        dsToRemove = []
        for dsNr, ds in enumerate(dsToCompare):
            if ds['incons']['df'].empty:
                # End of chunk was reached
                if ds['incons']['chunk'] >= len(ds['meta']['chunks']) - 1:
                    # Current chunk is last chunk, remove from data sets to compare
                    del ds['incons']
                    dsToRemove.append(dsNr)
                else:
                    # Select next chunk
                    ds['incons']['chunk'] += 1

        # Remove data sets marked for removal
        dsToRemove.sort(reverse=True)
        for nr in dsToRemove:
            del dsToCompare[nr]

        # Free memory
        del dsToRemove
        gc.collect()
        printMemory()

    # Delete temporary data
    for ds in dataSets:
        if 'incons' in ds.keys():
            del ds['incons']
    gc.collect()
    printMemory()

    if not inconsistent.empty:
        # Analyze inconsistencies
        inconsistent.drop_duplicates(subset=['server_name'], keep='first', inplace=True)
        dsMerge['results']['v4v6inconsistent-domains'] = len(inconsistent.index)
        inconsistent.to_pickle(buildFileName(dsMerge) + EXT_INCONSISTENT + EXT_INCONS_V4V6 + EXT_PICKLE)
    else:
        dsMerge['results']['v4v6inconsistent-domains'] = -1
    saveResults(dsMerge)

    # Free memory
    del inconsistent
    gc.collect()
    printMemory()

    logger.info("Found %d inconsistent domains" % dsMerge['results']['v4v6inconsistent-domains'])

inconsistencyIPv4v6()

## Mark inconsistent

* Marks any lines that are inconsistent in the main data

In [None]:
def markInconsistent():
    onStepStart('mark inconsistent')
    for ds in dataSets:
        logger.info("Processing data in %s" % ds['id'])

        # Load list of all inconsistent server_names
        inconsExt = pd.read_pickle(buildFileName(ds) + EXT_INCONSISTENT + EXT_INCONS_EXISTENCE + EXT_PICKLE)
        inconsConfig = pd.read_pickle(buildFileName(ds) + EXT_INCONSISTENT + EXT_INCONS_CONFIGURATION + EXT_PICKLE)
        printMemory()

        # Only one entry per server_name required
        inconsExt.drop_duplicates(subset=['server_name'], keep='first', inplace=True)
        inconsConfig.drop_duplicates(subset=['server_name'], keep='first', inplace=True)
        printMemory()

        for nr, chunk in enumerate(ds['meta']['chunks']):
            logger.info("Processing chunk %d @ %s" % (nr, ds['id']))

            df = loadChunk(ds, nr, status='parsed')
            logger.debug("Loaded chunk (%d lines)" % len(df.index))
            printMemory()

            df['inconsistent-existence'] = df['server_name'].isin(inconsExt['server_name'])
            df['inconsistent-configuration'] = df['server_name'].isin(inconsConfig['server_name'])

            # Save chunk
            df.to_pickle(buildFileName(ds, chunk=nr, status='marked'))

            # Free memory
            del df
            gc.collect()
            printMemory()

        # Free memory
        del inconsExt
        del inconsConfig
        gc.collect()
        printMemory()

markInconsistent()

## Reduce

* Reduce the data per data set such that only one entry per domain exists.

In [None]:
def reduce():
    onStepStart('reduce')
    for ds in dataSets:
        logger.info("Processing data in %s" % ds['id'])

        reducedCount = 0
        for nr, chunk in enumerate(ds['meta']['chunks']):
            logger.info("Processing chunk %d @ %s" % (nr, ds['id']))
            printMemory()
            chunkFile = buildFileName(ds, chunk=nr, status='reduced')

            if isValidFile(chunkFile, chunk.get('reduced', {}).get('hash', None)):
                logger.info("Found valid chunk")
                reducedCount += chunk['reduced']['lines']
                continue

            df = loadChunk(ds, nr, status='marked')
            logger.debug("Loaded chunk (%d lines)" % len(df.index))
            printMemory()
            
            # Remove inconsistent
            df = df[~df['inconsistent-existence'] & ~df['inconsistent-configuration']]
            logger.debug("Dropped inconsistent entries")
            printMemory()

            # Count entries per domain
            counts = df.groupby(by=['server_name'], sort=False).size().to_frame(name='count').reset_index()

            # Drop duplicate entries
            df.drop_duplicates(subset=['server_name'], keep='first', inplace=True)
            logger.info("Reduced chunk (%d lines remaining)" % len(df.index))
            printMemory()

            # Add counts for entries
            df = df.merge(counts, left_on='server_name', right_on='server_name', how='left')
            del counts

            # Save reduced chunk
            df.to_pickle(chunkFile)
            chunk['reduced'] = {
                'hash': sha512sum(chunkFile),
                'lines': len(df.index)
            }
            logger.debug("Saved reduced chunk")
            printMemory()

            reducedCount += chunk['reduced']['lines']
            saveMeta(ds)

            # Free memory
            del df
            gc.collect()
            printMemory()

        ds['results']['domains'] = reducedCount
        saveResults(ds)

reduce()

## Merge

Merges all data sets and makes sure that in the merged set only one entry per domain exists.

* Selects current chunks out of data sets
* Compares chunk ends to figure out which chunk ends earliest
* Splits other chunks at that point and merges up to that point
* Re-use other part of chunks for next round

In [None]:
def merge():
    onStepStart('merge')

    dsMerge['meta']['chunks'] = dsMerge['meta'].get('chunks', [])
    dsMerge['meta']['totalLines'] = 0
    if(len(dsMerge['meta']['chunks']) > 0):
        # Delete old data
        for f in os.listdir(dsMerge['dir']):
            match = re.search('^' + HOSTS_FILE + EXT_PREPARED + EXT_CHUNK + '\d+.*$', f)
            if match:
                os.remove(os.path.join(dsMerge['dir'], f))
        dsMerge['meta']['chunks'] = []
        dsMerge['meta']['totalLines'] = 0

    # Merge data sets
    dsToMerge = []
    for ds in dataSets:
        ds['merge'] = {
            'chunk': 0,
            'df': pd.DataFrame()
        }
        dsToMerge.append(ds)

    chunksAvailable = True
    merged = pd.DataFrame()
    while dsToMerge:
        logger.info("Merging chunks: " + str([(ds['id'], ds['merge']['chunk']) for ds in dsToMerge]))
        printMemory()

        # Load chunks
        for ds in dsToMerge:
            if ds['merge']['df'].empty:
                ds['merge']['df'] = loadChunk(ds, ds['merge']['chunk'], status='reduced')
        printMemory()

        # Select first chunk end
        chunkEnds = []
        for i, ds in enumerate(dsToMerge):
            if not ds['merge']['df'].empty:
                chunkEnds.append((ds['merge']['df'].iloc[-1]['server_name'], i))
        chunkEnds.sort()

        splitName, dsWithChunkEnd = chunkEnds[0]
        del chunkEnds

        # Merge chunk parts up to splitName
        mergeParts = []
        for ds in dsToMerge:
            df = ds['merge']['df']
            splitIndex = np.searchsorted(df['server_name'], splitName)[0] + 1
            # Append data up to splitIndex to mergeParts
            mergeParts.append((ds['id_normal'], df[:splitIndex]))
            # Remove merged part from chunk
            ds['merge']['df'] = df[splitIndex:]
        printMemory()

        # Append all parts together
        mergedPart = pd.DataFrame()
        for dsName, part in mergeParts:
            mergedPart = part.append(mergedPart, ignore_index=True)
        mergedPart.drop(columns=['count'], inplace=True)
        logger.debug("Merged chunks up to '%s' (%d lines)" % (splitName, len(mergedPart.index)))
        printMemory()

        # Merge to one entry per server_name    
        mergedPart.drop_duplicates(subset=['server_name'], keep='first', inplace=True)
        logger.debug("Dropped duplicates in merged part (%d lines remaining)" % len(mergedPart.index))
        printMemory()

        # Add count columns
        for dsName, part in mergeParts:
            part = part[['server_name', 'count']].rename(index=str, columns={'count': 'count_' + dsName})
            mergedPart = mergedPart.merge(part, left_on='server_name', right_on='server_name', how='left')

        # Free memory
        del mergeParts
        gc.collect()
        printMemory()

        # Append merged part to merge chunk
        if merged.empty:
            merged = mergedPart
        else:
            merged = merged.append(mergedPart, ignore_index=True)
        logger.debug("Merged chunk now contains %d lines" % len(merged.index))

        # Free memory
        del mergedPart
        gc.collect()
        printMemory()

        # Save chunk
        while len(merged.index) >= MERGE_CHUNK_SIZE:
            chunkNr = len(dsMerge['meta']['chunks'])
            chunkFile = buildFileName(dsMerge, chunk=chunkNr, status='reduced')

            mergedChunk = merged[:MERGE_CHUNK_SIZE]
            mergedChunk.to_pickle(chunkFile)
            merged = merged[MERGE_CHUNK_SIZE:]

            chunkMeta = {
                'hash': sha512sum(chunkFile),
                'lines': len(mergedChunk.index)
            }
            dsMerge['meta']['totalLines'] += chunkMeta['lines']
            dsMerge['meta']['chunks'].append(chunkMeta)
            saveMeta(dsMerge)

            del mergedChunk
            del chunkMeta
            logger.debug("Saved merged chunk %d (%d lines remaining for next chunk)" % (chunkNr, len(merged.index)))
            printMemory()

        # Select next chunk in data set with chunk end
        dsToDelete = []
        for dsNr, ds in enumerate(dsToMerge):
            if ds['merge']['df'].empty:
                # End of current chunk was reached
                if ds['merge']['chunk'] >= len(ds['meta']['chunks']) - 1:
                    # Current chunk is last chunk, remove from data sets to merge
                    del ds['merge']
                    dsToDelete.append(dsNr)
                else:
                    # Select next chunk
                    ds['merge']['chunk'] += 1

        # Delete ds marked for deletion
        dsToDelete.sort(reverse=True)
        for dsNr in dsToDelete:
            del dsToMerge[dsNr]
        del dsToDelete

        # Free memory
        gc.collect()
        printMemory()

    # Save chunk last chunk
    while len(merged.index) > 0:
        chunkNr = len(dsMerge['meta']['chunks'])
        chunkFile = buildFileName(dsMerge, chunk=chunkNr, status='reduced')

        mergedChunk = merged[:MERGE_CHUNK_SIZE]
        mergedChunk.to_pickle(chunkFile)
        merged = merged[MERGE_CHUNK_SIZE:]

        chunkMeta = {
            'hash': sha512sum(chunkFile),
            'lines': len(mergedChunk.index)
        }
        dsMerge['meta']['totalLines'] += chunkMeta['lines']
        dsMerge['meta']['chunks'].append(chunkMeta)
        saveMeta(dsMerge)

        del mergedChunk
        del chunkMeta
        logger.debug("Saved merged chunk %d (%d lines remaining for next chunk)" % (chunkNr, len(merged.index)))
        printMemory()

    saveMeta(dsMerge)

    dsMerge['results']['totalLines'] = dsMerge['meta']['totalLines']
    dsMerge['results']['domains'] = dsMerge['meta']['totalLines']
    saveResults(dsMerge)

    # Free memory
    del merged
    gc.collect()
    printMemory()

merge()

## Analyze headers

* Analyzes the usage of the header and its configuration values

In [None]:
def analyzeHeaders():
    onStepStart('analyze headers')
    # Analyze parsed headers
    for ds in dataSets + [dsMerge]:
        ds['meta']['results'] = ds['meta'].get('results', {})
        ds['meta']['results']['chunks'] = ds['meta']['results'].get('chunks', [])

        cntMaxAge = collections.Counter()

        logger.info("Analyzing headers in %s" % ds['id'])
        printMemory()

        # Analysis per chunk
        for nr, chunk in enumerate(ds['meta']['chunks']):
            logger.info("Processing chunk %d @ %s" % (nr, ds['id']))
            printMemory()

            results = {}
            if len(ds['meta']['results']['chunks']) <= nr:
                ds['meta']['results']['chunks'].append(results)
            else:
                results = ds['meta']['results']['chunks'][nr]

            # Load reduced chunk
            df = loadChunk(ds, nr, 'reduced')
            printMemory()

            # Analyze parse error rate
            results['parse-error'] = len(df[df['http-header-parse-error']].index)
            logger.info("%d have a parse error" % results['parse-error'])

            # General sts usage
            df = df[df['http-header--sts']]
            results['sts'] = len(df.index)
            logger.info("%d use sts" % results['sts'])

            # max-age
            cntMaxAge.update(df['http-header--sts-max-age'].tolist())

            # includeSubDomains
            results['sts-includeSubDomains'] = int(df[df['http-header--sts-includeSubDomains']]['http-header--sts-includeSubDomains'].count())
            logger.info("%d include sub domains" % results['sts-includeSubDomains'])

            # preload
            results['sts-preload'] = int(df[df['http-header--sts-preload']]['http-header--sts-preload'].count())
            logger.info("%d use preload" % results['sts-preload'])

            # Free memory
            del df
            gc.collect()
            printMemory()

            saveMeta(ds)

        # Sum results
        resultSums = {key: 0 for key in ['parse-error', 'sts', 'sts-includeSubDomains', 'sts-preload']}
        for chunkResults in ds['meta']['results']['chunks']:
            for key in resultSums.keys():
                resultSums[key] += chunkResults.get(key, 0)
        for key, value in resultSums.items():
            ds['meta']['results'][key] = value

        ds['meta']['results']['sts-max-age'] = [{'occurences': occ, 'value': val} for val, occ in cntMaxAge.most_common()]
        saveMeta(ds)
        printMemory()

        # Copy end results
        for key in resultSums.keys():
            ds['results'][key] = ds['meta']['results'][key]

        # Aggregate max age values
        ds['results']['sts-max-age'] = {aggKey: 0 for aggKey in maxAgeAggregation}
        maxAgeAggValues = list(maxAgeAggregation.items())
        maxAgeAggValues.sort(key=lambda x: x[1])
        for val, occ in cntMaxAge.most_common():
            for aggKey, aggVal in maxAgeAggValues:
                if val <= aggVal:
                    ds['results']['sts-max-age'][aggKey] += occ
                    break
        saveResults(ds)
        printMemory()

analyzeHeaders()

In [None]:
def printResults():
    for ds in dataSets + [dsMerge]:
        results = ds.get('results', {})
        logger.info("Results for %s" % ds['id'])

        logger.info("%d scanned domains" % results['domains'])
        if results['domains'] == 0:
            continue
        logger.info("%d (%.4f%% of all scanned domains) use sts" % (
            results['sts'],
            results['sts'] / results['domains'] * 100.0
        ))
        if results['sts'] == 0:
            continue
        logger.info("%d (%.4f%% of all scanned domains) had a parse error" % (
            results['parse-error'],
            results['parse-error'] / results['domains'] * 100.0
        ))
        logger.info("%d (%.4f%% of domains with a valid sts header) include the subdomains (%.4f%% of all domains)" % (
            results['sts-includeSubDomains'],
            results['sts-includeSubDomains'] / results['sts'] * 100.0,
            results['sts-includeSubDomains'] / results['domains'] * 100.0
        ))
        logger.info("%d (%.4f%% of hosts with a valid sts header) preload the domain (%.4f%% of all domains)" % (
            results['sts-preload'],
            results['sts-preload'] / results['sts'] * 100.0,
            results['sts-preload'] / results['domains'] * 100.0
        ))

printResults()

### Generate HSTS Domain list

In [None]:
def genHstsDomainList():
    onStepStart('hsts domain list')
    for ds in dataSets + [dsMerge]:
        logger.info("Processing %s" % ds['id'])
        sts = pd.DataFrame()
        for nr, chunk in enumerate(ds['meta']['chunks']):
            logger.info("Loading chunk %d" % nr)
            df = loadChunk(ds, nr, status='reduced')

            # Only consistent
            df = df[~df['inconsistent-existence']]
            df = df[~df['inconsistent-configuration']]

            # Filter sts domains
            df = df[df['http-header--sts']]

            # Append domains
            sts = df.append(sts, ignore_index=True)

            del df

        # Just to be sure
        sts.drop_duplicates('server_name', keep='first', inplace=True)
        sts.sort_values('server_name', inplace=True)

        # Save domain list
        sts['server_name'].to_csv(buildFileName(ds) + '.consistent.hsts-domains.csv', index=False)

        logger.info("Saved consistent hsts domains")

        # Free memory
        del sts
        gc.collect()

#genHstsDomainList()