diff --git a/.gitignore b/.gitignore index c170c8c..c54ac46 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ __pycache__/ # Distribution / packaging .Python env/ +.venv build/ develop-eggs/ dist/ @@ -57,3 +58,6 @@ target/ # Vim stuff .ropeproject/ + +# Pycharm settings +.idea \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 66e74a0..915efc8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,9 +6,11 @@ matrix: python: - "2.7" - "3.6" + - "3.7" + - "3.8" install: - - "pip install ." + - "pip install tox" - "pip install -r requirements_dev.txt" -script: py.test --cov haralyzer tests/ +script: py.test --cov haralyzer tests/ -vv after_success: - coveralls diff --git a/README.rst b/README.rst index 5561da7..b5fc94d 100644 --- a/README.rst +++ b/README.rst @@ -19,9 +19,11 @@ A Python Framework For Using HAR Files To Analyze Web Pages. Overview -------- -The haralyzer module contains two classes for analyzing web pages based +The haralyzer module contains three classes for analyzing web pages based on a HAR file. ``HarParser()`` represents a full file (which might have -multiple pages), and ``HarPage()`` represents a single page from said file. +multiple pages). ``HarPage()`` represents a single page from said file. +``HarEntry()`` represents an entry in a ``HarPage()``, and there are are multiple entries per page. +Each ``HarEntry`` has a request and response that contains items such as the headers, status code, timings, etc ``HarParser`` has a couple of helpful methods for analyzing single entries from a HAR file, but most of the pertinent functions are inside of the page @@ -119,6 +121,102 @@ to a page, an additional page will be created with an ID of `unknown`. This not have attributes for things like time to first byte or page load, and will return `None`. +HarEntry +++++++++ + +The ``HarEntry()`` object contains useful information for each request. The main purpose is to have easy of use as it has a lot of attributes. +Each entry also contains a ``Request()`` and ``Response()`` which are styled off of the requests library.:: + + import json + from haralyzer import HarPage + + with open("humanssuck.net.har", 'r') as f: + har_page = HarPage('page_3', har_data=json.loads(f.read())) + + ### GET BASIC INFO + print(har_page.hostname) + # 'humanssuck.net' + print(har_page.url) + # 'http://humanssuck.net/' + + ### GET LIST OF ENTRIES + print(har_page.entries) + # [HarEntry for http://humanssuck.net/, HarEntry for http://humanssuck.net/test.css, ...] + + ### WORKING WITH ENTRIES + single_entry = har_page.entries[0] + + ### REQUEST HEADERS + print(single_entry.request.headers) + # [{'name': 'Host', 'value': 'humanssuck.net'}, {'name': 'User-Agent', 'value': 'Mozilla/5.0 (X11; Linux i686 on x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'}, ...] + + ### RESPONSE HEADERS + print(single_entry.response.headers) + # [{'name': 'Server', 'value': 'nginx'}, {'name': 'Date', 'value': 'Mon, 23 Feb 2015 03:28:12 GMT'}, ...] + + ### RESPONSE CODE + print(single_entry.response.status) + # 200 + + # GET THE VALUE OF A REQUEST OR RESPONSE HEADER + print(single_entry.request.get_header_value("accept")) + # text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 + + # ALL ATTRIBUTES OF A ENTRY + + single_entry.cache -> Dictionary of cached content + single_entry.cookies -> List of combined cookies for request and response + single_entry.headers -> List of combined headers for request and response + single_entry.pageref -> String of the pageref + single_entry.port -> Integer of the port number for the server + single_entry.request -> Request object + single_entry.response -> Response object + single_entry.secure -> Bool if secure is set + single_entry.serverAddress -> String of the server IP + single_entry.startTime -> Datetime of the start time + single_entry.time -> Integer of total time for entry + single_entry.timings -> Dictionary of the timings for a request + single_entry.url -> String of the request url + + # ALL ATTRIBUTES OF A REQUEST + + single_entry.request.accept -> String of the ``Accept`` header + single_entry.request.bodySize -> Integer of the body size for the request + single_entry.request.cacheControl -> String of the ``Cache-Control`` header + single_entry.request.cookies -> List of cookies + single_entry.request.encoding -> String of the ``Accept-Encoding`` header + single_entry.request.headers -> List of headers + single_entry.request.headersSize -> Integer of the size of the headers + single_entry.request.host -> String of the ``Host`` header + single_entry.request.httpVersion -> String of the http version used + single_entry.request.language -> String of the ``Accept-Language`` header + single_entry.request.method -> String of the HTTP method used + single_entry.request.queryString -> List of query string used + single_entry.request.url -> String of the URL + single_entry.request.userAgent -> String of the User-Agent + + # ALL ATTRIBUTES OF A RESPONSE + single_entry.response.bodySize -> Integer of the body size for the response + single_entry.response.cacheControl -> String of the ``Cache-Control`` header + single_entry.response.contentSecurityPolicy -> String of the `Content-Security-Policy`` header + single_entry.response.contentSize -> Integer of the content size + single_entry.response.contentType -> String of the ``content-type`` header + single_entry.response.date -> String of the ``date`` header + single_entry.response.headers -> List of headers + single_entry.response.headersSize -> Integer of the size of the headers + single_entry.response.httpVersion -> String of the http version used + single_entry.response.lastModified -> String of the ``last-modified`` header + single_entry.response.mimeType -> String of the mimeType of the content + single_entry.response.redirectURL -> String of the redirect URL or None + single_entry.response.status -> Integer of th HTTP status code + single_entry.response.statusText -> String of HTTP status + single_entry.response.text -> String of content received + + ** You are still able to access items like a dictionary. + print(single_entry["connection"]) + # "80" + + MultiHarParser ++++++++++++++ diff --git a/haralyzer/__init__.py b/haralyzer/__init__.py index 5f7a999..c1aef3e 100644 --- a/haralyzer/__init__.py +++ b/haralyzer/__init__.py @@ -1,7 +1,10 @@ """ Module for analyzing web pages using HAR files """ -from .assets import HarParser, HarPage +from .assets import HarParser, HarPage, HarEntry from .multihar import MultiHarParser + + +__all__ = ["HarPage", "HarParser", "MultiHarParser", "HarEntry"] diff --git a/haralyzer/assets.py b/haralyzer/assets.py index cc53b88..f85df57 100644 --- a/haralyzer/assets.py +++ b/haralyzer/assets.py @@ -2,24 +2,39 @@ Provides all of the main functional classes for analyzing HAR files """ +import functools import datetime import re -import dateutil from collections import Counter from cached_property import cached_property # I know this import is stupid, but I cannot use dateutil.parser without it from dateutil import parser -assert parser - from .compat import iteritems from .errors import PageNotFoundError +from .http import Request, Response +from .mixins import MimicDict DECIMAL_PRECISION = 0 +def convert_to_entry(func): + @functools.wraps(func) + def inner(*args, **kwargs): + # Changed to list because tuple does not support item assignment + changed_args = list(args) + # Convert the dict (first argument) to HarEntry + if isinstance(changed_args[0], dict): + changed_args[0] = HarEntry(changed_args[0]) + # For some cases have HarParser as the first type with the Entry and second + if isinstance(changed_args[0], HarParser): + changed_args[1] = HarEntry(changed_args[1]) + return func(*tuple(changed_args), **kwargs) + return inner + + class HarParser(object): """ A Basic HAR parser that also adds helpful stuff for analyzing the @@ -38,6 +53,7 @@ def __init__(self, har_data=None): ' to instantiate this class. Please RTFM.') self.har_data = har_data['log'] + @convert_to_entry def match_headers(self, entry, header_type, header, value, regex=True): """ Function to match headers. @@ -48,7 +64,7 @@ def match_headers(self, entry, header_type, header, value, regex=True): This function is case-insensitive - :param entry: entry object + :param entry: ``HarEntry`` object to analyze :param header_type: ``str`` of header type. Valid values: * 'request' @@ -60,12 +76,12 @@ def match_headers(self, entry, header_type, header, value, regex=True): :returns: a ``bool`` indicating whether a match was found """ - if header_type not in entry: + if header_type not in ["request", "response"]: raise ValueError('Invalid header_type, should be either:\n\n' '* \'request\'\n*\'response\'') # TODO - headers are empty in some HAR data.... need fallbacks here - for h in entry[header_type]['headers']: + for h in getattr(entry, header_type).headers: if h['name'].lower() == header.lower() and h['value'] is not None: if regex and re.search(value, h['value'], flags=re.IGNORECASE): return True @@ -74,56 +90,59 @@ def match_headers(self, entry, header_type, header, value, regex=True): return False @staticmethod + @convert_to_entry def match_content_type(entry, content_type, regex=True): """ Matches the content type of a request using the mimeType metadata. - :param entry: ``dict`` of a single entry from a HarPage + :param entry: ``HarEntry`` object to analyze :param content_type: ``str`` of regex to use for finding content type :param regex: ``bool`` indicating whether to use regex or exact match. """ - mimeType = entry['response']['content']['mimeType'] + mime_type = entry.response.mimeType - if regex and re.search(content_type, mimeType, flags=re.IGNORECASE): + if regex and re.search(content_type, mime_type, flags=re.IGNORECASE): return True - - elif content_type == mimeType: + elif content_type == mime_type: return True return False + @convert_to_entry def match_request_type(self, entry, request_type, regex=True): """ Helper function that returns entries with a request type matching the given `request_type` argument. - :param entry: entry object to analyze + :param entry: ``HarEntry`` object to analyze :param request_type: ``str`` of request type to match :param regex: ``bool`` indicating whether to use a regex or string match """ if regex: - return re.search(request_type, entry['request']['method'], + return re.search(request_type, entry.request.method, flags=re.IGNORECASE) is not None else: - return entry['request']['method'] == request_type + return entry.request.method == request_type @staticmethod + @convert_to_entry def match_http_version(entry, http_version, regex=True): """ Helper function that returns entries with a request type matching the given `request_type` argument. - :param entry: entry object to analyze - :param request_type: ``str`` of request type to match + :param entry: ``HarEntry`` object to analyze + :param http_version: ``str`` of HTTP version type to match :param regex: ``bool`` indicating whether to use a regex or string match """ - response_version = entry['response']['httpVersion'] + response_version = entry.response.httpVersion if regex: return re.search(http_version, response_version, flags=re.IGNORECASE) is not None else: return response_version == http_version + @convert_to_entry def match_status_code(self, entry, status_code, regex=True): """ Helper function that returns entries with a status code matching @@ -133,13 +152,13 @@ def match_status_code(self, entry, status_code, regex=True): :param entry: entry object to analyze :param status_code: ``str`` of status code to search for - :param request_type: ``regex`` of request type to match + :param regex: ``bool`` indicating whether to use a regex or string match """ if regex: return re.search(status_code, - str(entry['response']['status'])) is not None + str(entry.response.status)) is not None else: - return str(entry['response']['status']) == status_code + return str(entry.response.status) == status_code def create_asset_timeline(self, asset_list): """ @@ -152,8 +171,8 @@ def create_asset_timeline(self, asset_list): """ results = dict() for asset in asset_list: - time_key = dateutil.parser.parse(asset['startedDateTime']) - load_time = int(asset['time']) + time_key = asset.startTime + load_time = int(asset.time) # Add the start time and asset to the results dict if time_key in results: results[time_key].append(asset) @@ -214,7 +233,7 @@ class HarPage(object): def __init__(self, page_id, har_parser=None, har_data=None): """ :param page_id: ``str`` of the page ID - :param parser: a HarParser object + :param har_parser: a HarParser object :param har_data: ``dict`` of a file HAR file """ self.page_id = page_id @@ -290,7 +309,7 @@ def _get_asset_load(self, asset_type): value is in ms. """ if asset_type == 'initial': - return self.actual_page['time'] + return self.actual_page.time elif asset_type == 'content': return self.pageTimings['onContentLoad'] elif asset_type == 'page': @@ -350,7 +369,7 @@ def filter_entries( if http_version is not None and not p.match_http_version( entry, http_version, regex=regex): valid_entry = False - if load_time__gt is not None and entry.get('time') < load_time__gt: + if load_time__gt is not None and entry.time < load_time__gt: valid_entry = False if valid_entry: @@ -386,7 +405,7 @@ def get_load_time(self, request_type=None, content_type=None, if not asynchronous: time = 0 for entry in entries: - time += entry['time'] + time += entry.time return time else: return len(self.parser.create_asset_timeline(entries)) @@ -399,8 +418,8 @@ def get_total_size(self, entries): """ size = 0 for entry in entries: - if entry['response']['bodySize'] > 0: - size += entry['response']['bodySize'] + if entry.response.bodySize > 0: + size += entry.response.bodySize return size def get_total_size_trans(self, entries): @@ -413,8 +432,8 @@ def get_total_size_trans(self, entries): """ size = 0 for entry in entries: - if entry['response']['_transferSize'] > 0: - size += entry['response']['_transferSize'] + if entry.response.raw_entry['_transferSize'] > 0: + size += entry.response.raw_entry['_transferSize'] return size # BEGIN PROPERTIES # @@ -424,7 +443,7 @@ def hostname(self): """ Hostname of the initial request """ - for header in self.entries[0]['request']['headers']: + for header in self.entries[0].request.headers: if header['name'] == 'Host': return header['value'] @@ -433,8 +452,8 @@ def url(self): """ The absolute URL of the initial request. """ - if 'request' in self.entries[0] and 'url' in self.entries[0]['request']: - return self.entries[0]['request']['url'] + if 'request' in self.entries[0].raw_entry and 'url' in self.entries[0].request.raw_entry: + return self.entries[0].request.url return None @cached_property @@ -443,13 +462,12 @@ def entries(self): for entry in self.parser.har_data['entries']: if 'pageref' not in entry: if self.page_id == 'unknown': - page_entries.append(entry) + page_entries.append(HarEntry(entry)) elif entry['pageref'] == self.page_id: - page_entries.append(entry) + page_entries.append(HarEntry(entry)) # Make sure the entries are sorted chronologically - if all('startedDateTime' in x for x in page_entries): - return sorted(page_entries, - key=lambda entry: entry['startedDateTime']) + if all(x.startTime for x in page_entries): + return sorted(page_entries, key=lambda entry: entry.startTime) else: return page_entries @@ -464,14 +482,14 @@ def time_to_first_byte(self): return None ttfb = 0 for entry in self.entries: - if entry['response']['status'] == 200: - for k, v in iteritems(entry['timings']): + if entry.response.status == 200: + for k, v in iteritems(entry.timings): if k != 'receive': if v > 0: ttfb += v break else: - ttfb += entry['time'] + ttfb += entry.time return ttfb @@ -498,8 +516,7 @@ def actual_page(self): indicating that it is the actual page we care about (after redirects). """ for entry in self.entries: - if not (entry['response']['status'] >= 300 and - entry['response']['status'] <= 399): + if not (300 <= entry.response.status <= 399): return entry @cached_property @@ -507,9 +524,9 @@ def duplicate_url_request(self): """ Returns a dict of urls and its number of repetitions that are sent more than once """ - urls = [entry.get('request').get('url') for entry in self.entries] + urls = [entry.request.url for entry in self.entries] counted_urls = Counter(urls) - return {k:v for k,v in counted_urls.items() if v > 1} + return {k: v for k, v in counted_urls.items() if v > 1} # Convenience properties. Easy accessible through the API, but even easier # to use as properties @@ -632,3 +649,74 @@ def video_load_time(self): @cached_property def html_load_time(self): return self._get_asset_load('html') + + +class HarEntry(MimicDict): + """ + An object that represent one entry in a HAR Page + """ + def __init__(self, entry): + self.raw_entry = entry + + def __str__(self): + return "HarEntry for %s" % self.raw_entry["request"]["url"] + + def __repr__(self): + return "HarEntry for %s" % self.raw_entry["request"]["url"] + + @cached_property + def request(self): + return Request(entry=self.raw_entry["request"]) + + @cached_property + def response(self): + if isinstance(self.raw_entry, dict): + return Response(entry=self.raw_entry["response"]) + return self.raw_entry.response + + @cached_property + def startTime(self): + try: + return parser.parse(self.raw_entry.get("startedDateTime", "")) + except parser._parser.ParserError: + return None + + @cached_property + def cache(self): + return self.raw_entry["cache"] + + @cached_property + def cookies(self): + return self.raw_entry.get("cookies", []) + + @cached_property + def pageref(self): + return self.raw_entry["pageref"] + + @cached_property + def port(self): + return int(self.raw_entry["connection"]) + + @cached_property + def secure(self): + return self.raw_entry.get('_securityState', '') == 'secure' + + @cached_property + def serverAddress(self): + return self.raw_entry["serverIPAddress"] + + @cached_property + def status(self): + return self.raw_entry["response"]["status"] + + @cached_property + def time(self): + return self.raw_entry["time"] + + @cached_property + def timings(self): + return self.raw_entry["timings"] + + @cached_property + def url(self): + return self.raw_entry["request"]["url"] diff --git a/haralyzer/http.py b/haralyzer/http.py new file mode 100644 index 0000000..0913091 --- /dev/null +++ b/haralyzer/http.py @@ -0,0 +1,134 @@ +"""Creates the Request and Response sub class that are used by each entry""" +from cached_property import cached_property +from .mixins import HttpTransaction + + +class Request(HttpTransaction): + """Request object for an HarEntry""" + + def __str__(self): + return "HarEntry.Request for %s" % self.raw_entry["url"] + + def __repr__(self): + return "HarEntry.Request for %s" % self.raw_entry["url"] + + # Root Level values + + @cached_property + def bodySize(self): + return self.raw_entry["bodySize"] + + @cached_property + def cookies(self): + return self.raw_entry["cookies"] + + @cached_property + def headersSize(self): + return self.raw_entry["headersSize"] + + @cached_property + def httpVersion(self): + return self.raw_entry["httpVersion"] + + @cached_property + def method(self): + return self.raw_entry["method"] + + @cached_property + def queryString(self): + return self.raw_entry["queryString"] + + @cached_property + def url(self): + return self.raw_entry["url"] + + # Header Values + + @cached_property + def accept(self): + return self.get_header_value("Accept") + + @cached_property + def cacheControl(self): + return self.get_header_value("Cache-Control") + + @cached_property + def encoding(self): + return self.get_header_value("Accept-Encoding") + + @cached_property + def host(self): + return self.get_header_value("Host") + + @cached_property + def language(self): + return self.get_header_value("Accept-Language") + + @cached_property + def userAgent(self): + return self.get_header_value("User-Agent") + + +class Response(HttpTransaction): + """Response object for a HarEntry""" + + # Root Level values + + @cached_property + def bodySize(self): + return self.raw_entry["bodySize"] + + @cached_property + def headersSize(self): + return self.raw_entry["headersSize"] + + @cached_property + def httpVersion(self): + return self.raw_entry["httpVersion"] + + @cached_property + def redirectURL(self): + if self.raw_entry["redirectURL"]: + return self.raw_entry["redirectURL"] + + @cached_property + def status(self): + return self.raw_entry["status"] + + @cached_property + def statusText(self): + return self.raw_entry["statusText"] + + # Header Values + + @cached_property + def cacheControl(self): + return self.get_header_value("cache-control") + + @cached_property + def contentSecurityPolicy(self): + return self.get_header_value("content-security-policy") + + @cached_property + def contentSize(self): + return self.raw_entry["content"]["size"] + + @cached_property + def contentType(self): + return self.get_header_value("content-type") + + @cached_property + def date(self): + return self.get_header_value("date") + + @cached_property + def lastModified(self): + return self.get_header_value("last-modified") + + @cached_property + def mimeType(self): + return self.raw_entry['content']['mimeType'] + + @cached_property + def text(self): + return self.raw_entry['content']['text'] diff --git a/haralyzer/mixins.py b/haralyzer/mixins.py new file mode 100644 index 0000000..ca14ced --- /dev/null +++ b/haralyzer/mixins.py @@ -0,0 +1,45 @@ +"""Mixin Objects that allow for shared methods""" +from cached_property import cached_property +from six.moves.collections_abc import MutableMapping + + +class GetHeaders(object): + """Mixin to get a header""" + def get_header_value(self, name): + """ + Returns the header value of the header defined in ``name`` + + :param name: ``str`` name of the header to get the value of + """ + for x in self.raw_entry["headers"]: + if x["name"].lower() == name.lower(): + return x["value"] + + +class MimicDict(MutableMapping): + """Mixin for functions to mimic a dictionary for backward compatibility""" + + def __getitem__(self, item): + return self.raw_entry[item] + + def __len__(self): + return len(self.raw_entry) + + def __delitem__(self, key): + del self.raw_entry[key] + + def __iter__(self): + return iter(self.raw_entry) + + def __setitem__(self, key, value): + self.raw_entry[key] = value + + +class HttpTransaction(GetHeaders, MimicDict): + def __init__(self, entry): + self.raw_entry = entry + + # Base class gets properties that belong to both request/response + @cached_property + def headers(self): + return self.raw_entry["headers"] diff --git a/requirements.txt b/requirements.txt index 5edc5cd..2e4a544 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ cached-property python-dateutil +six >= 1.13.0 diff --git a/setup.py b/setup.py index f8867c9..983c9a1 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,9 @@ from distutils.core import setup install_reqs = ['cached-property', - 'python-dateutil', ] + 'python-dateutil', + "six >= 1.13.0"] + if sys.version_info < (3, 4): install_reqs.extend([ "backports.statistics", diff --git a/tests/chrome/__init__.py b/tests/chrome/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/chrome/test_chrome_entry.py b/tests/chrome/test_chrome_entry.py new file mode 100644 index 0000000..d927546 --- /dev/null +++ b/tests/chrome/test_chrome_entry.py @@ -0,0 +1,139 @@ +import pytest +from haralyzer import HarPage, HarEntry + + +PAGE_ID = 'page_1' + + +def test_entry(har_data): + """ + Tests that HarEntry class works + """ + init_data = har_data("chrome.har") + single_entry = HarPage(PAGE_ID, har_data=init_data).entries[1] + assert isinstance(single_entry, HarEntry) + assert str(single_entry) == "HarEntry for https://jwhite.network/" + assert repr(single_entry) == "HarEntry for https://jwhite.network/" + + assert single_entry.cache == {} + assert len(single_entry.cookies) == 0 + assert single_entry.pageref == "page_1" + assert single_entry.port == 249 + assert single_entry.status == 301 + assert single_entry.secure is False + assert single_entry.serverAddress == "104.27.152.17" + assert single_entry.time == 110.02700000244658 + assert single_entry.timings == { + "blocked": 0.5099999980302528, + "dns": 0, + "ssl": 36.527, + "connect": 62.269, + "send": 1.0060000000000002, + "wait": 44.8429999964661, + "receive": 1.3990000079502352, + "_blocked_queueing": 0.5099999980302528 + } + assert single_entry.url == "https://jwhite.network/" + + +def test_request(har_data): + """ + Tests that HarEntry.request has the correct data + """ + init_data = har_data("chrome.har") + request = HarPage(PAGE_ID, har_data=init_data).entries[1].request + assert str(request) == "HarEntry.Request for https://jwhite.network/" + assert repr(request) == "HarEntry.Request for https://jwhite.network/" + + assert request.accept == "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" + assert request.cookies == [{ + "name": "__cfduid", + "value": "df477fc1d24c2bbce2fe8127a020316a11598723802", + "expires": None, + "httpOnly": False, + "secure": False + }] + assert request.bodySize == 0 + assert request.cacheControl == "no-cache" + assert request.encoding == "gzip, deflate, br" + assert len(request.headers) == 16 + assert request.headersSize == -1 + assert request.host is None + assert request.httpVersion == "http/2.0" + assert request.language == "en-US,en;q=0.9" + assert request.method == "GET" + assert len(request.queryString) == 0 + assert request.url == "https://jwhite.network/" + assert request.userAgent == "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36" + + assert request.get_header_value("Connection") is None + + +def test_response(har_data): + """ + Tests the HarEntry.response has the correct data + """ + init_data = har_data("chrome.har") + response = HarPage(PAGE_ID, har_data=init_data).entries[1].response + assert response.bodySize == -1 + assert response.cacheControl == "max-age=3600" + assert response.contentSecurityPolicy is None + assert response.contentSize == 0 + assert response.contentType is None + assert response.date == "Thu, 24 Sep 2020 22:22:57 GMT" + assert len(response.headers) == 13 + assert response.headersSize == -1 + assert response.httpVersion == "http/2.0" + assert response.lastModified is None + assert response.mimeType == "x-unknown" + assert response.redirectURL == "https://www.jwhite.network" + assert response.status == 301 + assert response.statusText == "" + with pytest.raises(KeyError): + assert len(response.text) + + assert response.get_header_value("Server") == "cloudflare" + + +def test_backwards(har_data): + """ + Tests that HarEntry class works if expecting dictionary. + Made so it is a non-breaking change + """ + init_data = har_data("chrome.har") + single_entry = HarPage(PAGE_ID, har_data=init_data).entries[1] + assert single_entry["cache"] == {} + assert single_entry["pageref"] == "page_1" + assert single_entry["connection"] == "249" + with pytest.raises(KeyError): + assert single_entry["_securityState"] + assert single_entry["serverIPAddress"] == "104.27.152.17" + assert single_entry["time"] == 110.02700000244658 + assert single_entry["timings"] == { + "blocked": 0.5099999980302528, + "dns": 0, + "ssl": 36.527, + "connect": 62.269, + "send": 1.0060000000000002, + "wait": 44.8429999964661, + "receive": 1.3990000079502352, + "_blocked_queueing": 0.5099999980302528 + } + assert single_entry["request"]["method"] == "GET" + + assert len(single_entry) == 12 + assert len(single_entry.keys()) == 12 + assert len(single_entry.items()) == 12 + + assert single_entry.get("time") == 110.02700000244658 + assert single_entry.get("NothingHere", "Default") == "Default" + + assert single_entry.request["method"] == single_entry.request.get("method") == "GET" + + assert single_entry.response["status"] == single_entry.response.get("status") == 301 + + # MISC TESTS FOR DICT COMPATIBILITY/COVERAGE + single_entry["Testing"] = "HelloWorld" + assert "Testing" in single_entry + del single_entry["Testing"] + assert iter(single_entry) diff --git a/tests/chrome/test_chrome_page.py b/tests/chrome/test_chrome_page.py new file mode 100644 index 0000000..a2dac91 --- /dev/null +++ b/tests/chrome/test_chrome_page.py @@ -0,0 +1,168 @@ +import pytest +from haralyzer import HarPage, HarParser +from haralyzer.errors import PageNotFoundError +import re + +BAD_PAGE_ID = 'sup_dawg' +PAGE_ID = 'page_1' + + +def test_init(har_data): + """ + Test the object loading + """ + with pytest.raises(ValueError): + assert HarPage(PAGE_ID) + + init_data = har_data("chrome.har") + + # Throws PageNotFoundException with bad page ID + with pytest.raises(PageNotFoundError): + assert HarPage(BAD_PAGE_ID, har_data=init_data) + + # Make sure it can load with either har_data or a parser + page = HarPage(PAGE_ID, har_data=init_data) + assert isinstance(page, HarPage) + assert repr(page) == "ID: page_1, URL: http://jwhite.network/" + parser = HarParser(init_data) + page = HarPage(PAGE_ID, har_parser=parser) + assert isinstance(page, HarPage) + + assert len(page.entries) == 44 + # Make sure that the entries are actually in order. Going a little bit + # old school here. + for index in range(0, len(page.entries)): + if index != len(page.entries) - 1: + current_date = page.entries[index].startTime + next_date = page.entries[index + 1].startTime + assert current_date <= next_date + + +def test_filter_entries(har_data): + """ + Tests ability to filter entries, with or without regex + """ + init_data = har_data("chrome.har") + page = HarPage(PAGE_ID, har_data=init_data) + + # Filter by request type only + entries = page.filter_entries(request_type='.*ET') + assert len(entries) == 41 + for entry in entries: + assert entry.request.method == entry["request"]["method"] == 'GET' + + # Filter by request type and content_type + entries = page.filter_entries(request_type='.*ET', content_type='image.*') + assert len(entries) == 11 + for entry in entries: + assert entry.request.method == entry["request"]["method"] == 'GET' + for header in entry.request.headers: + if header['name'] == 'Content-Type': + assert re.search('image.*', header['value']) + + # Filter by request type, content type, and status code + entries = page.filter_entries(request_type='.*ET', content_type='image.*', + status_code='2.*') + assert len(entries) == 11 + for entry in entries: + assert entry.request.method == entry["request"]["method"] == 'GET' + assert re.search('2.*', str(entry.response.status)) + for header in entry.response.headers: + if header['name'] == 'Content-Type': + assert re.search('image.*', header['value']) + for header in entry["response"]["headers"]: + if header['name'] == 'Content-Type': + assert re.search('image.*', header['value']) + + entries = page.filter_entries(request_type='.*ST') + assert len(entries) == 3 + entries = page.filter_entries(request_type='.*ET', content_type='video.*') + assert len(entries) == 0 + entries = page.filter_entries(request_type='.*ET', content_type='image.*', + status_code='3.*') + assert len(entries) == 0 + + +def test_entries(har_data): + init_data = har_data("chrome.har") + page = HarPage(PAGE_ID, har_data=init_data) + + for entry in page.entries: + assert entry.pageref == entry["pageref"] == page.page_id + + +def test_request_types(har_data): + """ + Test request type filters + """ + init_data = har_data("chrome.har") + page = HarPage(PAGE_ID, har_data=init_data) + + # Check request type lists + for req in page.get_requests: + assert req.request.method == req["request"]["method"] == 'GET' + + for req in page.post_requests: + assert req.request.method == req["request"]["method"] == 'POST' + + +def test_load_times(har_data): + """ + This whole test really needs better sample data. I need to make a + web page with like 2-3 of each asset type to really test the load times. + """ + init_data = har_data("chrome.har") + page = HarPage(PAGE_ID, har_data=init_data) + # Check initial page load + assert page.actual_page.request.url == 'https://www.jwhite.network/' + + # Check initial page load times + assert page.initial_load_time == 44.99499999656109 + assert page.content_load_time == 396.14499999879627 + # Check content type browser (async) load times + assert page.image_load_time == 770 + assert page.css_load_time == 170 + assert page.js_load_time == 761 + assert page.html_load_time == 44 + assert page.page_load_time == 621.5909999955329 + # TODO - Need to get sample data for these types + assert page.audio_load_time == 0 + assert page.video_load_time == 0 + + +def test_time_to_first_byte(har_data): + """ + Tests that TTFB is correctly reported as a property of the page. + """ + init_data = har_data("chrome.har") + page = HarPage(PAGE_ID, har_data=init_data) + assert page.time_to_first_byte == 157.19699999317527 + + +def test_hostname(har_data): + """ + Makes sure that the correct hostname is returned. + """ + init_data = har_data("chrome.har") + page = HarPage(PAGE_ID, har_data=init_data) + assert page.hostname is None + + +def test_url(har_data): + """ + Makes sure that the correct URL is returned. + """ + init_data = har_data("chrome.har") + page = HarPage(PAGE_ID, har_data=init_data) + assert page.url == 'http://jwhite.network/' + + +def test_redirect(har_data): + """ + Makes sure that the entry.redirectURL works + """ + init_data = har_data("chrome.har") + entry = HarPage(PAGE_ID, har_data=init_data).entries[0] + assert entry.response.redirectURL == "https://jwhite.network/" + + diff --git a/tests/conftest.py b/tests/conftest.py index 7822799..9929a4f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,7 +4,7 @@ @pytest.fixture -def har_data(request): +def har_data(): """ Given a HAR file name, returns a ``dict`` of this data from the corresponding file name in tests/data @@ -19,7 +19,7 @@ def load_doc(filename): @pytest.fixture -def header_types(request): +def header_types(): """ Just returns all the headers we need to test """ diff --git a/tests/data/chrome.har b/tests/data/chrome.har new file mode 100644 index 0000000..f8d3851 --- /dev/null +++ b/tests/data/chrome.har @@ -0,0 +1,10739 @@ +{ + "log": { + "version": "1.2", + "creator": { + "name": "WebInspector", + "version": "537.36" + }, + "pages": [ + { + "startedDateTime": "2020-09-24T22:22:57.663Z", + "id": "page_1", + "title": "http://jwhite.network/", + "pageTimings": { + "onContentLoad": 396.14499999879627, + "onLoad": 621.5909999955329 + } + } + ], + "entries": [ + { + "_initiator": { + "type": "other" + }, + "_priority": "VeryHigh", + "_resourceType": "document", + "cache": {}, + "pageref": "page_1", + "request": { + "method": "GET", + "url": "http://jwhite.network/", + "httpVersion": "HTTP/1.1", + "headers": [ + { + "name": "Upgrade-Insecure-Requests", + "value": "1" + }, + { + "name": "User-Agent", + "value": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36" + }, + { + "name": "Accept", + "value": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" + } + ], + "queryString": [], + "cookies": [], + "headersSize": 320, + "bodySize": 0 + }, + "response": { + "status": 307, + "statusText": "Internal Redirect", + "httpVersion": "HTTP/1.1", + "headers": [ + { + "name": "Location", + "value": "https://jwhite.network/" + }, + { + "name": "Non-Authoritative-Reason", + "value": "HSTS" + } + ], + "cookies": [], + "content": { + "size": 0, + "mimeType": "x-unknown", + "compression": 101 + }, + "redirectURL": "https://jwhite.network/", + "headersSize": 101, + "bodySize": -101, + "_transferSize": 0, + "_error": null + }, + "serverIPAddress": "", + "startedDateTime": "2020-09-24T22:22:57.660Z", + "time": 5.87200000154553, + "timings": { + "blocked": 2.7460000038747676, + "dns": -1, + "ssl": -1, + "connect": -1, + "send": 0, + "wait": 1.6978010525248166e-9, + "receive": 3.125999995972961, + "_blocked_queueing": 2.6980000038747676 + } + }, + { + "_initiator": { + "type": "other" + }, + "_priority": "VeryHigh", + "_resourceType": "document", + "cache": {}, + "connection": "249", + "pageref": "page_1", + "request": { + "method": "GET", + "url": "https://jwhite.network/", + "httpVersion": "http/2.0", + "headers": [ + { + "name": ":method", + "value": "GET" + }, + { + "name": ":authority", + "value": "jwhite.network" + }, + { + "name": ":scheme", + "value": "https" + }, + { + "name": ":path", + "value": "/" + }, + { + "name": "pragma", + "value": "no-cache" + }, + { + "name": "cache-control", + "value": "no-cache" + }, + { + "name": "upgrade-insecure-requests", + "value": "1" + }, + { + "name": "user-agent", + "value": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36" + }, + { + "name": "accept", + "value": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" + }, + { + "name": "sec-fetch-site", + "value": "none" + }, + { + "name": "sec-fetch-mode", + "value": "navigate" + }, + { + "name": "sec-fetch-user", + "value": "?1" + }, + { + "name": "sec-fetch-dest", + "value": "document" + }, + { + "name": "accept-encoding", + "value": "gzip, deflate, br" + }, + { + "name": "accept-language", + "value": "en-US,en;q=0.9" + }, + { + "name": "cookie", + "value": "__cfduid=df477fc1d24c2bbce2fe8127a020316a11598723802" + } + ], + "queryString": [], + "cookies": [ + { + "name": "__cfduid", + "value": "df477fc1d24c2bbce2fe8127a020316a11598723802", + "expires": null, + "httpOnly": false, + "secure": false + } + ], + "headersSize": -1, + "bodySize": 0 + }, + "response": { + "status": 301, + "statusText": "", + "httpVersion": "http/2.0", + "headers": [ + { + "name": "status", + "value": "301" + }, + { + "name": "date", + "value": "Thu, 24 Sep 2020 22:22:57 GMT" + }, + { + "name": "location", + "value": "https://www.jwhite.network" + }, + { + "name": "cf-ray", + "value": "5d7fe83af9320d8e-IAD" + }, + { + "name": "cache-control", + "value": "max-age=3600" + }, + { + "name": "expires", + "value": "Thu, 24 Sep 2020 23:22:57 GMT" + }, + { + "name": "strict-transport-security", + "value": "max-age=31536000; includeSubDomains; preload" + }, + { + "name": "vary", + "value": "Accept-Encoding" + }, + { + "name": "cf-request-id", + "value": "0563cf78d900000d8e373c3200000001" + }, + { + "name": "expect-ct", + "value": "max-age=604800, report-uri=\"https://report-uri.cloudflare.com/cdn-cgi/beacon/expect-ct\"" + }, + { + "name": "x-content-type-options", + "value": "nosniff" + }, + { + "name": "server", + "value": "cloudflare" + }, + { + "name": "alt-svc", + "value": "h3-27=\":443\"; ma=86400, h3-28=\":443\"; ma=86400, h3-29=\":443\"; ma=86400" + } + ], + "cookies": [], + "content": { + "size": 0, + "mimeType": "x-unknown" + }, + "redirectURL": "https://www.jwhite.network", + "headersSize": -1, + "bodySize": -1, + "_transferSize": 381, + "_error": null + }, + "serverIPAddress": "104.27.152.17", + "startedDateTime": "2020-09-24T22:22:57.666Z", + "time": 110.02700000244658, + "timings": { + "blocked": 0.5099999980302528, + "dns": 0, + "ssl": 36.527, + "connect": 62.269, + "send": 1.0060000000000002, + "wait": 44.8429999964661, + "receive": 1.3990000079502352, + "_blocked_queueing": 0.5099999980302528 + } + }, + { + "_initiator": { + "type": "other" + }, + "_priority": "VeryHigh", + "_resourceType": "document", + "cache": {}, + "connection": "249", + "pageref": "page_1", + "request": { + "method": "GET", + "url": "https://www.jwhite.network/", + "httpVersion": "http/2.0", + "headers": [ + { + "name": ":method", + "value": "GET" + }, + { + "name": ":authority", + "value": "www.jwhite.network" + }, + { + "name": ":scheme", + "value": "https" + }, + { + "name": ":path", + "value": "/" + }, + { + "name": "pragma", + "value": "no-cache" + }, + { + "name": "cache-control", + "value": "no-cache" + }, + { + "name": "upgrade-insecure-requests", + "value": "1" + }, + { + "name": "user-agent", + "value": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36" + }, + { + "name": "accept", + "value": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" + }, + { + "name": "sec-fetch-site", + "value": "none" + }, + { + "name": "sec-fetch-mode", + "value": "navigate" + }, + { + "name": "sec-fetch-user", + "value": "?1" + }, + { + "name": "sec-fetch-dest", + "value": "document" + }, + { + "name": "accept-encoding", + "value": "gzip, deflate, br" + }, + { + "name": "accept-language", + "value": "en-US,en;q=0.9" + }, + { + "name": "cookie", + "value": "__cfduid=df477fc1d24c2bbce2fe8127a020316a11598723802" + } + ], + "queryString": [], + "cookies": [ + { + "name": "__cfduid", + "value": "df477fc1d24c2bbce2fe8127a020316a11598723802", + "expires": null, + "httpOnly": false, + "secure": false + } + ], + "headersSize": -1, + "bodySize": 0 + }, + "response": { + "status": 200, + "statusText": "", + "httpVersion": "http/2.0", + "headers": [ + { + "name": "status", + "value": "200" + }, + { + "name": "date", + "value": "Thu, 24 Sep 2020 22:22:57 GMT" + }, + { + "name": "content-type", + "value": "text/html; charset=utf-8" + }, + { + "name": "cf-ray", + "value": "5d7fe83b39bf0d8e-IAD" + }, + { + "name": "age", + "value": "102" + }, + { + "name": "cache-control", + "value": "max-age=31536000" + }, + { + "name": "last-modified", + "value": "Sat, 29 Aug 2020 20:36:06 GMT" + }, + { + "name": "strict-transport-security", + "value": "max-age=31536000; includeSubDomains; preload" + }, + { + "name": "vary", + "value": "x-fh-requested-host, accept-encoding" + }, + { + "name": "cf-cache-status", + "value": "HIT" + }, + { + "name": "cf-request-id", + "value": "0563cf790200000d8e373c7200000001" + }, + { + "name": "content-security-policy", + "value": "default-src 'none' ; script-src 'self' https://ajax.cloudflare.com https://cdnjs.cloudflare.com/ajax/libs/modernizr/2.8.3/modernizr.min.js https://cdn.jsdelivr.net/gh/hubspot/pace@1.0.2/pace.min.js https://cdn.jsdelivr.net/gh/jquery/jquery@3.5.1/dist/jquery.min.js https://*.linkedin.com https://*.licdn.com https://static.cloudflareinsights.com/beacon.min.js; style-src 'self' https://*.licdn.com; img-src 'self' https://*.licdn.com; font-src 'self' data:; media-src 'self' ; object-src 'none'; base-uri 'self'; connect-src https://www.jwhite.network/; frame-ancestors 'none'; report-uri https://jwhite.report-uri.com/r/d/csp/enforce; form-action 'none'" + }, + { + "name": "expect-ct", + "value": "max-age=604800, report-uri=\"https://report-uri.cloudflare.com/cdn-cgi/beacon/expect-ct\"" + }, + { + "name": "expect-staple", + "value": "max-age=31536000; report-uri='https://jwhite.report-uri.com/r/d/staple/enforce'; includeSubDomains; preload" + }, + { + "name": "feature-policy", + "value": "vibrate 'none'; ambient-light-sensor 'none'; autoplay 'none'; accelerometer 'none'; camera 'none'; document-domain 'none'; encrypted-media 'none'; fullscreen 'none'; geolocation 'none'; gyroscope 'none'; magnetometer 'none'; microphone 'none'; midi 'none'; payment 'none'; picture-in-picture 'none'; speaker 'none'; sync-xhr 'none'; vr 'none'" + }, + { + "name": "hello", + "value": "Welcome to my website" + }, + { + "name": "nel", + "value": "{\"report_to\":\"default\",\"max_age\":31536000,\"include_subdomains\":true}" + }, + { + "name": "referrer-policy", + "value": "strict-origin" + }, + { + "name": "report-to", + "value": "{\"group\":\"default\",\"max_age\":31536000,\"endpoints\":[{\"url\":\"https://jwhite.report-uri.com/a/d/g\"}],\"include_subdomains\":true}" + }, + { + "name": "x-cache", + "value": "MISS" + }, + { + "name": "x-cache-hits", + "value": "0" + }, + { + "name": "x-content-type-options", + "value": "nosniff" + }, + { + "name": "x-frame-options", + "value": "DENY" + }, + { + "name": "x-served-by", + "value": "cache-bwi5047-BWI" + }, + { + "name": "x-timer", + "value": "S1598734736.090096,VS0,VE281" + }, + { + "name": "x-xss-protection", + "value": "1; report=https://jwhite.report-uri.com/r/d/xss/enforce" + }, + { + "name": "server", + "value": "cloudflare" + }, + { + "name": "content-encoding", + "value": "br" + }, + { + "name": "alt-svc", + "value": "h3-27=\":443\"; ma=86400, h3-28=\":443\"; ma=86400, h3-29=\":443\"; ma=86400" + } + ], + "cookies": [], + "content": { + "size": 18992, + "mimeType": "text/html", + "text": "\n
\n\n\n\nNetwork & Cyber Security Graduate Python Programmer\n
\nMore About Me\nWelcome to my website.\nMy main mirror www.jwhite.network is hosted with Google Firebase and I self-host portfolio.jwhite.network.\nI use Cloudflare for DNS and load balancing.
\nI am a privacy orientated individual who strongly believes in transparency and enjoys learning about new technologies and exploring new ways of doing things.\nI love discovering new platforms and new ways of doing things and using technology to its fullest potential.
\nI am constantly working to increase them and exploring new technologies.
\n \nMy resume is signed by my personal key which is available at Contact Me section at the bottom.
\nMay 2020 -
\n• Helped customers troubleshoot network, hardware and software issues\nrelated to the Verkada platform.
\n• Collaborated with Engineering and Product teams to test new products\nand features and fix bugs.
\nJanuary 2020 - May 2020
\n• Maintained an Office 365 environment for a client.
\n• Managed, configured, and troubleshooted networks and infrastructure for clients.
\n• Answered support requests from clients and remediated issues in a timely manner.
\nAugust 2019 - December 2019
\n• Updated and maintained network infrastructure such as ESXI hosts, Windows, and Linux servers.
\n• Administered and troubleshot vSphere environment.
\n• Ensured high availability of all infrastructure components to prevent any potential downtime to critical network components.
\nJune 2019 - August 2019
\n• Troubleshooted cases that covered the entire Meraki product line, including wireless access points, security appliances, switches, and Systems Manager application.
\n• Resolved issues reported by Meraki customers and partners.
\nMay 2018 - August 2018
\n• Helped clients with platforms to provide technical solutions.
\n• Provided day to day 1st level technical support, by responding to incoming phone calls, emails, and tickets.
\nJanuary 2018 - May 2018
\n• Work on automated network scanning tool using Python and Raspberry Pis.
\n• Worked in a project based environment to complete my task with a team.
\nAugust 2015 - May 2016
\nIn my senior year of high school, I work with the IT Department to deploy new systems, maintained the network and provide technical support to administrators, teachers, and student
\nFebruary 2020 - August 2023
\nIn February of 2020, I earned my CCNA in routing and switching. You can verify it here
\nAugust 2016 - May 2020
\nIn May of 2020, I graduated with a Bachelors of science in Computer Networking & Cyber Security specializing in Systems & IT Infrastructure and Cyber Security Operations.\n
Champlain College is designated a National Center of Academic Excellence in Information Assurance Education by the National Security Agency and the Department of Homeland Security.
\nI consider Python my main programming language. The majority of my projects are in Python.\nIt is an incredibly useful language and I am continuing to develop with it. If you want to see projects that I have worked on then check out\nmy project site.
\nAs I am I privacy orientated person, I have experience with using Tor and onion routing.\nI currently have a working version of my website hosted with Tor.\nThe link is in my profile section here or a the bottom on my website.\n
\nI have used AWS to create websites. I created the websites with a combination of S3 and Route53. I have also used EC2 and LightSail for virtual machine hosting. I want to learn more about and how it is used in real-world scenarios.\n
\nI have used git with Github, Bitbucket, GitLab. I have set up my own version control servers, like GitLab.
\nI have used them to create network scanners for auditing security and devices of networks. I also created a Pi-Hole .
\nI am really comfortable with Linux. I have experience with Ubuntu, Debian and both CentOS and RedHat. I use Ubuntu for my laptop and have CentOS servers as well as Ubuntu Server.\n
\n\nI take pride in making my systems as secure as they can be. This website is PCI DSS, HPIAA, NIST compliant.\nIt gets A+ in security from\n
\nYou can see from Observatory that I started at a D+, but the tests improved over time as I learned and improved.\n\nPlease let me know something that you would like to see or if you have any questions.
\n Personal Key
Key ID: 0x1804B469
\n Web Admin Key
Key ID: 0x0251CFF1
\n\n